In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("fake_job_postings.csv")

In [3]:
location = data["location"].str.split(",", expand= True, n= 2)
location.columns = ["country", "state", "city"]
data[["country", "state", "city"]] = location
data = data.drop(columns= "location")

In [4]:
salary = data["salary_range"].str.split("-", expand= True, n= 1)
data[["min_salary", "max_salary"]] = salary
data = data.drop(columns= "salary_range")

In [5]:
data = data.fillna("N/A")
data["state"] = data["state"].str.strip().apply(lambda x: "N/A" if x == '' else x)
data["country"] = data["country"].str.strip().apply(lambda x: "N/A" if x == '' else x)
data["city"] = data["city"].str.strip().apply(lambda x: "N/A" if x == '' else x)

In [6]:
for i in ["company_profile", "description", "requirements", "benefits"]:
    data[i] = data[i].str.lower()

In [7]:
data.isnull().sum()

job_id                 0
title                  0
department             0
company_profile        0
description            0
requirements           0
benefits               0
telecommuting          0
has_company_logo       0
has_questions          0
employment_type        0
required_experience    0
required_education     0
industry               0
function               0
fraudulent             0
country                0
state                  0
city                   0
min_salary             0
max_salary             0
dtype: int64

In [8]:
data.columns

Index(['job_id', 'title', 'department', 'company_profile', 'description',
       'requirements', 'benefits', 'telecommuting', 'has_company_logo',
       'has_questions', 'employment_type', 'required_experience',
       'required_education', 'industry', 'function', 'fraudulent', 'country',
       'state', 'city', 'min_salary', 'max_salary'],
      dtype='object')

In [9]:
pd.crosstab(data.fraudulent, data.min_salary=='N/A')

min_salary,False,True
fraudulent,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2645,14369
1,223,643


In [10]:
# train test split - 0.8, 0.1, 0.1, execute after preprocessing
# X_train, X_test, y_train, y_test = train_test_split(data, data["fraudulent"], test_size= 0.10, random_state= 42, stratify= data["fraudulent"])
# X_tran, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

### TF-IDF - Logistic Regression

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
import collections
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_selection import RFE
from matplotlib import pyplot

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
# tokenizer define
ps = PorterStemmer()
stop = set(stopwords.words('english'))
def tokenizer (doc):
    sentences = sent_tokenize(doc)
    tokens = []
    for sent in sentences:
        words = word_tokenize(sent)
        words = [ps.stem(word) for word in words]
        tokens+=words
    return [w.lower() for w in tokens if w not in stop]

In [10]:
# combine text features and vectorize
df1 = data.copy()
text_feature = df1[['title', 'department','company_profile','description','requirements','benefits']].apply(lambda x: ' '.join(x), axis = 1)

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=True,
                        preprocessor=None,  # applied preprocessor in Data Cleaning
                        tokenizer=tokenizer,
                        use_idf=True,
                        norm='l2',
                        smooth_idf=True)

text_feature = tfidf.fit_transform(text_feature)


In [11]:
# encode label features
lb = LabelEncoder()

for col in ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']:
    df1[col] = lb.fit_transform(df1[col])


In [12]:
# scale
label_feature = df1[['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']]
scaler = StandardScaler().fit(label_feature)

label_feature = scaler.transform(label_feature)

In [13]:
# build model
X = hstack((text_feature, label_feature))
y = df1['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

tfidf_clf = LogisticRegression(C=1.0, max_iter=10000).fit(X_train, y_train)
print(tfidf_clf.score(X_val, y_val))
print(precision_recall_fscore_support(y_val, tfidf_clf.predict(X_val), average='macro'))
print(precision_recall_fscore_support(y_val, tfidf_clf.predict(X_val), average='micro'))
print(roc_auc_score(y_val, tfidf_clf.predict(X_val), average='macro'))
print(roc_auc_score(y_val, tfidf_clf.predict(X_val), average='micro'))

0.9723320158102767
(0.9858708189158016, 0.7151162790697674, 0.7936471701657644, None)
(0.9723320158102767, 0.9723320158102767, 0.9723320158102767, None)
0.7151162790697674
0.7151162790697674


In [16]:
cols = ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']
for i in range(8):
    print(cols[i], "|", tfidf_clf.coef_[0][-8:][i])


employment_type | 0.07506910370737147
required_experience | 0.25703517165843615
required_education | 0.16477544425920318
industry | -0.01127769950150585
function | -0.4339682426136203
country | 0.43847843142883514
state | 0.07770028922464661
city | -0.007637709450692502


In [17]:
# if only processed with text - lower accruacy and other scores
# if keep 'required_experience', 'required_education', 'function', 'country' - same result as using all varialbes
# label_feature = df1[['required_experience', 'required_education', 'function', 'country']]
# scaler = StandardScaler().fit(label_feature)

# label_feature = scaler.transform(label_feature)
# X = hstack((text_feature, label_feature))
X = text_feature
y = df1['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

tfidf_clf = LogisticRegression(C=1.0, max_iter=10000).fit(X_train, y_train)
print(tfidf_clf.score(X_val, y_val))
print(precision_recall_fscore_support(y_val, tfidf_clf.predict(X_val), average='macro'))
print(precision_recall_fscore_support(y_val, tfidf_clf.predict(X_val), average='micro'))
print(roc_auc_score(y_val, tfidf_clf.predict(X_val), average='macro'))
print(roc_auc_score(y_val, tfidf_clf.predict(X_val), average='micro'))

0.9700734048560136
(0.9847525891829689, 0.6918604651162791, 0.7695691773641973, None)
(0.9700734048560136, 0.9700734048560136, 0.9700734048560136, None)
0.6918604651162791
0.6918604651162791


### BOW - Logistic Regression

In [12]:
# combine text features and vectorize
df2 = data.copy()
text_feature = df2[['title', 'department','company_profile','description','requirements','benefits']].apply(lambda x: ' '.join(x), axis = 1)

bow = CountVectorizer(tokenizer=tokenizer)

text_feature = bow.fit_transform(text_feature)


In [13]:
# encode label features
lb = LabelEncoder()

for col in ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']:
    df2[col] = lb.fit_transform(df2[col])


In [14]:
# scale
label_feature = df2[['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']]
scaler = StandardScaler().fit(label_feature)

label_feature = scaler.transform(label_feature)

In [15]:
# build model
X = hstack((text_feature, label_feature))
y = df2['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

bow_clf = LogisticRegression(C=1.0, max_iter=10000).fit(X_train, y_train)
print(bow_clf.score(X_val, y_val))
print(precision_recall_fscore_support(y_val, bow_clf.predict(X_val), average='macro'))
print(precision_recall_fscore_support(y_val, bow_clf.predict(X_val), average='micro'))
print(roc_auc_score(y_val, bow_clf.predict(X_val), average='macro'))
print(roc_auc_score(y_val, bow_clf.predict(X_val), average='micro'))

0.9847543760587238
(0.941065251572327, 0.8871609964805742, 0.9121561668145519, None)
(0.9847543760587238, 0.9847543760587238, 0.9847543760587238, None)
0.8871609964805741
0.8871609964805741


In [23]:
cols = ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']
for i in range(8):
    print(cols[i], "|", bow_clf.coef_[0][-8:][i])


employment_type | 0.2417920735850242
required_experience | 0.35007397092246073
required_education | 0.38439277528309646
industry | -0.4690553511020775
function | -0.6222970582320625
country | 0.2660940000419586
state | -0.04997706926619668
city | 0.07876774786835677


In [16]:
# adjusted model
label_feature = df2[['industry', 'function']]
#scaler = StandardScaler().fit(label_feature)

# label_feature = scaler.transform(label_feature)
X = hstack((text_feature, label_feature))
# X = text_feature
y = df2['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

bow_clf = LogisticRegression(C=1.0, max_iter=10000).fit(X_train, y_train)
print(bow_clf.score(X_val, y_val))
print(precision_recall_fscore_support(y_val, bow_clf.predict(X_val), average='macro'))
print(precision_recall_fscore_support(y_val, bow_clf.predict(X_val), average='micro'))
print(roc_auc_score(y_val, bow_clf.predict(X_val), average='macro'))
print(roc_auc_score(y_val, bow_clf.predict(X_val), average='micro'))

0.9864483342744212
(0.9596862533516448, 0.8880512041957077, 0.9205045336206121, None)
(0.9864483342744212, 0.9864483342744212, 0.9864483342744212, None)
0.8880512041957077
0.8880512041957077


### GloVe

In [10]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.corpora import Dictionary
from gensim.models import KeyedVectors, LdaModel, Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile
import re
# load the GloVe 6B embeddings

glove_file = "glove.6B.100d.txt"
tmp_file = get_tmpfile("test_word2vec.txt")

_ = glove2word2vec(glove_file, tmp_file)
glove_model = KeyedVectors.load_word2vec_format(tmp_file)

In [11]:
df3 = data.copy()
text_feature = df3[['title', 'department','company_profile','description','requirements','benefits']].apply(lambda x: ' '.join(x), axis = 1)


In [12]:
def word_averaging(model, sentence):
    vectors = np.zeros(100)
    words = re.sub(r"\W+", " ", sentence).split()
    words = [w.lower() for w in words if w not in stop]
   # words = [ps.stem(word) for word in words]
    for i in words:
        try:
            vectors += model[i]
        except KeyError:
            pass
    return vectors / len(words)

In [13]:
# combine text features and vectorize
df3 = data.copy()
text = df3[['title', 'department','company_profile','description','requirements','benefits']].apply(lambda x: ' '.join(x), axis = 1)

text_feature = list(word_averaging(glove_model, i) for i in text_feature)



In [16]:
# scale
lb = LabelEncoder()

for col in ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']:
    df3[col] = lb.fit_transform(df3[col])

label_feature = df3[['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']]
scaler = StandardScaler().fit(label_feature)

label_feature = scaler.transform(label_feature)

In [17]:
# build model
X = pd.concat([pd.DataFrame(text_feature),pd.DataFrame(label_feature)],axis=1).set_axis(list(range(108)), axis=1, inplace=False)
y = df3['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

glove_clf = LogisticRegression(C=1.0, max_iter=10000).fit(X_train, y_train)
print(glove_clf.score(X_val, y_val))
print(precision_recall_fscore_support(y_val, glove_clf.predict(X_val), average='macro'))
print(precision_recall_fscore_support(y_val, glove_clf.predict(X_val), average='micro'))
print(roc_auc_score(y_val, glove_clf.predict(X_val), average='macro'))
print(roc_auc_score(y_val, glove_clf.predict(X_val), average='micro'))

0.9565217391304348
(0.8478731074260994, 0.5799116693119868, 0.622131704005431, None)
(0.9565217391304348, 0.9565217391304348, 0.9565217391304348, None)
0.5799116693119868
0.5799116693119868


In [19]:
cols = ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']
for i in range(8):
    print(cols[i], "|", glove_clf.coef_[0][-8:][i])


employment_type | 0.08363491774954529
required_experience | 0.28352072276900686
required_education | 0.22064025558192668
industry | 0.1360812004010211
function | -0.507865381464418
country | 0.4400128548116464
state | 0.044940121431559524
city | -0.09606047404468133


In [20]:
X = pd.DataFrame(text_feature)#,pd.DataFrame(label_feature)],axis=1).set_axis(list(range(108)), axis=1, inplace=False)
y = df3['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

glove_clf = LogisticRegression(C=1.0, max_iter=10000).fit(X_train, y_train)
print(glove_clf.score(X_val, y_val))
print(precision_recall_fscore_support(y_val, glove_clf.predict(X_val), average='macro'))
print(precision_recall_fscore_support(y_val, glove_clf.predict(X_val), average='micro'))
print(roc_auc_score(y_val, glove_clf.predict(X_val), average='macro'))
print(roc_auc_score(y_val, glove_clf.predict(X_val), average='micro'))

0.9570863918690006
(0.8683526652722318, 0.5802084052170313, 0.6235624468608763, None)
(0.9570863918690006, 0.9570863918690006, 0.9570863918690006, None)
0.5802084052170313
0.5802084052170313
