In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("fake_job_postings.csv")

In [3]:
location = data["location"].str.split(",", expand= True, n= 2)
location.columns = ["country", "state", "city"]
data[["country", "state", "city"]] = location
data = data.drop(columns= "location")

In [4]:
salary = data["salary_range"].str.split("-", expand= True, n= 1)
data[["min_salary", "max_salary"]] = salary
data = data.drop(columns= "salary_range")

In [5]:
data = data.fillna("N/A")
data["state"] = data["state"].str.strip().apply(lambda x: "N/A" if x == '' else x)
data["country"] = data["country"].str.strip().apply(lambda x: "N/A" if x == '' else x)
data["city"] = data["city"].str.strip().apply(lambda x: "N/A" if x == '' else x)

In [6]:
for i in ["company_profile", "description", "requirements", "benefits"]:
    data[i] = data[i].str.lower()

In [7]:
data.isnull().sum()

job_id                 0
title                  0
department             0
company_profile        0
description            0
requirements           0
benefits               0
telecommuting          0
has_company_logo       0
has_questions          0
employment_type        0
required_experience    0
required_education     0
industry               0
function               0
fraudulent             0
country                0
state                  0
city                   0
min_salary             0
max_salary             0
dtype: int64

In [8]:
data.columns

Index(['job_id', 'title', 'department', 'company_profile', 'description',
       'requirements', 'benefits', 'telecommuting', 'has_company_logo',
       'has_questions', 'employment_type', 'required_experience',
       'required_education', 'industry', 'function', 'fraudulent', 'country',
       'state', 'city', 'min_salary', 'max_salary'],
      dtype='object')

In [9]:
pd.crosstab(data.fraudulent, data.min_salary=='N/A')

min_salary,False,True
fraudulent,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2645,14369
1,223,643


In [10]:
# train test split - 0.8, 0.1, 0.1, execute after preprocessing
# X_train, X_test, y_train, y_test = train_test_split(data, data["fraudulent"], test_size= 0.10, random_state= 42, stratify= data["fraudulent"])
# X_tran, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

### TF-IDF - Logistic Regression

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
import collections
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_selection import RFE
from matplotlib import pyplot

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
# tokenizer define
ps = PorterStemmer()
stop = set(stopwords.words('english'))
def tokenizer (doc):
    sentences = sent_tokenize(doc)
    tokens = []
    for sent in sentences:
        words = word_tokenize(sent)
        words = [ps.stem(word) for word in words]
        tokens+=words
    return [w.lower() for w in tokens if w not in stop]

In [12]:
# combine text features and vectorize
df1 = data.copy()
text_feature = df1[['title', 'department','company_profile','description','requirements','benefits']].apply(lambda x: ' '.join(x), axis = 1)

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=True,
                        preprocessor=None,  # applied preprocessor in Data Cleaning
                        tokenizer=tokenizer,
                        use_idf=True,
                        norm='l2',
                        smooth_idf=True)

text_feature = tfidf.fit_transform(text_feature)


In [13]:
# encode label features
lb = LabelEncoder()

for col in ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']:
    df1[col] = lb.fit_transform(df1[col])


In [14]:
# scale
label_feature = df1[['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']]
scaler = StandardScaler().fit(label_feature)

label_feature = scaler.transform(label_feature)

In [78]:
# build model
X = hstack((text_feature, label_feature))
y = df1['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

tfidf_clf = LogisticRegression(C=1, max_iter=10000, class_weight = 'balanced').fit(X_train, y_train)
print(classification_report(y_val, tfidf_clf.predict(X_val), digits=6))
print(roc_auc_score(y_val, tfidf_clf.predict(X_val), average='macro'))

              precision    recall  f1-score   support

           0   0.992257  0.988724  0.990488      1685
           1   0.793478  0.848837  0.820225        86

    accuracy                       0.981931      1771
   macro avg   0.892868  0.918781  0.905356      1771
weighted avg   0.982605  0.981931  0.982220      1771

0.918780622455317


In [29]:
# checking feature importance
cols = ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']
for i in range(8):
    print(cols[i], "|", tfidf_clf.coef_[0][-8:][i])


employment_type | 0.07506910370737147
required_experience | 0.25703517165843615
required_education | 0.16477544425920318
industry | -0.01127769950150585
function | -0.4339682426136203
country | 0.43847843142883514
state | 0.07770028922464661
city | -0.007637709450692502


In [73]:
# if only processed with text - lower accruacy and other scores
# if keep 'required_experience', 'required_education', 'function', 'country' - same result as using all varialbes
# label_feature = df1[['required_experience', 'required_education', 'function', 'country']]
# scaler = StandardScaler().fit(label_feature)

# label_feature = scaler.transform(label_feature)
# X = hstack((text_feature, label_feature))
X = text_feature
y = df1['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

tfidf_clf = LogisticRegression(C=1.0, max_iter=10000, class_weight = 'balanced').fit(X_train, y_train)
print(classification_report(y_val, tfidf_clf.predict(X_val), digits=6))
print(roc_auc_score(y_val, tfidf_clf.predict(X_val), average='macro'))

              precision    recall  f1-score   support

           0   0.992253  0.988131  0.990187      1685
           1   0.784946  0.848837  0.815642        86

    accuracy                       0.981366      1771
   macro avg   0.888599  0.918484  0.902915      1771
weighted avg   0.982186  0.981366  0.981711      1771

0.9184838865502726


In [75]:
# tunning
X = hstack((text_feature, label_feature))
y = df1['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

tfidf_clf = LogisticRegression(C=10, max_iter=10000, class_weight = 'balanced').fit(X_train, y_train)
print(classification_report(y_val, tfidf_clf.predict(X_val), digits=6))
print(roc_auc_score(y_val, tfidf_clf.predict(X_val), average='macro'))

              precision    recall  f1-score   support

           0   0.991677  0.989911  0.990793      1685
           1   0.808989  0.837209  0.822857        86

    accuracy                       0.982496      1771
   macro avg   0.900333  0.913560  0.906825      1771
weighted avg   0.982805  0.982496  0.982638      1771

0.9135601407770341


In [83]:
# tunning
tfidf_clf = LogisticRegression(C=0.1, max_iter=10000, class_weight = 'balanced').fit(X_train, y_train)
print(classification_report(y_val, tfidf_clf.predict(X_val), digits=6))
print(roc_auc_score(y_val, tfidf_clf.predict(X_val), average='macro'))

              precision    recall  f1-score   support

           0   0.991627  0.983976  0.987787      1685
           1   0.727273  0.837209  0.778378        86

    accuracy                       0.976849      1771
   macro avg   0.859450  0.910593  0.883083      1771
weighted avg   0.978790  0.976849  0.977618      1771

0.910592781726589


In [79]:
confusion_matrix(y_val, tfidf_clf.predict(X_val))

array([[1666,   19],
       [  13,   73]])

In [87]:
# final model
X = hstack((text_feature, label_feature))
y = df1['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)


tfidf_clf = LogisticRegression(C=1, max_iter=10000, class_weight = 'balanced').fit(X_train, y_train)
#print(classification_report(y_val, tfidf_clf.predict(X_val), digits=6))
#print(roc_auc_score(y_val, tfidf_clf.predict(X_val), average='macro'))

In [88]:
# Apply on test set
print(classification_report(y_test, tfidf_clf.predict(X_test), digits=6))
print(roc_auc_score(y_test, tfidf_clf.predict(X_test), average='macro'))


              precision    recall  f1-score   support

           0   0.994712  0.995297  0.995004      1701
           1   0.906977  0.896552  0.901734        87

    accuracy                       0.990492      1788
   macro avg   0.950844  0.945924  0.948369      1788
weighted avg   0.990443  0.990492  0.990466      1788

0.9459243041618521


In [97]:
confusion_matrix(y_test, tfidf_clf.predict(X_test), labels=[1,0]) 

array([[  78,    9],
       [   8, 1693]])

### BOW - Logistic Regression

In [106]:
# combine text features and vectorize
df2 = data.copy()
text_feature = df2[['title', 'department','company_profile','description','requirements','benefits']].apply(lambda x: ' '.join(x), axis = 1)

bow = CountVectorizer(tokenizer=tokenizer)

text_feature = bow.fit_transform(text_feature)


In [107]:
# encode label features
lb = LabelEncoder()

for col in ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']:
    df2[col] = lb.fit_transform(df2[col])


In [108]:
# scale
label_feature = df2[['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']]
scaler = StandardScaler().fit(label_feature)

label_feature = scaler.transform(label_feature)

In [109]:
# build model
X = hstack((text_feature, label_feature))
y = df2['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

bow_clf = LogisticRegression(C=1.0, max_iter=10000, class_weight = 'balanced').fit(X_train, y_train)
print(classification_report(y_val, bow_clf.predict(X_val), digits=6))
print(roc_auc_score(y_val, bow_clf.predict(X_val), average='macro'))

              precision    recall  f1-score   support

           0   0.992253  0.988131  0.990187      1685
           1   0.784946  0.848837  0.815642        86

    accuracy                       0.981366      1771
   macro avg   0.888599  0.918484  0.902915      1771
weighted avg   0.982186  0.981366  0.981711      1771

0.9184838865502726


In [55]:
# feature importance
cols = ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']
for i in range(8):
    print(cols[i], "|", bow_clf.coef_[0][-8:][i])


employment_type | 0.2417920735850242
required_experience | 0.35007397092246073
required_education | 0.38439277528309646
industry | -0.4690553511020775
function | -0.6222970582320625
country | 0.2660940000419586
state | -0.04997706926619668
city | 0.07876774786835677


In [110]:
# adjusted model
label_feature = df2[['industry', 'function']]
scaler = StandardScaler().fit(label_feature)

label_feature = scaler.transform(label_feature)
X = hstack((text_feature, label_feature))
y = df2['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

bow_clf = LogisticRegression(C=1.0, max_iter=10000, class_weight = 'balanced').fit(X_train, y_train)
print(classification_report(y_val, bow_clf.predict(X_val), digits=6))
print(roc_auc_score(y_val, bow_clf.predict(X_val), average='macro'))

              precision    recall  f1-score   support

           0   0.992257  0.988724  0.990488      1685
           1   0.793478  0.848837  0.820225        86

    accuracy                       0.981931      1771
   macro avg   0.892868  0.918781  0.905356      1771
weighted avg   0.982605  0.981931  0.982220      1771

0.918780622455317


In [111]:
# tuning
bow_clf = LogisticRegression(C=5, max_iter=10000, class_weight = 'balanced').fit(X_train, y_train)
print(classification_report(y_val, bow_clf.predict(X_val), digits=6))
print(roc_auc_score(y_val, bow_clf.predict(X_val), average='macro'))

              precision    recall  f1-score   support

           0   0.992262  0.989318  0.990788      1685
           1   0.802198  0.848837  0.824859        86

    accuracy                       0.982496      1771
   macro avg   0.897230  0.919077  0.907823      1771
weighted avg   0.983032  0.982496  0.982730      1771

0.9190773583603615


In [116]:
# final model
label_feature = df2[['industry', 'function']]
scaler = StandardScaler().fit(label_feature)

label_feature = scaler.transform(label_feature)
X = hstack((text_feature, label_feature))
# X = text_feature
y = df2['fraudulent']

bow_clf = LogisticRegression(C=5, max_iter=10000, class_weight = 'balanced').fit(X_train, y_train)


In [117]:
# apply on test set
print(classification_report(y_test, bow_clf.predict(X_test), digits=6))
print(roc_auc_score(y_test, bow_clf.predict(X_test), average='macro'))

              precision    recall  f1-score   support

           0   0.993545  0.995297  0.994420      1701
           1   0.904762  0.873563  0.888889        87

    accuracy                       0.989374      1788
   macro avg   0.949153  0.934430  0.941654      1788
weighted avg   0.989225  0.989374  0.989285      1788

0.9344300512882888


In [119]:
confusion_matrix(y_test, bow_clf.predict(X_test), labels=[1,0])

array([[  76,   11],
       [   8, 1693]])

### GloVe

In [123]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.corpora import Dictionary
from gensim.models import KeyedVectors, LdaModel, Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile
import re
# load the GloVe 6B embeddings

glove_file = "glove.6B.100d.txt"
tmp_file = get_tmpfile("test_word2vec.txt")

_ = glove2word2vec(glove_file, tmp_file)
glove_model = KeyedVectors.load_word2vec_format(tmp_file)

In [120]:
df3 = data.copy()
text_feature = df3[['title', 'department','company_profile','description','requirements','benefits']].apply(lambda x: ' '.join(x), axis = 1)


In [121]:
def word_averaging(model, sentence):
    vectors = np.zeros(100)
    words = re.sub(r"\W+", " ", sentence).split()
    words = [w.lower() for w in words if w not in stop]
   # words = [ps.stem(word) for word in words]
    for i in words:
        try:
            vectors += model[i]
        except KeyError:
            pass
    return vectors / len(words)

In [124]:
# combine text features and vectorize
df3 = data.copy()
text = df3[['title', 'department','company_profile','description','requirements','benefits']].apply(lambda x: ' '.join(x), axis = 1)

text_feature = list(word_averaging(glove_model, i) for i in text_feature)



In [125]:
# scale
lb = LabelEncoder()

for col in ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']:
    df3[col] = lb.fit_transform(df3[col])

label_feature = df3[['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']]
scaler = StandardScaler().fit(label_feature)

label_feature = scaler.transform(label_feature)

In [128]:
# build model
X = pd.concat([pd.DataFrame(text_feature),pd.DataFrame(label_feature)],axis=1).set_axis(list(range(108)), axis=1, inplace=False)
y = df3['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

glove_clf = LogisticRegression(C=1.0, max_iter=10000, class_weight="balanced").fit(X_train, y_train)
print(classification_report(y_test, glove_clf.predict(X_test), digits=6))
print(roc_auc_score(y_test, glove_clf.predict(X_test), average='macro'))

              precision    recall  f1-score   support

           0   0.987787  0.808348  0.889104      1701
           1   0.176768  0.804598  0.289855        87

    accuracy                       0.808166      1788
   macro avg   0.582278  0.806473  0.589480      1788
weighted avg   0.948325  0.808166  0.859946      1788

0.8064728658598391


In [129]:
cols = ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']
for i in range(8):
    print(cols[i], "|", glove_clf.coef_[0][-8:][i])


employment_type | 0.12492398007500559
required_experience | 0.2766101989436487
required_education | 0.15794061706875598
industry | 0.12733947898596698
function | -0.4751378421703102
country | 0.5027073672831244
state | 0.04529280621792042
city | -0.021495299046772352


In [143]:
# only include important features
X = pd.concat([pd.DataFrame(text_feature),pd.DataFrame(label_feature)[5]],axis=1).set_axis(list(range(101)), axis=1, inplace=False)
y = df3['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

glove_clf = LogisticRegression(C=1.0, max_iter=10000, class_weight="balanced").fit(X_train, y_train)
print(classification_report(y_test, glove_clf.predict(X_test), digits=6))
print(roc_auc_score(y_test, glove_clf.predict(X_test), average='macro'))

              precision    recall  f1-score   support

           0   0.990674  0.811875  0.892407      1701
           1   0.187817  0.850575  0.307692        87

    accuracy                       0.813758      1788
   macro avg   0.589246  0.831225  0.600050      1788
weighted avg   0.951609  0.813758  0.863956      1788

0.8312250400373006


In [144]:
# tuning
glove_clf = LogisticRegression(C=2, max_iter=10000, class_weight="balanced").fit(X_train, y_train)
print(classification_report(y_test, glove_clf.predict(X_test), digits=6))
print(roc_auc_score(y_test, glove_clf.predict(X_test), average='macro'))

              precision    recall  f1-score   support

           0   0.990728  0.816578  0.895263      1701
           1   0.191710  0.850575  0.312896        87

    accuracy                       0.818233      1788
   macro avg   0.591219  0.833577  0.604080      1788
weighted avg   0.951849  0.818233  0.866926      1788

0.8335765979444141


In [145]:
# tuning
glove_clf = LogisticRegression(C=100, max_iter=10000, class_weight="balanced").fit(X_train, y_train)
print(classification_report(y_test, glove_clf.predict(X_test), digits=6))
print(roc_auc_score(y_test, glove_clf.predict(X_test), average='macro'))

              precision    recall  f1-score   support

           0   0.991519  0.824809  0.900513      1701
           1   0.201072  0.862069  0.326087        87

    accuracy                       0.826622      1788
   macro avg   0.596296  0.843439  0.613300      1788
weighted avg   0.953058  0.826622  0.872563      1788

0.8434389507186442


In [146]:
# final model
X = pd.concat([pd.DataFrame(text_feature),pd.DataFrame(label_feature)[5]],axis=1).set_axis(list(range(101)), axis=1, inplace=False)
y = df3['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

glove_clf = LogisticRegression(C=100, max_iter=10000, class_weight="balanced").fit(X_train, y_train)
print(classification_report(y_test, glove_clf.predict(X_test), digits=6))
print(roc_auc_score(y_test, glove_clf.predict(X_test), average='macro'))

              precision    recall  f1-score   support

           0   0.991519  0.824809  0.900513      1701
           1   0.201072  0.862069  0.326087        87

    accuracy                       0.826622      1788
   macro avg   0.596296  0.843439  0.613300      1788
weighted avg   0.953058  0.826622  0.872563      1788

0.8434389507186442


In [147]:
# applying on test set
print(classification_report(y_test, glove_clf.predict(X_test), digits=6))
print(roc_auc_score(y_test, glove_clf.predict(X_test), average='macro'))

              precision    recall  f1-score   support

           0   0.991519  0.824809  0.900513      1701
           1   0.201072  0.862069  0.326087        87

    accuracy                       0.826622      1788
   macro avg   0.596296  0.843439  0.613300      1788
weighted avg   0.953058  0.826622  0.872563      1788

0.8434389507186442


In [148]:
confusion_matrix(y_test, glove_clf.predict(X_test), labels=[1,0])

array([[  75,   12],
       [ 298, 1403]])