In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import collections
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import PorterStemmer
nltk.download('stopwords')
from nltk.corpus import stopwords
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [53]:
from scipy import sparse
import numpy as np
from scipy import stats
from scipy.sparse import vstack

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
data = pd.read_csv("/content/fake_job_postings.csv")

In [4]:
location = data["location"].str.split(",", expand= True, n= 2)
location.columns = ["country", "state", "city"]
data[["country", "state", "city"]] = location
data = data.drop(columns= "location")

In [5]:
salary = data["salary_range"].str.split("-", expand= True, n= 1)
data[["min_salary", "max_salary"]] = salary
data = data.drop(columns= "salary_range")

In [6]:
data = data.fillna("N/A")
data["state"] = data["state"].str.strip().apply(lambda x: "N/A" if x == '' else x)
data["country"] = data["country"].str.strip().apply(lambda x: "N/A" if x == '' else x)
data["city"] = data["city"].str.strip().apply(lambda x: "N/A" if x == '' else x)

In [7]:
for i in ["company_profile", "description", "requirements", "benefits"]:
    data[i] = data[i].str.lower()

In [8]:
data.isnull().sum()

job_id                 0
title                  0
department             0
company_profile        0
description            0
requirements           0
benefits               0
telecommuting          0
has_company_logo       0
has_questions          0
employment_type        0
required_experience    0
required_education     0
industry               0
function               0
fraudulent             0
country                0
state                  0
city                   0
min_salary             0
max_salary             0
dtype: int64

<h2>Prototype Selection with K-Means + TFIDF</h2>

In [9]:
# tokenizer define
ps = PorterStemmer()
stop = set(stopwords.words('english'))
def tokenizer (doc):
    sentences = sent_tokenize(doc)
    tokens = []
    for sent in sentences:
        words = word_tokenize(sent)
        words = [ps.stem(word) for word in words]
        tokens+=words
    return [w.lower() for w in tokens if w not in stop]

In [10]:
# combine text features and vectorize
df1 = data.copy()
text_feature = df1[['title', 'department','company_profile','description','requirements','benefits']].apply(lambda x: ' '.join(x), axis = 1)

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=True,
                        preprocessor=None,  # applied preprocessor in Data Cleaning
                        tokenizer=tokenizer,
                        use_idf=True,
                        norm='l2',
                        smooth_idf=True)

text_feature = tfidf.fit_transform(text_feature)

In [11]:
lb = LabelEncoder()

for col in ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']:
    df1[col] = lb.fit_transform(df1[col])

In [12]:
# scale
label_feature = df1[['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']]
scaler = StandardScaler().fit(label_feature)

label_feature = scaler.transform(label_feature)

In [13]:
X = hstack((text_feature, label_feature))
y = df1['fraudulent']

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

In [49]:
kmx = X_train[np.array(y_train == 0), :]

In [50]:
y_train.value_counts()

0    13628
1      693
Name: fraudulent, dtype: int64

In [51]:
#Use K-means to select 693 non-fraudulent cases
km = KMeans(n_clusters= 693, random_state= 42).fit(kmx)
non_fraud_centers = km.cluster_centers_

In [65]:
X_train_ps = vstack([sparse.csr_matrix(non_fraud_centers), X_train[np.array(y_train == 1), :]])
y_train_ps = np.concatenate([np.repeat(0, 693), np.repeat(1, 693)])

In [97]:
#KNN
knn = KNeighborsClassifier(n_neighbors= 1)
knn.fit(X_train_ps, y_train_ps)
y_val_pred = knn.predict(X_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.98      0.98      0.98      1685
           1       0.61      0.60      0.61        86

    accuracy                           0.96      1771
   macro avg       0.80      0.79      0.79      1771
weighted avg       0.96      0.96      0.96      1771



In [98]:
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_pred)
metrics.auc(fpr, tpr)

0.7925332965288799

In [99]:
knn2 = KNeighborsClassifier(n_neighbors= 1)
knn2.fit(X_train, y_train)
y_val_pred_2 = knn2.predict(X_val)
print(classification_report(y_val, y_val_pred_2))

              precision    recall  f1-score   support

           0       0.98      0.99      0.98      1685
           1       0.68      0.62      0.65        86

    accuracy                           0.97      1771
   macro avg       0.83      0.80      0.81      1771
weighted avg       0.97      0.97      0.97      1771



In [100]:
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_pred_2)
metrics.auc(fpr, tpr)

0.8007211372576082

In [None]:
y_val_test_2 = knn2.predict(X_test)
print(classification_report(y_test, y_val_test_2))

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_val_test_2)
metrics.auc(fpr, tpr)

<h2>Prototype Selection with K-Means + Countvectorizer</h2>

In [101]:
# combine text features and vectorize
df2 = data.copy()
text_feature = df2[['title', 'department','company_profile','description','requirements','benefits']].apply(lambda x: ' '.join(x), axis = 1)

bow = CountVectorizer(tokenizer=tokenizer)

text_feature = bow.fit_transform(text_feature)

In [102]:
# encode label features
lb = LabelEncoder()

for col in ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']:
    df2[col] = lb.fit_transform(df2[col])


In [103]:
# scale
label_feature = df2[['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']]
scaler = StandardScaler().fit(label_feature)

label_feature = scaler.transform(label_feature)

In [104]:
X = hstack((text_feature, label_feature))
y = df1['fraudulent']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)
kmx = X_train[np.array(y_train == 0), :]

In [105]:
y_train.value_counts()

0    13628
1      693
Name: fraudulent, dtype: int64

In [106]:
#Use K-means to select 693 non-fraudulent cases
km = KMeans(n_clusters= 693, random_state= 42).fit(kmx)
non_fraud_centers = km.cluster_centers_

In [107]:
X_train_ps = vstack([sparse.csr_matrix(non_fraud_centers), X_train[np.array(y_train == 1), :]])
y_train_ps = np.concatenate([np.repeat(0, 693), np.repeat(1, 693)])

In [108]:
#KNN
knn = KNeighborsClassifier(n_neighbors= 1)
knn.fit(X_train_ps, y_train_ps)
y_val_pred = knn.predict(X_val)
print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1685
           1       0.91      0.70      0.79        86

    accuracy                           0.98      1771
   macro avg       0.95      0.85      0.89      1771
weighted avg       0.98      0.98      0.98      1771



In [124]:
y_val_pred = knn.predict(X_val)
print(classification_report(y_val, y_val_pred, digits= 6))

              precision    recall  f1-score   support

           0   0.984751  0.996439  0.990560      1685
           1   0.909091  0.697674  0.789474        86

    accuracy                       0.981931      1771
   macro avg   0.946921  0.847057  0.890017      1771
weighted avg   0.981077  0.981931  0.980796      1771



In [109]:
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_pred)
metrics.auc(fpr, tpr)

0.8470567938720585

In [115]:
y_test_pred = knn.predict(X_test)
print(classification_report(y_test, y_test_pred, digits= 6))

              precision    recall  f1-score   support

           0   0.984884  0.995885  0.990354      1701
           1   0.897059  0.701149  0.787097        87

    accuracy                       0.981544      1788
   macro avg   0.940971  0.848517  0.888725      1788
weighted avg   0.980610  0.981544  0.980464      1788



In [116]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred)
metrics.auc(fpr, tpr)

0.848517099474954

In [110]:
knn2 = KNeighborsClassifier(n_neighbors= 1)
knn2.fit(X_train, y_train)
y_val_pred_2 = knn2.predict(X_val)
print(classification_report(y_val, y_val_pred_2))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1685
           1       0.69      0.83      0.75        86

    accuracy                           0.97      1771
   macro avg       0.84      0.90      0.87      1771
weighted avg       0.98      0.97      0.97      1771



In [117]:
print(classification_report(y_val, y_val_pred_2, digits= 6))

              precision    recall  f1-score   support

           0   0.991007  0.981009  0.985983      1685
           1   0.689320  0.825581  0.751323        86

    accuracy                       0.973461      1771
   macro avg   0.840164  0.903295  0.868653      1771
weighted avg   0.976357  0.973461  0.974588      1771



In [111]:
fpr, tpr, thresholds = metrics.roc_curve(y_val, y_val_pred_2)
metrics.auc(fpr, tpr)

0.9032951487129942

In [121]:
y_test_pred_2 = knn2.predict(X_test)
print(classification_report(y_test, y_test_pred_2, digits= 6))

              precision    recall  f1-score   support

           0   0.992294  0.984127  0.988194      1701
           1   0.732673  0.850575  0.787234        87

    accuracy                       0.977629      1788
   macro avg   0.862484  0.917351  0.887714      1788
weighted avg   0.979661  0.977629  0.978415      1788



In [122]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_test_pred_2)
metrics.auc(fpr, tpr)

0.9173508483853312

Glove

In [127]:
from gensim.test.utils import datapath, get_tmpfile
from gensim.corpora import Dictionary
from gensim.models import KeyedVectors, LdaModel, Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.test.utils import datapath, get_tmpfile
import re
# load the GloVe 6B embeddings

glove_file = "/content/glove.6B.100d.txt"
tmp_file = get_tmpfile("test_word2vec.txt")

_ = glove2word2vec(glove_file, tmp_file)
glove_model = KeyedVectors.load_word2vec_format(tmp_file)

In [128]:
df3 = data.copy()
text_feature = df3[['title', 'department','company_profile','description','requirements','benefits']].apply(lambda x: ' '.join(x), axis = 1)

In [129]:
def word_averaging(model, sentence):
    vectors = np.zeros(100)
    words = re.sub(r"\W+", " ", sentence).split()
    words = [w.lower() for w in words if w not in stop]
   # words = [ps.stem(word) for word in words]
    for i in words:
        try:
            vectors += model[i]
        except KeyError:
            pass
    return vectors / len(words)

In [130]:
# combine text features and vectorize
df3 = data.copy()
text = df3[['title', 'department','company_profile','description','requirements','benefits']].apply(lambda x: ' '.join(x), axis = 1)

text_feature = list(word_averaging(glove_model, i) for i in text_feature)

In [131]:
# scale
lb = LabelEncoder()

for col in ['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']:
    df3[col] = lb.fit_transform(df3[col])

label_feature = df3[['employment_type', 'required_experience', 'required_education', 'industry', 'function', 'country',
       'state', 'city']]
scaler = StandardScaler().fit(label_feature)

label_feature = scaler.transform(label_feature)

In [132]:
# build model
X = pd.concat([pd.DataFrame(text_feature),pd.DataFrame(label_feature)],axis=1).set_axis(list(range(108)), axis=1, inplace=False)
y = df3['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.10, random_state= 42, stratify= y)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42, stratify= y_train)

glove_clf = KNeighborsClassifier(n_neighbors= 1)
glove_clf.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=1, p=2,
                     weights='uniform')

In [144]:
glove_clf = KNeighborsClassifier(n_neighbors= 1)
glove_clf.fit(X_train, y_train)
glove_val_pred = glove_clf.predict(X_val)
print(classification_report(y_val, glove_val_pred, digits= 6))

              precision    recall  f1-score   support

           0   0.978274  0.988724  0.983471      1685
           1   0.720588  0.569767  0.636364        86

    accuracy                       0.968379      1771
   macro avg   0.849431  0.779246  0.809917      1771
weighted avg   0.965760  0.968379  0.966615      1771



In [145]:
fpr, tpr, thresholds = metrics.roc_curve(y_val, glove_val_pred)
metrics.auc(fpr, tpr)

0.779245738734387

In [146]:
glove_test_pred = glove_clf.predict(X_test)
print(classification_report(y_test, glove_test_pred, digits= 6))

              precision    recall  f1-score   support

           0   0.980198  0.989418  0.984786      1701
           1   0.746479  0.609195  0.670886        87

    accuracy                       0.970917      1788
   macro avg   0.863338  0.799307  0.827836      1788
weighted avg   0.968826  0.970917  0.969513      1788



In [147]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, glove_test_pred)
metrics.auc(fpr, tpr)

0.79930669585842