In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score 
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

In [4]:
pd.set_option('display.max_columns', None)

In [5]:
data = pd.read_csv("fake_job_postings.csv")

In [6]:
location = data["location"].str.split(",", expand= True, n= 2)
location.columns = ["country", "state", "city"]
data[["country", "state", "city"]] = location
data = data.drop(columns= "location")

In [7]:
salary = data["salary_range"].str.split("-", expand= True, n= 1)
data[["min_salary", "max_salary"]] = salary
data = data.drop(columns= "salary_range")

In [8]:
data = data.fillna("N/A")
data["state"] = data["state"].str.strip().apply(lambda x: "N/A" if x == '' else x)
data["country"] = data["country"].str.strip().apply(lambda x: "N/A" if x == '' else x)
data["city"] = data["city"].str.strip().apply(lambda x: "N/A" if x == '' else x)

In [9]:
for i in ["company_profile", "description", "requirements", "benefits"]:
    data[i] = data[i].str.lower()

In [10]:
data.isnull().sum()

job_id                 0
title                  0
department             0
company_profile        0
description            0
requirements           0
benefits               0
telecommuting          0
has_company_logo       0
has_questions          0
employment_type        0
required_experience    0
required_education     0
industry               0
function               0
fraudulent             0
country                0
state                  0
city                   0
min_salary             0
max_salary             0
dtype: int64

<h2>Prototype Selection with K-Means</h2>

In [11]:
label_columns = ['telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function', 'country', 'state', 'city', 'min_salary', 'max_salary', 'department']
lb_make = LabelEncoder()
for i in label_columns:
  data[i] = lb_make.fit_transform(data[i])

In [12]:
data['text'] = data['title'].str.cat(data[['country', 'state', 'city', 'department','min_salary', 'max_salary','company_profile','description','requirements',
                                    'benefits','employment_type','required_education', 'required_experience', 'industry',
                                    'function']].astype(str), sep=' ')

In [13]:
data = data.drop(columns= ["job_id", "title", "company_profile", "description", "requirements", "benefits"])

In [14]:
data['text']=data.text.str.replace(r'\W',' ',regex=True)
data['text']=data.text.str.replace(r'\b\d+','',regex=True)
data['text']=data.text.str.replace(r'\S{20,}',' ',regex=True)
data['text']=data.text.str.replace(r'\s{2,}',' ',regex=True)

In [15]:
stop = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
data['text'] = data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [16]:
cv = TfidfVectorizer(max_features = 300)
x = cv.fit_transform(data['text'])
df1 = pd.DataFrame(x.toarray(), columns=cv.get_feature_names())
data.drop(["text"], axis=1, inplace=True)
main_df = pd.concat([df1,data], axis=1)

In [17]:
#train test split - 0.85:0.15
X_train, X_test, y_train, y_test = train_test_split(main_df, main_df["fraudulent"], test_size= 0.15, random_state= 42, stratify= main_df["fraudulent"])

In [18]:
y_train.value_counts()

0    14462
1      736
Name: fraudulent, dtype: int64

In [19]:
#Use K-means to select 736 non-fraudulent cases
km = KMeans(n_clusters= 736, random_state= 42).fit(X_train[y_train == 0])
non_fraud_centers = pd.DataFrame(data= km.cluster_centers_, columns= X_train.columns)

In [20]:
X_train_ps = pd.concat([X_train[y_train == 1], non_fraud_centers])
y_train_ps = np.concatenate([np.repeat(1, 736), np.repeat(0, 736)])

In [21]:
#KNN
knn = KNeighborsClassifier(n_neighbors= 2)
knn.fit(X_train_ps, y_train_ps)
y_test_pred = knn.predict(X_test)
print(classification_report(y_test, y_test_pred))

              precision    recall  f1-score   support

           0       0.97      0.91      0.94      2552
           1       0.24      0.53      0.33       130

    accuracy                           0.89      2682
   macro avg       0.61      0.72      0.64      2682
weighted avg       0.94      0.89      0.91      2682



In [22]:
knn2 = KNeighborsClassifier(n_neighbors= 10, weights= "distance")
knn2.fit(X_train, y_train)
y_test_pred_2 = knn2.predict(X_test)
print(classification_report(y_test, y_test_pred_2))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98      2552
           1       0.89      0.44      0.59       130

    accuracy                           0.97      2682
   macro avg       0.93      0.72      0.79      2682
weighted avg       0.97      0.97      0.97      2682

