In [None]:
# Requires Pip Install
# pip install xgboost

In [None]:
# Import the data
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE

og_df = pd.read_csv('./original dataset/fake_job_postings.csv')
og_form = pd.read_csv('./formatted dataset/fake_job_postings.csv')

df = og_df.copy()
form = og_form.copy()

display(df.head())

display(form.head())

In [None]:
# moving below, this was too early and causing datal leakage

# implement one hot encoding for department
ohe = OneHotEncoder(min_frequency=20, handle_unknown="ignore")
# One-hot encode the 'department' column
ohe = OneHotEncoder(min_frequency=20, handle_unknown="ignore")
dept_encoded = ohe.fit_transform(df[['department']])
# Convert the sparse matrix to a dense array
dept_encoded_dense = dept_encoded.toarray()
# Create a DataFrame with the encoded department data
dept_df = pd.DataFrame(dept_encoded_dense, columns=ohe.get_feature_names_out(["department"]))
# Join the original DataFrame with the encoded department DataFrame
df = df.join(dept_df)
df.drop(['department'], axis=1, inplace=True)

In [None]:
data_leak = ['job_id']
form.drop(data_leak, axis=1, inplace=True)
df.drop(data_leak, axis=1, inplace=True)

# removing the categorical columns for a baseline test
df.drop(['company_profile','description','requirements','benefits','location','salary_range','title'], axis=1, inplace=True)

# creating a df with some categorical fields that aren't long text fields
categorical=['employment_type', 'required_experience', 'required_education', 'industry', 'function']
df_categorical = df.copy()
form_cat = form.copy()

# Drop the remaining categorical columns from the original DataFrame
df.drop(categorical, axis=1, inplace=True)
form.drop(categorical, axis=1, inplace=True)

# set the type to category for the XGB DMatrix to handle when flagging for categorical columns
df_categorical[categorical] = df_categorical[categorical].astype('category')
form_cat[categorical] = form_cat[categorical].astype('category')
df_categorical[categorical] = df_categorical[categorical].astype('category')

In [None]:
# most basic model

x = df.drop(['fraudulent'], axis=1)
y = df['fraudulent']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# build with oversampling

x_os = df.drop(['fraudulent'], axis=1)
y_os = df['fraudulent']

x_train_os, x_test_os, y_train_os, y_test_os = train_test_split(x_os, y_os, test_size=0.2, random_state=0)

# # Apply SMOTE to the training data
# smote = SMOTE(random_state=42)
# x_train_os, y_train_os = smote.fit_resample(x_train_os, y_train_os)

# build is oversampling and categorical fields

x_os_cat = df_categorical.drop(['fraudulent'], axis=1)
y_os_cat = df_categorical['fraudulent']

x_train_os_cat, x_test_os_cat, y_train_os_cat, y_test_os_cat = train_test_split(x_os_cat, y_os_cat, test_size=0.2, random_state=0)

In [None]:
# Use test and train datafra to create a DMatrix

dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

dtrain_os = xgb.DMatrix(x_train_os, label=y_train_os)
dtest_os = xgb.DMatrix(x_test_os, label=y_test_os)

dtrain_os_cat = xgb.DMatrix(x_train_os_cat, label=y_train_os_cat, enable_categorical=True)
dtest_os_cat = xgb.DMatrix(x_test_os_cat, label=y_test_os_cat, enable_categorical=True)


In [None]:
# set params and train the DMatrix
params = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
num_round = 10

xg_model = xgb.train(params, dtrain, num_round)
xg_model_os = xgb.train(params, dtrain_os, num_round)
xg_model_os_cat = xgb.train(params, dtrain_os_cat, num_round)

In [None]:
# Extra cleanup of the formatted dataset

# more field to set at category for the DMatrix
categorical = ['company_profile', 'description', 'requirements', 'benefits', 'title', 'country','state','city','salary_min','salary_max']

form_cat[categorical] = form_cat[categorical].astype('category')
# transform the predictions to binary
preds_binary = (preds >= 0.5).astype(int)
preds_binary_os = (preds_os >= 0.5).astype(int)
preds_binary_os_cat = (preds_os_cat >= 0.5).astype(int)

display(form_cat.info())

In [None]:
df['department'] = df['department'].fillna('Unknown')

df['department'] = df['department'].astype('str')

df['department'].info()

In [None]:
# cleanup following the train_test_split to avoid data leakage

# implement one hot encoding for department
ohe = OneHotEncoder(min_frequency=20, handle_unknown="ignore")

dept_encoded = ohe.fit_transform(x_train[['department']])
dept_encoded_dense = dept_encoded.toarray()
dept_df = pd.DataFrame(dept_encoded_dense, columns=ohe.get_feature_names_out(["department"]))
x_train = x_train.join(dept_df.fillna(-1))
x_train.drop(['department'], axis=1, inplace=True)

dept_encoded = ohe.fit_transform(x_test[['department']])
dept_encoded_dense = dept_encoded.toarray()
dept_df = pd.DataFrame(dept_encoded_dense, columns=ohe.get_feature_names_out(["department"]))
x_test = x_test.join(dept_df)
x_test.drop(['department'], axis=1, inplace=True)

dept_encoded = ohe.fit_transform(x_train_os[['department']])
dept_encoded_dense = dept_encoded.toarray()
dept_df = pd.DataFrame(dept_encoded_dense, columns=ohe.get_feature_names_out(["department"]))
x_train_os = x_train_os.join(dept_df)
x_train_os.drop(['department'], axis=1, inplace=True)

dept_encoded = ohe.fit_transform(x_test_os[['department']])
dept_encoded_dense = dept_encoded.toarray()
dept_df = pd.DataFrame(dept_encoded_dense, columns=ohe.get_feature_names_out(["department"]))
x_test_os = x_test_os.join(dept_df)
x_test_os.drop(['department'], axis=1, inplace=True)

dept_encoded = ohe.fit_transform(x_train_os_cat[['department']])
dept_encoded_dense = dept_encoded.toarray()
dept_df = pd.DataFrame(dept_encoded_dense, columns=ohe.get_feature_names_out(["department"]))
x_train_os_cat = x_train_os_cat.join(dept_df)
x_train_os_cat.drop(['department'], axis=1, inplace=True)

dept_encoded = ohe.fit_transform(x_test_os_cat[['department']])
dept_encoded_dense = dept_encoded.toarray()
dept_df = pd.DataFrame(dept_encoded_dense, columns=ohe.get_feature_names_out(["department"]))
x_test_os_cat = x_test_os_cat.join(dept_df)
x_test_os_cat.drop(['department'], axis=1, inplace=True)

dept_encoded = ohe.fit_transform(x_train_form[['department']])
dept_encoded_dense = dept_encoded.toarray()
dept_df = pd.DataFrame(dept_encoded_dense, columns=ohe.get_feature_names_out(["department"]))
x_train_form = x_train_form.join(dept_df)
x_train_form.drop(['department'], axis=1, inplace=True)

dept_encoded = ohe.fit_transform(x_test_form[['department']])
dept_encoded_dense = dept_encoded.toarray()
dept_df = pd.DataFrame(dept_encoded_dense, columns=ohe.get_feature_names_out(["department"]))
x_test_form = x_test_form.join(dept_df)
x_test_form.drop(['department'], axis=1, inplace=True)

x_train.head()

In [None]:
# Apply SMOTE to the training data now that the department column has been one-hot encoded
smote = SMOTE(random_state=42)
x_train_os, y_train_os = smote.fit_resample(x_train_os, y_train_os)

In [None]:
# Use pandas dataframe to create a DMatrix

dtrain = xgb.DMatrix(x_train, label=y_train)
dtest = xgb.DMatrix(x_test, label=y_test)

dtrain_os = xgb.DMatrix(x_train_os, label=y_train_os)
dtest_os = xgb.DMatrix(x_test_os, label=y_test_os)

dtrain_os_cat = xgb.DMatrix(x_train_os_cat, label=y_train_os_cat, enable_categorical=True)
dtest_os_cat = xgb.DMatrix(x_test_os_cat, label=y_test_os_cat, enable_categorical=True)

dtrain_form = xgb.DMatrix(x_train_form, label=y_train_form, enable_categorical=True)
dtest_form = xgb.DMatrix(x_test_form, label=y_test_form, enable_categorical=True)


In [None]:
# set params and train the DMatrix
params = {'max_depth': 2, 'eta': 1, 'objective': 'binary:logistic'}
num_round = 10

xg_model = xgb.train(params, dtrain, num_round)
xg_model_os = xgb.train(params, dtrain_os, num_round)
xg_model_os_cat = xgb.train(params, dtrain_os_cat, num_round)

In [None]:
# predict the xg model
preds = xg_model.predict(dtest)
preds_os = xg_model_os.predict(dtest_os)
preds_os_cat = xg_model_os_cat.predict(dtest_os_cat)

# convert the predictions to binary
preds_binary = (preds >= 0.5).astype(int)
preds_binary_os = (preds_os >= 0.5).astype(int)
preds_binary_os_cat = (preds_os_cat >= 0.5).astype(int)

# check accuracy of the model
print('Base Model: ', balanced_accuracy_score(y_test, preds_binary))
print('Oversampled Model: ', balanced_accuracy_score(y_test_os, preds_binary_os))
print('Oversample Categorized Model: ', balanced_accuracy_score(y_test_os_cat, preds_binary_os_cat))

In [None]:
# Calculate the confusion matrix
cm = confusion_matrix(y_test, preds_binary)
print('Confusion Matrix:')
print(cm)

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
disp.plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
# Calculate the confusion matrix
cm = confusion_matrix(y_test_os, preds_binary_os)
print('Confusion Matrix:')
print(cm)

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
disp.plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
# Calculate the confusion matrix
cm = confusion_matrix(y_test_os_cat, preds_binary_os_cat)
print('Confusion Matrix:')
print(cm)

# Display the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=[0, 1])
disp.plot(cmap=plt.cm.Blues)
plt.show()

In [None]:
from xgboost import XGBClassifier, XGBRegressor

xgbc = XGBClassifier(n_estimators=136, max_depth=4, learning_rate=.13, random_state=1)

# fit the model
xgbc.fit(x_test_os, y_test_os)

# Make predictions
train_pred_os = xgbc.predict(x_train_os)
test_pred_os = xgbc.predict(x_test_os)

In [None]:

print('\nRandom Oversampled and categorized Data with Tuned XGB Classifer')
# Print scores
print("--------------------------------------------------------")
print(balanced_accuracy_score(y_train_os, train_pred_os),'training score')
print(balanced_accuracy_score(y_test_os, test_pred_os),'testing score')