In [2]:
# Import the data
import pandas as pd
import numpy as np
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

df = pd.read_csv("original dataset/fake_job_postings.csv")
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


## Explore the dataframe

In [3]:
# Check the shape of the dataframe
df.shape

(17880, 18)

In [4]:
# Display a quick summary of the dataframe
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   job_id               17880 non-null  int64 
 1   title                17880 non-null  object
 2   location             17534 non-null  object
 3   department           6333 non-null   object
 4   salary_range         2868 non-null   object
 5   company_profile      14572 non-null  object
 6   description          17879 non-null  object
 7   requirements         15184 non-null  object
 8   benefits             10668 non-null  object
 9   telecommuting        17880 non-null  int64 
 10  has_company_logo     17880 non-null  int64 
 11  has_questions        17880 non-null  int64 
 12  employment_type      14409 non-null  object
 13  required_experience  10830 non-null  object
 14  required_education   9775 non-null   object
 15  industry             12977 non-null  object
 16  func

In [5]:
# Identify and count the missing values in the dataframe by column
df.isnull().sum()

job_id                     0
title                      0
location                 346
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2696
benefits                7212
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
dtype: int64

In [7]:
# Identify and count the unique values in the dataframe by column
df.nunique()

job_id                 17880
title                  11231
location                3105
department              1337
salary_range             874
company_profile         1709
description            14801
requirements           11967
benefits                6204
telecommuting              2
has_company_logo           2
has_questions              2
employment_type            5
required_experience        7
required_education        13
industry                 131
function                  37
fraudulent                 2
dtype: int64

In [6]:
# Review statistics for columns with numerical values
df.describe()

Unnamed: 0,job_id,telecommuting,has_company_logo,has_questions,fraudulent
count,17880.0,17880.0,17880.0,17880.0,17880.0
mean,8940.5,0.042897,0.795302,0.491723,0.048434
std,5161.655742,0.202631,0.403492,0.499945,0.214688
min,1.0,0.0,0.0,0.0,0.0
25%,4470.75,0.0,1.0,0.0,0.0
50%,8940.5,0.0,1.0,0.0,0.0
75%,13410.25,0.0,1.0,1.0,0.0
max,17880.0,1.0,1.0,1.0,1.0


## Build the Baseline Model v1 (Run the model with no updates to the dataset)

In [None]:
# Make a copy the dataframe
df_baseline_1 = df.copy()

In [None]:
# Return a list of columns with object dtypes to drop
drop_strings = df_baseline_1.select_dtypes(include="object").columns.tolist()

# Split features and target
X = df_baseline_1.drop(drop_strings, axis=1)
y = df_baseline_1["fraudulent"]

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# Create the random forest classifier instance
rf_baseline_model_1 = RandomForestClassifier(random_state=1, n_estimators=500).


In [None]:
# Fit the model
rf_baseline_model_1 = fit(X_train, y_train)

In [None]:
# Make predictions using the testing data
predictions = rf_baseline_model_1.predict(X_test)

In [None]:
# Get the feature importance array
importances = rf_baseline_model_1.feature_importances_
# List the top 10 most important features
importances_sorted = sorted(zip(rf_baseline_model_1.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

In [None]:
# Plot the feature importances
features = sorted(zip(X.columns, importances), key = lambda x: x[1])
cols = [f[0] for f in features]
width = [f[1] for f in features]

fig, ax = plt.subplots()

fig.set_size_inches(8,6)
plt.margins(y=0.001)

ax.barh(y=cols, width=width)

plt.show()

In [None]:
# Evaluate the model accuracy (do both values need to be evaluated after the fact)
print(f"Training Score: {rf_baseline_model_1.score(X_train, y_train)}")
print(f"Testing Score: {rf_baseline_model_1.score(X_test, y_test)}")

## Build the Baseline Model v2 (Minimal update to the dataset)

In [None]:
# Make a copy the dataframe
df_baseline_2 = df.copy()

In [None]:
# Define list of columns with minimal data or lots of unique values
columns_to_drop = ["job_id", "title", "location", "department", "salary_range", 
                   "company_profile", "description", "requirements", "benefits"]

# Drop columns from the copy of the data frame
df_baseline_2.drop(columns_to_drop, axis=1)

In [None]:
# Split features and target
X = df_baseline_2.drop("fraudulent", axis=1)
y = df_baseline_2["fraudulent"]

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [None]:
# Create the random forest classifier instance
rf_baseline_model_2 = RandomForestClassifier(random_state=1, n_estimators=500).

In [None]:
# Fit the model
rf_baseline_model_2 = fit(X_train, y_train)

In [None]:
# Make predictions using the testing data
predictions = rf_baseline_model_1.predict(X_test)

In [None]:
# Evaluate the model accuracy
print(f"Training Score: {rf_baseline_model_2.score(X_train, y_train)}")
print(f"Testing Score: {rf_baseline_model_2.score(X_test, y_test)}")