In [1]:
# Import the data
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from encoders import build_encoders, encode
from imblearn.over_sampling import SMOTE

df = pd.read_csv('../group-project-2/original dataset/fake_job_postings.csv')
df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [2]:
df.columns

Index(['job_id', 'title', 'location', 'department', 'salary_range',
       'company_profile', 'description', 'requirements', 'benefits',
       'telecommuting', 'has_company_logo', 'has_questions', 'employment_type',
       'required_experience', 'required_education', 'industry', 'function',
       'fraudulent'],
      dtype='object')

In [3]:
# using only the country abbreivations to simplify vectorizing(?) later
df['country'] = df['location'].str.split(',').str[0]
df = df.drop(columns=['location'])
df

Unnamed: 0,job_id,title,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,country
0,1,Marketing Intern,Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0,US
1,2,Customer Service - Cloud Video Production,Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,NZ
2,3,Commissioning Machinery Assistant (CMA),,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0,US
3,4,Account Executive - Washington DC,Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,US
4,5,Bill Review Manager,,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,US
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,17876,Account Director - Distribution,Sales,,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,0,1,1,Full-time,Mid-Senior level,,Computer Software,Sales,0,CA
17876,17877,Payroll Accountant,Accounting,,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0,US
17877,17878,Project Cost Control Staff Engineer - Cost Con...,,,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,,0,0,0,Full-time,,,,,0,US
17878,17879,Graphic Designer,,,,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,0,0,1,Contract,Not Applicable,Professional,Graphic Design,Design,0,NG


In [4]:
df['country'].unique()

array(['US', 'NZ', 'DE', 'GB', 'AU', 'SG', 'IL', 'AE', 'CA', 'IN', 'EG',
       'PL', 'GR', nan, 'PK', 'BE', 'BR', 'SA', 'DK', 'RU', 'ZA', 'CY',
       'HK', 'TR', 'IE', 'LT', 'JP', 'NL', 'AT', 'KR', 'FR', 'EE', 'TH',
       'PA', 'KE', 'MU', 'MX', 'RO', 'MY', 'FI', 'CN', 'ES', 'SE', 'CL',
       'UA', 'QA', 'IT', 'LV', 'IQ', 'BG', 'PH', 'CZ', 'VI', 'MT', 'HU',
       'BD', 'KW', 'LU', 'NG', 'RS', 'BY', 'VN', 'ID', 'ZM', 'NO', 'BH',
       'UG', 'CH', 'TT', 'SD', 'SK', 'AR', 'TW', 'PT', 'PE', 'CO', 'IS',
       'SI', 'MA', 'AM', 'TN', 'GH', 'AL', 'HR', 'CM', 'SV', 'NI', 'LK',
       'JM', 'KZ', 'KH'], dtype=object)

In [5]:
df.isnull().sum()

job_id                     0
title                      0
department             11547
salary_range           15012
company_profile         3308
description                1
requirements            2696
benefits                7212
telecommuting              0
has_company_logo           0
has_questions              0
employment_type         3471
required_experience     7050
required_education      8105
industry                4903
function                6455
fraudulent                 0
country                  346
dtype: int64

# **Baseline Model**

In [6]:
# isolate usable columns for basline model
numerical_columns = df[['telecommuting', 'has_company_logo', 'has_questions', 'fraudulent']]

In [7]:
numerical_columns

Unnamed: 0,telecommuting,has_company_logo,has_questions,fraudulent
0,0,1,0,0
1,0,1,0,0
2,0,1,0,0
3,0,1,0,0
4,0,1,1,0
...,...,...,...,...
17875,0,1,1,0
17876,0,1,1,0
17877,0,0,0,0
17878,0,0,1,0


In [8]:
# creating X and y for train test split
X = numerical_columns.drop(columns=['fraudulent'])
y = numerical_columns['fraudulent']

In [9]:
# train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [10]:
# declare logistic regression
lr = LogisticRegression(random_state=1)

In [11]:
# fit model
lr_baseline = lr.fit(X_train, y_train)

In [12]:
# validate model
print(f"training data score: {lr_baseline.score(X_train, y_train)}")
print(f"testing data score: {lr_baseline.score(X_test, y_test)}")

training data score: 0.9492915734526473
testing data score: 0.9583892617449664


In [13]:
# make predictions with the saved logistic regression model using the test data
baseline_predections = lr_baseline.predict(X_test)

In [14]:
# check the accuracy score for the test data
accuracy_score(y_test, baseline_predections)

0.9583892617449664

In [15]:
# balanced accuracy for test data
balanced_accuracy_score(y_test, baseline_predections)

0.5

# **Formatted CSV Model w/ Encoders**

In [16]:
# read in CSV
formatted_df = pd.read_csv('../group-project-2/formatted dataset/fake_job_postings.csv')
formatted_df.head()

Unnamed: 0,job_id,title,country,state,city,department,salary_min,salary_max,company_profile,description,...,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,us,ny,new york,Marketing,-1.0,-1.0,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",...,,0,1,0,Other,Internship,Undefined,Undefined,Marketing,0
1,2,Customer Service - Cloud Video Production,nz,,auckland,Success,-1.0,-1.0,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,Undefined,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),us,ia,wever,Undefined,-1.0,-1.0,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",...,,0,1,0,Undefined,Undefined,Undefined,Undefined,Undefined,0
3,4,Account Executive - Washington DC,us,dc,washington,Sales,-1.0,-1.0,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,...,Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,us,fl,fort worth,Undefined,-1.0,-1.0,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [17]:
# drop columns with too much data / irrelevant columns
columns_to_drop = ['job_id', 'title', 'company_profile', 'description', 'benefits']
formatted_df.drop(columns_to_drop, axis=1, inplace=True)
formatted_df

Unnamed: 0,country,state,city,department,salary_min,salary_max,requirements,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,us,ny,new york,Marketing,-1.0,-1.0,Experience with content management systems a m...,0,1,0,Other,Internship,Undefined,Undefined,Marketing,0
1,nz,,auckland,Success,-1.0,-1.0,What we expect from you:Your key responsibilit...,0,1,0,Full-time,Not Applicable,Undefined,Marketing and Advertising,Customer Service,0
2,us,ia,wever,Undefined,-1.0,-1.0,Implement pre-commissioning and commissioning ...,0,1,0,Undefined,Undefined,Undefined,Undefined,Undefined,0
3,us,dc,washington,Sales,-1.0,-1.0,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,us,fl,fort worth,Undefined,-1.0,-1.0,QUALIFICATIONS:RN license in the State of Texa...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,ca,on,toronto,Sales,-1.0,-1.0,To ace this role you:Will eat comprehensive St...,0,1,1,Full-time,Mid-Senior level,Undefined,Computer Software,Sales,0
17876,us,pa,philadelphia,Accounting,-1.0,-1.0,- B.A. or B.S. in Accounting- Desire to have f...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0
17877,us,tx,houston,Undefined,-1.0,-1.0,At least 12 years professional experience.Abil...,0,0,0,Full-time,Undefined,Undefined,Undefined,Undefined,0
17878,ng,la,lagos,Undefined,-1.0,-1.0,1. Must be fluent in the latest versions of Co...,0,0,1,Contract,Not Applicable,Professional,Graphic Design,Design,0


In [18]:
# define X and y variables
X = formatted_df.drop(columns=['fraudulent'])
y = formatted_df['fraudulent']

In [19]:
# train / test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
X_train

Unnamed: 0,country,state,city,department,salary_min,salary_max,requirements,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function
14808,us,sc,greenville,Undefined,-1.0,-1.0,Education and Experience: · Electrical...,0,0,0,Full-time,Mid-Senior level,Unspecified,Machinery,Engineering
7580,us,fl,miami,Construction,196000.0,230000.0,Experience Welders.,0,0,0,Contract,Mid-Senior level,Bachelor's Degree,Construction,Other
14112,gb,lnd,london,Engineering,-1.0,-1.0,Good working knowledge of Python or similar la...,0,1,0,Undefined,Undefined,Undefined,Undefined,Engineering
12879,us,ny,rye,4,-1.0,-1.0,Must be self-motivated with strong verbal and ...,0,1,1,Full-time,Associate,High School or equivalent,Banking,Customer Service
755,us,oh,ashland,Undefined,-1.0,-1.0,University degree required. TEFL / TESOL / CEL...,0,1,1,Contract,Undefined,Bachelor's Degree,Education Management,Undefined
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10955,gb,lnd,london,Undefined,-1.0,-1.0,About YouYou must have previous customer servi...,0,1,0,Full-time,Undefined,Undefined,Consumer Goods,Customer Service
17289,il,,,Undefined,-1.0,-1.0,You have to love QA and feel a compulsion to t...,0,0,1,Full-time,Undefined,Undefined,Undefined,Undefined
5192,gb,,,Undefined,-1.0,-1.0,HGV/LGV Class 1 license.Digi Tacho.,0,1,1,Undefined,Undefined,Undefined,Undefined,Undefined
12172,us,fl,daytona,Undefined,-1.0,-1.0,Experience in programming microcontrollersEmbe...,0,0,0,Full-time,Undefined,Undefined,Undefined,Undefined


In [20]:
# use encoders to process X data
encoder = build_encoders(X_train)
X_train_encoded = encode(X_train, encoder)
X_test_encoded = encode(X_test, encoder)
X_test_encoded

Unnamed: 0,salary_min,salary_max,telecommuting,has_company_logo,has_questions,x0_gb,x0_gr,x0_us,x0_infrequent_sklearn,x0_ca,...,x0_Administrative,x0_Customer Service,x0_Design,x0_Engineering,x0_Health Care Provider,x0_Information Technology,x0_Marketing,x0_Sales,x0_Undefined,x0_infrequent_sklearn.1
0,-1.0,-1.0,0,0,0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-1.0,-1.0,0,1,1,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,90000.0,100000.0,0,0,0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-1.0,-1.0,0,1,0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-1.0,-1.0,0,0,0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4465,-1.0,-1.0,0,1,1,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4466,-1.0,-1.0,0,1,0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4467,-1.0,-1.0,0,1,0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4468,-1.0,-1.0,0,1,0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [21]:
# initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train_encoded, y_train)

# make predictions on the encoded test set
y_pred = model.predict(X_test_encoded)
y_pred

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [22]:
# vvaluate the model
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
print(f'Balanced Accuracy: {balanced_accuracy}')

Balanced Accuracy: 0.5399724405891388


# **Formatted CSV w/ Vectorization**

# **Original CSV w/ OneHotEncoder**

In [23]:
ohe_df = pd.read_csv('../group-project-2/original dataset/fake_job_postings.csv')
ohe_df.head()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [24]:
# columns for OHE
columns_to_encode = ['department', 'employment_type', 'required_experience', 'required_education', 'industry', 'function']

In [25]:
# encode specified columns
ohe = OneHotEncoder(sparse_output=False)
encoded_data = ohe.fit_transform(ohe_df[columns_to_encode])

# add the encoded column names back
encoded_df = pd.DataFrame(encoded_data, columns=ohe.get_feature_names_out(columns_to_encode))


# concatenate the encoded columns with the 'fraudulent' column
final_df = pd.concat([encoded_df, ohe_df['fraudulent'].reset_index(drop=True)], axis=1)
final_df

Unnamed: 0,department_,department_ \tCorporate Shared Services,department_ Lower Level Management,department_ Marketing,department_ Moni Technologies,department_ R&D,department_(Consultant),department_.NET,department_.net Development,department_0,...,function_Quality Assurance,function_Research,function_Sales,function_Science,function_Strategy/Planning,function_Supply Chain,function_Training,function_Writing/Editing,function_nan,fraudulent
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0
17876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
17877,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0
17878,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [26]:
# Define X and y
X = final_df.drop('fraudulent', axis=1)
y = final_df['fraudulent']

In [27]:
# train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# initialize and train the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# make predictions on the test set
y_pred = model.predict(X_test)

# evaluate the model
accuracy = balanced_accuracy_score(y_test, y_pred)
print(f'Balanced Accuracy: {accuracy}')

Balanced Accuracy: 0.6035526685111673


# **Formatted CSV w/ Oversampling and Encoders**

In [28]:
# read in CSV
formatted_df3 = pd.read_csv('../group-project-2/formatted dataset/fake_job_postings.csv')
formatted_df3.head()

Unnamed: 0,job_id,title,country,state,city,department,salary_min,salary_max,company_profile,description,...,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,us,ny,new york,Marketing,-1.0,-1.0,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",...,,0,1,0,Other,Internship,Undefined,Undefined,Marketing,0
1,2,Customer Service - Cloud Video Production,nz,,auckland,Success,-1.0,-1.0,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,Undefined,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),us,ia,wever,Undefined,-1.0,-1.0,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",...,,0,1,0,Undefined,Undefined,Undefined,Undefined,Undefined,0
3,4,Account Executive - Washington DC,us,dc,washington,Sales,-1.0,-1.0,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,...,Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,us,fl,fort worth,Undefined,-1.0,-1.0,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [29]:
# drop columns with too much data / irrelevant columns
columns_to_drop = ['job_id', 'title', 'company_profile', 'description', 'benefits']
formatted_df3.drop(columns_to_drop, axis=1, inplace=True)
formatted_df3

Unnamed: 0,country,state,city,department,salary_min,salary_max,requirements,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,us,ny,new york,Marketing,-1.0,-1.0,Experience with content management systems a m...,0,1,0,Other,Internship,Undefined,Undefined,Marketing,0
1,nz,,auckland,Success,-1.0,-1.0,What we expect from you:Your key responsibilit...,0,1,0,Full-time,Not Applicable,Undefined,Marketing and Advertising,Customer Service,0
2,us,ia,wever,Undefined,-1.0,-1.0,Implement pre-commissioning and commissioning ...,0,1,0,Undefined,Undefined,Undefined,Undefined,Undefined,0
3,us,dc,washington,Sales,-1.0,-1.0,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,us,fl,fort worth,Undefined,-1.0,-1.0,QUALIFICATIONS:RN license in the State of Texa...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,ca,on,toronto,Sales,-1.0,-1.0,To ace this role you:Will eat comprehensive St...,0,1,1,Full-time,Mid-Senior level,Undefined,Computer Software,Sales,0
17876,us,pa,philadelphia,Accounting,-1.0,-1.0,- B.A. or B.S. in Accounting- Desire to have f...,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0
17877,us,tx,houston,Undefined,-1.0,-1.0,At least 12 years professional experience.Abil...,0,0,0,Full-time,Undefined,Undefined,Undefined,Undefined,0
17878,ng,la,lagos,Undefined,-1.0,-1.0,1. Must be fluent in the latest versions of Co...,0,0,1,Contract,Not Applicable,Professional,Graphic Design,Design,0


In [30]:
# define X and y variables
X = formatted_df3.drop(columns=['fraudulent'])
y = formatted_df3['fraudulent']

In [31]:
# test/train split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [32]:
# use encoders to process X data
encoder = build_encoders(X_train)
X_train_encoded2 = encode(X_train, encoder)
X_test_encoded2 = encode(X_test, encoder)
X_test_encoded2

Unnamed: 0,salary_min,salary_max,telecommuting,has_company_logo,has_questions,x0_gb,x0_gr,x0_us,x0_infrequent_sklearn,x0_ca,...,x0_Administrative,x0_Customer Service,x0_Design,x0_Engineering,x0_Health Care Provider,x0_Information Technology,x0_Marketing,x0_Sales,x0_Undefined,x0_infrequent_sklearn.1
0,-1.0,-1.0,0,0,0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-1.0,-1.0,0,1,1,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,90000.0,100000.0,0,0,0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-1.0,-1.0,0,1,0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-1.0,-1.0,0,0,0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4465,-1.0,-1.0,0,1,1,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4466,-1.0,-1.0,0,1,0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4467,-1.0,-1.0,0,1,0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4468,-1.0,-1.0,0,1,0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [33]:
# apply SMOTE to oversample training data
smote = SMOTE(random_state=1)
X_train_smote, y_train_smote = smote.fit_resample(X_train_encoded2, y_train)

In [34]:
# fit LR model to oversampled and encoded training data
model = LogisticRegression()
model.fit(X_train_smote, y_train_smote)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [35]:
# predict
y_pred = model.predict(X_test_encoded)

In [36]:
# evaluate the model
balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
print(f'Balanced Accuracy: {balanced_accuracy}')

Balanced Accuracy: 0.7432494503177616
