In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from statsmodels.genmod.families.links import logit
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score

import nltk
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download("stopwords")
from nltk.corpus import wordnet as wn
from nltk.corpus import stopwords
from nltk import pos_tag

[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv("fake_job_postings.csv")

In [3]:
location = data["location"].str.split(",", expand= True, n= 2)
location.columns = ["country", "state", "city"]
data[["country", "state", "city"]] = location
data = data.drop(columns= "location")

In [4]:
salary = data["salary_range"].str.split("-", expand= True, n= 1)
data[["min_salary", "max_salary"]] = salary
data = data.drop(columns= "salary_range")

In [5]:
data = data.fillna("N/A")
data["state"] = data["state"].str.strip().apply(lambda x: "N/A" if x == '' else x)
data["country"] = data["country"].str.strip().apply(lambda x: "N/A" if x == '' else x)
data["city"] = data["city"].str.strip().apply(lambda x: "N/A" if x == '' else x)

In [6]:
for i in ["company_profile", "description", "requirements", "benefits"]:
    data[i] = data[i].str.lower()

In [7]:
data.isnull().sum()

job_id                 0
title                  0
department             0
company_profile        0
description            0
requirements           0
benefits               0
telecommuting          0
has_company_logo       0
has_questions          0
employment_type        0
required_experience    0
required_education     0
industry               0
function               0
fraudulent             0
country                0
state                  0
city                   0
min_salary             0
max_salary             0
dtype: int64

In [8]:
data

Unnamed: 0,job_id,title,department,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,...,required_experience,required_education,industry,function,fraudulent,country,state,city,min_salary,max_salary
0,1,Marketing Intern,Marketing,"we're food52, and we've created a groundbreaki...","food52, a fast-growing, james beard award-winn...",experience with content management systems a m...,,0,1,0,...,Internship,,,Marketing,0,US,NY,New York,,
1,2,Customer Service - Cloud Video Production,Success,"90 seconds, the worlds cloud video production ...",organised - focused - vibrant - awesome!do you...,what we expect from you:your key responsibilit...,what you will get from usthrough being part of...,0,1,0,...,Not Applicable,,Marketing and Advertising,Customer Service,0,NZ,,Auckland,,
2,3,Commissioning Machinery Assistant (CMA),,valor services provides workforce solutions th...,"our client, located in houston, is actively se...",implement pre-commissioning and commissioning ...,,0,1,0,...,,,,,0,US,IA,Wever,,
3,4,Account Executive - Washington DC,Sales,our passion for improving quality of life thro...,the company: esri – environmental systems rese...,"education: bachelor’s or master’s in gis, busi...",our culture is anything but corporate—we have ...,0,1,0,...,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,US,DC,Washington,,
4,5,Bill Review Manager,,spotsource solutions llc is a global human cap...,job title: itemization review managerlocation:...,qualifications:rn license in the state of texa...,full benefits offered,0,1,1,...,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,US,FL,Fort Worth,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,17876,Account Director - Distribution,Sales,vend is looking for some awesome new talent to...,just in case this is the first time you’ve vis...,to ace this role you:will eat comprehensive st...,what can you expect from us?we have an open cu...,0,1,1,...,Mid-Senior level,,Computer Software,Sales,0,CA,ON,Toronto,,
17876,17877,Payroll Accountant,Accounting,weblinc is the e-commerce platform and service...,the payroll accountant will focus primarily on...,- b.a. or b.s. in accounting- desire to have f...,health &amp; wellnessmedical planprescription ...,0,1,1,...,Mid-Senior level,Bachelor's Degree,Internet,Accounting/Auditing,0,US,PA,Philadelphia,,
17877,17878,Project Cost Control Staff Engineer - Cost Con...,,we provide full time permanent positions for m...,experienced project cost control staff enginee...,at least 12 years professional experience.abil...,,0,0,0,...,,,,,0,US,TX,Houston,,
17878,17879,Graphic Designer,,,nemsia studios is looking for an experienced v...,1. must be fluent in the latest versions of co...,competitive salary (compensation will be based...,0,0,1,...,Not Applicable,Professional,Graphic Design,Design,0,NG,LA,Lagos,,


In [22]:
data["all_text"] = data["title"] + " " + data["department"] + " " + data["company_profile"] + " " 
+ data["description"] + " " + data["requirements"] + " " + data["benefits"] + " " 
+ data["employment_type"] + " " + data["required_experience"] + " " 
+ data["required_education"] + " " + data["industry"] + " " + data["function"] + " " + data['country'] + ' '
+ data['state'] + " " + data['city']

data['all_text'] = [i.lower() for i in data['all_text']]
data.head()

Unnamed: 0,job_id,title,department,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,country,state,city,min_salary,max_salary,all_text
0,1,Marketing Intern,Marketing,"we're food52, and we've created a groundbreaki...","food52, a fast-growing, james beard award-winn...",experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0,US,NY,New York,,,"Marketing Intern Marketing we're food52, and w..."
1,2,Customer Service - Cloud Video Production,Success,"90 seconds, the worlds cloud video production ...",organised - focused - vibrant - awesome!do you...,what we expect from you:your key responsibilit...,what you will get from usthrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0,NZ,,Auckland,,,Customer Service - Cloud Video Production Succ...
2,3,Commissioning Machinery Assistant (CMA),,valor services provides workforce solutions th...,"our client, located in houston, is actively se...",implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0,US,IA,Wever,,,Commissioning Machinery Assistant (CMA) N/A va...
3,4,Account Executive - Washington DC,Sales,our passion for improving quality of life thro...,the company: esri – environmental systems rese...,"education: bachelor’s or master’s in gis, busi...",our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0,US,DC,Washington,,,Account Executive - Washington DC Sales our pa...
4,5,Bill Review Manager,,spotsource solutions llc is a global human cap...,job title: itemization review managerlocation:...,qualifications:rn license in the state of texa...,full benefits offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,US,FL,Fort Worth,,,Bill Review Manager N/A spotsource solutions l...


In [31]:
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()
data['last_text'] = data['all_text'].apply(lambda x: ' '.join([lmtzr.lemmatize(word,'v') for word in x.split() ]))

stop_words = set(stopwords.words('english'))

In [32]:
data['last_text'] = data['last_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
print(data['last_text'])

0        market intern market we're food52, we've creat...
1        customer service - cloud video production succ...
2        commission machinery assistant (cma) n/a valor...
3        account executive - washington dc sales passio...
4        bill review manager n/a spotsource solutions l...
                               ...                        
17875    account director - distribution sales vend loo...
17876    payroll accountant account weblinc e-commerce ...
17877    project cost control staff engineer - cost con...
17878                             graphic designer n/a n/a
17879    web application developers engineer vend look ...
Name: last_text, Length: 17880, dtype: object


In [40]:
#train test split - 0.8, 0.1, 0.1, execute after preprocessing
X_train, X_test, y_train, y_test = train_test_split(data['last_text'], data["fraudulent"], test_size= 0.10, random_state= 42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size= 0.11, random_state= 42)

In [45]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(stop_words="english")

count_train = count_vectorizer.fit_transform(X_train)
count_test = count_vectorizer.transform(X_test)

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words="english")

tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

### Explore

In [16]:
pd.set_option('display.max_columns', None)
X_train.head()

Unnamed: 0,job_id,title,department,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,country,state,city,min_salary,max_salary
15455,15456,Digital Project Manager,,loop is an award winning interactive agency ba...,loop is an award winning interactive agency ba...,,,0,1,1,Full-time,Entry level,Bachelor's Degree,Online Media,Project Management,0,AT,5,,,
15724,15725,Partnership Manager,Strategic Initiatives,the financial clinic was founded in 2005 to ad...,about the position:the partnership manager ens...,bachelor’s degreecompetitive candidates will h...,excellent benefit package that includes medica...,0,1,1,Full-time,Associate,Bachelor's Degree,Nonprofit Organization Management,Training,0,US,NY,New York City,45000.0,50000.0
1687,1688,Customer Service Associate - Part Time,,"novitex enterprise solutions, formerly pitney ...",the customer service associate will be based i...,minimum requirements:minimum of 6 months custo...,,0,1,0,Part-time,Entry level,High School or equivalent,Legal Services,Administrative,0,US,IN,Indianapolis,,
5669,5670,Hiring for Sales Management Team in Houston Te...,U-Verse,"argenta field solutions values the client, cre...",hiring for sales management team in houston te...,,"we are argenta field solutions, an award winni...",0,1,0,Full-time,Not Applicable,Unspecified,Consumer Services,Management,0,US,TX,Houston,,
8621,8622,Remote Control Solutions Architect,,"come be a part of one of the fastest growing, ...",will lead the remote control aspect of the pro...,skillsbs/ms/phd computer science/electronics/e...,competitive base salarystock optionsfull benef...,0,1,1,Full-time,,,,,0,US,CA,Mountain View,,


In [None]:
rfr = RandomForestClassifier(class_weight = 'balanced', bootstrap = True)

grid = {'n_estimators': [200], 'max_depth': [5, 7, 8], 'max_features': [9, 10, 11], 'random_state': [42]}
test_scores = []

for i in ParameterGrid(grid):
    rfr.set_params(**i)
    rfr.fit(train_features, train_target)
    test_scores.append(rfr.score(test_features, test_target))
    
best_idx = np.argmax(test_scores)
print(test_scores[best_idx], ParameterGrid(grid)[best_idx])

In [50]:
rfr = RandomForestClassifier(class_weight = 'balanced', bootstrap = True)
rfr.fit(tfidf_train, y_train)
score = cross_val_score(rfr, tfidf_train, y_train)
score

array([0.98289703, 0.9825419 , 0.97660615, 0.97800279, 0.98114525])

In [51]:
tfidf_pred = rfr.predict(tfidf_test)
print(classification_report(y_test, tfidf_pred, target_names = ['0','1']))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1693
           1       0.92      0.73      0.81        95

    accuracy                           0.98      1788
   macro avg       0.95      0.86      0.90      1788
weighted avg       0.98      0.98      0.98      1788

