In [1]:
pip install tensorflow

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import our dependencies
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# if I do a branched model
from tensorflow.keras.models import Model
from tensorflow.keras import layers
import tensorflow as tf

# for NLP
## Import the Counter class from the collections library.
from collections import Counter
# Import reuters and stopwords 
from nltk.corpus import reuters, stopwords
# Import ngrams
from nltk.util import ngrams
# Import the WordNetLemmatizer class 
from nltk.stem import WordNetLemmatizer 
# Import the word tokenizer
from nltk.tokenize import word_tokenize
# Import regular expressions
import re
# Import the nat lang toolkit
import nltk
nltk.download('punkt')
nltk.download('wordnet')

#  Import and read the attrition data
df = pd.read_csv('./formatted dataset/fake_job_postings.csv')
df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Jake\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Jake\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,job_id,title,country,state,city,department,salary_min,salary_max,company_profile,description,...,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,us,ny,new york,Marketing,-1.0,-1.0,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",...,,0,1,0,Other,Internship,Undefined,Undefined,Marketing,0
1,2,Customer Service - Cloud Video Production,nz,,auckland,Success,-1.0,-1.0,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,Undefined,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),us,ia,wever,Undefined,-1.0,-1.0,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",...,,0,1,0,Undefined,Undefined,Undefined,Undefined,Undefined,0
3,4,Account Executive - Washington DC,us,dc,washington,Sales,-1.0,-1.0,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,...,Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0
4,5,Bill Review Manager,us,fl,fort worth,Undefined,-1.0,-1.0,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0


In [3]:
df.columns

Index(['job_id', 'title', 'country', 'state', 'city', 'department',
       'salary_min', 'salary_max', 'company_profile', 'description',
       'requirements', 'benefits', 'telecommuting', 'has_company_logo',
       'has_questions', 'employment_type', 'required_experience',
       'required_education', 'industry', 'function', 'fraudulent'],
      dtype='object')

In [4]:
drop_cols = ['job_id']

df.drop(columns=drop_cols, inplace=True)

In [5]:
# concatenate the long text columns into a single column called Job_Reqs
long_text_cols = ['title', 'country', 'state', 'city', 'department', 'company_profile', 'description'\
                , 'requirements', 'benefits', 'required_experience', 'required_education', 'employment_type'\
                , 'industry', 'function']

df['Job_Reqs'] = df[long_text_cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

df.drop(columns=long_text_cols, inplace=True)

df.head()

Unnamed: 0,salary_min,salary_max,telecommuting,has_company_logo,has_questions,fraudulent,Job_Reqs
0,-1.0,-1.0,0,1,0,0,Marketing Intern us ny new york Marketing We'r...
1,-1.0,-1.0,0,1,0,0,Customer Service - Cloud Video Production nz n...
2,-1.0,-1.0,0,1,0,0,Commissioning Machinery Assistant (CMA) us ia ...
3,-1.0,-1.0,0,1,0,0,Account Executive - Washington DC us dc washin...
4,-1.0,-1.0,0,1,1,0,Bill Review Manager us fl fort worth Undefined...


In [6]:
# now process the long text column
lemma = WordNetLemmatizer()
def process_text(job_req):

    # Get the stopwords
    sw = set(stopwords.words('english'))

    # Use regex to substitute everything that is not a letter with an empty string.
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', job_req)

    # Tokenize the words 
    words = word_tokenize(re_clean)

    # Lemmatize the words
    lem = [lemma.lemmatize(word) for word in words]

    # Retrieve only the words that aren't in the stopwords
    output = [word.lower() for word in lem if word.lower() not in sw]

    # ensure the ouptut is a Series with a single column with a single string
    output = pd.Series(' '.join(output))

    return output

In [7]:
df['Job_Reqs'] = df['Job_Reqs'].apply(process_text)

In [13]:
x = df['Job_Reqs']
y = df['fraudulent']

In [14]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=1)

In [15]:
x_train.head()

14808    senior controls engineer u sc greenville undef...
7580     welder u fl miami construction nan tullow oil ...
14112    senior data scientist gb lnd london engineerin...
12879    call center representative u ny rye outstandin...
755      english teacher abroad u oh ashland undefined ...
Name: Job_Reqs, dtype: object

# Vectorize and build model

In [16]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn import metrics

In [17]:
pipe = Pipeline([('tfidf', TfidfVectorizer(stop_words='english')),('clf', LinearSVC())])

# Fit the model
pipe.fit(x_train, y_train)



In [20]:
print('Train Accuracy: %.3f' % pipe.score(x_train, y_train))
print('Test Accuracy: %.3f' % pipe.score(x_test, y_test))

Train Accuracy: 1.000
Test Accuracy: 0.989


In [23]:
test_predictions = pipe.predict(x_test)

# Create the confusion matrix on the test data and predictions
print(metrics.confusion_matrix(y_test,test_predictions))

# Print a classification report
print(metrics.classification_report(y_test,test_predictions))

# Print the overall accuracy
print(metrics.accuracy_score(y_test,test_predictions))

[[4275    9]
 [  39  147]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      4284
           1       0.94      0.79      0.86       186

    accuracy                           0.99      4470
   macro avg       0.97      0.89      0.93      4470
weighted avg       0.99      0.99      0.99      4470

0.9892617449664429


In [None]:
test_predictions = pipe.predict(x_test)