<a href="https://colab.research.google.com/github/jobrien1726/D3-challenge/blob/master/machinelearning/nlp_svm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Dependencies**

In [49]:
# Dependencies
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from nltk import word_tokenize,sent_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## **Load/Clean Data**

In [50]:
# Read in csv file
url = "https://job-postings-dataviz.s3.amazonaws.com/fake_jobs_clean.csv"

fake_jobs_df = pd.read_csv(url, sep=",",  encoding = "UTF-8")
fake_jobs_df

Unnamed: 0,job_id,city,state/province,country,title,department,industry,function,salary_range,salary_provided,company_profile,description,requirements,benefits,benefits_provided,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,fraudulent
0,1,New York,NY,US,Marketing Intern,Marketing,,Marketing,,0,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,0,1,0,Other,Internship,,0
1,2,Auckland,,NZ,Customer Service - Cloud Video Production,Success,Marketing and Advertising,Customer Service,,0,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,1,0,1,0,Full-time,Not Applicable,,0
2,3,Wever,IA,US,Commissioning Machinery Assistant (CMA),,,,,0,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,0,1,0,,,,0
3,4,Washington,DC,US,Account Executive - Washington DC,Sales,Computer Software,Sales,,0,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,1,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,0
4,5,Fort Worth,FL,US,Bill Review Manager,,Hospital & Health Care,Health Care Provider,,0,SpotSource Solutions LLC is a Global Human Cap...,JOB TITLE: Itemization Review ManagerLOCATION:...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,1,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17875,17876,Toronto,ON,CA,Account Director - Distribution,Sales,Computer Software,Sales,,0,Vend is looking for some awesome new talent to...,Just in case this is the first time you’ve vis...,To ace this role you:Will eat comprehensive St...,What can you expect from us?We have an open cu...,1,0,1,1,Full-time,Mid-Senior level,,0
17876,17877,Philadelphia,PA,US,Payroll Accountant,Accounting,Internet,Accounting/Auditing,,0,WebLinc is the e-commerce platform and service...,The Payroll Accountant will focus primarily on...,- B.A. or B.S. in Accounting- Desire to have f...,Health &amp; WellnessMedical planPrescription ...,1,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,0
17877,17878,Houston,TX,US,Project Cost Control Staff Engineer - Cost Con...,,,,,0,We Provide Full Time Permanent Positions for m...,Experienced Project Cost Control Staff Enginee...,At least 12 years professional experience.Abil...,,0,0,0,0,Full-time,,,0
17878,17879,Lagos,LA,NG,Graphic Designer,,Graphic Design,Design,,0,,Nemsia Studios is looking for an experienced v...,1. Must be fluent in the latest versions of Co...,Competitive salary (compensation will be based...,1,0,0,1,Contract,Not Applicable,Professional,0


In [51]:
# Take a look at job description and fraudulent cols
df = fake_jobs_df[['description', 'fraudulent']]
df

Unnamed: 0,description,fraudulent
0,"Food52, a fast-growing, James Beard Award-winn...",0
1,Organised - Focused - Vibrant - Awesome!Do you...,0
2,"Our client, located in Houston, is actively se...",0
3,THE COMPANY: ESRI – Environmental Systems Rese...,0
4,JOB TITLE: Itemization Review ManagerLOCATION:...,0
...,...,...
17875,Just in case this is the first time you’ve vis...,0
17876,The Payroll Accountant will focus primarily on...,0
17877,Experienced Project Cost Control Staff Enginee...,0
17878,Nemsia Studios is looking for an experienced v...,0


In [52]:
# Count how many Real vs Fraudulent postings 
from collections import Counter
print(Counter(df['fraudulent'].values))

Counter({0: 17014, 1: 866})


In [53]:
# Drop duplicate descriptions
df = df.drop_duplicates()
df

Unnamed: 0,description,fraudulent
0,"Food52, a fast-growing, James Beard Award-winn...",0
1,Organised - Focused - Vibrant - Awesome!Do you...,0
2,"Our client, located in Houston, is actively se...",0
3,THE COMPANY: ESRI – Environmental Systems Rese...,0
4,JOB TITLE: Itemization Review ManagerLOCATION:...,0
...,...,...
17873,RESPONSIBILITIES:Will facilitate the recruitin...,0
17875,Just in case this is the first time you’ve vis...,0
17876,The Payroll Accountant will focus primarily on...,0
17877,Experienced Project Cost Control Staff Enginee...,0


In [54]:
# Count how many Real vs Fraudulent postings 
print(Counter(df['fraudulent'].values))

Counter({0: 14171, 1: 632})


In [55]:
# Make sure number of real vs fraudulent are more balanced
# Using UNDERsampling for balancing of classification groups
df_fraud= df[df['fraudulent'] == 1] 
df_normal = df[df['fraudulent'] == 0] 

df_normal = df_normal.sample(n=(7*len(df_fraud)), random_state=22)
df = df_normal.append(df_fraud)

df

Unnamed: 0,description,fraudulent
13898,Currently hiring per-diem caregivers to work o...,0
1688,"Μεγαλώνουμε την ομάδα μας,και χρειαζόμαστε ένα...",0
13100,PowToon is looking for a team-oriented Marketi...,0
5579,Wealth Management Advisor We are continuing ou...,0
15961,RECRUITING MILITARY VETERANSLooking to offer o...,0
...,...,...
17827,Student Positions Part-Time and Full-Time.You ...,1
17828,LEARN TO EARN AN EXECUTIVE LEVEL INCOMEFULL TR...,1
17829,inFullMobile Sp. z o.o. is a mobile software d...,1
17830,JOB DESCRIPTIONWe are seeking a full time payr...,1


In [56]:
# Shuffle the dataframe so fraudulent postings more evenly distributed 
from sklearn.utils import shuffle
df = shuffle(df, random_state=22)
df = df.reset_index(drop=True)
df

Unnamed: 0,description,fraudulent
0,Why is UX Design important for us?We want to b...,0
1,JOB DESCRIPTIONWe are seeking a full time payr...,1
2,"Client Services Manager - SM1San Mateo, CA#URL...",1
3,Lead projects from beginning to end while fost...,0
4,Fast-growing E-commerce company has a fantasti...,0
...,...,...
5051,Transifex is seeking a passionate UI/UX Develo...,0
5052,Normal 0 false false fals...,1
5053,CTO for a tourism start-upWe are just now star...,0
5054,OFFER Part Time Positions (Cash In Hand)You ca...,1


In [0]:
# Explicitly define data types
df['description'] = df['description'].astype(str)
df['fraudulent'] = df['fraudulent'].astype(int)

In [58]:
# Datatypes
df.dtypes

description    object
fraudulent      int64
dtype: object

In [59]:
# Count Real vs Fraudulent again to make sure we've achieved the desired result
print(Counter(df['fraudulent'].values))

Counter({0: 4424, 1: 632})


## **Data Pre-Processing**

In [60]:
# Change all the text to lowercase
df['description'] = [x.lower() for x in df['description']]
df['description']

0       why is ux design important for us?we want to b...
1       job descriptionwe are seeking a full time payr...
2       client services manager - sm1san mateo, ca#url...
3       lead projects from beginning to end while fost...
4       fast-growing e-commerce company has a fantasti...
                              ...                        
5051    transifex is seeking a passionate ui/ux develo...
5052             normal  0          false  false  fals...
5053    cto for a tourism start-upwe are just now star...
5054    offer part time positions (cash in hand)you ca...
5055    qubit: cutting edge big data engineeringat qub...
Name: description, Length: 5056, dtype: object

In [0]:
# Tokenization
df['description']= [word_tokenize(x) for x in df['description']]

In [62]:
# Remove Stop words and Non-Numeric. Perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

for index,entry in enumerate(df['description']):
    # Declare an Empty List to store the words that follow the rules for this step
    Final_words = []

    # Initialize WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()

    # pos_tag function to provide the 'tag' i.e if the word is Noun(N), Verb(V), ADJ(J) or ADV(R) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)

    # The final processed set of words for each iteration will be stored in 'text_final'
    df.loc[index,'text_final'] = str(Final_words)

df

Unnamed: 0,description,fraudulent,text_final
0,"[why, is, ux, design, important, for, us, ?, w...",0,"['ux', 'design', 'important', 'u', 'want', 're..."
1,"[job, descriptionwe, are, seeking, a, full, ti...",1,"['job', 'descriptionwe', 'seek', 'full', 'time..."
2,"[client, services, manager, -, sm1san, mateo, ...",1,"['client', 'service', 'manager', 'mateo', 'ca'..."
3,"[lead, projects, from, beginning, to, end, whi...",0,"['lead', 'project', 'begin', 'end', 'foster', ..."
4,"[fast-growing, e-commerce, company, has, a, fa...",0,"['company', 'fantastic', 'opportunity', 'talen..."
...,...,...,...
5051,"[transifex, is, seeking, a, passionate, ui/ux,...",0,"['transifex', 'seek', 'passionate', 'developer..."
5052,"[normal, 0, false, false, false, en-us, x-none...",1,"['normal', 'false', 'false', 'false', 'style',..."
5053,"[cto, for, a, tourism, start-upwe, are, just, ...",0,"['cto', 'tourism', 'start', 'want', 'join', 'a..."
5054,"[offer, part, time, positions, (, cash, in, ha...",1,"['offer', 'part', 'time', 'position', 'cash', ..."


In [63]:
# Reorganize Columns
df = df[['description','text_final','fraudulent']]
df

Unnamed: 0,description,text_final,fraudulent
0,"[why, is, ux, design, important, for, us, ?, w...","['ux', 'design', 'important', 'u', 'want', 're...",0
1,"[job, descriptionwe, are, seeking, a, full, ti...","['job', 'descriptionwe', 'seek', 'full', 'time...",1
2,"[client, services, manager, -, sm1san, mateo, ...","['client', 'service', 'manager', 'mateo', 'ca'...",1
3,"[lead, projects, from, beginning, to, end, whi...","['lead', 'project', 'begin', 'end', 'foster', ...",0
4,"[fast-growing, e-commerce, company, has, a, fa...","['company', 'fantastic', 'opportunity', 'talen...",0
...,...,...,...
5051,"[transifex, is, seeking, a, passionate, ui/ux,...","['transifex', 'seek', 'passionate', 'developer...",0
5052,"[normal, 0, false, false, false, en-us, x-none...","['normal', 'false', 'false', 'false', 'style',...",1
5053,"[cto, for, a, tourism, start-upwe, are, just, ...","['cto', 'tourism', 'start', 'want', 'join', 'a...",0
5054,"[offer, part, time, positions, (, cash, in, ha...","['offer', 'part', 'time', 'position', 'cash', ...",1


## **Prepare Training/Testing Data**

In [0]:
# Prepare Train and Test Data sets
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(df['text_final'],df['fraudulent'],test_size=0.3)

## **Word Vectorization w TF-IDF**

In [65]:
# Word Vectorization using method TF-IDF
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(df['text_final'])
X_train_Tfidf = Tfidf_vect.transform(X_train)
X_test_Tfidf = Tfidf_vect.transform(X_test)

# View the vocab its learned
print(Tfidf_vect.vocabulary_)
print(X_train_Tfidf)

{'ux': 4739, 'design': 1225, 'important': 2201, 'want': 4844, 'reference': 3678, 'term': 4455, 'user': 4724, 'experience': 1645, 'within': 4917, 'financial': 1758, 'service': 4024, 'industry': 2250, 'aspect': 308, 'designer': 1228, 'shape': 4046, 'offer': 3012, 'customer': 1108, 'strive': 4300, 'everyday': 1597, 'make': 2658, 'great': 1982, 'possible': 3358, 'example': 1608, 'problem': 3452, 'work': 4926, 'develop': 1248, 'information': 2261, 'flow': 1803, 'new': 2932, 'optimize': 3067, 'touch': 4544, 'point': 3330, 'job': 2419, 'descriptionwe': 1223, 'seek': 3990, 'full': 1882, 'time': 4511, 'payroll': 3209, 'clerk': 770, 'manage': 2665, 'day': 1134, 'accounting': 37, 'operation': 3050, 'pay': 3203, 'scale': 3932, 'balancing': 430, 'company': 863, 'processing': 3461, 'weekly': 4870, 'credit': 1068, 'overall': 3122, 'record': 3657, 'keep': 2452, 'client': 773, 'manager': 2670, 'mateo': 2708, 'ca': 599, 'lead': 2520, 'way': 4857, 'technology': 4438, 'evolve': 1603, 'change': 700, 'mediu

# **Train/ Evaluate Model**

In [66]:
# Classifier Algorithm= Support Vector Machine

# Fit the Classifier w the Training Data
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(X_train_Tfidf, Y_train)

# Use model to get predictions from the Testing data
predictions_SVM = SVM.predict(X_test_Tfidf)

# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score: ", accuracy_score(predictions_SVM, Y_test)*100)

SVM Accuracy Score:  92.2874093605801


## **Hypertune Model**

In [0]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [1, 5, 10, 100],
              'gamma': [0.001, 0.01, .1, 1],
              'kernel': ['linear','rbf', 'poly', 'sigmoid']}
grid = GridSearchCV(SVM, param_grid, verbose=3)

In [68]:
# Train the model with GridSearch
grid.fit(X_train_Tfidf, Y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits
[CV] C=1, gamma=0.001, kernel=linear .................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] ..... C=1, gamma=0.001, kernel=linear, score=0.912, total=   3.0s
[CV] C=1, gamma=0.001, kernel=linear .................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.0s remaining:    0.0s


[CV] ..... C=1, gamma=0.001, kernel=linear, score=0.921, total=   2.9s
[CV] C=1, gamma=0.001, kernel=linear .................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    5.9s remaining:    0.0s


[CV] ..... C=1, gamma=0.001, kernel=linear, score=0.917, total=   3.0s
[CV] C=1, gamma=0.001, kernel=linear .................................
[CV] ..... C=1, gamma=0.001, kernel=linear, score=0.924, total=   3.0s
[CV] C=1, gamma=0.001, kernel=linear .................................
[CV] ..... C=1, gamma=0.001, kernel=linear, score=0.907, total=   3.0s
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV] ........ C=1, gamma=0.001, kernel=rbf, score=0.876, total=   2.4s
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV] ........ C=1, gamma=0.001, kernel=rbf, score=0.876, total=   2.4s
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV] ........ C=1, gamma=0.001, kernel=rbf, score=0.874, total=   2.4s
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV] ........ C=1, gamma=0.001, kernel=rbf, score=0.874, total=   2.3s
[CV] C=1, gamma=0.001, kernel=rbf ....................................
[CV] .

[Parallel(n_jobs=1)]: Done 320 out of 320 | elapsed: 19.1min finished


GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='auto', kernel='linear', max_iter=-1,
                           probability=False, random_state=None, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [1, 5, 10, 100], 'gamma': [0.001, 0.01, 0.1, 1],
                         'kernel': ['linear', 'rbf', 'poly', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=3)

In [69]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 100, 'gamma': 1, 'kernel': 'rbf'}
0.9251188678189852


In [70]:
# Train the hypertuned model
SVM2 = svm.SVC(kernel='rbf', C= 100, gamma= 1)
SVM2.fit(X_train_Tfidf, Y_train)

SVC(C=100, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=1, kernel='rbf', max_iter=-1,
    probability=False, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

In [71]:
# Hypertuned Model Accuracy
print('SVM2 Acc Score: %.3f' % SVM2.score(X_test_Tfidf, Y_test))

SVM2 Acc Score: 0.930


In [72]:
from sklearn.metrics import classification_report
predictions = SVM2.predict(X_test_Tfidf)
print("SVM2 Classification Report: \n" + classification_report(Y_test, predictions, target_names=['real','fake']))

SVM2 Classification Report: 
              precision    recall  f1-score   support

        real       0.93      1.00      0.96      1327
        fake       0.94      0.47      0.63       190

    accuracy                           0.93      1517
   macro avg       0.93      0.73      0.80      1517
weighted avg       0.93      0.93      0.92      1517



## **Save Model**