<h1>Part 3 : Modeling of Pre-processed Text Data</h1>

<h3>Import Packages</h3>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
import warnings
warnings.filterwarnings('ignore')

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score

<h3>Load Data</h3>

In [2]:
from sqlalchemy import create_engine, MetaData, Table, select

engine = create_engine("sqlite:///joblist.sqlite")
metadata = MetaData()
data = Table('data', metadata, autoload=True, autoload_with=engine)
stmt = select([data.columns.jobdescription, data.columns.label])
connection = engine.connect()
results = connection.execute(stmt).fetchall()

df_data = pd.DataFrame(results)
df_data.columns = results[0].keys()
df_data['jobdescription'] = df_data['jobdescription'].astype('string').replace('\n', ' ')

In [3]:
df_data.head()

Unnamed: 0,jobdescription,label
0,Position Title:Pricing Analyst Position Type: ...,0
1,Title: Senior Data Analyst - Telephony Manager...,0
2,We are looking for a talented Fuel Cell Data E...,0
3,CAREER OPPORTUNITY SENIOR METER DATA ANALYST L...,0
4,The Data Engineer reports directly to the Dire...,0


In [4]:
df_data['jobdescription']

0      Position Title:Pricing Analyst Position Type: ...
1      Title: Senior Data Analyst - Telephony Manager...
2      We are looking for a talented Fuel Cell Data E...
3      CAREER OPPORTUNITY SENIOR METER DATA ANALYST L...
4      The Data Engineer reports directly to the Dire...
                             ...                        
620    While people dream of a better energy future, ...
621    Yelp has hundreds of millions of pieces of use...
622    #TeamNextdoor

Nextdoor is the neighborhood hu...
623    Company Description
Hi there! We’re Razorfish....
624    Auto req ID: 230021BR

Job Description
Sr. Man...
Name: jobdescription, Length: 625, dtype: string

In [5]:
df_data['jobdescription'].values

<StringArray>
[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         

<h2>Pre-Processing Steps</h2>

<h3>Feature Extraction Using TF-IDF Vectorization</h3>

In [6]:
#custom_stopwords = ['bachelor', 'degree', 'work', 'equal', 'opportunity', 'employer', 'objectives', 'ontario', 'canada', 'disability', 'strong', 'including', 'ensure', 'understanding', 'related']

In [7]:
#nltk.download('stopwords')
from nltk.corpus import stopwords

# Initialize TFIDF Vectorizer
tvec = TfidfVectorizer(analyzer = 'word',  
                       stop_words = ENGLISH_STOP_WORDS.union(stopwords.words('french')), 
                       lowercase= True, 
                       min_df=4, 
                       #max_df = 0.95,
                      ngram_range = (1,3))

def preproc_text(txt):
    x = tvec.fit_transform(txt) # Apply Vectorizer, Stopword Removal, & Lowercasing
    x
    return x 

In [8]:
#counts = pd.DataFrame(.toarray(), columns = tvec.get_feature_names())
#counts.head()

<h3>Create Train & Test Sets</h3>

In [9]:
# Create a training and testing data sets
x_train, x_test, y_train, y_test = train_test_split(df_data['jobdescription'].values,df_data['label'],test_size=0.20, random_state=123, stratify=df_data['label'])

In [12]:
x_train

<StringArray>
[                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         

In [13]:
x_train[309]

"Adresse : 234 Simcoe Street Groupe de famille d'emploi : Analyses données et rapports Creates a culture where data is managed as a valuable corporate asset. Provides input to, and follows, data governance standards and processes to improve the quality of BMO data and reduce overall data risk for the designated business/group. Develops and implements data strategies, tactics, products and a curated data model to achieve business objectives. Reviews, designs and implements data governance practices and data management processes within the business/group in alignment with Enterprise Data Governance Operating Directive. Provides guidance on the interpretation of enterprise data governance policies and maintains alignment between data governance practices and the data strategy. Develops data solutions and makes recommendations based on an understanding of the business strategy and stakeholder needs. Supports the assessment, root cause analysis, and formulating proposed solutions/remediatio

In [14]:
print("Shape of y_test: ", y_test.shape)

Shape of y_test:  (125,)


<h3>Apply Pre-Processing Steps to Extract Features</h3>

In [None]:
x_train_TFIDF = tfidf_pipeline(x_train)
x_test_TFIDF = tfidf_pipeline(x_test)

In [None]:
print("Original training set shape: ", x_train.shape)
print("Preprocessed training set for unigrams: ", x_train_TFIDF.shape)
print("Preprocessed training set for n-grams: ", x_train_TFIDF.shape)

<h3>Modeling</h3>

In [None]:
# Initialize and fit Random Forest Classifier model
rfc = RandomForestClassifier(n_estimators=100, class_weight='balanced')
rfc_model = rfc.fit(x_train_TFIDF,y_train)

<h3>Cross-Validation</h3>

In [None]:
%%timeit
# Apply 10-fold cross-validation
rfc_result = cross_val_score(rfc_model, x_train_TFIDF, y_train, cv=10, scoring='accuracy')
print("The mean of cross validation is: ", rfc_result.mean())

<h3>Evaluation on Test Data</h3>

In [None]:
# Predict y values using x train values
y_pred = rfc_model.predict(x_test_TFIDF)
precision, recall, fscore, support = score(y_test, 
                                            y_pred, 
                                            pos_label=1, 
                                            average ='binary')

print("Classification Report: \nPrecision: {}, \nRecall: {}, \nF-score: {}, \nAccuracy: {}".format(round(precision,3),round(recall,3),round(fscore,3),round((y_pred==y_test).sum()/len(y_test),3)))


# ValueError: Number of features of the model must match the input. 
# Model n_features is 6536 (x_train_TFIDF shape) and input n_features is 3213 

In [None]:
# Confusion matrix
confusion_matrix(y_test, y_pred)