In [2]:
# Create Naive Bayes classifier using TD-IDF vectorizer and pickle results. This model can be 
# trained on job resumes or Indeed job postings. Steps include:
# 1. Set the model parameters
# 2. Getting a combined list of salaries and only using job titles with 500+ salary records
# 3. Getting a list of resumes using this list of job titles and remove any job titles with 
#    less than 500 resumes
# 4. Run TD-IDF vectorizer and Naive Bayes model training
# 5. Test the model using "List Most Relevant Skills"
# 6. Test the model using "Document Similarity Score"

In [303]:
import datetime
import math
import numpy as np
import pandas as pd
import pickle
from itertools import product
from random import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score


# Custom function in functions folder
from functions.word_preprocessing import *

pd.set_option('display.max_colwidth', 500)

In [304]:
# directory = '/Users/kwheatley/Desktop/Capstone/gcloud_data/'
directory = '/mnt/disks/mnt_dir/data/'

# Model Parameters

In [281]:
# # Best parameters for doc similarity
# parameters = {
#                 "doc_type":"indeed_resume", # Use "indeed_resume" or "indeed_postings"
#                 "min_salary_records":100, # Filter out all jobs with less than specified salary records
#                 "min_job_summaries":1000, # Filter out all jobs with less than specified job summaries
#                 "min_ngram":1, # For TD-IDF vectorizer
#                 "max_ngram":3, # For TD-IDF vectorizer
#                 "min_df":5, # For TD-IDF vectorizer, ignore features in less than this number of documents
#                 "train_test_split":0.05, # For train-test split
#                 "random_state":1, # For train-test split
#                 "alpha":0.02, # For Naive Bayes model 0.02 is current best
#                 "num_skills":50, # Number of skill to show per job 
#                 "max_number_records":2500, # Number of records in each class, SMOTE is used to fill small classes
# }

In [305]:
# Best parameters for skills

parameters = {
                "doc_type":"indeed_resume", # Use "indeed_resume" or "indeed_postings"
                "min_salary_records":100, # Filter out all jobs with less than specified salary records
                "min_job_summaries":1000, # Filter out all jobs with less than specified job summaries
                "min_ngram":3, # For TD-IDF vectorizer
                "max_ngram":3, # For TD-IDF vectorizer
                "min_df":5, # For TD-IDF vectorizer, ignore features in less than this number of documents
                "train_test_split":0.05, # For train-test split
                "random_state":1, # For train-test split
                "alpha":0.2, # For Naive Bayes model 0.02 is current best
                "num_skills":100, # Number of skill to show per job 
                "max_number_records":5000, # Number of records in each class, SMOTE is used to fill small classes
}

# Load Salary Datasets

In [161]:
# Load the both salary datasets
salary1 = pd.read_csv(directory+'02_salaries_h1b.csv')
salary2 = pd.read_csv(directory+'02_salaries_greencard.csv')

# Combine salary datasets
temp_salary1 = salary1[['role','city','state','start_year','cleaned_job_title','experiences','salary']]
temp_salary2 = salary2[['job_title','city','state','decision_year','cleaned_job_title','experiences','salary_amount']]
temp_salary1.columns = ['original_role','city','state','start_year','cleaned_job_title','experiences','salary']
temp_salary2.columns = ['original_role','city','state','start_year','cleaned_job_title','experiences','salary']
combined_salaries = temp_salary1.append(temp_salary2)

# Remove salaries with null value and convert to int
combined_salaries = combined_salaries[~combined_salaries.salary.isnull()]
combined_salaries.salary = combined_salaries.salary.astype(int)

# Fill any NaN fields with no_value and convert each column into a list
combined_salaries = combined_salaries.fillna('no_value')
combined_salaries.experiences = combined_salaries.experiences.apply(lambda x: 
                                                list([item.strip() for item in x.split(',')]))
combined_salaries.original_role = combined_salaries.original_role.apply(lambda x: [x])
combined_salaries.city = combined_salaries.city.apply(lambda x: [x])
combined_salaries.state = combined_salaries.state.apply(lambda x: [x])
combined_salaries.start_year = combined_salaries.start_year.apply(lambda x: [x])
combined_salaries.cleaned_job_title = combined_salaries.cleaned_job_title.apply(lambda x: [x])
combined_salaries.salary = combined_salaries.salary.apply(lambda x: [x])

# Perform a pivot on the columns to split out rows with multiple experience level qualifiers
combined_salaries = pd.DataFrame([j for i in combined_salaries.values for j in product(*i)],
                                      columns = combined_salaries.columns)

# Get List of Jobs

In [162]:
# Choose all jobs with `min_salary_records` or more records
temp = combined_salaries.groupby('cleaned_job_title').count().salary.reset_index()
jobs_to_model = temp[temp.salary >= parameters['min_salary_records']]
pd.DataFrame(jobs_to_model.cleaned_job_title.unique()).to_csv(directory+'03_relevant_job_titles.csv',index=False)
combined_salaries = combined_salaries[combined_salaries.cleaned_job_title\
                                                       .fillna('').isin(jobs_to_model.cleaned_job_title)]
print("Number of jobs with "+str(parameters['min_salary_records'])+"+ salary records:", jobs_to_model.cleaned_job_title.count())

# combined_salaries.groupby(['cleaned_job_title'])\
# .salary.agg(['count','mean','min','max','median','std']).to_csv("sample.csv")

Number of jobs with 100+ salary records: 867


# Load Job Summaries

In [163]:
# Load resume data
if parameters['doc_type'] == 'indeed_resume':
    data = pd.read_csv(directory+'02_resumes_work.csv')
if parameters['doc_type'] == 'indeed_postings':
    data = pd.read_csv(directory+'02_job_posts_indeed.csv')

# Remove all null cleaned_job_title records
jobs_descriptions = data[~data.cleaned_job_title.isnull()]

In [164]:
# Filter to only jobs specified by the jobs_to_model list
jobs_descriptions = jobs_descriptions[jobs_descriptions.cleaned_job_title\
                                      .isin(jobs_to_model.cleaned_job_title)]

# Rename job summary field
if parameters['doc_type'] == 'indeed_resume':
    jobs_descriptions.rename(columns = {'descript':'summary_text'}, inplace=True)

In [165]:
# Print the number of records left after removing irrelevant jobs
print("Number of resume entries available:", 
      str(jobs_descriptions.cleaned_job_title.count()) +"/"+
      str(data.cleaned_job_title.count()))

Number of resume entries available: 997695/3200066


In [166]:
# Remove all jobs without `min_job_summaries` or more resume entries
cnt_resumes_available = jobs_descriptions.groupby('cleaned_job_title')\
                                .count().reset_index()
cnt_resumes_available = list(cnt_resumes_available[
            cnt_resumes_available.summary_text>parameters['min_job_summaries']].cleaned_job_title)
jobs_descriptions = jobs_descriptions[jobs_descriptions.cleaned_job_title\
                       .isin(cnt_resumes_available)]

# Save off list of resume ids
if parameters['doc_type'] == 'indeed_resume':
    # Save the list of resume ids for resumes being used
    pd.DataFrame(jobs_descriptions.resume_id.unique())\
                .to_csv(directory+'03_relevant_resume_ids.csv',index=False)
        
# Remove duplicate data
jobs_descriptions = jobs_descriptions[['cleaned_job_title','summary_text','from_year']].drop_duplicates()

print("Number of jobs with "+str(parameters['min_job_summaries'])+"+ resume entries:", len(cnt_resumes_available))
print("Number of resume entries available now:", jobs_descriptions.cleaned_job_title.count())

Number of jobs with 1000+ resume entries: 152
Number of resume entries available now: 849706


# Save Salary Data for App

In [167]:
# This code saves the cleaned salary information back to the main data folder
combined_salaries.to_csv(directory+'03_cleaned_salaries_for_app.csv',index=False)

# Data Preprocess

In [236]:
# This code samples the number of records to remove excessive numbers
new_job_descriptions = pd.DataFrame()
for name, group in jobs_descriptions.groupby('cleaned_job_title'):
    if group[group.from_year == 2016].cleaned_job_title.count() >= parameters['max_number_records']:
        new_job_descriptions = pd.concat([new_job_descriptions,group[group.from_year < 2017]\
            .sort_values(by='from_year', ascending=False).head(parameters['max_number_records'])])
    elif group[(group.from_year == 2016) | (group.from_year == 2017)].cleaned_job_title.count() >= parameters['max_number_records']:
        new_job_descriptions = pd.concat([new_job_descriptions,group[group.from_year < 2018]\
            .sort_values(by='from_year', ascending=False).head(parameters['max_number_records'])])
    else:
        new_job_descriptions = pd.concat([new_job_descriptions,group\
            .sort_values(by='from_year', ascending=False).head(parameters['max_number_records'])])

new_job_descriptions.count()

cleaned_job_title    318102
summary_text         318102
from_year            318102
dtype: int64

In [237]:
# jobs_descriptions.groupby(['cleaned_job_title']).summary_text.count().sort_values(ascending=False).head(70)

In [238]:
print("Start:", datetime.datetime.now())

# from functions.word_preprocessing
x_data = preprocess_list(new_job_descriptions.summary_text)

# Create labels using cleaned_job_title
y_labels = new_job_descriptions.cleaned_job_title

print("End:", datetime.datetime.now())

Start: 2018-07-20 15:54:02.600306
End: 2018-07-20 16:03:29.120794


In [239]:
# Split the data into test and train datasets
X_train, X_test, y_train, y_test = train_test_split(x_data, 
                                                    y_labels,
                                                    test_size=parameters['train_test_split'],
                                                    random_state=parameters['random_state'])

In [306]:
print("Start:", datetime.datetime.now())

# Train TF-IDF vectorizer model
vect = TfidfVectorizer(min_df=parameters['min_df'], 
                       ngram_range=(parameters['min_ngram'], parameters['max_ngram'])
                      ).fit(X_train)
X_train_vectorized = vect.transform(X_train)

print("End:", datetime.datetime.now())

print('Vocabulary len:', len(vect.get_feature_names()))

# Vocabulary len: 893268 When there are 3-4 ngrams
# Vocabulary len: 569472 When there are 3 ngrams

Start: 2018-07-20 20:13:52.625734
End: 2018-07-20 20:19:24.621358
Vocabulary len: 569472


In [307]:
print("Start:", datetime.datetime.now())
sm = SMOTE(kind='regular')
X_res, y_res = sm.fit_sample(X_train_vectorized, y_train)
print("End:", datetime.datetime.now())

Start: 2018-07-20 20:19:26.246658
End: 2018-07-20 20:20:22.709054


In [308]:
temp_display = pd.DataFrame(y_res)
temp_display.columns = ['range']
temp_display['counter'] = 1
temp_display.groupby('range').count().reset_index().head()

Unnamed: 0,range,counter
0,account executive,2402
1,account manager,2402
2,accountant,2402
3,accounting,2402
4,accounting clerk,2402


# Train Model

In [314]:
print("Start:", datetime.datetime.now())

# Train Multinomial Naive Bayes model
model = MultinomialNB(alpha=parameters['alpha'])
model.fit(X_res, y_res)

y_pred = model.predict(vect.transform(X_test))
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))

print("End:", datetime.datetime.now())

# Accuracy: 44.81% for doc similarity
# Accuracy: 33.43% for skills

Start: 2018-07-20 23:11:02.112940
Accuracy: 33.43%
End: 2018-07-20 23:11:22.428579


In [315]:
print('f1_score: ', f1_score(y_test, y_pred, average="macro"))
print('precision_score: ', precision_score(y_test, y_pred, average="macro"))
print('recall_score: ', recall_score(y_test, y_pred, average="macro"))   

f1_score:  0.314963983989
precision_score:  0.342383574896
recall_score:  0.326414634437


In [316]:
precision, recall, fscore, support = score(y_test, y_pred)

metrics = pd.DataFrame(list(zip(model.classes_, precision, recall, fscore, support)))
metrics.columns = ['class','precision', 'recall', 'fscore', 'support']
metrics.sort_values(by='fscore',ascending=True).head()

Unnamed: 0,class,precision,recall,fscore,support
114,researcher,0.0,0.0,0.0,60
45,director,0.114286,0.037736,0.056738,106
0,account executive,0.032593,0.372881,0.059946,118
73,manager,0.142857,0.043478,0.066667,138
81,member of technical staff,0.111111,0.052632,0.071429,57


In [266]:
# import collections
# counted = collections.Counter(y_test)

# from operator import itemgetter
# import heapq
# import collections
# def least_common_values(array, to_find=None):
#     counter = collections.Counter(array)
#     if to_find is None:
#         return sorted(counter.items(), key=itemgetter(1), reverse=False)
#     return heapq.nsmallest(to_find, counter.items(), key=itemgetter(1))

# # counted.most_common()
# least_common_values(counted, 50)

In [268]:
# From the test population, get the most confused labels

# pd.options.display.max_rows = 10000
predictions = pd.DataFrame(list(zip(y_test, y_pred)))
predictions.columns=['actual','prediction']
predictions['count']=1
pred_group = predictions.groupby(['actual','prediction']).count().reset_index()
pred_group[(pred_group.actual != pred_group.prediction) 
           & (pred_group.prediction!='account executive')
          ].sort_values(by='count',ascending=False).head(20)
# .to_csv('most_confusion.csv')

Unnamed: 0,actual,prediction,count
1095,controller,accounting manager,37
2240,java software engineer,j2ee engineer,35
1996,human resources specialist,human resources manager,32
2375,marketing director,marketing manager,32
93,accountant,staff accountant,26
3739,salesforce administrator,salesforce engineer,26
2230,j2ee engineer,java software engineer,25
145,accounting manager,controller,24
4219,systems administrator,network administrator,22
2426,marketing specialist,marketing manager,22


# List Most Relevant Skills

In [317]:
# This code finds the top parameters['num_skills'] of features to show the user. It filters out any 
# ngram where the same n-1 version of the ngram is shown. This cuts down on repetition.

label_id = 80

print(model.classes_[label_id])
print('-------')

features_list = []
topn_class1 = sorted(zip(model.coef_[label_id], vect.get_feature_names()))[-parameters['num_skills']:]
for coef, feat in topn_class1:
    features_list.append(feat)

accepted_skill_list = [model.classes_[label_id]]
for potential_skill in sorted(features_list, key=lambda x: -len(x.split())):
    highest_match = len(potential_skill.split())
    for accepted_skill in accepted_skill_list:
        leftovers = list(set(potential_skill.split()) - set(accepted_skill.split()))
        if len(leftovers) < highest_match:
            highest_match = len(leftovers)
    if highest_match > 1:
        accepted_skill_list.append(potential_skill)
accepted_skill_list = accepted_skill_list[1:]
shuffle(accepted_skill_list)

for skill in accepted_skill_list:
    print(skill)

mechanical engineer
-------
computer aid design
product data management
provide technical support
mechanical electrical component
geometric dimensioning tolerancing
designing building testing
3d model 2d
using ansys fluent
design review presentation
perform finite element
mechanical engineering support
computational fluid dynamic
plastic sheet metal
piping instrumentation diagram
using autodesk inventor
print circuit board
element analysis fea
autocad solidworks design
cooling load calculation
engineering change request
subject matter expert
meeting strict deadline
injection mold part
product life cycle
design next generation
design 3d print
perform root analysis
design test fixture
plastic injection molding
drawing using autocad
create bill material
design stress analysis
using catia v5
mode effect analysis
cross functional team
new existing product
draft engineering drawing
model using solidworks
create part drawing
first article inspection
cad model drawing
product design developmen

# Document Similarity Score

In [255]:
# This code returns the prediction probabilities for an example input

example_index = 11
print(y_test[example_index:example_index+1])

print()
print("---------------------")
print(X_test[example_index])

print()
print("---------------------")
vector_example = vect.transform(preprocess_list([X_test[example_index]]))
job_rankings = list(zip(model.predict_proba(vector_example)[0],model.classes_))
sorted(job_rankings,reverse=True)[:20]

756656    quality assurance tester
Name: cleaned_job_title, dtype: object

---------------------
 definition functional testing strategy module design test plan checklist test case validation structure visual monitoring standard template accomplishment smoke testing exploratory testing functional testing regression testing integration testing registration bug found development production assign task development team fix error monitoring resolution evaluate approval rejection work done development team research develop necessary documentation improve process test plan test case use case report preparing suggestion document improve quality application produce analyze customer report progress project regularly 

---------------------


[(0.97465683265290171, 'quality assurance analyst'),
 (0.0078351714292003022, 'quality assurance tester'),
 (0.0059408891885279948, 'software tester'),
 (0.0043898792065852482, 'test engineer'),
 (0.0043151850727030623, 'software quality engineer'),
 (0.0013814186011663091, 'software test engineer'),
 (0.00066400263754561105, 'quality assurance engineer'),
 (0.00040898850899171005, 'quality assurance'),
 (0.00030397507120115977, 'test'),
 (4.5799081373216704e-05, 'automation engineer'),
 (1.9464541054858887e-05, 'quality assurance specialist'),
 (1.590720864434804e-05, 'quality assurance manager'),
 (9.0489167140637224e-06, 'business analyst'),
 (5.8303131247096438e-06, 'project'),
 (2.0566547023995492e-06, 'programmer analyst'),
 (1.9071343583170404e-06, 'dit analyst'),
 (1.4204332071099108e-06, 'systems analyst'),
 (3.3992404932238059e-07, 'analyst'),
 (3.3005341030906443e-07, 'technical'),
 (2.4755806702249873e-07, 'engineer')]

# Save New Model

In [318]:
# This code saves the model to the models folder
directory = '/mnt/disks/mnt_dir/'

save_time = re.sub('[^A-Za-z0-9]+', '', str(datetime.datetime.now()))
print(save_time)

write_param = open(directory+"models/" + save_time + '_parameters.txt','w')
for key in parameters:
    write_param.write(key + "=" + str(parameters[key]) + '\n')
write_param.close()

# Save preprocessed x data
pickling_on = open(directory+"models/"+save_time+"_x_data.pkl","wb")
pickle.dump(x_data, pickling_on)
pickling_on.close()

# Save preprocessed y labels
pickling_on = open(directory+"models/"+save_time+"_y_labels.pkl","wb")
pickle.dump(y_labels, pickling_on)
pickling_on.close()

# Save preprocessed x SMOTE data
pickling_on = open(directory+"models/"+save_time+"_x_SMOTE_data.pkl","wb")
pickle.dump(X_res, pickling_on)
pickling_on.close()

# Save preprocessed y SMOTE labels
pickling_on = open(directory+"models/"+save_time+"_y_SMOTE_labels.pkl","wb")
pickle.dump(y_res, pickling_on)
pickling_on.close()

# Save TD-IDF vectorizer
pickling_on = open(directory+"models/"+save_time+"_tdidf_vect.pkl","wb")
pickle.dump(vect, pickling_on)
pickling_on.close()

# Save vectorized x_train
pickling_on = open(directory+"models/"+save_time+"_x_trained_tdidf_vect.pkl","wb")
pickle.dump(X_train_vectorized, pickling_on)
pickling_on.close()

# Save NB model
pickling_on = open(directory+"models/"+save_time+"_nb_model.pkl","wb")
pickle.dump(model, pickling_on)
pickling_on.close()

20180720231219859630


# Load Model

In [228]:
# This code loads an old model
directory = '/mnt/disks/mnt_dir/'

save_time = '20180720171945426156' # Currently best model for doc similarity
# save_time = '20180720231219859630' # Currently best model for skills

pickling_on = open(directory+"models/"+save_time+"_x_data.pkl","rb")
x_data = pickle.load(pickling_on)
pickling_on.close()

# Save preprocessed y labels
pickling_on = open(directory+"models/"+save_time+"_y_labels.pkl","rb")
y_labels = pickle.load(pickling_on)
pickling_on.close()

# Save TD-IDF vectorizer
pickling_on = open(directory+"models/"+save_time+"_tdidf_vect.pkl","rb")
vect = pickle.load(pickling_on)
pickling_on.close()

# Save vectorized x_train
pickling_on = open(directory+"models/"+save_time+"_x_trained_tdidf_vect.pkl","rb")
X_train_vectorized = pickle.load(pickling_on)
pickling_on.close()

# # Save NB model
# pickling_on = open(directory+"models/"+save_time+"_nb_model.pkl","rb")
# model = pickle.load(pickling_on)
# pickling_on.close()

# End