In [None]:
# Create Naive Bayes classifier using TD-IDF vectorizer and pickle results. This model can be 
# trained on job resumes or Indeed job postings. Steps include:
# 1. Set the model parameters
# 2. Getting a combined list of salaries and only using job titles with 500+ salary records
# 3. Getting a list of resumes using this list of job titles and remove any job titles with 
#    less than 500 resumes
# 4. Run TD-IDF vectorizer and Naive Bayes model training
# 5. Test the model using "List Most Relevant Skills"
# 6. Test the model using "Document Similarity Score"

In [11]:
import pandas as pd
from functions.word_preprocessing import *
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import pickle
import datetime
from sklearn.naive_bayes import MultinomialNB
import numpy as np
from random import shuffle

In [12]:
directory = '/Users/kwheatley/Desktop/Capstone/gcloud_data/'

# Model Parameters

In [23]:
parameters = {
                "doc_type":"indeed_resume", # Use "indeed_resume" or "indeed_postings"
                "min_salary_records":500, # Filter out all jobs with less than specified salary records
                "min_job_summaries":500, # Filter out all jobs with less than specified job summaries
                "min_ngram":2, # For TD-IDF vectorizer
                "max_ngram":4, # For TD-IDF vectorizer
                "min_df":0, # For TD-IDF vectorizer, ignore features in less than this number of documents
                "train_test_split":0.05, # For train-test split
                "random_state":1, # For train-test split
                "alpha":0.1, # For Naive Bayes model
                "num_skills":50, # Number of skill to show per job 
}

# Load Salary Datasets

In [13]:
# Load the both salary datasets
salary1 = pd.read_csv(directory+'02_h1b_salaries_CLEAN.csv')
salary2 = pd.read_csv(directory+'02_greencard_salaries_CLEAN.csv')

In [14]:
# Combine salary datasets
temp_salary1 = salary1[['city','state','start_year','cleaned_job_title','experiences','salary']]
temp_salary2 = salary2[['city','state','decision_year','cleaned_job_title','experiences','salary_amount']]
temp_salary1.columns = ['city','state','start_year','cleaned_job_title','experiences','salary']
temp_salary2.columns = ['city','state','start_year','cleaned_job_title','experiences','salary']
combined_salaries = temp_salary1.append(temp_salary2)

# Get List of Jobs

In [15]:
# Choose all jobs with 500 or more records
temp = combined_salaries.groupby(['cleaned_job_title']).count().salary.reset_index()
jobs_to_model = temp[temp.salary >= 500]
print("Number of jobs with 500+ salary records:", jobs_to_model.cleaned_job_title.count())

# combined_salaries.groupby(['cleaned_job_title'])\
# .salary.agg(['count','mean','min','max','median','std']).to_csv("sample.csv")

Number of jobs with 500+ salary records: 275


# Load Job Summaries

In [16]:
# Load resume data
if parameters['doc_type'] == 'indeed_resume':
    data = pd.read_csv(directory+'02_processed_resumes_work_CLEAN.csv')
if parameters['doc_type'] == 'indeed_postings':
    data = pd.read_csv(directory+'02_indeed_job_postings_CLEAN.csv')

# Remove all null cleaned_job_title records
jobs_descriptions = data[~data.cleaned_job_title.isnull()]

In [17]:
# Filter to only jobs specified by the jobs_to_model list
jobs_descriptions = jobs_descriptions[jobs_descriptions.cleaned_job_title.isin(jobs_to_model.cleaned_job_title)]

# Get job title and job summary
if parameters['doc_type'] == 'indeed_resume':
    jobs_descriptions = jobs_descriptions[['cleaned_job_title','descript']]
if parameters['doc_type'] == 'indeed_postings':
    jobs_descriptions = jobs_descriptions[['cleaned_job_title','summary_text']]

jobs_descriptions.columns = ['cleaned_job_title','summary']
jobs_descriptions = jobs_descriptions.drop_duplicates()

In [18]:
# Print the number of records left after removing irrelevant jobs
print("Number of resume entries available:", 
      str(jobs_descriptions.cleaned_job_title.count()) +"/"+
      str(data.cleaned_job_title.count()))

Number of resume entries available: 252678/1215292


In [19]:
# Remove all jobs without 500 or more resume entries
cnt_resumes_available = jobs_descriptions.groupby('cleaned_job_title')\
                                .count().reset_index()
cnt_resumes_available = list(cnt_resumes_available[cnt_resumes_available.summary>500]\
                                .cleaned_job_title)
jobs_descriptions = jobs_descriptions[jobs_descriptions.cleaned_job_title\
                       .isin(cnt_resumes_available)]

print("Number of jobs with 500+ resume entries:", len(cnt_resumes_available))
print("Number of resume entries available now:", jobs_descriptions.cleaned_job_title.count())

Number of jobs with 500+ resume entries: 90
Number of resume entries available now: 235496


# Preprocess Job Summaries

In [20]:
# Can we add a spell checker somehow?

In [21]:
# from functions.word_preprocessing
x_data = preprocess_list(jobs_descriptions.summary)

In [22]:
# Create labels using cleaned_job_title
y_labels = jobs_descriptions.cleaned_job_title

# Train Model

In [24]:
# Split the data into test and train datasets
X_train, X_test, y_train, y_test = train_test_split(x_data, 
                                                    y_labels,
                                                    test_size=parameters['train_test_split'],
                                                    random_state=parameters['random_state'])

In [None]:
print("Start:", datetime.datetime.now())

# Train TF-IDF vectorizer model
vect = TfidfVectorizer(min_df=parameters['min_df'], 
                       ngram_range=(parameters['min_ngram'], parameters['max_ngram'])
                      ).fit(X_train)
X_train_vectorized = vect.transform(X_train)

print("End:", datetime.datetime.now())

print('Vocabulary len:', len(vect.get_feature_names()))

Start: 2018-07-03 18:15:57.234151
End: 2018-07-03 18:44:17.999290
Vocabulary len: 35083980


In [None]:
# Train Multinomial Naive Bayes model
model = MultinomialNB(alpha=parameters['alpha'])
model.fit(X_train_vectorized, y_train)

y_pred = model.predict(vect.transform(X_test))
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))

In [None]:
# predictions = pd.DataFrame(list(zip(y_test, y_pred)))
# predictions.columns=['actual','prediction']
# predictions['count']=1
# predictions.groupby(['actual','prediction']).count().reset_index().to_csv('most_confusion.csv')

# List Most Relevant Skills

In [None]:
# This code finds the top parameters['num_skills'] of features to show the user. It filters out any 
# ngram where the same n-1 version of the ngram is shown. This cuts down on repetition.

label_id = 61

print(model.classes_[label_id])
print('-------')

features_list = []
topn_class1 = sorted(zip(model.coef_[label_id], vect.get_feature_names()))[-parameters['num_skills']:]
for coef, feat in topn_class1:
    features_list.append(feat)

accepted_skill_list = ['none']
for potential_skill in sorted(features_list, key=lambda x: -len(x.split())):
    highest_match = len(potential_skill.split())
    for accepted_skill in accepted_skill_list:
        leftovers = list(set(potential_skill.split()) - set(accepted_skill.split()))
        if len(leftovers) < highest_match:
            highest_match = len(leftovers)
    if highest_match > 1:
        accepted_skill_list.append(potential_skill)
accepted_skill_list = accepted_skill_list[1:]
shuffle(accepted_skill_list)

for skill in accepted_skill_list:
    print(skill)

# Document Similarity Score

In [None]:
# This code returns the prediction probabilities for an example input

example_index = 11
print(y_test[example_index:example_index+1])

print()
print("---------------------")
print(X_test[example_index])

print()
print("---------------------")
vector_example = vect.transform(preprocess_list([X_test[example_index]]))
job_rankings = list(zip(model.predict_proba(vector_example)[0],model.classes_))
sorted(job_rankings,reverse=True)[:20]

# Save New Model

In [None]:
# This code saves the model to the models folder

save_time = re.sub('[^A-Za-z0-9]+', '', str(datetime.datetime.now()))
print(save_time)

write_param = open("models/" + save_time + '_parameters.txt','w')
for key in parameters:
    write_param.write(key + "=" + str(parameters[key]) + '\n')
write_param.close()

# Save preprocessed x data
pickling_on = open("models/"+save_time+"_x_data.pkl","wb")
pickle.dump(x_data, pickling_on)
pickling_on.close()

# Save preprocessed y labels
pickling_on = open("models/"+save_time+"_y_labels.pkl","wb")
pickle.dump(y_labels, pickling_on)
pickling_on.close()

# Save TD-IDF vectorizer
pickling_on = open("models/"+save_time+"_tdidf_vect.pkl","wb")
pickle.dump(vect, pickling_on)
pickling_on.close()

# Save vectorized x_train
pickling_on = open("models/"+save_time+"_x_trained_tdidf_vect.pkl","wb")
pickle.dump(X_train_vectorized, pickling_on)
pickling_on.close()

# Save NB model
pickling_on = open("models/"+save_time+"_nb_model.pkl","wb")
pickle.dump(model, pickling_on)
pickling_on.close()

# Load Model

In [None]:
# This code loads an old model

save_time = '20180703161959266229'

pickling_on = open("models/"+save_time+"_x_data.pkl","rb")
x_data = pickle.load(pickling_on)
pickling_on.close()

# Save preprocessed y labels
pickling_on = open("models/"+save_time+"_y_labels.pkl","rb")
y_labels = pickle.load(pickling_on)
pickling_on.close()

# Save TD-IDF vectorizer
pickling_on = open("models/"+save_time+"_tdidf_vect.pkl","rb")
vect = pickle.load(pickling_on)
pickling_on.close()

# Save vectorized x_train
pickling_on = open("models/"+save_time+"_x_trained_tdidf_vect.pkl","rb")
X_train_vectorized = pickle.load(pickling_on)
pickling_on.close()

# Save NB model
pickling_on = open("models/"+save_time+"_nb_model.pkl","rb")
model = pickle.load(pickling_on)
pickling_on.close()

# End

Most likely I will need to train with a 1-5 ngram model and then return skills
based on a 3-4 ngram model