In [1]:
# Create Naive Bayes classifier using TD-IDF vectorizer and pickle results. This model can be 
# trained on job resumes or job postings. Steps include:
# 1. Set the model parameters
# 2. Getting a combined list of salaries and only using job titles with 500+ salary records
# 3. Getting a list of resumes using this list of job titles and remove any job titles with 
#    less than 500 resumes
# 4. Run TD-IDF vectorizer and Naive Bayes model training
# 5. Test the model using "List Most Relevant Skills"
# 6. Test the model using "Document Similarity Score"

In [22]:
import datetime
import math
import numpy as np
import pandas as pd
import pickle
from itertools import product
from random import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score


# Custom function in functions folder
from functions.word_preprocessing import *

pd.set_option('display.max_colwidth', 500)

In [2]:
directory = '/Users/kwheatley/Desktop/Capstone/gcloud_data/'

# Model Parameters

In [3]:
parameters = {
                "min_salary_records":100, # Filter out all jobs with less than specified salary records
                "min_job_summaries":1000, # Filter out all jobs with less than specified job summaries
                "min_ngram":2, # For TD-IDF vectorizer
                "max_ngram":4, # For TD-IDF vectorizer
                "min_df":0, # For TD-IDF vectorizer, ignore features in less than this number of documents
                "train_test_split":0.05, # For train-test split
                "random_state":1, # For train-test split
                "alpha":0.1, # For Naive Bayes model
                "num_skills":50, # Number of skill to show per job 
}

# Load Job Summaries

In [4]:
# Load resume data
data = pd.read_csv(directory+'02_resumes_work.csv')

data = data[data.cleaned_job_title == 'software engineer']

# Remove duplicate data
data = data[['cleaned_job_title','descript','from_year']].drop_duplicates()

In [5]:
data['range'] = 'none'
data.loc[data.from_year >= 2013, 'range'] = '2013-2018' 
data.loc[(data.from_year >= 2008) & (data.from_year < 2013), 'range'] = '2008-2013' 
data.loc[(data.from_year >= 2003) & (data.from_year < 2008), 'range'] = '2003-2008' 
data.loc[(data.from_year >= 1998) & (data.from_year < 2003), 'range'] = '1998-2003' 
data.loc[data.from_year < 1998, 'range'] = '1900-1998'

In [6]:
data.groupby('range').count()

Unnamed: 0_level_0,cleaned_job_title,descript,from_year
range,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1900-1998,8852,8852,8852
1998-2003,9627,9627,9627
2003-2008,12842,12842,12842
2008-2013,16111,16111,16111
2013-2018,18295,18295,18295


# Data Preprocess Unbalanced Classes

In [7]:
# Down sample the first model
# SMOTE up sample the second model
# Run the model on different periods of time (just for software engineers)

In [8]:
x_data = preprocess_list(data.descript)
y_labels = data.range

In [9]:
# Split the data into test and train datasets
X_train, X_test, y_train, y_test = train_test_split(x_data, 
                                                    y_labels,
                                                    test_size=parameters['train_test_split'],
                                                    random_state=parameters['random_state'])

In [13]:
print("X_train: ",len(X_train))
print("X_test: ",len(X_test))

X_train:  62440
X_test:  3287


In [14]:
print("Start:", datetime.datetime.now())

# Train TF-IDF vectorizer model
vect = TfidfVectorizer(min_df=parameters['min_df'], 
                       ngram_range=(parameters['min_ngram'], parameters['max_ngram'])
                      ).fit(X_train)
X_train_vectorized = vect.transform(X_train)

print("End:", datetime.datetime.now())

print('Vocabulary len:', len(vect.get_feature_names()))

Start: 2018-08-04 11:39:31.380607
End: 2018-08-04 11:41:34.457882
Vocabulary len: 9312849


In [15]:
sm = SMOTE(kind='regular')
X_res, y_res = sm.fit_sample(X_train_vectorized, y_train)

In [16]:
temp_display = pd.DataFrame(y_res)
temp_display.columns = ['range']
temp_display['counter'] = 1
temp_display.groupby('range').count().reset_index()

Unnamed: 0,range,counter
0,1900-1998,17363
1,1998-2003,17363
2,2003-2008,17363
3,2008-2013,17363
4,2013-2018,17363


# Train Model

In [17]:
# Train Multinomial Naive Bayes model
model = MultinomialNB(alpha=parameters['alpha'])
model.fit(X_res, y_res)

y_pred = model.predict(vect.transform(X_test))
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))

Accuracy: 54.58%


In [None]:
# predictions = pd.DataFrame(list(zip(y_test, y_pred)))
# predictions.columns=['actual','prediction']
# predictions['count']=1
# predictions.groupby(['actual','prediction']).count().reset_index().to_csv('most_confusion.csv')

In [20]:
print('f1_score: ', f1_score(y_test, y_pred, average="macro"))
print('precision_score: ', precision_score(y_test, y_pred, average="macro"))
print('recall_score: ', recall_score(y_test, y_pred, average="macro"))   

f1_score:  0.535192742896
precision_score:  0.54810767289
recall_score:  0.528271931128


In [24]:
precision, recall, fscore, support = score(y_test, y_pred)
'{:.1%}'.format(1/3.0)

metrics = pd.DataFrame(list(zip(model.classes_, precision, recall, fscore, support)))
metrics.columns = ['class','precision', 'recall', 'fscore', 'support']
metrics_samples = metrics.sort_values(by='fscore',ascending=False).head(5)
metrics_samples.precision = metrics_samples.precision.map(lambda x: '{:.2%}'.format(x))
metrics_samples.recall = metrics_samples.recall.map(lambda x: '{:.2%}'.format(x))
metrics_samples.fscore = metrics_samples.fscore.map(lambda x: '{:.2%}'.format(x))
metrics_samples.sort_values(by='fscore',ascending=True).to_csv('temp.csv')
metrics_samples

Unnamed: 0,class,precision,recall,fscore,support
4,2013-2018,70.46%,67.06%,68.72%,932
0,1900-1998,67.66%,57.38%,62.10%,474
3,2008-2013,43.86%,56.57%,49.41%,776
1,1998-2003,46.68%,42.27%,44.37%,466
2,2003-2008,45.39%,40.85%,43.00%,639


# List Most Relevant Skills

In [70]:
# This code finds the top parameters['num_skills'] of features to show the user. It filters out any 
# ngram where the same n-1 version of the ngram is shown. This cuts down on repetition.

label_id = 4

print(model.classes_[label_id])
print('-------')

features_list = []
topn_class1 = sorted(zip(model.coef_[label_id], vect.get_feature_names()))[-parameters['num_skills']:]
for coef, feat in topn_class1:
    features_list.append(feat)

accepted_skill_list = [model.classes_[label_id]]
for potential_skill in sorted(features_list, key=lambda x: -len(x.split())):
    highest_match = len(potential_skill.split())
    for accepted_skill in accepted_skill_list:
        leftovers = list(set(potential_skill.split()) - set(accepted_skill.split()))
        if len(leftovers) < highest_match:
            highest_match = len(leftovers)
    if highest_match > 1:
        accepted_skill_list.append(potential_skill)
accepted_skill_list = accepted_skill_list[1:]
shuffle(accepted_skill_list)

for skill in accepted_skill_list:
    print(skill)

2013-2018
-------
version control
unit testing
web application using
visual studio
full stack
develop maintain
store procedure
technology use
new feature
code review
agile scrum
rest api
design implement
using asp net
entity framework
management system
development team
software engineer
sql server
ruby rail
front end
test case
continuous integration
html5 css3
html cs javascript
user interface


# Save New Model

In [76]:
# This code saves the model to the models folder

save_time = re.sub('[^A-Za-z0-9]+', '', str(datetime.datetime.now()))
print(save_time)

write_param = open(directory+"models/" + save_time + '_parameters.txt','w')
for key in parameters:
    write_param.write(key + "=" + str(parameters[key]) + '\n')
write_param.close()

# Save preprocessed x data
pickling_on = open(directory+"models/"+save_time+"_x_data.pkl","wb")
pickle.dump(x_data, pickling_on)
pickling_on.close()

# Save preprocessed y labels
pickling_on = open(directory+"models/"+save_time+"_y_labels.pkl","wb")
pickle.dump(y_labels, pickling_on)
pickling_on.close()

# Save preprocessed x SMOTE data
pickling_on = open(directory+"models/"+save_time+"_x_SMOTE_data.pkl","wb")
pickle.dump(X_res, pickling_on)
pickling_on.close()

# Save preprocessed y SMOTE labels
pickling_on = open(directory+"models/"+save_time+"_y_SMOTE_labels.pkl","wb")
pickle.dump(y_res, pickling_on)
pickling_on.close()

# Save TD-IDF vectorizer
pickling_on = open(directory+"models/"+save_time+"_tdidf_vect.pkl","wb")
pickle.dump(vect, pickling_on)
pickling_on.close()

# Save vectorized x_train
pickling_on = open(directory+"models/"+save_time+"_x_trained_tdidf_vect.pkl","wb")
pickle.dump(X_train_vectorized, pickling_on)
pickling_on.close()

# Save NB model
pickling_on = open(directory+"models/"+save_time+"_nb_model.pkl","wb")
pickle.dump(model, pickling_on)
pickling_on.close()

20180718171406220007


# Load Model

In [15]:
# This code loads an old model

save_time = '20180718171406220007' # for software_engineer

pickling_on = open(directory+"models/"+save_time+"_x_data.pkl","rb")
x_data = pickle.load(pickling_on)
pickling_on.close()

# Save preprocessed y labels
pickling_on = open(directory+"models/"+save_time+"_y_labels.pkl","rb")
y_labels = pickle.load(pickling_on)
pickling_on.close()

# Save TD-IDF vectorizer
pickling_on = open(directory+"models/"+save_time+"_tdidf_vect.pkl","rb")
vect = pickle.load(pickling_on)
pickling_on.close()

# Save vectorized x_train
pickling_on = open(directory+"models/"+save_time+"_x_trained_tdidf_vect.pkl","rb")
X_train_vectorized = pickle.load(pickling_on)
pickling_on.close()

# Save NB model
pickling_on = open(directory+"models/"+save_time+"_nb_model.pkl","rb")
model = pickle.load(pickling_on)
pickling_on.close()



KeyboardInterrupt: 

# End

Most likely I will need to train with a 1-5 ngram model and then return skills
based on a 3-4 ngram model