In [1]:
# Create Naive Bayes classifier using TD-IDF vectorizer and pickle results. This model can be 
# trained on job resumes or Indeed job postings. Steps include:
# 1. Set the model parameters
# 2. Getting a combined list of salaries and only using job titles with 500+ salary records
# 3. Getting a list of resumes using this list of job titles and remove any job titles with 
#    less than 500 resumes
# 4. Run TD-IDF vectorizer and Naive Bayes model training
# 5. Test the model using "List Most Relevant Skills"
# 6. Test the model using "Document Similarity Score"


# We are only looking at the most recent 5 years of salary
# We are only looking at the mode recent 10 years of work start date

In [2]:
import datetime
import math
import numpy as np
import pandas as pd
import pickle
from itertools import product
from random import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.preprocessing import FunctionTransformer
from imblearn.pipeline import Pipeline 


# Custom function in functions folder
from functions.word_preprocessing import *

pd.set_option('display.max_colwidth', 500)

In [3]:
directory = '/Users/kwheatley/Desktop/Capstone/gcloud_data/'
# directory = '/mnt/disks/mnt_dir/data/'

# Model Parameters

In [4]:
# Best parameters for doc similarity
parameters = {
                "doc_type":"indeed_resume", # Use "indeed_resume" or "indeed_postings"
                "min_salary_year":2014, # Filter out all salaries older than specified number
                "min_job_year":2008, # Filter out all resume jobs older than specified number
                "min_salary_records":100, # Filter out all jobs with less than specified salary records
                "min_job_summaries":1000, # Filter out all jobs with less than specified job summaries
                "min_ngram":1, # For TD-IDF vectorizer
                "max_ngram":3, # For TD-IDF vectorizer
                "min_df":5, # For TD-IDF vectorizer, ignore features in less than this number of documents
                "train_test_split":0.05, # For train-test split
                "random_state":1, # For train-test split
                "alpha":0.02, # For Naive Bayes model 0.02 is current best
                "num_skills":50, # Number of skill to show per job 
                "max_number_records":2500, # Number of records in each class, SMOTE is used to fill small classes
}

In [5]:
# # Best parameters for skills

# parameters = {
#                 "doc_type":"indeed_resume", # Use "indeed_resume" or "indeed_postings"
#                 "min_salary_year":2014, # Filter out all salaries older than specified number
#                 "min_job_year":2008, # Filter out all resume jobs older than specified number
#                 "min_salary_records":100, # Filter out all jobs with less than specified salary records
#                 "min_job_summaries":1000, # Filter out all jobs with less than specified job summaries
#                 "min_ngram":3, # For TD-IDF vectorizer
#                 "max_ngram":3, # For TD-IDF vectorizer
#                 "min_df":5, # For TD-IDF vectorizer, ignore features in less than this number of documents
#                 "train_test_split":0.05, # For train-test split
#                 "random_state":1, # For train-test split
#                 "alpha":0.02, # For Naive Bayes model 0.02 is current best
#                 "num_skills":100, # Number of skill to show per job 
#                 "max_number_records":5000, # Number of records in each class, SMOTE is used to fill small classes
# }

# Load Salary Datasets

In [6]:
# Load both salary datasets
salary1 = pd.read_csv(directory+'02_salaries_h1b.csv')
salary2 = pd.read_csv(directory+'02_salaries_greencard.csv')

# Combine salary datasets
temp_salary1 = salary1[['role','city','state','start_year','cleaned_job_title','experiences','salary']]
temp_salary2 = salary2[['job_title','city','state','decision_year','cleaned_job_title','experiences','salary_amount']]
temp_salary1.columns = ['original_role','city','state','start_year','cleaned_job_title','experiences','salary']
temp_salary2.columns = ['original_role','city','state','start_year','cleaned_job_title','experiences','salary']
combined_salaries = temp_salary1.append(temp_salary2)

# Remove salaries with null value and convert to int
combined_salaries = combined_salaries[~combined_salaries.salary.isnull()]
combined_salaries.salary = combined_salaries.salary.astype(int)

# Fill any NaN fields with no_value and convert each column into a list
combined_salaries = combined_salaries.fillna('no_value')
combined_salaries.experiences = combined_salaries.experiences.apply(lambda x: 
                                                list([item.strip() for item in x.split(',')]))
combined_salaries.original_role = combined_salaries.original_role.apply(lambda x: [x])
combined_salaries.city = combined_salaries.city.apply(lambda x: [x])
combined_salaries.state = combined_salaries.state.apply(lambda x: [x])
combined_salaries.start_year = combined_salaries.start_year.apply(lambda x: [x])
combined_salaries.cleaned_job_title = combined_salaries.cleaned_job_title.apply(lambda x: [x])
combined_salaries.salary = combined_salaries.salary.apply(lambda x: [x])

# Perform a pivot on the columns to split out rows with multiple experience level qualifiers
combined_salaries = pd.DataFrame([j for i in combined_salaries.values for j in product(*i)],
                                      columns = combined_salaries.columns)

# Only look at jobs in the past 5 years
combined_salaries = combined_salaries[combined_salaries.start_year >= parameters['min_salary_year']]

# Get List of Jobs

In [7]:
# Choose all jobs with `min_salary_records` or more records
temp = combined_salaries.groupby('cleaned_job_title').count().salary.reset_index()
jobs_to_model = temp[temp.salary >= parameters['min_salary_records']]
combined_salaries = combined_salaries[combined_salaries.cleaned_job_title\
                                                       .fillna('').isin(jobs_to_model.cleaned_job_title)]
print("Number of jobs with "+str(parameters['min_salary_records'])+"+ salary records:", jobs_to_model.cleaned_job_title.count())

Number of jobs with 100+ salary records: 732


# Load Job Summaries

In [8]:
# Load resume data
if parameters['doc_type'] == 'indeed_resume':
    data = pd.read_csv(directory+'02_resumes_work.csv')
    data.rename(columns = {'descript':'summary_text'}, inplace=True)
if parameters['doc_type'] == 'indeed_postings':
    data = pd.read_csv(directory+'02_job_posts_indeed.csv')

In [12]:
# Remove all null cleaned_job_title records
jobs_descriptions = data[~data.cleaned_job_title.isnull()]

# Remove all jobs older than 10 years
jobs_descriptions = jobs_descriptions[jobs_descriptions.from_year >= parameters['min_job_year']]

# Drop insignificant job names
jobs_to_remove = ['technical','team','test','project']
jobs_descriptions = jobs_descriptions[~jobs_descriptions.cleaned_job_title.isin(jobs_to_remove)] 

# Filter to only jobs specified by the jobs_to_model list
jobs_descriptions = jobs_descriptions[jobs_descriptions.cleaned_job_title\
                                      .isin(jobs_to_model.cleaned_job_title)]
    
# Remove all jobs without `min_job_summaries` or more resume entries
cnt_resumes_available = jobs_descriptions.groupby('cleaned_job_title')\
                                .count().reset_index()
cnt_resumes_available = list(cnt_resumes_available[
            cnt_resumes_available.summary_text>parameters['min_job_summaries']].cleaned_job_title)
jobs_descriptions = jobs_descriptions[jobs_descriptions.cleaned_job_title\
                       .isin(cnt_resumes_available)]

# Remove duplicate data
jobs_descriptions = jobs_descriptions.groupby(['cleaned_job_title','summary_text','from_year'])\
        .resume_id.first().reset_index()

print("Number of jobs with "+str(parameters['min_job_summaries'])+"+ resume entries:", len(cnt_resumes_available))
print("Number of resume entries available:", jobs_descriptions.cleaned_job_title.count())

Number of jobs with 1000+ resume entries: 114
Number of resume entries available: 497928


In [13]:
# This code samples the number of records to remove excessive numbers
new_job_descriptions = pd.DataFrame()
for name, group in jobs_descriptions.groupby('cleaned_job_title'):
    if group[group.from_year == 2016].cleaned_job_title.count() >= parameters['max_number_records']:
        new_job_descriptions = pd.concat([new_job_descriptions,group[group.from_year < 2017]\
            .sort_values(by='from_year', ascending=False).head(parameters['max_number_records'])])
    elif group[(group.from_year == 2016) | (group.from_year == 2017)].cleaned_job_title.count() >= parameters['max_number_records']:
        new_job_descriptions = pd.concat([new_job_descriptions,group[group.from_year < 2018]\
            .sort_values(by='from_year', ascending=False).head(parameters['max_number_records'])])
    else:
        new_job_descriptions = pd.concat([new_job_descriptions,group\
            .sort_values(by='from_year', ascending=False).head(parameters['max_number_records'])])

new_job_descriptions.count()

cleaned_job_title    234714
summary_text         234714
from_year            234714
resume_id            234714
dtype: int64

# Save Data for Viz

In [14]:
# Save off list of resume ids
if parameters['doc_type'] == 'indeed_resume':
    # Save the list of resume ids for resumes being used
    pd.DataFrame(new_job_descriptions.resume_id.unique())\
                .to_csv(directory+'03_relevant_resume_ids.csv',index=False)

# Save off list of relevant job titles
relevant_job_titles = pd.DataFrame(new_job_descriptions.cleaned_job_title.unique())
relevant_job_titles.columns = ['cleaned_job_title']
relevant_job_titles.to_csv(directory+'03_relevant_job_titles.csv',index=False)

In [16]:
# This code saves the cleaned salary information back to the main data folder
combined_salaries.to_csv(directory+'03_cleaned_salaries_for_app.csv',index=False)

# Data Preprocess

In [17]:
x_data = new_job_descriptions.summary_text
y_labels = new_job_descriptions.cleaned_job_title

In [18]:
# Split the data into test and train datasets
X_train, X_test, y_train, y_test = train_test_split(x_data, 
                                                    y_labels,
                                                    test_size=parameters['train_test_split'],
                                                    random_state=parameters['random_state'])

# Testing Models

In [19]:
print("Start:", datetime.datetime.now())

x_data = new_job_descriptions.summary_text
y_labels = new_job_descriptions.cleaned_job_title

x_data = preprocess_list(new_job_descriptions.summary_text)
print("Done with preprocess")

# Split the data into test and train datasets
X_train, X_test, y_train, y_test = train_test_split(x_data, 
                                                    y_labels,
                                                    test_size=parameters['train_test_split'],
                                                    random_state=parameters['random_state'])

# Train TF-IDF vectorizer model
vect = TfidfVectorizer(min_df=parameters['min_df'], 
                       ngram_range=(parameters['min_ngram'], parameters['max_ngram'])
                      ).fit(X_train)
X_train_vectorized = vect.transform(X_train)

print("Done with TD-IDF")

print('Vocabulary len:', len(vect.get_feature_names()))

sm = SMOTE(kind='regular')
X_res, y_res = sm.fit_sample(X_train_vectorized, y_train)

print("End:", datetime.datetime.now())

Start: 2018-07-28 08:13:46.112849
Done with preprocess
Done with TD-IDF
Vocabulary len: 1031373
End: 2018-07-28 08:31:21.942723


In [20]:
pickling_on = open(directory+"models/dev_X_test.pkl","wb")
pickle.dump(X_test, pickling_on)
pickling_on.close()

pickling_on = open(directory+"models/dev_y_test.pkl","wb")
pickle.dump(y_test, pickling_on)
pickling_on.close()

pickling_on = open(directory+"models/dev_x_SMOTE_data.pkl","wb")
pickle.dump(X_res, pickling_on)
pickling_on.close()

pickling_on = open(directory+"models/vect.pkl","wb")
pickle.dump(vect, pickling_on)
pickling_on.close()


# pickling_on = open(directory+"models/dev_X_test.pkl","rb")
# X_test = pickle.load(pickling_on)
# pickling_on.close()

# pickling_on = open(directory+"models/dev_X_test.pkl","rb")
# X_test = pickle.load(pickling_on)
# pickling_on.close()

# pickling_on = open(directory+"models/dev_y_test.pkl","rb")
# y_test = pickle.load(pickling_on)
# pickling_on.close()

# pickling_on = open(directory+"models/dev_x_SMOTE_data.pkl","rb")
# X_res = pickle.load(pickling_on)
# pickling_on.close()

# pickling_on = open(directory+"models/dev_y_SMOTE_labels.pkl","rb")
# y_res = pickle.load(pickling_on)
# pickling_on.close()

# pickling_on = open(directory+"models/vect.pkl","wb")
# pickle.dump(vect, pickling_on)
# pickling_on.close()


In [21]:
model = MultinomialNB(alpha=0.02)
model.fit(X_res, y_res)

y_pred = model.predict(vect.transform(X_test))
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))

Accuracy: 48.90%


In [32]:
print("Start:", datetime.datetime.now())

from sklearn.linear_model import SGDClassifier
svm_model = SGDClassifier(loss='hinge', penalty='l2', alpha=.0012, n_iter=5, random_state=42)

svm_model.fit(X_res, y_res)

y_pred = svm_model.predict(vect.transform(X_test))
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))

print("End:", datetime.datetime.now())
# Accuracy: 47.06% .001
# Accuracy: 47.03% .0008
# Accuracy: 46.97% .01

Start: 2018-07-28 10:28:49.561347




Accuracy: 46.85%
End: 2018-07-28 10:34:02.868062


In [24]:
print("Start:", datetime.datetime.now())

from sklearn.linear_model import SGDClassifier
svm_model = SGDClassifier(loss='hinge', penalty='l1', alpha=.001, n_iter=5, random_state=42)

svm_model.fit(X_res, y_res)

y_pred = svm_model.predict(vect.transform(X_test))
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))

print("End:", datetime.datetime.now())
# Accuracy: 18.10% .0001

Start: 2018-07-28 09:15:22.899457




Accuracy: 1.12%
End: 2018-07-28 09:27:33.729109


In [None]:
print("Start:", datetime.datetime.now())

def create_model_architecture(input_size):
    # create input layer 
    input_layer = layers.Input((input_size, ), sparse=True)
    
    # create hidden layer
    hidden_layer = layers.Dense(100, activation="relu")(input_layer)
    
    # create output layer
    output_layer = layers.Dense(1, activation="sigmoid")(hidden_layer)

    classifier = models.Model(inputs = input_layer, outputs = output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    return classifier 

classifier = create_model_architecture(xtrain_tfidf_ngram.shape[1])

classifier.fit(X_res, y_res)

y_pred = classifier.predict(vect.transform(X_test))
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))

print("End:", datetime.datetime.now())

In [None]:
def create_cnn():
    # Add an Input Layer
    input_layer = layers.Input((70, ))

    # Add the word embedding Layer
    embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
    embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

    # Add the convolutional Layer
    conv_layer = layers.Convolution1D(100, 3, activation="relu")(embedding_layer)

    # Add the pooling Layer
    pooling_layer = layers.GlobalMaxPool1D()(conv_layer)

    # Add the output Layers
    output_layer1 = layers.Dense(50, activation="relu")(pooling_layer)
    output_layer1 = layers.Dropout(0.25)(output_layer1)
    output_layer2 = layers.Dense(1, activation="sigmoid")(output_layer1)

    # Compile the model
    model = models.Model(inputs=input_layer, outputs=output_layer2)
    model.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    
    return model

classifier = create_cnn()
accuracy = train_model(classifier, train_seq_x, train_y, valid_seq_x, is_neural_net=True)
print "CNN, Word Embeddings",  accuracy

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vectorized_X_train = count_vect.fit_transform(X_train)

sm = SMOTE(kind='regular')
count_vect_X_res, count_vect_y_res = sm.fit_sample(count_vectorized_X_train, y_train)

# transform the training and validation data using count vectorizer object
# xtrain_count =  count_vect.transform(train_x)
count_vectorized_X_test =  count_vect.transform(X_test)

In [None]:
# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print "RF, Count Vectors: ", accuracy

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print "RF, WordLevel TF-IDF: ", accuracy

In [11]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.ensemble import RandomForestClassifier, VotingClassifier

# clf1 = LogisticRegression(random_state=1)
# clf2 = RandomForestClassifier(random_state=1)
# clf3 = MultinomialNB(alpha=parameters['alpha'])

# eclf1 = VotingClassifier(estimators=[
#             ('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='hard')
# eclf2 = VotingClassifier(estimators=[
#             ('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='soft')
# eclf3 = VotingClassifier(estimators=[
#             ('lr', clf1), ('rf', clf2), ('nb', clf3)], voting='soft', weights=[1,1,2], flatten_transform=True)

In [None]:
print("Start:", datetime.datetime.now())
clf2.fit(X_res, y_res)
y_pred = clf1.predict(vect.transform(X_test))
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))
print("End:", datetime.datetime.now())

Start: 2018-07-26 11:32:46.196836


In [None]:
print("Start:", datetime.datetime.now())
clf2.fit(X_res, y_res)
y_pred = clf2.predict(vect.transform(X_test))
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))
print("End:", datetime.datetime.now())

In [None]:
print("Start:", datetime.datetime.now())
clf3.fit(X_res, y_res)
y_pred = clf3.predict(vect.transform(X_test))
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))
print("End:", datetime.datetime.now())

In [None]:
print("Start:", datetime.datetime.now())
eclf1.fit(X_res, y_res)
y_pred = eclf1.predict(vect.transform(X_test))
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))
print("End:", datetime.datetime.now())

In [None]:
print("Start:", datetime.datetime.now())
eclf2.fit(X_res, y_res)
y_pred = eclf2.predict(vect.transform(X_test))
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))
print("End:", datetime.datetime.now())

In [None]:
print("Start:", datetime.datetime.now())
eclf3.fit(X_res, y_res)
y_pred = eclf3.predict(vect.transform(X_test))
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))
print("End:", datetime.datetime.now())

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt

pca = PCA(n_components=152).fit(X_res)
data2D = pca.transform(X_res)
plt.scatter(data2D[:,0], data2D[:,1], c=data.target)
plt.show()              #not required if using ipython notebook

In [None]:
from sklearn.metrics import confusion_matrix
conf_mat = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(conf_mat, annot=True, fmt='d',
            xticklabels=category_id_df.Product.values, yticklabels=category_id_df.Product.values)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# Setup Pipeline

In [57]:
preprocess = FunctionTransformer(preprocess_list)
smt = SMOTE()
tfidf = TfidfVectorizer(min_df=parameters['min_df'], ngram_range=(parameters['min_ngram'], parameters['max_ngram']))
nb = MultinomialNB(alpha=parameters['alpha'])
preprocess = FunctionTransformer(preprocess_list, validate=False)

pipeline = Pipeline([('preprocess', preprocess), ('tfidf', tfidf), ('smt', smt), ('nb', nb)])

# Train Model

In [58]:
print('X_train:', len(X_train))
print('y_train:', len(y_train))

print("Start:", datetime.datetime.now())
pipeline.fit(X_train,y_train)
print("End:", datetime.datetime.now())

X_train: 459707
y_train: 459707
Start: 2018-07-24 23:43:16.778521
End: 2018-07-25 00:07:21.414853


In [7]:
y_pred = pipeline.predict(X_test)
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))

# Accuracy: 44.81% for doc similarity
# Accuracy: 35.53% for skills

Accuracy: 35.53%


In [60]:
print('f1_score: ', f1_score(y_test, y_pred, average="macro"))
print('precision_score: ', precision_score(y_test, y_pred, average="macro"))
print('recall_score: ', recall_score(y_test, y_pred, average="macro"))   

f1_score:  0.323935852893
precision_score:  0.336582884615
recall_score:  0.332000061005


In [61]:
precision, recall, fscore, support = score(y_test, y_pred)

metrics = pd.DataFrame(list(zip(pipeline.classes_, precision, recall, fscore, support)))
metrics.columns = ['class','precision', 'recall', 'fscore', 'support']
metrics.sort_values(by='fscore',ascending=True).head()

Unnamed: 0,class,precision,recall,fscore,support
129,specialist,0.071429,0.030769,0.043011,65
87,operations analyst,0.078947,0.040541,0.053571,74
31,ceo,0.085714,0.044118,0.058252,68
23,business development,0.067416,0.073171,0.070175,82
12,assistant,0.190476,0.054422,0.084656,147


In [62]:
# import collections
# counted = collections.Counter(y_test)

# from operator import itemgetter
# import heapq
# import collections
# def least_common_values(array, to_find=None):
#     counter = collections.Counter(array)
#     if to_find is None:
#         return sorted(counter.items(), key=itemgetter(1), reverse=False)
#     return heapq.nsmallest(to_find, counter.items(), key=itemgetter(1))

# # counted.most_common()
# least_common_values(counted, 50)

In [63]:
# From the test population, get the most confused labels

# pd.options.display.max_rows = 10000
predictions = pd.DataFrame(list(zip(y_test, y_pred)))
predictions.columns=['actual','prediction']
predictions['count']=1
pred_group = predictions.groupby(['actual','prediction']).count().reset_index()
pred_group[(pred_group.actual != pred_group.prediction) 
           & (pred_group.prediction!='account executive')
          ].sort_values(by='count',ascending=False).head(20)
# .to_csv('most_confusion.csv')

Unnamed: 0,actual,prediction,count
117,accountant,staff accountant,61
2369,human resources specialist,human resources manager,61
4811,staff accountant,accountant,53
3225,process engineer,manufacturing engineer,51
4946,systems administrator,network administrator,48
2620,j2ee engineer,java software engineer,47
2636,java software engineer,j2ee engineer,42
3032,network administrator,systems administrator,42
1304,controller,accounting manager,40
5096,teacher,preschool teacher,38


# List Most Relevant Skills

In [8]:
# This code finds the top parameters['num_skills'] of features to show the user. It filters out any 
# ngram where the same n-1 version of the ngram is shown. This cuts down on repetition.

label_id = 93

print(pipeline.classes_[label_id])
print('-------')

features_list = []
topn_class1 = sorted(zip(pipeline.named_steps['nb'].coef_[label_id], 
                         pipeline.named_steps['tfidf'].get_feature_names()))[-parameters['num_skills']:]
for coef, feat in topn_class1:
    features_list.append(feat)

accepted_skill_list = [pipeline.classes_[label_id]]
for potential_skill in sorted(features_list, key=lambda x: -len(x.split())):
    highest_match = len(potential_skill.split())
    for accepted_skill in accepted_skill_list:
        leftovers = list(set(potential_skill.split()) - set(accepted_skill.split()))
        if len(leftovers) < highest_match:
            highest_match = len(leftovers)
    if highest_match > 1:
        accepted_skill_list.append(potential_skill)
accepted_skill_list = accepted_skill_list[1:]
shuffle(accepted_skill_list)

for skill in accepted_skill_list:
    print(skill)

product manager
-------
define product requirement
io android apps
market research competitive
within first month
life cycle management
work closely engineering
successful product launch
responsible product management
key performance indicator
quality assurance team
product development process
product strategy vision
work crossfunctional team
agile scrum team
create product roadmap
product road map
people get job
conduct competitive analysis
manage entire product
increase market share
agile software development
lead product team
minimum viable product
business case development
team engineer designer
internal external customer
support sale team
prioritize product backlog
user experience design
story acceptance criteria
content management system
lead cross functional
writing user story
new product service
using agile methodology
user acceptance testing
improve customer experience
serve product owner
go market strategy
new business opportunity
subject matter expert


# Document Similarity Score

In [65]:
# This code returns the prediction probabilities for an example input

example_index = 40
print(y_test[example_index:example_index+1])

print()
print("---------------------")
example = X_test[example_index:example_index+1]
print(example)

print()
print("---------------------")

job_rankings = list(zip(pipeline.predict_proba(example)[0],pipeline.classes_))
sorted(job_rankings,reverse=True)[:20]

299648    consultant
Name: cleaned_job_title, dtype: object

---------------------
299648    .as part of academically sponsored team-based project .• Conducted industry research and competitive benchmarking in order to develop pricing strategy .• Determined feasibility of program based on financial analysis of Bay Cove Academy.
Name: summary_text, dtype: object

---------------------


[(0.22726698136641141, 'financial analyst'),
 (0.10846793866030462, 'marketing specialist'),
 (0.090911912318914545, 'chief financial officer'),
 (0.076218383979244672, 'business development manager'),
 (0.047572097864376342, 'analyst'),
 (0.037165532420553309, 'technical writer'),
 (0.033379704417007695, 'consultant'),
 (0.024299218120848249, 'business manager'),
 (0.021285235634691892, 'product marketing manager'),
 (0.018763913424894845, 'program manager'),
 (0.014960565077392386, 'accounting'),
 (0.013389055843904769, 'marketing manager'),
 (0.013364387274254676, 'product manager'),
 (0.009982822959913918, 'business development'),
 (0.0098005846809103812, 'business consultant'),
 (0.0083345510721833452, 'sales manager'),
 (0.0077160757680939246, 'operations manager'),
 (0.007528294906168775, 'buyer'),
 (0.0072209841150210619, 'operations analyst'),
 (0.0067130493753592216, 'creative director')]

# Save New Model

In [66]:
# This code saves the model to the models folder
directory = '/mnt/disks/mnt_dir/'

save_time = re.sub('[^A-Za-z0-9]+', '', str(datetime.datetime.now()))
print(save_time)

write_param = open(directory+"models/" + save_time + '_parameters.txt','w')
for key in parameters:
    write_param.write(key + "=" + str(parameters[key]) + '\n')
write_param.close()

# Save preprocessed x data
pickling_on = open(directory+"models/"+save_time+"_x_data.pkl","wb")
pickle.dump(x_data, pickling_on)
pickling_on.close()

# Save preprocessed y labels
pickling_on = open(directory+"models/"+save_time+"_y_labels.pkl","wb")
pickle.dump(y_labels, pickling_on)
pickling_on.close()

# Save NB model
pickling_on = open(directory+"models/"+save_time+"_pipeline_model.pkl","wb")
pickle.dump(pipeline, pickling_on)
pickling_on.close()

20180725002822539809


# Load Model

In [5]:
# This code loads an old model
directory = '/mnt/disks/mnt_dir/'

# save_time = '20180724220628349336' # Currently best model for doc similarity
save_time = '20180725002822539809' # Currently best model for skills

pickling_on = open(directory+"models/"+save_time+"_x_data.pkl","rb")
x_data = pickle.load(pickling_on)
pickling_on.close()

pickling_on = open(directory+"models/"+save_time+"_y_labels.pkl","rb")
y_labels = pickle.load(pickling_on)
pickling_on.close()

pickling_on = open(directory+"models/"+save_time+"_pipeline_model.pkl","rb")
pipeline = pickle.load(pickling_on)
pickling_on.close()

# End