In [1]:
# Create Naive Bayes classifier using TD-IDF vectorizer and pickle results. This model can be 
# trained on job resumes or job postings. Steps include:
# 1. Set the model parameters
# 2. Getting a combined list of salaries and only using job titles with 500+ salary records
# 3. Getting a list of resumes using this list of job titles and remove any job titles with 
#    less than 500 resumes
# 4. Run TD-IDF vectorizer and Naive Bayes model training
# 5. Test the model using "List Most Relevant Skills"
# 6. Test the model using "Document Similarity Score"


# We are only looking at the most recent 5 years of salary
# We are only looking at the mode recent 10 years of work start date

In [2]:
import datetime
import math
import numpy as np
import pandas as pd
import pickle
from itertools import product
from random import shuffle
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.preprocessing import FunctionTransformer
from imblearn.pipeline import Pipeline 


# Custom function in functions folder
from functions.word_preprocessing import *

pd.set_option('display.max_colwidth', 500)

In [3]:
directory = '/Users/kwheatley/Desktop/Capstone/gcloud_data/'
# directory = '/mnt/disks/mnt_dir/data/'

# Model Parameters

In [11]:
# Best parameters for doc similarity
parameters = {
                "doc_type":"resume", # Use "resume" or "postings"
                "min_salary_year":2014, # Filter out all salaries older than specified number
                "min_job_year":2008, # Filter out all resume jobs older than specified number
                "min_salary_records":100, # Filter out all jobs with less than specified salary records
                "min_job_summaries":1000, # Filter out all jobs with less than specified job summaries
                "min_ngram":1, # For TD-IDF vectorizer
                "max_ngram":3, # For TD-IDF vectorizer
                "min_df":5, # For TD-IDF vectorizer, ignore features in less than this number of documents
                "train_test_split":0.05, # For train-test split
                "random_state":1, # For train-test split
                "alpha":0.02, # For Naive Bayes model 0.02 is current best
                "num_skills":50, # Number of skill to show per job 
                "max_number_records":2500, # Number of records in each class, SMOTE is used to fill small classes
}

In [12]:
# # Best parameters for skills

# parameters = {
#                 "doc_type":"resume", # Use "resume" or "postings"
#                 "min_salary_year":2014, # Filter out all salaries older than specified number
#                 "min_job_year":2008, # Filter out all resume jobs older than specified number
#                 "min_salary_records":100, # Filter out all jobs with less than specified salary records
#                 "min_job_summaries":1000, # Filter out all jobs with less than specified job summaries
#                 "min_ngram":3, # For TD-IDF vectorizer
#                 "max_ngram":3, # For TD-IDF vectorizer
#                 "min_df":5, # For TD-IDF vectorizer, ignore features in less than this number of documents
#                 "train_test_split":0.05, # For train-test split
#                 "random_state":1, # For train-test split
#                 "alpha":0.02, # For Naive Bayes model 0.02 is current best
#                 "num_skills":100, # Number of skill to show per job 
#                 "max_number_records":5000, # Number of records in each class, SMOTE is used to fill small classes
# }

# Load Salary Datasets

In [6]:
# Load both salary datasets
salary1 = pd.read_csv(directory+'02_salaries_h1b.csv')
salary2 = pd.read_csv(directory+'02_salaries_greencard.csv')

# Combine salary datasets
temp_salary1 = salary1[['role','city','state','start_year','cleaned_job_title','experiences','salary']]
temp_salary2 = salary2[['job_title','city','state','decision_year','cleaned_job_title','experiences','salary_amount']]
temp_salary1.columns = ['original_role','city','state','start_year','cleaned_job_title','experiences','salary']
temp_salary2.columns = ['original_role','city','state','start_year','cleaned_job_title','experiences','salary']
combined_salaries = temp_salary1.append(temp_salary2)

# Remove salaries with null value and convert to int
combined_salaries = combined_salaries[~combined_salaries.salary.isnull()]
combined_salaries.salary = combined_salaries.salary.astype(int)

# Fill any NaN fields with no_value and convert each column into a list
combined_salaries = combined_salaries.fillna('no_value')
combined_salaries.experiences = combined_salaries.experiences.apply(lambda x: 
                                                list([item.strip() for item in x.split(',')]))
combined_salaries.original_role = combined_salaries.original_role.apply(lambda x: [x])
combined_salaries.city = combined_salaries.city.apply(lambda x: [x])
combined_salaries.state = combined_salaries.state.apply(lambda x: [x])
combined_salaries.start_year = combined_salaries.start_year.apply(lambda x: [x])
combined_salaries.cleaned_job_title = combined_salaries.cleaned_job_title.apply(lambda x: [x])
combined_salaries.salary = combined_salaries.salary.apply(lambda x: [x])

# Perform a pivot on the columns to split out rows with multiple experience level qualifiers
combined_salaries = pd.DataFrame([j for i in combined_salaries.values for j in product(*i)],
                                      columns = combined_salaries.columns)

# Only look at jobs in the past 5 years
combined_salaries = combined_salaries[combined_salaries.start_year >= parameters['min_salary_year']]

# Get List of Jobs

In [7]:
# Choose all jobs with `min_salary_records` or more records
temp = combined_salaries.groupby('cleaned_job_title').count().salary.reset_index()
jobs_to_model = temp[temp.salary >= parameters['min_salary_records']]
combined_salaries = combined_salaries[combined_salaries.cleaned_job_title\
                                                       .fillna('').isin(jobs_to_model.cleaned_job_title)]
print("Number of jobs with "+str(parameters['min_salary_records'])+"+ salary records:", jobs_to_model.cleaned_job_title.count())

Number of jobs with 100+ salary records: 732


# Load Job Summaries

In [8]:
# Load resume data
if parameters['doc_type'] == 'resume':
    data = pd.read_csv(directory+'02_resumes_work.csv')
    data.rename(columns = {'descript':'summary_text'}, inplace=True)
if parameters['doc_type'] == 'postings':
    data = pd.read_csv(directory+'02_job_posts.csv')

In [9]:
# Remove all null cleaned_job_title records
jobs_descriptions = data[~data.cleaned_job_title.isnull()]

# Remove all jobs older than 10 years
jobs_descriptions = jobs_descriptions[jobs_descriptions.from_year >= parameters['min_job_year']]

# Drop insignificant job names
jobs_to_remove = ['technical','team','test','project']
jobs_descriptions = jobs_descriptions[~jobs_descriptions.cleaned_job_title.isin(jobs_to_remove)] 

# Filter to only jobs specified by the jobs_to_model list
jobs_descriptions = jobs_descriptions[jobs_descriptions.cleaned_job_title\
                                      .isin(jobs_to_model.cleaned_job_title)]
    
# Remove all jobs without `min_job_summaries` or more resume entries
cnt_resumes_available = jobs_descriptions.groupby('cleaned_job_title')\
                                .count().reset_index()
cnt_resumes_available = list(cnt_resumes_available[
            cnt_resumes_available.summary_text>parameters['min_job_summaries']].cleaned_job_title)
jobs_descriptions = jobs_descriptions[jobs_descriptions.cleaned_job_title\
                       .isin(cnt_resumes_available)]

# Remove duplicate data
jobs_descriptions = jobs_descriptions.groupby(['cleaned_job_title','summary_text','from_year'])\
        .resume_id.first().reset_index()

print("Number of jobs with "+str(parameters['min_job_summaries'])+"+ resume entries:", len(cnt_resumes_available))
print("Number of resume entries available:", jobs_descriptions.cleaned_job_title.count())

Number of jobs with 1000+ resume entries: 114
Number of resume entries available: 497928


In [10]:
# This code samples the number of records to remove excessive numbers
new_job_descriptions = pd.DataFrame()
for name, group in jobs_descriptions.groupby('cleaned_job_title'):
    if group[group.from_year == 2016].cleaned_job_title.count() >= parameters['max_number_records']:
        new_job_descriptions = pd.concat([new_job_descriptions,group[group.from_year < 2017]\
            .sort_values(by='from_year', ascending=False).head(parameters['max_number_records'])])
    elif group[(group.from_year == 2016) | (group.from_year == 2017)].cleaned_job_title.count() >= parameters['max_number_records']:
        new_job_descriptions = pd.concat([new_job_descriptions,group[group.from_year < 2018]\
            .sort_values(by='from_year', ascending=False).head(parameters['max_number_records'])])
    else:
        new_job_descriptions = pd.concat([new_job_descriptions,group\
            .sort_values(by='from_year', ascending=False).head(parameters['max_number_records'])])

new_job_descriptions.count()

cleaned_job_title    342934
summary_text         342934
from_year            342934
resume_id            342934
dtype: int64

# Save Data for Viz

In [11]:
# Save off list of resume ids
if parameters['doc_type'] == 'resume':
    # Save the list of resume ids for resumes being used
    pd.DataFrame(new_job_descriptions.resume_id.unique())\
                .to_csv(directory+'03_relevant_resume_ids.csv',index=False)

# Save off list of relevant job titles
relevant_job_titles = pd.DataFrame(new_job_descriptions.cleaned_job_title.unique())
relevant_job_titles.columns = ['cleaned_job_title']
relevant_job_titles.to_csv(directory+'03_relevant_job_titles.csv',index=False)

In [12]:
# This code saves the cleaned salary information back to the main data folder
combined_salaries.to_csv(directory+'03_cleaned_salaries_for_app.csv',index=False)

# Data Preprocess

In [13]:
x_data = new_job_descriptions.summary_text
y_labels = new_job_descriptions.cleaned_job_title

In [14]:
# Split the data into test and train datasets
X_train, X_test, y_train, y_test = train_test_split(x_data, 
                                                    y_labels,
                                                    test_size=parameters['train_test_split'],
                                                    random_state=parameters['random_state'])

In [15]:
print("X_train:",len(X_train))
print("y_train: ",len(y_train))
print("X_test: ",len(X_test))
print("y_test: ",len(y_test))

X_train: 325787
y_train:  325787
X_test:  17147
y_test:  17147


In [16]:
new_job_descriptions.cleaned_job_title.nunique()

114

In [17]:
# for i in new_job_descriptions.cleaned_job_title.unique():
#     print(i)

# Model Development

In [18]:
# from sklearn.feature_extraction.text import CountVectorizer

# print("Start:", datetime.datetime.now())

# x_data = new_job_descriptions.summary_text
# y_labels = new_job_descriptions.cleaned_job_title

# x_data = preprocess_list(new_job_descriptions.summary_text)
# print("Done with preprocess")

# # create a count vectorizer object 
# count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
# count_vectorized_X_train = count_vect.fit_transform(X_train)

# sm = SMOTE(kind='regular')
# count_vect_X_res, count_vect_y_res = sm.fit_sample(count_vectorized_X_train, y_train)

# # transform the training and validation data using count vectorizer object
# # xtrain_count =  count_vect.transform(train_x)
# count_vectorized_X_test =  count_vect.transform(X_test)

# print("End:", datetime.datetime.now())

In [19]:
# print("Start:", datetime.datetime.now())

# x_data = new_job_descriptions.summary_text
# y_labels = new_job_descriptions.cleaned_job_title

# x_data = preprocess_list(new_job_descriptions.summary_text)
# print("Done with preprocess")

# # Split the data into test and train datasets
# X_train, X_test, y_train, y_test = train_test_split(x_data, 
#                                                     y_labels,
#                                                     test_size=parameters['train_test_split'],
#                                                     random_state=parameters['random_state'])

# # Train TF-IDF vectorizer model
# vect = TfidfVectorizer(min_df=parameters['min_df'], 
#                        ngram_range=(parameters['min_ngram'], parameters['max_ngram'])
#                       ).fit(X_train)
# X_train_vectorized = vect.transform(X_train)

# print("Done with TD-IDF")

# print('Vocabulary len:', len(vect.get_feature_names()))

# sm = SMOTE(kind='regular')
# X_res, y_res = sm.fit_sample(X_train_vectorized, y_train)

# print("End:", datetime.datetime.now())

In [20]:
# # pickling_on = open(directory+"models/dev_X_test.pkl","wb")
# # pickle.dump(X_test, pickling_on)
# # pickling_on.close()

# # pickling_on = open(directory+"models/dev_y_test.pkl","wb")
# # pickle.dump(y_test, pickling_on)
# # pickling_on.close()

# # pickling_on = open(directory+"models/dev_x_SMOTE_data.pkl","wb")
# # pickle.dump(X_res, pickling_on)
# # pickling_on.close()

# # pickling_on = open(directory+"models/dev_y_SMOTE_data.pkl","wb")
# # pickle.dump(y_res, pickling_on)
# # pickling_on.close()

# # pickling_on = open(directory+"models/vect.pkl","wb")
# # pickle.dump(vect, pickling_on)
# # pickling_on.close()


# pickling_on = open(directory+"models/dev_X_test.pkl","rb")
# X_test = pickle.load(pickling_on)
# pickling_on.close()

# pickling_on = open(directory+"models/dev_y_test.pkl","rb")
# y_test = pickle.load(pickling_on)
# pickling_on.close()

# pickling_on = open(directory+"models/dev_x_SMOTE_data.pkl","rb")
# X_res = pickle.load(pickling_on)
# pickling_on.close()

# pickling_on = open(directory+"models/dev_y_SMOTE_data.pkl","rb")
# y_res = pickle.load(pickling_on)
# pickling_on.close()

# pickling_on = open(directory+"models/vect.pkl","rb")
# vect = pickle.load(pickling_on)
# pickling_on.close()

In [21]:
# nb_model = MultinomialNB(alpha=0.02)
# nb_model.fit(X_res, y_res)

# y_pred = nb_model.predict(vect.transform(X_test))
# print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))

# # Accuracy: 35.83% .012

In [22]:
# print("Start:", datetime.datetime.now())

# from sklearn.linear_model import SGDClassifier
# svm_model = SGDClassifier(loss='hinge', penalty='l2', alpha=.0008, n_iter=3, random_state=42)

# svm_model.fit(X_res, y_res)

# y_pred = svm_model.predict(vect.transform(X_test))
# print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))

# print("End:", datetime.datetime.now())
# # Accuracy: 47.21% alpha=.0008, n_iter=3
# # Count Vectorizer Accuracy: 40.81% .0008 n_iter=3
# # Accuracy: 41.42% .001 5
# # Accuracy: 41.41% .009 5

In [23]:
# from sklearn.ensemble import VotingClassifier

# eclf1 = VotingClassifier(estimators=[
#             ('nb', nb_model), ('svm', svm_model)], voting='hard')
# eclf2 = VotingClassifier(estimators=[
#             ('nb', nb_model), ('svm', svm_model)], voting='soft')
# # eclf3 = VotingClassifier(estimators=[
# #             ('nb', nb_model), ('svm', svm_model)], voting='soft', weights=[1,1,2], flatten_transform=True)

In [24]:
# print("Start:", datetime.datetime.now())
# eclf1.fit(X_res, y_res)
# y_pred = eclf1.predict(vect.transform(X_test))
# print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))
# print("End:", datetime.datetime.now())

In [25]:
# print("Start:", datetime.datetime.now())
# eclf2.fit(X_res, y_res)
# y_pred = eclf2.predict(vect.transform(X_test))
# print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))
# print("End:", datetime.datetime.now())

In [26]:
# # from sklearn.feature_extraction.text import TfidfTransformer
# # from sklearn.decomposition import PCA
# # from sklearn.pipeline import Pipeline
# # import matplotlib.pyplot as plt

# # pca = PCA(n_components=152).fit(X_res)
# # data2D = pca.transform(X_res)
# # plt.scatter(data2D[:,0], data2D[:,1], c=data.target)
# # plt.show()              #not required if using ipython notebook


# # Load libraries
# from sklearn.preprocessing import StandardScaler
# from sklearn.decomposition import TruncatedSVD
# from scipy.sparse import csr_matrix
# from sklearn import datasets
# import numpy as np

# tsvd = TruncatedSVD(n_components=50)

# X_sparse_tsvd = tsvd.fit_transform(X_res)

# # # Show results
# print('Original number of features:', X_res.shape[1])
# print('Reduced number of features:', X_sparse_tsvd.shape[1])
# # Original number of features: 64
# # Reduced number of features: 10
# # View Percent Of Variance Explained By New Features
# # # Sum of first three components' explained variance ratios
# # tsvd.explained_variance_ratio_[0:3].sum()

In [27]:
# dense_matrix = X_res.toarray()

In [28]:
# list_of_remove_index = []
# # filevector = open('docvectors.txt', 'w')
# vector_list = []
# meta_list = []
# meta_list = list(y_res)
# # filemeta = open('docmeta.txt', 'w')
# # edu_filemeta.write("%s\n" % 'title')

# for index in range(X_res.shape[0]):
#     cleaned_vectors = '\t'.join(str(vector) for vector in X_sparse_tsvd[index])
#     if not cleaned_vectors or cleaned_vectors == '':
#         print(index)
#         list_of_remove_index.append(index)
#         continue
#     vector_list.append(cleaned_vectors)
# #     filevector.write("%s\n" % cleaned_vectors)
# #     cleaned_meta = (str(index) + ' ' + ' '.join(traindocs[index].words))
# #     if index >= 94999:
# #         print(index)
# #     if not cleaned_meta:
# #         print("issue with meta")
# #         print(index)
# #     filemeta.write("%s\n" %cleaned_meta )
# #     meta_list.append(cleaned_meta)

In [29]:
# pd.DataFrame(X_sparse_tsvd).to_csv('teest.txt',index=False)

In [30]:
# len(vector_list)

In [31]:
# print(len(vector_list))
# print(X_sparse_tsvd.shape)
# print(len(meta_list))

In [32]:
# # len(vector_list)
# # len(meta_list)
# filevector = open('docvectors.txt', 'w')
# filemeta = open('docmeta.txt', 'w')
# for row in vector_list:
#     filevector.write("%s\n" % row)
# for row in meta_list:
#     filemeta.write("%s\n" % row)

In [33]:
# import seaborn as sns

# from sklearn.metrics import confusion_matrix
# conf_mat = confusion_matrix(y_test, y_pred)
# fig, ax = plt.subplots(figsize=(50,50))
# sns.heatmap(conf_mat, annot=True, fmt='d', cmap='Greys',
#             xticklabels=nb_model.classes_, yticklabels=nb_model.classes_)
# plt.ylabel('Actual')
# plt.xlabel('Predicted')
# plt.show()

# Setup Pipeline

In [34]:
preprocess = FunctionTransformer(preprocess_list)
smt = SMOTE()
tfidf = TfidfVectorizer(min_df=parameters['min_df'], ngram_range=(parameters['min_ngram'], parameters['max_ngram']))
nb = MultinomialNB(alpha=parameters['alpha'])
preprocess = FunctionTransformer(preprocess_list, validate=False)

pipeline = Pipeline([('preprocess', preprocess), ('tfidf', tfidf), ('smt', smt), ('nb', nb)])

# Train Model

In [35]:
print('X_train:', len(X_train))
print('y_train:', len(y_train))

print("Start:", datetime.datetime.now())
pipeline.fit(X_train,y_train)
print("End:", datetime.datetime.now())

X_train: 325787
y_train: 325787
Start: 2018-08-05 08:49:20.819808
End: 2018-08-05 09:07:29.022073


In [36]:
y_pred = pipeline.predict(X_test)
print('Accuracy: %.2f%%' % (accuracy_score(y_test, y_pred) * 100))

# Accuracy: 48.86% for doc similarity
# Accuracy: 39.39% for skills

Accuracy: 39.39%


In [37]:
print('f1_score: ', f1_score(y_test, y_pred, average="macro"))
print('precision_score: ', precision_score(y_test, y_pred, average="macro"))
print('recall_score: ', recall_score(y_test, y_pred, average="macro"))   

f1_score:  0.356097542179
precision_score:  0.368799372728
recall_score:  0.36099768835


In [38]:
precision, recall, fscore, support = score(y_test, y_pred)
'{:.1%}'.format(1/3.0)

metrics = pd.DataFrame(list(zip(pipeline.classes_, precision, recall, fscore, support)))
metrics.columns = ['class','precision', 'recall', 'fscore', 'support']
metrics_samples = metrics.sort_values(by='fscore',ascending=False).head(5)
metrics_samples.precision = metrics_samples.precision.map(lambda x: '{:.2%}'.format(x))
metrics_samples.recall = metrics_samples.recall.map(lambda x: '{:.2%}'.format(x))
metrics_samples.fscore = metrics_samples.fscore.map(lambda x: '{:.2%}'.format(x))
metrics_samples.sort_values(by='fscore',ascending=True).to_csv('temp.csv')
metrics_samples

Unnamed: 0,class,precision,recall,fscore,support
17,build and release engineer,81.01%,88.28%,84.49%,145
37,devops engineer,83.59%,82.95%,83.27%,258
94,salesforce engineer,80.31%,85.71%,82.93%,119
34,database administrator,80.66%,79.91%,80.28%,214
68,network engineer,81.30%,77.82%,79.52%,257


In [39]:
# import collections
# counted = collections.Counter(y_test)

# from operator import itemgetter
# import heapq
# import collections
# def least_common_values(array, to_find=None):
#     counter = collections.Counter(array)
#     if to_find is None:
#         return sorted(counter.items(), key=itemgetter(1), reverse=False)
#     return heapq.nsmallest(to_find, counter.items(), key=itemgetter(1))

# # counted.most_common()
# least_common_values(counted, 50)

In [40]:
# From the test population, get the most confused labels

# pd.options.display.max_rows = 10000
predictions = pd.DataFrame(list(zip(y_test, y_pred)))
predictions.columns=['actual','prediction']
predictions['count']=1
pred_group = predictions.groupby(['actual','prediction']).count().reset_index()
pred_group[(pred_group.actual != pred_group.prediction) 
           & (pred_group.prediction!='account executive')
          ].sort_values(by='count',ascending=False).head(10)
# .to_csv('most_confusion.csv')

Unnamed: 0,actual,prediction,count
108,accountant,staff accountant,101
3065,staff accountant,accountant,65
3207,systems engineer,systems administrator,55
1985,network administrator,systems administrator,46
1717,j2ee engineer,java software engineer,41
1727,java software engineer,j2ee engineer,40
2089,process engineer,manufacturing engineer,37
2598,quality assurance engineer,quality assurance analyst,34
3097,systems administrator,network administrator,33
2565,quality assurance analyst,quality assurance tester,32


# List Most Relevant Skills

In [7]:
# This code finds the top parameters['num_skills'] of features to show the user. It filters out any 
# ngram where the same n-1 version of the ngram is shown. This cuts down on repetition.

label_id = 51

print(pipeline.classes_[label_id])
print('-------')

features_list = []
topn_class1 = sorted(zip(pipeline.named_steps['nb'].coef_[label_id], 
                         pipeline.named_steps['tfidf'].get_feature_names()))[-parameters['num_skills']:]
for coef, feat in topn_class1:
    features_list.append(feat)

accepted_skill_list = [pipeline.classes_[label_id]]
for potential_skill in sorted(features_list, key=lambda x: -len(x.split())):
    highest_match = len(potential_skill.split())
    for accepted_skill in accepted_skill_list:
        leftovers = list(set(potential_skill.split()) - set(accepted_skill.split()))
        if len(leftovers) < highest_match:
            highest_match = len(leftovers)
    if highest_match > 1:
        accepted_skill_list.append(potential_skill)
accepted_skill_list = accepted_skill_list[1:]
shuffle(accepted_skill_list)

for index, skill in enumerate(accepted_skill_list):
    print(index, skill)

ios engineer
-------
0 io iphone sdk
1 objective cocoa framework
2 url http itunes
3 adobe cs5 suite
4 available app store
5 ui navigation controller
6 code fix bug
7 storyboards autolayout constraint
8 json parsing include
9 cross functional team
10 mapkit core location
11 work backend team
12 breakpoints lldb statement
13 suite photoshop dreamweaver
14 web service json
15 parse json response
16 call xml json
17 gdb xcode debugging
18 view controller made
19 implement core data
20 tester developer work
21 use xib design
22 sdk mac objective
23 customize table view
24 tool xcode objectivec
25 apple com app
26 app link http
27 checkout update codebase
28 work closely within
29 responsibility work extensively
30 design development work
31 view mixture using
32 xcode interface builder
33 display correct data
34 work git checkout
35 smooth transitioning better
36 apple push notification
37 work uikit framework
38 create model mvc
39 updating model information
40 xcode cocoa touch
41 enviro

# Document Similarity Score

In [17]:
# This code returns the prediction probabilities for an example input

print("ACTUAL LABEL")
example_index = 43
print(y_test[example_index:example_index+1])

print()
print("---------------------")
print()
print("INPUT")
example = X_test[example_index:example_index+1]
print(example)

print()
print("---------------------")
print()
print("OUTPUT")

job_rankings = list(zip(pipeline.predict_proba(example)[0],pipeline.classes_))
sorted(job_rankings,reverse=True)[:20]

ACTUAL LABEL
326718    project manager
Name: cleaned_job_title, dtype: object

---------------------

INPUT
326718    .Produced 7 videos to be used a part of an online interactive trainin .module. "Shortening Management." Heavily involved in all aspects of production. Created light diagrams, shot logs, and initial rough cuts. .Attended overnight shoots and managed team..
Name: summary_text, dtype: object

---------------------

OUTPUT


[(0.33121678375734054, 'designer'),
 (0.14559227791404752, 'art director'),
 (0.10827273831558622, 'creative director'),
 (0.10219457961707287, 'graphic designer'),
 (0.049579636789399292, 'mechanical engineer'),
 (0.044342383494734759, 'production manager'),
 (0.032139331633062838, 'marketing manager'),
 (0.017060862261132688, 'project manager'),
 (0.014791250897224082, 'director'),
 (0.013185577762211452, 'mechanical design engineer'),
 (0.01040982642872439, 'process engineer'),
 (0.009375135533138226, 'software test engineer'),
 (0.0092984499230077504, 'manufacturing engineer'),
 (0.0080556784073791358, 'design engineer'),
 (0.0070383626963855068, 'technical writer'),
 (0.005623314604888875, 'product engineer'),
 (0.0054593857647914235, 'architect'),
 (0.0042924682071691349, 'quality assurance manager'),
 (0.0034467212089058078, 'applications engineer'),
 (0.0032189923779301064, 'engineer')]

# Save New Model

In [43]:
# This code saves the model to the models folder
# directory = '/mnt/disks/mnt_dir/'

save_time = re.sub('[^A-Za-z0-9]+', '', str(datetime.datetime.now()))
print(save_time)

write_param = open(directory+"models/" + save_time + '_parameters.txt','w')
for key in parameters:
    write_param.write(key + "=" + str(parameters[key]) + '\n')
write_param.close()

# Save preprocessed x data
pickling_on = open(directory+"models/"+save_time+"_x_data.pkl","wb")
pickle.dump(x_data, pickling_on)
pickling_on.close()

# Save preprocessed y labels
pickling_on = open(directory+"models/"+save_time+"_y_labels.pkl","wb")
pickle.dump(y_labels, pickling_on)
pickling_on.close()

# # Save NB model
# pickling_on = open(directory+"models/"+save_time+"_pipeline_model.pkl","wb")
# pickle.dump(pipeline, pickling_on)
# pickling_on.close()

20180805090815480402


In [44]:
class MacOSFile(object):

    def __init__(self, f):
        self.f = f

    def __getattr__(self, item):
        return getattr(self.f, item)

    def read(self, n):
        # print("reading total_bytes=%s" % n, flush=True)
        if n >= (1 << 31):
            buffer = bytearray(n)
            idx = 0
            while idx < n:
                batch_size = min(n - idx, 1 << 31 - 1)
                # print("reading bytes [%s,%s)..." % (idx, idx + batch_size), end="", flush=True)
                buffer[idx:idx + batch_size] = self.f.read(batch_size)
                # print("done.", flush=True)
                idx += batch_size
            return buffer
        return self.f.read(n)

    def write(self, buffer):
        n = len(buffer)
        print("writing total_bytes=%s..." % n, flush=True)
        idx = 0
        while idx < n:
            batch_size = min(n - idx, 1 << 31 - 1)
            print("writing bytes [%s, %s)... " % (idx, idx + batch_size), end="", flush=True)
            self.f.write(buffer[idx:idx + batch_size])
            print("done.", flush=True)
            idx += batch_size


def pickle_dump(obj, file_path):
    with open(file_path, "wb") as f:
        return pickle.dump(obj, MacOSFile(f), protocol=pickle.HIGHEST_PROTOCOL)

In [45]:
pickle_dump(pipeline,directory+"models/"+save_time+"_pipeline_model.pkl")

writing total_bytes=1721537373...
writing bytes [0, 1073741824)... done.
writing bytes [1073741824, 1721537373)... done.


# Load Model

In [9]:
# This code loads an old model
# directory = '/mnt/disks/mnt_dir/'

save_time = '20180805074152591471' # Currently best model for doc similarity
# save_time = '20180805090815480402' # Currently best model for skills

pickling_on = open(directory+"models/"+save_time+"_x_data.pkl","rb")
x_data = pickle.load(pickling_on)
pickling_on.close()

pickling_on = open(directory+"models/"+save_time+"_y_labels.pkl","rb")
y_labels = pickle.load(pickling_on)
pickling_on.close()

pickling_on = open(directory+"models/"+save_time+"_pipeline_model.pkl","rb")
pipeline = pickle.load(pickling_on)
pickling_on.close()

# End