In this notebook, experience embedding model is trained. It will later be used in order to find best matching candidates for every position.

Train/test split, evaluating different models and hyperparameter tuning were done in another notebook, so in this notebook I'm already training the best model.

# Imports

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from gensim.parsing.preprocessing import remove_stopwords
from gensim.models import FastText # using FastText and not word2vec as it can handle out-of-vocabulary terms

In [16]:
max_matches = 10
emb_dim = 16
context_window = 128
min_term_count = 3
epochs = 150

# Load data

In [2]:
contacts = pd.read_csv('Contacts.csv', index_col=[0])
edu = pd.read_csv('Education.csv', index_col=[0])
exp = pd.read_csv('Experience.csv', index_col=[0], parse_dates = [1,2])
hires = pd.read_csv('Hires.csv', index_col=[1])
old_req_titles = pd.read_csv('Old RUS req titles.csv', index_col=[0])
open_reqs = pd.read_excel('Open Req Data.xlsx', index_col=[0], sheet_name = 'ReqDetails')

# Create a combined Candidate record dataframe
* Calculate max degree
* Calculate Years of experience
* Concat all expereinces together

In [3]:
# group contact experience by contactid and cacl Years of experience & list of job titles
gb = exp.reset_index().groupby(['Contact Id'])
exp_summary = gb.agg({'Job Start Date':'min', 'Job End Date':'max', 'Job Title': list})
exp_summary['Years Of Experience'] = (exp_summary['Job End Date'] - exp_summary['Job Start Date']) / np.timedelta64(1, 'Y')
exp_summary.drop(['Job Start Date', 'Job End Date'], axis=1, inplace=True)
exp_summary.columns = ['All Job Titles','Years Of Experience']

In [4]:
bachelor_terms = ['bsee','bsce','bscs','bachelor','bachelors','btech','bs','bsc','ba','bcom','bca','baccalaureate','beng','bachiller','undergraduate']
masters_terms = ['msee','msce','mscs','master','masters','mba','ms','msc','mtech','mca','postgraduate','ma','magister','meng']
phd_terms = ['doctor','doctorate','phd']
    
def get_degree(text):
    text = text.lower()
    for token in bachelor_terms:
        if token in text:
            return 1
    for token in masters_terms:
         if token in text:
            return 2
    for token in phd_terms:
         if token in text:
            return 3
    return 0

In [5]:
edu['Degree Level'].fillna('', inplace=True)
edu['Degree Level Int'] = edu['Degree Level'].apply(get_degree)
edu.head()

Unnamed: 0_level_0,Degree Level,Discipline Category,Major,Grad Year,Graduated,Degree Level Int
Contact Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
41967680,,,,4031,Y,0
41962093,Bachelor of Computer Science,Computer Science/Software,Software Engineering,2021,Y,1
41961851,,,British and American Studies,2018,Y,0
41961851,Bachelor's,,English & Spanish languages,2016,Y,1
41961851,Bachelor's,,Finance and Credit,2017,Y,1


In [6]:
edu_summary = edu.reset_index().groupby('Contact Id').agg({'Degree Level Int':'max'}) # get max degree for each contact
# merge education and experience summary data into the main contact df
full_contacts = contacts.merge(edu_summary, how='inner', left_index = True, right_index=True).merge(exp_summary, how='inner', left_index = True, right_index=True)

# Prepare data for experience embedding model training

* Combine hire experiences with filled req titles into one dataframe
* Concatenate hire experiences with filled req titles into one column

## Testset

In [7]:
exp_testset = old_req_titles.merge(hires, how='inner', left_index = True, right_index=True) # merge req titles with hire candidate id
# merge contact experience based on candidate id
exp_testset = full_contacts[['WD Candidate Id','All Job Titles']].merge(exp_testset[['Candidate Id','Job Title']], how='inner', left_on = 'WD Candidate Id', right_on = 'Candidate Id')

In [8]:
exp_testset['Combined'] = exp_testset['All Job Titles'].astype(str) + exp_testset['Job Title'].astype(str)

In [9]:
def clean_text_column(col):
    col = col.astype(str)                              # convert to string
    col = col.str.lower()                              # lowercase
    col = col.replace("\W"," ", regex = True)          # remove all non-alphanumeric characters
    col = col.replace("\d"," ", regex = True)          # remove all numeric characters
    col = col.replace("\s+", " ", regex = True)        # replace all whitspace chars with a single space character
    col = col.replace("nan", "", regex = True)         # remove 'nan' entries
    col = col.str.strip()                              # strip training spaces
    return col

In [10]:
# clean all text columns
exp_testset['All Job Titles'] = clean_text_column(exp_testset['All Job Titles'])
exp_testset['Job Title'] = clean_text_column(exp_testset['Job Title'])
exp_testset['Combined'] = clean_text_column(exp_testset['Combined'])

## Trainset

In [11]:
exp_trainset = clean_text_column(full_contacts['All Job Titles']) # get experiences of all candiates and clean them
exp_trainset = exp_trainset[exp_trainset!=''] # remove blanks

## Combine train+test and format them

In [14]:
# combine data used for model training (candidate experience) as well as combined hires experience including their req tile
testset_list_of_lists = exp_testset['Combined'].apply(remove_stopwords).str.split().to_list() # prepare combined hires experience in format required by fasttext
trainset_list_of_lists = exp_trainset.apply(remove_stopwords).str.split().to_list() # prepare candidates experience in format required by fasttext
full_exp_data = trainset_list_of_lists + testset_list_of_lists # combine training data + test data

# Train word embedding model for experience embedding

In [23]:
# train the model
fasttext_model = FastText(sentences=full_exp_data, vector_size=emb_dim, window=context_window, min_count=min_term_count, epochs=epochs)
word_vectors = fasttext_model.wv # get word_vectors object

# Inference on testset

In [36]:
# take a list of documents and word vectors object and return document embedding (as mean word embedding)
def embed_documents(documents, word_vectors):
    if (type(documents)!=list): # if the input is not a list, but a series, we will convert it to list of lists
        documents = documents.str.split().to_list() # convert each document to a list of tokens, so this will return a list of lists
    doc_vectors = []
    for doc in documents:
        if len(doc)>0:
            doc_vectors.append(word_vectors[doc].mean(axis=0)) # embed each document token and save mean embedding
        else:
            doc_vectors.append(np.zeros(word_vectors.vector_size)) # if the document is empty, return an zero vector of an approprizte size
    return np.array(doc_vectors)

In [25]:
# clean prepare columns for inference
exp_testset_req_job_titles_clean = exp_testset['Job Title'].apply(remove_stopwords)
exp_testset_hire_job_titles_clean = exp_testset['All Job Titles'].apply(remove_stopwords)

In [37]:
# validate on hire data (of course, the result will be high due to target leakage)
req_titles_matrix = embed_documents(exp_testset_req_job_titles_clean, word_vectors) # embed hire req title
experience_matrix = embed_documents(exp_testset_hire_job_titles_clean, word_vectors) # embed candidate experience text

In [46]:
sim = cosine_similarity(req_titles_matrix, experience_matrix) # calculate similarity for each row in these matrices
mean_hire_match_topic_model = sim.trace() / len(req_titles_matrix) # caclulate average similarity between relevant items (hire req title and hire previous experience)
mean_match_topic_model = sim.mean()
print(f'Average hire got a match of {mean_hire_match_topic_model:.3f}, while baseline match of a random req/hire combination is {mean_match_topic_model:.3f}')
print(f'lift is {mean_hire_match_topic_model/mean_match_topic_model:.3f}')

Average hire got a match of 0.401, while baseline match of a random req/hire combination is 0.171
lift is 2.340


In [47]:
word_vectors.similarity("developer", "programmer")

0.44259718

## Pickle the model

In [48]:
fasttext_model.save('FastText_Embedding') # save the full model, with dictionary and other attributes

## Test model loading

In [50]:
loaded_model = FastText.load('FastText_Embedding') # load the model later, at inference

In [51]:
loaded_model

<gensim.models.fasttext.FastText at 0x21577b4cc70>

In [52]:
word_vectors = loaded_model.wv # get word_vectors object
word_vectors.similarity("developer", "programmer")

0.44259718

## Some manual testing of matches

In [53]:
term1 = 'Software Validation Engineer (Computer Vision, Integration)'
term2 = 'collecting and processing employee payroll data'
term3 = 'System Administrator, Entrepreneur, Software Engineer, Software Engineer'
term4 = 'Network Engineer, Engineer, Software Engineer'

terms = pd.Series([term1, term2, term3, term4])

In [54]:
terms_embeddings = embed_documents(terms, word_vectors)
cosine_similarity(terms_embeddings, terms_embeddings)

array([[1.0000001 , 0.17078729, 0.5373685 , 0.6881934 ],
       [0.17078729, 0.9999998 , 0.19011727, 0.04270445],
       [0.5373685 , 0.19011727, 0.9999999 , 0.8458737 ],
       [0.6881934 , 0.04270445, 0.8458737 , 0.9999999 ]], dtype=float32)