The code below uses a pre-trained experience embedding model in order to the find best matches for every position (called also requision or req in short).

The match is based on degree level, years of experience and experience matching 

# Imports

In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from gensim.parsing.preprocessing import remove_stopwords
from gensim.models import FastText # using FastText and not word2vec as it can handle out-of-vocabulary terms

In [3]:
max_matches = 10

# Load data

In [4]:
contacts = pd.read_csv('Contacts.csv', index_col=[0])
edu = pd.read_csv('Education.csv', index_col=[0])
exp = pd.read_csv('Experience.csv', index_col=[0], parse_dates = [1,2])
open_reqs = pd.read_excel('Open Req Data.xlsx', index_col=[0], sheet_name = 'ReqDetails')

In [5]:
old_req_titles = pd.read_csv('Old RUS req titles.csv', index_col=[0])

# Create a combined Candidate record dataframe
* Calculate max degree
* Calculate Years of experience
* Concat all expereinces together

In [6]:
# group contact experience by contactid and cacl Years of experience & list of job titles
gb = exp.reset_index().groupby(['Contact Id'])
exp_summary = gb.agg({'Job Start Date':'min', 'Job End Date':'max', 'Job Title': list})
exp_summary['Years Of Experience'] = (exp_summary['Job End Date'] - exp_summary['Job Start Date']) / np.timedelta64(1, 'Y')
exp_summary.drop(['Job Start Date', 'Job End Date'], axis=1, inplace=True)
exp_summary.columns = ['All Job Titles','Years Of Experience']

In [7]:
bachelor_terms = ['bsee','bsce','bscs','bachelor','bachelors','btech','bs','bsc','ba','bcom','bca','baccalaureate','beng','bachiller','undergraduate']
masters_terms = ['msee','msce','mscs','master','masters','mba','ms','msc','mtech','mca','postgraduate','ma','magister','meng']
phd_terms = ['doctor','doctorate','phd']
    
def get_degree(text):
    text = text.lower()
    for token in bachelor_terms:
        if token in text:
            return 1
    for token in masters_terms:
         if token in text:
            return 2
    for token in phd_terms:
         if token in text:
            return 3
    return 0

In [8]:
edu['Degree Level'].fillna('', inplace=True)
edu['Degree Level Int'] = edu['Degree Level'].apply(get_degree)
edu.head()

Unnamed: 0_level_0,Degree Level,Discipline Category,Major,Grad Year,Graduated,Degree Level Int
Contact Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
41967680,,,,4031,Y,0
41962093,Bachelor of Computer Science,Computer Science/Software,Software Engineering,2021,Y,1
41961851,,,British and American Studies,2018,Y,0
41961851,Bachelor's,,English & Spanish languages,2016,Y,1
41961851,Bachelor's,,Finance and Credit,2017,Y,1


In [9]:
edu_summary = edu.reset_index().groupby('Contact Id').agg({'Degree Level Int':'max'}) # get max degree for each contact
# merge education and experience summary data into the main contact df
full_contacts = contacts.merge(edu_summary, how='inner', left_index = True, right_index=True).merge(exp_summary, how='inner', left_index = True, right_index=True)

# Prepare data

In [10]:
def clean_text_column(col):
    col = col.astype(str)                              # convert to string
    col = col.str.lower()                              # lowercase
    col = col.replace("\W"," ", regex = True)          # remove all non-alphanumeric characters
    col = col.replace("\d"," ", regex = True)          # remove all numeric characters
    col = col.replace("\s+", " ", regex = True)        # replace all whitspace chars with a single space character
    col = col.replace("nan", "", regex = True)         # remove 'nan' entries
    col = col.str.strip()                              # strip training spaces
    return col

## Embedding method

In [11]:
# take a list of documents and word vectors object and return document embedding (as mean word embedding)
def embed_documents(documents, word_vectors):
    if (type(documents)!=list): # if the input is not a list, but a series, we will convert it to list of lists
        documents = documents.str.split().to_list() # convert each document to a list of tokens, so this will return a list of lists
    doc_vectors = []
    for doc in documents:
        if len(doc)>0:
            doc_vectors.append(word_vectors[doc].mean(axis=0)) # embed each document token and save mean embedding
        else:
            doc_vectors.append(np.zeros(word_vectors.vector_size)) # if the document is empty, return an zero vector of an approprizte size
    return np.array(doc_vectors)

## Load embedding model

In [12]:
loaded_model = FastText.load('FastText_Embedding') # load the model later, at inference

In [13]:
word_vectors = loaded_model.wv # get word_vectors object
word_vectors.similarity("developer", "programmer")

0.44259718

# Full match calculation

* Experience match using topic model
* Min degree match
* Years of experience match
* Combined match

## Embed candidate experiences and req titles

In [14]:
# clean 'All Job Titles' column
contacts_job_titles = clean_text_column(full_contacts['All Job Titles']).apply(remove_stopwords)

In [15]:
# embed candidate experience
full_contacts_exp_embedding = embed_documents(contacts_job_titles, word_vectors) # embed candidate experience text

In [16]:
# find open req titles (in the final solution I should already have it from BOBJ feed)
open_reqs = open_reqs.merge(old_req_titles, left_index=True, right_index=True) # bring req titles to open reqs

In [17]:
# embed open req titles 
req_job_titles_clean = clean_text_column(open_reqs['Job Title']).apply(remove_stopwords)
req_titles_embedding = embed_documents(req_job_titles_clean, word_vectors) # embed hire req title

## Find for each req candidates who match based on degree and min experience

In [18]:
def find_minimally_qualified_candidates(req_num):
    req_details = open_reqs.loc[req_num]
    req_row_num = open_reqs.index.get_loc(req_num)
    minimally_qualified_candidates = full_contacts
    
    # if we found degree requirements on the req, filter candidates by that requirement 
    if req_details.LowestDegree > 0: 
        minimally_qualified_candidates = minimally_qualified_candidates[minimally_qualified_candidates['Degree Level Int']>=req_details.LowestDegree]
    
    # if we found years of experience requirements on the req, filter candidates by that requirement 
    if req_details.MinExperience > 0:
        minimally_qualified_candidates = minimally_qualified_candidates[minimally_qualified_candidates['Years Of Experience']>=req_details.MinExperience]
        
    return minimally_qualified_candidates

## Calculate experience similarity matrix 

Between each open req and each candidate

In [19]:
similarity = cosine_similarity(req_titles_embedding, full_contacts_exp_embedding) # rows are positions and columns are candidates
similarity_df = pd.DataFrame(similarity, index=open_reqs.index, columns=full_contacts.index)

In [20]:
similarity.shape

(65, 5609)

## Find matches for each position

In [21]:
def find_req_matches(req_num, match_count):
    matches = find_minimally_qualified_candidates(req_num) # get a list of candidates who fit req minimal requirements
    exp_similarity_vector = similarity_df.loc[req_num] # get experience similarity of each candidate for this req
    matches = matches.merge(exp_similarity_vector, left_index=True, right_index=True) # append experience similarity as a column to matches dataframe 
    matches.columns = list(matches.columns[:-1]) + ['Match'] # rename the last column
    return matches.sort_values(by='Match', ascending=False)[:match_count] # return top X matches

In [22]:
top_candidates_for_all_positions = []

for req in open_reqs.index:
    matches = find_req_matches(req, max_matches)
    matches['Req Number'] = req
    top_candidates_for_all_positions.append(matches)

In [23]:
# convert the results to a df
top_candidates_for_all_positions_df = pd.concat(top_candidates_for_all_positions)
top_candidates_for_all_positions_df.index.name = 'Contact Id'
top_candidates_for_all_positions_df.reset_index(inplace=True)

In [24]:
# add position title to the dataframe
top_candidates_for_all_positions_df = top_candidates_for_all_positions_df.merge(old_req_titles, left_on='Req Number', right_index=True)

In [25]:
# reorder columns and remove unnecessary ones
top_candidates_for_all_positions_df = top_candidates_for_all_positions_df[['Req Number', 'Job Title','Contact Id', 'WD Candidate Id', 'Full Name', 'Email',
                                                                           'Degree Level Int', 'All Job Titles', 'Years Of Experience', 'Match']]

In [26]:
# map degree number to degree level name
degree_mapping = {0:'Unknown', 1:'Bachelor', 2:'Masters', 3:'Doctorate'}
top_candidates_for_all_positions_df['Degree Level Int'] = top_candidates_for_all_positions_df['Degree Level Int'].map(degree_mapping)

In [27]:
top_candidates_for_all_positions_df['WD Candidate Id'].fillna('', inplace=True)

In [28]:
# round expereience and match column
top_candidates_for_all_positions_df['Years Of Experience'] = top_candidates_for_all_positions_df['Years Of Experience'].round(1) 
top_candidates_for_all_positions_df['Match'] = top_candidates_for_all_positions_df['Match'].round(2) 

In [29]:
top_candidates_for_all_positions_df.rename(columns={'Degree Level Int': 'Degree Level'}, inplace=True)

In [30]:
# remove extra spaces in Full Name column
top_candidates_for_all_positions_df['Full Name'] = top_candidates_for_all_positions_df['Full Name'].str.replace("\s+", " ", regex = True)

In [31]:
# blank out annonbimized WD Candidate Id
annonimized_mask = top_candidates_for_all_positions_df['WD Candidate Id'].str.startswith('0x')
top_candidates_for_all_positions_df.loc[annonimized_mask, ['Full Name','Email','WD Candidate Id']] = ''

In [32]:
# clean All Job Titles, leaving only normal text separated by commas
top_candidates_for_all_positions_df['All Job Titles'] = top_candidates_for_all_positions_df['All Job Titles'].astype(str)
top_candidates_for_all_positions_df['All Job Titles'] = top_candidates_for_all_positions_df['All Job Titles'].str.replace("[\]\[\']","", regex = True) # remove list symbols
top_candidates_for_all_positions_df['All Job Titles'] = top_candidates_for_all_positions_df['All Job Titles'].replace("nan, ", "", regex = True)     # remove nan entries
top_candidates_for_all_positions_df['All Job Titles'] = top_candidates_for_all_positions_df['All Job Titles'].replace(" nan,", "", regex = True)     # remove nan entries

In [33]:
top_candidates_for_all_positions_df[['Req Number', 'Job Title', 'Contact Id', 'Degree Level', 'All Job Titles', 'Years Of Experience', 'Match']].head()

Unnamed: 0,Req Number,Job Title,Contact Id,Degree Level,All Job Titles,Years Of Experience,Match
0,JR0137595,Deep Learning Software Engineer (Federated Lea...,38519944,Unknown,"Machine learning engineer, Deep Learning Engin...",2.7,0.98
1,JR0137595,Deep Learning Software Engineer (Federated Lea...,37781211,Unknown,"Machine learning engineer, Deep Learning Engin...",2.7,0.98
2,JR0137595,Deep Learning Software Engineer (Federated Lea...,34551721,Unknown,Deep Learning Software Engineering Intern,4.4,0.94
3,JR0137595,Deep Learning Software Engineer (Federated Lea...,33771037,Bachelor,"Machine learning engineer, Machine learning en...",2.8,0.93
4,JR0137595,Deep Learning Software Engineer (Federated Lea...,39178007,Masters,MACHINE LEARNING RESEARCH ENGINEER,7.0,0.89


In [34]:
top_candidates_for_all_positions_df.to_excel('Match Results.xlsx', index=False)