In [13]:
from utils.load import load_data
from utils.transform import process_text, convert_terms, convert_words_to_vectors
from utils.evaluate import cosine_similarity

from pandas_profiling import ProfileReport
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer

from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors, Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.fasttext import FastText

### 1. Load and explore data

In [2]:
data = load_data(file_name="potential-talents.xlsx", folder_name="data")
print(data.info(), "\n")
print(data.describe(), "\n")
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          104 non-null    int64  
 1   job_title   104 non-null    object 
 2   location    104 non-null    object 
 3   connection  104 non-null    object 
 4   fit         0 non-null      float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.2+ KB
None 

               id  fit
count  104.000000  0.0
mean    52.500000  NaN
std     30.166206  NaN
min      1.000000  NaN
25%     26.750000  NaN
50%     52.500000  NaN
75%     78.250000  NaN
max    104.000000  NaN 



Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


In [3]:
# ProfileReport(data)

The id column is just an index column that would not be relevant to the fitness of any roles.
Although the job_title and location columns are highly correlated, the job_title column seems to be the only relevant column in determining the fitness of a particular role based on the column values and information we have about the requirements.

Therefore, only the job_title column will be used in the ranking procedures. Having said that, the other columns will still be returned in the result so that the user (i.e. the client) can have the full information about each of the relevant candidates.
The fit column will be filled with a fitness score for each row/candidate later.

### 2. Pre-process job titles

Convert human resources-related terms in a way that job titles containing those terms will have better fitness scores. That is, those job titles might end up having a fitness score of 0 without conversion because for instance "HR" and "Human Resources" would be considered to have nothing in common by most algorithms.

In [4]:
job_title_words = list(set(" ".join(data['job_title']).split()))
hr_words = [word for word in job_title_words if "HR" in word]
hr_words

['CHRO,', 'GPHR', 'HR', 'SPHR', 'HRIS']

In [5]:
hr_terms_dict = {'CHRO,': 'Chief Human Resources Officer,',
                'GPHR': 'Global Professional in Human Resources',
                'SPHR': 'Senior Professional in Human Resources',
                'HR': 'Human Resources',
                'HRIS': 'Human Resources Information System',
                'People': 'Human'} # this is for titles like 'People Development Coordinator at Ryan'.

for i, job_title in enumerate(data['job_title']):
    converted = []
    for word in job_title.split():
        converted.append(convert_terms(word, hr_terms_dict))
    data.loc[i, 'job_title'] = " ".join(converted)

Similar conversions can be done for terms like "staff*", "employ*", but we will leave the decision to domain experts and only convert terms that specifically include "HR" as above.

### 3. Get fitness scores
based on cosine similary between job titles and keywords using different algorithms

some descriptions about tfidf to be added.

In [6]:
tfidf_args = {'strip_accents':'unicode',
              'lowercase':True,
              'stop_words':'english',
              'ngram_range':(1,3)}
tfidf_vectorizer = TfidfVectorizer(**tfidf_args)

job_title_processed_tfidf = data['job_title'].apply(
    process_text,
    remove_stopwords=True,
    lemmatize=True,
    stem=True
)

keywords = ["Aspiring human resources", "seeking human resources"]
keywords_processed_tfidf = [process_text(keyword) for keyword in keywords]

data['fit_tfidf'] = cosine_similarity(tfidf_vectorizer.fit_transform(job_title_processed_tfidf),
                                      tfidf_vectorizer.transform(keywords_processed_tfidf)).sum(axis=1)

For other vectorizers, only remove stopwords without lemmatization or stemming since stopwords do not add any values/meanings.

In [7]:
job_title_processed = data['job_title'].apply(
    process_text,
    remove_stopwords=True,
    lemmatize=False,
    stem=False
)
keywords_processed = [process_text(
    keyword,
    remove_stopwords=True,
    lemmatize=False,
    stem=False
    ) for keyword in keywords]

some descriptions about tensorflow.keras Tokenizer to be added.

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(job_title_processed) # fit_on_texts updates internal vocabulary based on a list of texts; similar to tf-idf.
data['fit_keras_tokenizer'] = cosine_similarity(tokenizer.texts_to_matrix(job_title_processed),
                                                tokenizer.texts_to_matrix(keywords_processed)).sum(axis=1)

some descriptions about gensim, glove, and word2vec to be added.

In [9]:
# glove file source: https://nlp.stanford.edu/projects/glove/
word2vec_file = get_tmpfile('word2vec.6B.50d.txt') # Create a temp file
glove2word2vec('data/glove/glove.6B.50d.txt', word2vec_file) # Save glove2word2vec into the temp file
glove_vectors = KeyedVectors.load_word2vec_format(word2vec_file) # Load the glove2word2vec from the teamp file
glove_dimension = 50

# Transform job titles and keywords into glove vectors
glove_vectors_job_title = convert_words_to_vectors(job_title_processed, glove_vectors, glove_dimension)
glove_vectors_keywords = convert_words_to_vectors(keywords_processed, glove_vectors, glove_dimension)
data['fit_glove'] = cosine_similarity(glove_vectors_job_title, glove_vectors_keywords).sum(axis=1)

In [11]:
word2vec = Word2Vec(sentences=job_title_processed.apply(lambda x: [word.lower() for word in x.split()]))
word2vec_dimension = word2vec.vector_size

# Transform job titles and keywords into word2vec vectors
word2vec_job_title = convert_words_to_vectors(job_title_processed, word2vec, word2vec_dimension)
word2vec_keywords = convert_words_to_vectors(keywords_processed, word2vec, word2vec_dimension)
data['fit_word2vec'] = cosine_similarity(word2vec_job_title, word2vec_keywords).sum(axis=1)

What is the disadvantage of GloVe embedding?
One of the main disadvantages of Word2Vec and GloVe embedding is that they are unable to encode unknown or out-of-vocabulary words. So, to deal with this problem Facebook proposed a model FastText. It is an extension to Word2Vec and follows the same Skip-gram and CBOW model.

In [14]:
fasttext = FastText(sentences=job_title_processed.apply(lambda x: [word.lower() for word in x.split()]))
fasttext_dimension = fasttext.vector_size

# Transform job titles and keywords into fasttext vectors
fasttext_job_title = convert_words_to_vectors(job_title_processed, fasttext, fasttext_dimension)
fasttext_keywords = convert_words_to_vectors(keywords_processed, fasttext, fasttext_dimension)
data['fit_fasttext'] = cosine_similarity(fasttext_job_title, fasttext_keywords).sum(axis=1)

Transform fit scores so that different fit scores will have the same range between 0 and 1.<br>
This is for easier comparisons among different fit scores.

In [None]:
from sklearn.preprocessing import MinMaxScaler

minmax_scaler = MinMaxScaler()
fit_columns = [col for col in data.columns if "fit_" in col]
data[fit_columns] = minmax_scaler.fit_transform(data[fit_columns])

data.describe()

Unnamed: 0,id,fit,fit_tfidf,fit_keras_tokenizer
count,104.0,0.0,104.0,104.0
mean,52.5,,0.328938,0.541808
std,30.166206,,0.315271,0.359278
min,1.0,,0.0,0.0
25%,26.75,,0.057644,0.106066
50%,52.5,,0.253802,0.642826
75%,78.25,,0.497634,0.8
max,104.0,,1.0,1.0


In [None]:
data['fit'] = data[fit_columns].sum(axis=1)

In [None]:
data[data['fit']==0].job_title.value_counts()

Series([], Name: job_title, dtype: int64)

In [None]:
data.sort_values('fit', ascending=False).head()

Unnamed: 0,id,job_title,location,connection,fit,fit_tfidf,fit_keras_tokenizer,fit_glove,fit_word2vec,fit_fasttext
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,4.875922,0.921024,1.0,1.0,0.982373,0.972524
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,4.875922,0.921024,1.0,1.0,0.982373,0.972524
96,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,4.816638,1.0,1.0,0.924198,0.892439,1.0
45,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,4.816638,1.0,1.0,0.924198,0.892439,1.0
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,4.816638,1.0,1.0,0.924198,0.892439,1.0


Looking at the job titles for the candidates who got the highest fitness score, they indeed look very relevant - in fact the job titles include one of the exact keywords "Aspiring Human Resources" in them.

In [None]:
data.sort_values('fit', ascending=True).head()

Unnamed: 0,id,job_title,location,connection,fit,fit_tfidf,fit_keras_tokenizer,fit_glove,fit_word2vec,fit_fasttext
84,85,RRP Brand Portfolio Executive at JTI (Japan To...,Greater Philadelphia Area,500+,0.224758,0.0,0.0,0.0,0.083847,0.140912
47,48,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.314628,0.0,0.0,0.11365,0.145094,0.055883
34,35,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.314628,0.0,0.0,0.11365,0.145094,0.055883
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.314628,0.0,0.0,0.11365,0.145094,0.055883
22,23,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,0.314628,0.0,0.0,0.11365,0.145094,0.055883


In [None]:
data[data['fit']==0].job_title.unique()

array([], dtype=object)

In [None]:
data.describe()

Unnamed: 0,id,fit,fit_tfidf,fit_keras_tokenizer,fit_glove,fit_word2vec,fit_fasttext
count,104.0,104.0,104.0,104.0,104.0,104.0,104.0
mean,52.5,2.556934,0.328938,0.511697,0.639963,0.559545,0.516792
std,30.166206,1.470973,0.315271,0.34902,0.265058,0.304454,0.310942
min,1.0,0.224758,0.0,0.0,0.0,0.0,0.0
25%,26.75,1.35201,0.057644,0.1,0.473497,0.290395,0.183819
50%,52.5,2.531407,0.253802,0.591047,0.669484,0.632579,0.472658
75%,78.25,3.515162,0.497634,0.725639,0.87098,0.773567,0.768933
max,104.0,4.875922,1.0,1.0,1.0,1.0,1.0


No candidates with a fitness score of 0 although the min fit score of each of the fit_columns is all 0.

Add a filter column 'has_zero_scores' for candidates with at least 1 'zero' fitness score from the fit_columns.

In [None]:
has_zero_scores = []
for i, row in data.iterrows():
    has_zero_score = 0
    for fit in data.iloc[i][fit_columns]:
        if fit == 0:
            has_zero_score = 1
    
    has_zero_scores.append(has_zero_score)

data['has_zero_scores'] = has_zero_scores

In [None]:
data[data['has_zero_scores'] == 1].job_title.unique()

array(['Native English Teacher at EPIK (English Program in Korea)',
       'Advisory Board Member at Celal Bayar University',
       'Student at Chapman University',
       'Junior MES Engineer| Information Systems',
       'RRP Brand Portfolio Executive at JTI (Japan Tobacco International)',
       'Information Systems Specialist and Programmer with a love for data and organization.',
       'Bachelor of Science in Biology from Victoria University of Wellington',
       'Undergraduate Research Assistant at Styczynski Lab',
       'Lead Official at Western Illinois University',
       'Admissions Representative at Community medical center long beach',
       'Student at Westfield State University',
       'Student at Indiana University Kokomo - Business Management - Retail Manager at Delphi Hardware and Paint',
       'Student', 'Business Intelligence and Analytics at Travelers',
       'Always set them up for Success',
       'Director Of Administration at Excellence Logging'], dtype=

We can drop these values as they indeed seem irrelavant to our keywords, "Aspiring human resources" and "seeking human resources".

In [None]:
for fit_col in fit_columns:
    for i, job_title in enumerate(data.sort_values(fit_col).head().job_title.values):
        print(f"{fit_col} least fit job title {i+1}: {job_title}")
    print()

fit_tfidf least fit job title 1: Director Of Administration at Excellence Logging
fit_tfidf least fit job title 2: Native English Teacher at EPIK (English Program in Korea)
fit_tfidf least fit job title 3: Bachelor of Science in Biology from Victoria University of Wellington
fit_tfidf least fit job title 4: Student at Chapman University
fit_tfidf least fit job title 5: Advisory Board Member at Celal Bayar University

fit_keras_tokenizer least fit job title 1: Director Of Administration at Excellence Logging
fit_keras_tokenizer least fit job title 2: Native English Teacher at EPIK (English Program in Korea)
fit_keras_tokenizer least fit job title 3: Advisory Board Member at Celal Bayar University
fit_keras_tokenizer least fit job title 4: Native English Teacher at EPIK (English Program in Korea)
fit_keras_tokenizer least fit job title 5: Advisory Board Member at Celal Bayar University

fit_glove least fit job title 1: RRP Brand Portfolio Executive at JTI (Japan Tobacco International)
fi

Looking at the job titles with the worst fitness scores, each evaluation methods for fitness seems to perform fine - i.e. all those 'worst' job titles do not seem relevant to our keywords. In other words, it would be safe to discard candidates with zero fitness scores.

In [None]:
data_filtered = data[ data['has_zero_scores'] != 1 ].drop('has_zero_scores', axis=1).reset_index(drop=True)

Here are top 20 'best-fit' candidates, after dropping candidates with at least 1 zero fitness scores.

In [None]:
data_filtered['rank'] = data_filtered['fit'].rank(method='dense', ascending=False)
data_filtered.sort_values(['rank', 'id', 'connection'], inplace=True)
data_filtered.head(20)

Unnamed: 0,id,job_title,location,connection,fit,fit_tfidf,fit_keras_tokenizer,fit_glove,fit_word2vec,fit_fasttext,fit_adjusted,rank
21,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,4.875922,0.921024,1.0,1.0,0.982373,0.972524,4.875922,1.0
23,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,4.875922,0.921024,1.0,1.0,0.982373,0.972524,4.875922,1.0
1,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,4.816638,1.0,1.0,0.924198,0.892439,1.0,4.816638,2.0
12,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,4.816638,1.0,1.0,0.924198,0.892439,1.0,4.816638,2.0
15,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,4.816638,1.0,1.0,0.924198,0.892439,1.0,4.816638,2.0
25,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,4.816638,1.0,1.0,0.924198,0.892439,1.0,4.816638,2.0
35,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,4.816638,1.0,1.0,0.924198,0.892439,1.0,4.816638,2.0
45,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,4.816638,1.0,1.0,0.924198,0.892439,1.0,4.816638,2.0
74,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,4.816638,1.0,1.0,0.924198,0.892439,1.0,4.816638,2.0
75,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,4.804741,0.913385,1.0,0.95578,0.982373,0.953203,4.804741,3.0


Star an ideal candidate and re-rank candidates based on the adjusted fitness scores.
* Starring one candidate sets this candidate as an ideal candidate for the given role. Then, we expect the list to be re-ranked each time a candidate is starred.
* For starred candidates, set the adjusted fit score to the maximum value (i.e. 5).

In [None]:
data_filtered['fit_adjusted'] = data_filtered['fit']

ideal_candidate = input("Enter an ID of an ideal candidate: ")
data_filtered.loc[int(ideal_candidate), 'fit_adjusted'] = 5

data_filtered['rank_adjusted'] = data_filtered['fit_adjusted'].rank(method='dense', ascending=False)
data_filtered.sort_values(['rank_adjusted', 'rank', 'fit_adjusted', 'fit', 'id', 'connection'], inplace=True)
data_filtered.head(20)

Unnamed: 0,id,job_title,location,connection,fit,fit_tfidf,fit_keras_tokenizer,fit_glove,fit_word2vec,fit_fasttext,fit_adjusted,rank,rank_adjusted
1,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,4.816638,1.0,1.0,0.924198,0.892439,1.0,5.0,2.0,1.0
3,6,Aspiring Human Resources Specialist,Greater New York City Area,1,4.650038,0.830646,1.0,0.964299,0.894137,0.960955,5.0,4.0,1.0
5,8,Human Resources Senior Specialist,San Francisco Bay Area,500+,3.515162,0.295208,0.8,0.87098,0.744879,0.804095,5.0,10.0,1.0
4,7,Student at Humber College and Aspiring Human R...,Kanada,61,3.127897,0.497634,0.666667,0.669484,0.632579,0.661532,5.0,15.0,1.0
21,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,4.875922,0.921024,1.0,1.0,0.982373,0.972524,4.875922,1.0,2.0
23,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,4.875922,0.921024,1.0,1.0,0.982373,0.972524,4.875922,1.0,2.0
12,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,4.816638,1.0,1.0,0.924198,0.892439,1.0,4.816638,2.0,3.0
15,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,4.816638,1.0,1.0,0.924198,0.892439,1.0,4.816638,2.0,3.0
25,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,4.816638,1.0,1.0,0.924198,0.892439,1.0,4.816638,2.0,3.0
35,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,4.816638,1.0,1.0,0.924198,0.892439,1.0,4.816638,2.0,3.0


##### next steps
Train a ranking model
- training features: all the original columns except for the id.
- target feature: rank
- return rankings.

how would I incorporate the starring and re-ranking operations into the model?

resources:

XGBRanker

https://medium.com/predictly-on-tech/learning-to-rank-using-xgboost-83de0166229d

https://towardsdatascience.com/learning-to-rank-for-product-recommendations-a113221ad8a7



RankNet, LambdaRank, and LambdaMART

My first choice would probably by XGBoost, the extreme gradient boosting algorithm. The benefit here (apart from the fact that it’s nearly always brilliant) is that you can set your distance metrics easily to match those of the RankNet, LambdaRank, and LambdaMART models explained above, by passing in the objective parameter in your param dictionary. Here, 'objective: rank:map' corresponds to RankNet, 'objective: rank:ndcg' corresponds to LambdaRank, and 'objective: rank:pairwise' corresponds to LambdaMART.

insertion sort, merge sort, and quicksort, bubble sort

Learning to rank (LTR) is a method that is used in the construction of classification models for information retrieval systems. The training data consists of lists of articles with an induced partial order that gives a numerical or ordinal score, or a binary judgment for each article. The purpose of the model is to order the elements into new lists according to the scores that take into account the judgments obtained from the articles.