In [1]:
from utils.load import load_data
from utils.split import split_data
from utils.process_text import process_text, convert_terms, convert_words_to_vectors, get_word_vectors
from utils.predict import get_rank_predictions
from utils.evaluate import cosine_similarity

import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors, Word2Vec
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models.fasttext import FastText
from xgboost import XGBRanker
from lightgbm import LGBMRanker

import warnings
warnings.simplefilter("ignore", UserWarning)

### 1. Load and explore data

In [2]:
data = load_data(file_name="potential-talents.xlsx", folder_name="data")
print(data.info(), "\n")
print(data.describe(), "\n")
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          104 non-null    int64  
 1   job_title   104 non-null    object 
 2   location    104 non-null    object 
 3   connection  104 non-null    object 
 4   fit         0 non-null      float64
dtypes: float64(1), int64(1), object(3)
memory usage: 4.2+ KB
None 

               id  fit
count  104.000000  0.0
mean    52.500000  NaN
std     30.166206  NaN
min      1.000000  NaN
25%     26.750000  NaN
50%     52.500000  NaN
75%     78.250000  NaN
max    104.000000  NaN 



Unnamed: 0,id,job_title,location,connection,fit
0,1,2019 C.T. Bauer College of Business Graduate (...,"Houston, Texas",85,
1,2,Native English Teacher at EPIK (English Progra...,Kanada,500+,
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,
3,4,People Development Coordinator at Ryan,"Denton, Texas",500+,
4,5,Advisory Board Member at Celal Bayar University,"İzmir, Türkiye",500+,


In [3]:
# ProfileReport(data)

The id column is just an index column that would not be relevant to the fitness of any roles.
Although the job_title and location columns are highly correlated, the job_title column seems to be the only relevant column in determining the fitness of a particular role based on the column values and information we have about the requirements.

Therefore, only the job_title column will be used in the ranking procedures. Having said that, the other columns will still be returned in the result so that the user (i.e. the client) can have the full information about each of the relevant candidates.
The fit column will be filled with a fitness score for each row/candidate later.

### 2. Pre-process job titles

Convert human resources-related terms in a way that job titles containing those terms will have better fitness scores. That is, those job titles might end up having a fitness score of 0 without conversion because for instance "HR" and "Human Resources" would be considered to have nothing in common by most algorithms.

In [4]:
job_title_words = list(set(" ".join(data['job_title']).split()))
hr_words = [word for word in job_title_words if "HR" in word]
hr_words

['HR', 'SPHR', 'HRIS', 'CHRO,', 'GPHR']

In [5]:
hr_terms_dict = {'CHRO,': 'Chief Human Resources Officer,',
                'GPHR': 'Global Professional in Human Resources',
                'SPHR': 'Senior Professional in Human Resources',
                'HR': 'Human Resources',
                'HRIS': 'Human Resources Information System',
                'People': 'Human'} # this is for titles like 'People Development Coordinator at Ryan'.

for i, job_title in enumerate(data['job_title']):
    converted = []
    for word in job_title.split():
        converted.append(convert_terms(word, hr_terms_dict))
    data.loc[i, 'job_title'] = " ".join(converted)

Similar conversions can be done for terms like "staff*", "employ*", but we will leave the decision to domain experts and only convert terms that specifically include "HR" as above.

### 3. Get fitness scores
based on cosine similary between job titles and keywords using different algorithms

some descriptions about tfidf to be added.

In [6]:
tfidf_args = {'strip_accents':'unicode',
              'lowercase':True,
              'stop_words':'english',
              'ngram_range':(1,3)}
tfidf_vectorizer = TfidfVectorizer(**tfidf_args)

job_title_processed_tfidf = data['job_title'].apply(
    process_text,
    remove_stopwords=True,
    lemmatize=True,
    stem=True
)

keywords = ["Aspiring human resources", "seeking human resources"]
keywords_processed_tfidf = [process_text(keyword) for keyword in keywords]

data['fit_tfidf'] = cosine_similarity(tfidf_vectorizer.fit_transform(job_title_processed_tfidf),
                                      tfidf_vectorizer.transform(keywords_processed_tfidf)).sum(axis=1)

For other vectorizers, only remove stopwords without lemmatization or stemming since stopwords do not add any values/meanings.

In [7]:
job_title_processed = data['job_title'].apply(
    process_text,
    remove_stopwords=True,
    lemmatize=False,
    stem=False
)
keywords_processed = [process_text(
    keyword,
    remove_stopwords=True,
    lemmatize=False,
    stem=False
    ) for keyword in keywords]

some descriptions about tensorflow.keras Tokenizer to be added.

In [8]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(job_title_processed) # fit_on_texts updates internal vocabulary based on a list of texts; similar to tf-idf.
data['fit_keras_tokenizer'] = cosine_similarity(tokenizer.texts_to_matrix(job_title_processed),
                                                tokenizer.texts_to_matrix(keywords_processed)).sum(axis=1)

some descriptions about gensim, glove, and word2vec to be added.

In [9]:
# glove file source: https://nlp.stanford.edu/projects/glove/
word2vec_file = get_tmpfile('word2vec.6B.50d.txt') # Create a temp file
glove2word2vec('data/glove/glove.6B.50d.txt', word2vec_file) # Save glove2word2vec into the temp file
glove_vectors = KeyedVectors.load_word2vec_format(word2vec_file) # Load the glove2word2vec from the teamp file
glove_dimension = 50

# Transform job titles and keywords into glove vectors
glove_vectors_job_title = convert_words_to_vectors(job_title_processed, glove_vectors, glove_dimension)
glove_vectors_keywords = convert_words_to_vectors(keywords_processed, glove_vectors, glove_dimension)
data['fit_glove'] = cosine_similarity(glove_vectors_job_title, glove_vectors_keywords).sum(axis=1)

In [10]:
word2vec = Word2Vec(sentences=job_title_processed.apply(lambda x: [word.lower() for word in x.split()]))
word2vec_dimension = word2vec.vector_size

# Transform job titles and keywords into word2vec vectors
word2vec_job_title = convert_words_to_vectors(job_title_processed, word2vec, word2vec_dimension)
word2vec_keywords = convert_words_to_vectors(keywords_processed, word2vec, word2vec_dimension)
data['fit_word2vec'] = cosine_similarity(word2vec_job_title, word2vec_keywords).sum(axis=1)

What is the disadvantage of GloVe embedding?
One of the main disadvantages of Word2Vec and GloVe embedding is that they are unable to encode unknown or out-of-vocabulary words. So, to deal with this problem Facebook proposed a model FastText. It is an extension to Word2Vec and follows the same Skip-gram and CBOW model.

some descriptions about fasttext to be added.

In [11]:
fasttext = FastText(sentences=job_title_processed.apply(lambda x: [word.lower() for word in x.split()]))
fasttext_dimension = fasttext.vector_size

# Transform job titles and keywords into fasttext vectors
fasttext_job_title = convert_words_to_vectors(job_title_processed, fasttext, fasttext_dimension)
fasttext_keywords = convert_words_to_vectors(keywords_processed, fasttext, fasttext_dimension)
data['fit_fasttext'] = cosine_similarity(fasttext_job_title, fasttext_keywords).sum(axis=1)

Transform fit scores so that different fit scores will have the same range between 0 and 1.<br>
This is for easier comparisons among different fit scores.

In [12]:
minmax_scaler = MinMaxScaler()
fit_columns = [col for col in data.columns if "fit_" in col]
data[fit_columns] = minmax_scaler.fit_transform(data[fit_columns])

data.describe()

Unnamed: 0,id,fit,fit_tfidf,fit_keras_tokenizer,fit_glove,fit_word2vec,fit_fasttext
count,104.0,0.0,104.0,104.0,104.0,104.0,104.0
mean,52.5,,0.328938,0.541808,0.604935,0.594117,0.586781
std,30.166206,,0.315271,0.359278,0.313345,0.322589,0.290476
min,1.0,,0.0,0.0,0.0,0.0,0.0
25%,26.75,,0.057644,0.106066,0.286359,0.282529,0.4224
50%,52.5,,0.253802,0.642826,0.664575,0.720907,0.684321
75%,78.25,,0.497634,0.8,0.864163,0.845231,0.776623
max,104.0,,1.0,1.0,1.0,1.0,1.0


In [13]:
data['fit'] = data[fit_columns].sum(axis=1)

In [14]:
data[data['fit']==0].job_title.value_counts()

Series([], Name: job_title, dtype: int64)

In [15]:
data.sort_values('fit', ascending=False).head()

Unnamed: 0,id,job_title,location,connection,fit,fit_tfidf,fit_keras_tokenizer,fit_glove,fit_word2vec,fit_fasttext
27,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,4.821504,0.921024,1.0,1.0,0.966611,0.933869
29,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,4.821504,0.921024,1.0,1.0,0.966611,0.933869
98,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,4.75586,0.913385,1.0,0.953443,0.966611,0.922421
32,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,4.75213,1.0,1.0,0.920193,0.874349,0.957589
2,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,4.75213,1.0,1.0,0.920193,0.874349,0.957589


Looking at the job titles for the candidates who got the highest fitness score, they indeed look very relevant - in fact the job titles include one of the exact keywords "Aspiring Human Resources" in them.

In [16]:
data.sort_values('fit', ascending=True).head()

Unnamed: 0,id,job_title,location,connection,fit,fit_tfidf,fit_keras_tokenizer,fit_glove,fit_word2vec,fit_fasttext
94,95,Student at Westfield State University,"Bridgewater, Massachusetts",57,0.243059,0.0,0.0,0.11799,0.125069,0.0
53,54,Student at Chapman University,"Lake Forest, California",2,0.288568,0.0,0.0,0.032275,0.125069,0.131224
62,63,Student at Chapman University,"Lake Forest, California",2,0.288568,0.0,0.0,0.032275,0.125069,0.131224
40,41,Student at Chapman University,"Lake Forest, California",2,0.288568,0.0,0.0,0.032275,0.125069,0.131224
10,11,Student at Chapman University,"Lake Forest, California",2,0.288568,0.0,0.0,0.032275,0.125069,0.131224


In [17]:
data[data['fit']==0].job_title.unique()

array([], dtype=object)

In [18]:
data.describe()

Unnamed: 0,id,fit,fit_tfidf,fit_keras_tokenizer,fit_glove,fit_word2vec,fit_fasttext
count,104.0,104.0,104.0,104.0,104.0,104.0,104.0
mean,52.5,2.656578,0.328938,0.541808,0.604935,0.594117,0.586781
std,30.166206,1.492243,0.315271,0.359278,0.313345,0.322589,0.290476
min,1.0,0.243059,0.0,0.0,0.0,0.0,0.0
25%,26.75,1.585619,0.057644,0.106066,0.286359,0.282529,0.4224
50%,52.5,2.951848,0.253802,0.642826,0.664575,0.720907,0.684321
75%,78.25,3.549718,0.497634,0.8,0.864163,0.845231,0.776623
max,104.0,4.821504,1.0,1.0,1.0,1.0,1.0


No candidates with a fitness score of 0 although the min fit score of each of the fit_columns is all 0.

Add a filter column 'has_zero_scores' for candidates with at least 1 'zero' fitness score from the fit_columns.

In [19]:
has_zero_scores = []
for i, row in data.iterrows():
    has_zero_score = 0
    for fit in data.iloc[i][fit_columns]:
        if fit == 0:
            has_zero_score = 1
    
    has_zero_scores.append(has_zero_score)

data['has_zero_scores'] = has_zero_scores

In [20]:
data[data['has_zero_scores'] == 1].job_title.unique()

array(['Native English Teacher at EPIK (English Program in Korea)',
       'Advisory Board Member at Celal Bayar University',
       'Student at Chapman University',
       'Junior MES Engineer| Information Systems',
       'RRP Brand Portfolio Executive at JTI (Japan Tobacco International)',
       'Information Systems Specialist and Programmer with a love for data and organization.',
       'Bachelor of Science in Biology from Victoria University of Wellington',
       'Undergraduate Research Assistant at Styczynski Lab',
       'Lead Official at Western Illinois University',
       'Admissions Representative at Community medical center long beach',
       'Student at Westfield State University',
       'Student at Indiana University Kokomo - Business Management - Retail Manager at Delphi Hardware and Paint',
       'Student', 'Business Intelligence and Analytics at Travelers',
       'Always set them up for Success',
       'Director Of Administration at Excellence Logging'], dtype=

We can drop these values as they indeed seem irrelavant to our keywords, "Aspiring human resources" and "seeking human resources".

In [21]:
for fit_col in fit_columns:
    for i, job_title in enumerate(data.sort_values(fit_col).head().job_title.values):
        print(f"{fit_col} least fit job title {i+1}: {job_title}")
    print()

fit_tfidf least fit job title 1: Director Of Administration at Excellence Logging
fit_tfidf least fit job title 2: Native English Teacher at EPIK (English Program in Korea)
fit_tfidf least fit job title 3: Bachelor of Science in Biology from Victoria University of Wellington
fit_tfidf least fit job title 4: Student at Chapman University
fit_tfidf least fit job title 5: Advisory Board Member at Celal Bayar University

fit_keras_tokenizer least fit job title 1: Director Of Administration at Excellence Logging
fit_keras_tokenizer least fit job title 2: Native English Teacher at EPIK (English Program in Korea)
fit_keras_tokenizer least fit job title 3: Advisory Board Member at Celal Bayar University
fit_keras_tokenizer least fit job title 4: Native English Teacher at EPIK (English Program in Korea)
fit_keras_tokenizer least fit job title 5: Advisory Board Member at Celal Bayar University

fit_glove least fit job title 1: Advisory Board Member at Celal Bayar University
fit_glove least fit j

Looking at the job titles with the worst fitness scores, each evaluation methods for fitness seems to perform fine - i.e. all those 'worst' job titles do not seem relevant to our keywords. In other words, it would be safe to discard candidates with zero fitness scores.

In [22]:
data_filtered = data[ data['has_zero_scores'] != 1 ].drop('has_zero_scores', axis=1)

Here are top 20 'best-fit' candidates and their job titles, after dropping candidates with at least 1 zero fitness scores.

In [23]:
data_filtered = data_filtered.sort_values(['fit', 'id', 'connection'], ascending=[False, True, True]).reset_index(drop=True)
print(f"Unique job titles of top 20 candidates:\n{data_filtered.head(20).job_title.unique()}")
data_filtered.head(20)

Unique job titles of top 20 candidates:
['Seeking Human Resources Opportunities'
 'Seeking Human Resources Position'
 'Aspiring Human Resources Professional'
 'Aspiring Human Resources Manager, seeking internship in Human Resources.'
 'Aspiring Human Resources Specialist'
 'Seeking Human Resources Human Resources Information System and Generalist Positions']


Unnamed: 0,id,job_title,location,connection,fit,fit_tfidf,fit_keras_tokenizer,fit_glove,fit_word2vec,fit_fasttext
0,28,Seeking Human Resources Opportunities,"Chicago, Illinois",390,4.821504,0.921024,1.0,1.0,0.966611,0.933869
1,30,Seeking Human Resources Opportunities,"Chicago, Illinois",390,4.821504,0.921024,1.0,1.0,0.966611,0.933869
2,99,Seeking Human Resources Position,"Las Vegas, Nevada Area",48,4.75586,0.913385,1.0,0.953443,0.966611,0.922421
3,3,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,4.75213,1.0,1.0,0.920193,0.874349,0.957589
4,17,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,4.75213,1.0,1.0,0.920193,0.874349,0.957589
5,21,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,4.75213,1.0,1.0,0.920193,0.874349,0.957589
6,33,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,4.75213,1.0,1.0,0.920193,0.874349,0.957589
7,46,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,4.75213,1.0,1.0,0.920193,0.874349,0.957589
8,58,Aspiring Human Resources Professional,"Raleigh-Durham, North Carolina Area",44,4.75213,1.0,1.0,0.920193,0.874349,0.957589
9,97,Aspiring Human Resources Professional,"Kokomo, Indiana Area",71,4.75213,1.0,1.0,0.920193,0.874349,0.957589


### 4. Train ranking models - XGBoost and LGBM Rankers.

Vectorize job titles using fasttext and use the word vectors as training features.<br>
Set the 'id' column as index, and the fitness score of each candidate under the 'fit' column will be the target.

In [24]:
training_features = pd.DataFrame(
    get_word_vectors(data_filtered, 'job_title', vectorizer='fasttext',
                     to_process_text=True, remove_stopwords=True, lemmatize=False, stem=False)
)
data_selected = pd.concat([data_filtered, training_features], axis=1).set_index('id')
X = data_selected[training_features.columns]
y = data_selected['fit']

Split data into train and test sets before train any model.

In [25]:
test_size = 0.2
random_state = 1
X_train, X_test, y_train_fitness, y_test_fitness = split_data(
    X, y, test_size, random_state=random_state, oversampling=False)

Convert fitness scores into ranks for y_train and y_test separately so that each data set has ranks starting from 1 to the number of data points. Also change the name of the series (or column) from 'fit' to 'rank.

In [26]:
y_train = y_train_fitness.rank(method='dense', ascending=False)
y_train.name = 'rank'

y_test = y_test_fitness.rank(method='dense', ascending=False)
y_test.name = 'rank'

##### 1) Train an XGBoost Ranker, and get prediction results.

descriptions about XGBoost to be added.
* Evaluation metric: NDCG (normalized discounted cumulative gain) is a measure of the effectiveness of a ranking system, taking into account the position of relevant items in the ranked list. It is based on the idea that items that are higher in the ranking should be given more credit than items that are lower in the ranking.

In [59]:
xgb_params = {
    # 'n_estimators': 40,
    # 'max_depth': 2,
    # 'learning_rate': 0.02, # same as xgb's eta; default=0.3
    'objective': 'rank:pairwise', # perform better than 'rank:ndcg'
    'booster': 'gbtree',
    'eval_metric': 'ndcg',
    # 'subsample': 0.5,
    # 'gamma': 4.5, # default=0; the larger the more conservative
    # 'min_child_weight': 2, # default=1; the larger the more conservative
    'random_state': random_state
}

xgb_ranker = XGBRanker(**xgb_params)
xgb_ranker.fit(X_train, y_train,
               group=y_train.value_counts(),
               eval_set=[(X_test, y_test)],
               eval_group=[y_test.value_counts()]
               )

stats_df = pd.DataFrame(
    index=["Mean (Top 5 rankers)", "Mean", "Std"],
    columns=["y_train", "y_test"],
    data=[
        [round(y_train[y_train<=5].mean(), 4),
         round(y_test[y_test<=5].mean(), 4)],
        [round(y_train.mean(), 4),
         round(y_test.mean(), 4)],
        [round(y_train.std(), 4),
         round(y_test.std(), 4)]
    ]
)
print("Ground truth stats:")
print(stats_df,"\n")

print("Train stats:")
xgb_train_result = get_rank_predictions(X_train, y_train, xgb_ranker, target_column="rank", target="candidates")

print("Test stats:")
xgb_test_result = get_rank_predictions(X_test, y_test, xgb_ranker, target_column="rank", target="candidates")

Ground truth stats:
                      y_train  y_test
Mean (Top 5 rankers)   2.7368  3.3750
Mean                  15.0000  6.1875
Std                   10.4865  3.3310 

Train stats:
Mean rank of top 5 candidates based on predictions: 5.6316
Mean rank of all candidates based on predictions: 15.5
Std rank of all candidates based on predictions: 9.7354
Mean absolute difference between each pair of rank and predicted rank: 2.9194
    rank  pred_rank  abs_diff
id                           
27   6.0        1.0       5.0
29   6.0        1.0       5.0
28   1.0        2.0       1.0
30   1.0        2.0       1.0
60   3.0        3.0       0.0 

Test stats:
Mean rank of top 5 candidates based on predictions: 2.875
Mean rank of all candidates based on predictions: 5.9375
Std rank of all candidates based on predictions: 3.8204
Mean absolute difference between each pair of rank and predicted rank: 1.875
    rank  pred_rank  abs_diff
id                           
99   1.0        1.0       0.0
25 

##### 2) Train an LGBM (Light Gradient-Boosting Machine) Ranker, and get prediction results.

descriptions about LGBM to be added.
* Ranking algorithm: LambdaRank. This is a technique where ranking is transformed into a pairwise classification or regression problem. Basically, the algorithms consider a pair of items at a single time to come up with a viable ordering of those items before initiating the final order of the entire list.

In [28]:
lgbm_ranker = LGBMRanker(
    boosting_type="dart", # 'gbdt', 'dart', 'rf'
    # max_depth=2,
    objective="lambdarank",
    metric= "ndcg",
    label_gain =[i for i in range(int(max(y_train.max(), y_test.max())) + 2)],
    random_state=random_state
    )

lgbm_ranker.fit(
    X=X_train,
    y=y_train,
    group=y_train.value_counts(),
    eval_set=[(X_test, y_test)],
    eval_group=[y_test.value_counts()],
    verbose=-1
    )

print("Ground truth stats:")
print(stats_df,"\n")

print("Train stats:")
lgbm_train_result = get_rank_predictions(X_train, y_train, lgbm_ranker, target_column="rank", target="candidates")

print("Test stats:")
lgbm_test_result = get_rank_predictions(X_test, y_test, lgbm_ranker, target_column="rank", target="candidates")

Ground truth stats:
                         y_train  y_test
0  Mean (Top 5 rankers)   2.7368  3.3750
1                  Mean  15.0000  6.1875
2                   Std  10.4865  3.3310 

Train stats:
Mean rank of top 5 candidates based on predictions: 4.7368
Mean rank of all candidates based on predictions: 15.2258
Std rank of all candidates based on predictions: 10.2436
Mean absolute difference between each pair of rank and predicted rank: 2.6129
    rank  pred_rank  abs_diff
id                           
60   3.0        1.0       2.0
36   3.0        1.0       2.0
49   3.0        1.0       2.0
6    3.0        1.0       2.0
24   3.0        1.0       2.0 

Test stats:
Mean rank of top 5 candidates based on predictions: 4.875
Mean rank of all candidates based on predictions: 6.8125
Std rank of all candidates based on predictions: 3.2087
Mean absolute difference between each pair of rank and predicted rank: 2.0
    rank  pred_rank  abs_diff
id                           
66   3.0        1.0

Both rankers successfully predicted the top-ranked candidates (i.e. pred_rank == 1.0) as they all were selected candidates.

In terms of the mean of predicted ranks for selected candidates, the LGBM ranker performed better at predicting selected candidates. Therefore, the model will be used instead of the XGB ranker for the rest of the project.

* Overall, the predictions by the LGBM ranker were quite close to the ground truths without much hyperparameter tuning, even though some hyperparameter tuning was done for the XGB ranker.

### 5. Star ideal candidates and re-train the ranking model based on the updated ranks/criteria.

Since we've built our base ranking model, proceed to starring ideal candidates and re-rank all candidates based on the stars.
* Starring one candidate sets this candidate as an ideal candidate for the given role. Then, we re-rank the list each time a candidate or a list of candidates is starred.

5-1. Take user input for the ids of ideal candidates (i.e. the candidates to star).

In [29]:
# input_ids = input(f"""
# Enter IDs of ideal candidates, separated by comma:
# * IDs: {unique_ids}
# """)
input_ids = "1, 2, 5, 10, 12"
ideal_candidates = sorted([int(id.strip()) for id in input_ids.split(',')])

5-2. Create a copy of the train and test data to rerank candidates based on starring.

In [30]:
X_train_updated = X_train.copy()
X_test_updated = X_test.copy()
y_train_updated = y_train.copy()
y_test_updated = y_test.copy()

5-3. Add a binary feature 'star' to X_train and X_test.

If a candidate is 'starred', the feature value will be 1, otherwise 0.

In [31]:
X_train_updated['star'] = [1 if idx in ideal_candidates else 0 for idx in X_train_updated.index]
X_test_updated['star'] = [1 if idx in ideal_candidates else 0 for idx in X_test_updated.index]

5-4. Add 1 to the ranks of all candidates (e.g. rank 1 will become rank 2) so that the starred candidates can become the top ranker with a rank of 1.

Update y_train and y_test with the updated ranks.

In [32]:
y_train_updated += 1
y_test_updated += 1

ideal_rank = 1
for id in ideal_candidates:
    if id in y_train_updated.index:
        y_train_updated[id] = ideal_rank
        print(f"Rank of candidate {id} in y_train updated to {ideal_rank}.")
    elif id in y_test.index:
        y_test_updated[id] = ideal_rank
        print(f"Rank of candidate {id} in y_test updated to {ideal_rank}.")
    else:
        print(f"Candidate {id} not found!")

Rank of candidate 1 in y_train updated to 1.
Candidate 2 not found!
Candidate 5 not found!
Rank of candidate 10 in y_train updated to 1.
Rank of candidate 12 in y_test updated to 1.


5-5. Re-train ranking models.

In [48]:
xgb_ranker.fit(X_train_updated, y_train_updated,
               group=y_train_updated.value_counts(),
               eval_set=[(X_test_updated, y_test_updated)],
               eval_group=[y_test_updated.value_counts()]
               )

stats_df_updated = pd.DataFrame(
    index=["Mean (Top 5 rankers)", "Mean", "Std"],
    columns=["y_train_updated", "y_test_updated"],
    data=[
        [round(y_train_updated[y_train_updated<=5].mean(), 4),
         round(y_test_updated[y_test_updated<=5].mean(), 4)],
        [round(y_train_updated.mean(), 4),
         round(y_test_updated.mean(), 4)],
        [round(y_train_updated.std(), 4),
         round(y_test_updated.std(), 4)]
    ]
)
print("(Updated) Ground truth stats:")
print(stats_df_updated,"\n")

print("(Updated) Train stats:")
xgb_train_result_updated = get_rank_predictions(
    X_train_updated, y_train_updated, xgb_ranker, target_column="rank", target="candidates")

print("(Updated) Test stats:")
xgb_test_result_updated = get_rank_predictions(
    X_test_updated, y_test_updated, xgb_ranker, target_column="rank", target="candidates")

(Updated) Ground truth stats:
                      y_train_updated  y_test_updated
Mean (Top 5 rankers)           3.2632          3.7500
Mean                          15.4677          6.8125
Std                           10.5764          3.6737 

(Updated) Train stats:
Mean rank of top 5 candidates based on predictions: 5.0
Mean rank of all candidates based on predictions: 15.2581
Std rank of all candidates based on predictions: 10.34
Mean absolute difference between each pair of rank and predicted rank: 2.8226
    rank  pred_rank  abs_diff
id                           
60   4.0        1.0       3.0
36   4.0        1.0       3.0
49   4.0        1.0       3.0
6    4.0        1.0       3.0
24   4.0        1.0       3.0 

(Updated) Test stats:
Mean rank of top 5 candidates based on predictions: 3.125
Mean rank of all candidates based on predictions: 5.875
Std rank of all candidates based on predictions: 3.7394
Mean absolute difference between each pair of rank and predicted rank: 1.9375


In [34]:
lgbm_ranker.fit(
    X=X_train_updated,
    y=y_train_updated,
    group=y_train_updated.value_counts(),
    eval_set=[(X_test_updated, y_test_updated)],
    eval_group=[y_test_updated.value_counts()],
    verbose=-1
    )

print("(Updated) Ground truth stats:")
print(stats_df_updated,"\n")

print("(Updated) Train stats:")
lgbm_train_result_updated = get_rank_predictions(
    X_train_updated, y_train_updated, lgbm_ranker, target_column="rank", target="candidates")

print("(Updated) Test stats:")
lgbm_test_result_updated = get_rank_predictions(
    X_test_updated, y_test_updated, lgbm_ranker, target_column="rank", target="candidates")

(Updated) Ground truth stats:
                         y_train_updated  y_test_updated
0  Mean (Top 5 rankers)           3.2632          3.7500
1                  Mean          15.4677          6.8125
2                   Std          10.5764          3.6737 

(Updated) Train stats:
Mean rank of top 5 candidates based on predictions: 3.8947
Mean rank of all candidates based on predictions: 14.9839
Std rank of all candidates based on predictions: 10.6916
Mean absolute difference between each pair of rank and predicted rank: 2.6774
    rank  pred_rank  abs_diff
id                           
3    3.0        1.0       2.0
58   3.0        1.0       2.0
46   3.0        1.0       2.0
17   3.0        1.0       2.0
33   3.0        1.0       2.0 

(Updated) Test stats:
Mean rank of top 5 candidates based on predictions: 5.0
Mean rank of all candidates based on predictions: 6.5625
Std rank of all candidates based on predictions: 3.1616
Mean absolute difference between each pair of rank and predict

5-4. Evaluate results.

Collate and re-arrange statistics for easier evaluation of the results and model performance.

In [80]:
stats_df_concat = pd.concat([stats_df, stats_df_updated], axis=1)
stats_df_concat = stats_df_concat.iloc[:, [0, 2, 1, 3]] # re-arrange columns
stats_df_concat

Unnamed: 0,y_train,y_train_updated,y_test,y_test_updated
Mean (Top 5 rankers),2.7368,3.2632,3.375,3.75
Mean,15.0,15.4677,6.1875,6.8125
Std,10.4865,10.5764,3.331,3.6737


Predictions on top 5 candidates were similar before and after the starring of ideal candidates. In other words, the ranking models can handle the update of ranks based on stars, i.e. the new binary column 'star'.

In [84]:
overall_mean_dict = {}

overall_mean_dict["xgb_train"] = round(xgb_train_result.mean(), 4).to_dict()
overall_mean_dict["xgb_train_updated"] = round(xgb_train_result_updated.mean(), 4).to_dict()
overall_mean_dict["xgb_test"] = round(xgb_test_result.mean(), 4).to_dict()
overall_mean_dict["xgb_test_updated"] = round(xgb_test_result_updated.mean(), 4).to_dict()

overall_mean_dict["lgbm_train"] = round(lgbm_train_result.mean(), 4).to_dict()
overall_mean_dict["lgbm_train_updated"] = round(lgbm_train_result_updated.mean(), 4).to_dict()
overall_mean_dict["lgbm_test"] = round(lgbm_test_result.mean(), 4).to_dict()
overall_mean_dict["lgbm_test_updated"] = round(lgbm_test_result_updated.mean(), 4).to_dict()

overall_mean_df = pd.DataFrame(overall_mean_dict)
print(f"Overall mean statistics:\n{overall_mean_df}")

Overall mean statistics:
           xgb_train  xgb_train_updated  xgb_test  xgb_test_updated  \
rank         15.0000            15.4677    6.1875            6.8125   
pred_rank    15.5000            15.2581    5.9375            5.8750   
abs_diff      2.9194             2.8226    1.8750            1.9375   

           lgbm_train  lgbm_train_updated  lgbm_test  lgbm_test_updated  
rank          15.0000             15.4677     6.1875             6.8125  
pred_rank     15.2258             14.9839     6.8125             6.5625  
abs_diff       2.6129              2.6774     2.0000             1.8750  


Overall, the mean absolute difference between the real ranks and predicted ranks did not change much after re-training of the models based on the starring of ideal candidates.

Based on these statistics, the model performance of the two rankers do not seem significantly different, although the LGBM ranker's predictions were slightly better than the XGB ranker's predictions - i.e. smaller abs_diffs.

In [83]:
top5_mean_dict = {}

top5_mean_dict["xgb_train"] = round(xgb_train_result[xgb_train_result["rank"]<=5].mean(), 4).to_dict()
top5_mean_dict["xgb_train_updated"] = round(xgb_train_result_updated[xgb_train_result_updated["rank"]<=5].mean(), 4).to_dict()
top5_mean_dict["xgb_test"] = round(xgb_test_result[xgb_test_result["rank"]<=5].mean(), 4).to_dict()
top5_mean_dict["xgb_test_updated"] = round(xgb_test_result_updated[xgb_test_result_updated["rank"]<=5].mean(), 4).to_dict()

top5_mean_dict["lgbm_train"] = round(lgbm_train_result[lgbm_train_result["rank"]<=5].mean(), 4).to_dict()
top5_mean_dict["lgbm_train_updated"] = round(lgbm_train_result_updated[lgbm_train_result_updated["rank"]<=5].mean(), 4).to_dict()
top5_mean_dict["lgbm_test"] = round(lgbm_test_result[lgbm_test_result["rank"]<=5].mean(), 4).to_dict()
top5_mean_dict["lgbm_test_updated"] = round(lgbm_test_result_updated[lgbm_test_result_updated["rank"]<=5].mean(), 4).to_dict()

top5_mean_df = pd.DataFrame(top5_mean_dict)
print(f"Top 5 mean statistics:\n{top5_mean_df}")

Top 5 mean statistics:
           xgb_train  xgb_train_updated  xgb_test  xgb_test_updated  \
rank          2.7368             3.2632     3.375             3.750   
pred_rank     5.6316             5.0000     2.875             3.125   
abs_diff      2.8947             3.3158     1.500             2.625   

           lgbm_train  lgbm_train_updated  lgbm_test  lgbm_test_updated  
rank           2.7368              3.2632      3.375               3.75  
pred_rank      4.7368              3.8947      4.875               5.00  
abs_diff       3.0526              3.1579      2.000               2.00  


If we only look at the top 5 rankers (based on the ground truth ranks), the LGBM ranker still performed better.

Also worth noting is that the differences between mean values before and after the starring operation (e.g. between xgb_train and xgb_train_updated) are larger for top 5 candidates, whereas the same differences for all candidates were quite small.

- save and load the model for re-training model and making predictions.