In [1]:
#import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

# For modeling later
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, precision_score, recall_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier


#commands for better readability 
pd.set_option('display.max_colwidth', None)  # show full content in each column
pd.set_option('display.max_columns', None)   # show all columns
pd.set_option('display.width', 2000)         # widen the display to avoid wrapping

pd.set_option('display.max_columns', None)  #display all columns 
pd.set_option('display.max_rows', None)  #display all rows 
warnings.filterwarnings("ignore", category=UserWarning, module='pandas')  #surpress warning messages

In [2]:
#import csv file and convert it to dataframe


file_path_full='/Users/India/Documents/match_names/df_full.csv'  #replace with respective file path 
df_full=pd.read_csv(file_path_full)  #specify delimiters to avoid reading errors

file_path_labelled='/Users/India/Documents/match_names/training_data.csv'  #replace with respective file path 
df_label=pd.read_csv(file_path_labelled)  #specify delimiters to avoid reading errors



In [3]:
df_full.drop(columns=['cap_rule_applied','num_matched_tokens'], inplace=True)
df_label.drop(columns=['cap_rule_applied','num_matched_tokens'], inplace=True)


In [4]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3608 entries, 0 to 3607
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   uk_id                     3608 non-null   int64  
 1   uk_name                   3608 non-null   object 
 2   eu_id                     3608 non-null   int64  
 3   eu_name                   3608 non-null   object 
 4   final_multi_score         3608 non-null   int64  
 5   avg_adjusted_token_score  3608 non-null   float64
 6   jaccard_similarity        3608 non-null   float64
 7   overlap_count             3608 non-null   int64  
 8   min_token_len_matched     3608 non-null   int64  
 9   max_token_len_matched     3608 non-null   int64  
 10  mean_token_len_matched    3608 non-null   float64
 11  num_uk_tokens             3608 non-null   int64  
 12  num_eu_tokens             3608 non-null   int64  
dtypes: float64(3), int64(8), object(2)
memory usage: 366.6+ KB


In [5]:
df_label.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 385 entries, 0 to 384
Data columns (total 14 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   uk_id                     385 non-null    int64  
 1   uk_name                   385 non-null    object 
 2   eu_id                     385 non-null    int64  
 3   eu_name                   385 non-null    object 
 4   final_multi_score         385 non-null    int64  
 5   avg_adjusted_token_score  385 non-null    float64
 6   jaccard_similarity        385 non-null    float64
 7   overlap_count             385 non-null    int64  
 8   min_token_len_matched     385 non-null    int64  
 9   max_token_len_matched     385 non-null    int64  
 10  mean_token_len_matched    385 non-null    float64
 11  num_uk_tokens             385 non-null    int64  
 12  num_eu_tokens             385 non-null    int64  
 13  label                     385 non-null    object 
dtypes: float64

In [6]:
df_full.head()

Unnamed: 0,uk_id,uk_name,eu_id,eu_name,final_multi_score,avg_adjusted_token_score,jaccard_similarity,overlap_count,min_token_len_matched,max_token_len_matched,mean_token_len_matched,num_uk_tokens,num_eu_tokens
0,6894,"{'Rahman', 'Fikiruddin', 'Fihiruddin', 'Muqti', 'A', 'Jibril', 'Abu', 'Abdurrahman', 'Iqbal', 'Abdul', 'Mohamad'}",1004,"{'Fikiruddin', 'Rahman', 'Fihiruddin', 'A', 'Muqti', 'Jibril', 'Abu', 'Abdurrahman', 'Iqbal', 'Abdul', 'Mohamad'}",97,96.0,1.0,9,1,11,6.0,11,11
1,6895,"{'Hazem', 'Qader', 'Abdul', 'Hai'}",505,"{'Abdul', 'Hai', 'Hazem', 'Qader'}",95,93.0,1.0,4,3,5,4.5,4,4
2,6897,"{'Am', 'Agha', 'Man', 'Abd', 'Manan', 'Saiyid', 'Abdul', 'Al'}",514,"{'Abdul', 'Am', 'Bd', 'Agha', 'Ag', 'Man', 'Abd', 'Saiyid', 'Manan', 'Lmnn', 'Al'}",93,90.5,1.0,6,2,6,3.33,8,11
3,6899,"{'Abdallah', 'Shahata', 'Tharwat', 'Tarwat', 'Shihata', 'Ali', 'Salah', 'Thirwat'}",796,"{'Abdallah', 'Shahata', 'Tharwat', 'Tarwat', 'Shihata', 'Ali', 'Salah', 'Thirwat'}",97,96.4,1.0,5,3,8,6.0,8,8
4,6901,"{'Abdul', 'Majeed', 'Majid', 'Chaudhry'}",641,"{'Abdul', 'Majeed', 'Majid', 'Chaudhry'}",99,98.33,1.0,3,5,8,6.33,4,4


In [7]:
df_label.head()

Unnamed: 0,uk_id,uk_name,eu_id,eu_name,final_multi_score,avg_adjusted_token_score,jaccard_similarity,overlap_count,min_token_len_matched,max_token_len_matched,mean_token_len_matched,num_uk_tokens,num_eu_tokens,label
0,6894,"{'Rahman', 'Fikiruddin', 'Fihiruddin', 'Muqti', 'A', 'Jibril', 'Abu', 'Abdurrahman', 'Iqbal', 'Abdul', 'Mohamad'}",1004,"{'Fikiruddin', 'Rahman', 'Fihiruddin', 'A', 'Muqti', 'Jibril', 'Abu', 'Abdurrahman', 'Iqbal', 'Abdul', 'Mohamad'}",97,96.0,1.0,9,1,11,6.0,11,11,match
1,6905,"{'Zakir', 'Bari', 'Haji', 'Sahib', 'Abdul', 'Mullah', 'Akhund'}",556,"{'Zakir', 'Bari', 'Haji', 'Sahib', 'Abdul', 'Mullah', 'Akhund'}",97,96.43,1.0,7,4,6,5.0,7,7,match
2,6912,"{'Abdul', 'Khadem', 'Rauf', 'Aliza'}",719,"{'Khadem', 'Aliza', 'Abdul', 'Mullah', 'Rauf'}",97,96.25,1.0,4,4,6,5.0,4,5,match
3,6932,"{'Samman', 'Ismail', 'Filistini', 'Othman', 'Takfiri', 'Mahmoud', 'Uthman', 'Omar', 'Qatada', 'Umar', 'Abu', 'Mohammed', 'Al', 'Umr'}",836,"{'Samman', 'Ismail', 'Filistini', 'Othman', 'Takfiri', 'Mahmoud', 'Uthman', 'Omar', 'Qatada', 'Umar', 'Abu', 'Mohammed', 'Al', 'Umr'}",97,96.33,1.0,12,2,9,5.58,14,14,match
4,7024,"{'Abdallah', 'Hannachi', 'Ben', 'Belgacem', 'Mohamed', 'Fathi', 'Aouadi', 'Belkacem', 'Al'}",927,"{'Abdallah', 'Ben', 'Belgacem', 'Fathi', 'Mohamed', 'Hannachi', 'Aouadi', 'Belkacem', 'Al'}",97,96.12,1.0,8,2,8,5.88,9,9,match


### Defining variables

In [8]:
#define relevant columns as features
feature_cols=[
    'final_multi_score',
    'avg_adjusted_token_score',
    'jaccard_similarity',
    'overlap_count',
    'min_token_len_matched',
    'max_token_len_matched',
    'mean_token_len_matched',
    'num_uk_tokens',
    'num_eu_tokens'
]

#get the uk ids of labelled data
labelled_ids=df_label['uk_id']

#create a dataframe without the labelled rows
df_holdout=df_full[~df_full['uk_id'].isin(labelled_ids)]


#extract features and target labels for training and holdout sets
X=df_label[feature_cols]  
y=df_label['label'] 
X_holdout=df_holdout[feature_cols]

# Methodology & Rationale

### Classifier 

- **Random Forest** was initially selected as a starting point. It's a robust and versatile classifier that performs well with minimal tuning and can handle complex patterns in the data. As a first model, it provided a solid baseline for comparison, delivering good results with relatively low effort

- **Logistic Regression** was explored next to test whether a simpler, more interpretable model could offer similar performance. Both L1 and L2 regularization were tested against each other. Since the L1 model showed better results, it was optimized in a dedicated follow-up experiment.


- **XGBoost**  was selected as the next step. This more advanced boosting algorithm is known for handling complex relationships in data while offering higher flexibility. Given its track record of high performance on structured data and its ability to manage overfitting, it was a natural progression to test whether it could outperform the previous models, especially after tuning hyperparameters


### Hyperparameter Tuning

- **GridSearchCV** was used to find the combination of hyperparameters that gave the best performance estimate. A custom scoring function guided the search by reflecting the task’s specific priorities, using a weighted combination of the metrics used for evaluation.



### Validation Strategy

- **Train-Test Split** was initially used for quick evaluation. Although class balance was maintained using stratification, the results may have underestimated model performance due to the limited sample size (385) and the variability introduced by a single random split. This limitation motivated a shift to Stratified K-Fold Cross-Validation, which provides more stable and reliable performance estimates.


       
- **Stratified K-Fold Cross-Validation** was adopted to address those isses and still maintain class blanace. By generating multiple train-test splits and allowing every data point to be used for both training and validation, it produced more stable and robust estimates.

    - 5 folds were chosen to simultaneously provide sufficient variability across splits, while preventing overfitting.
 
    

### Evaluation Metrics

In this task, the goal is to classify names as not match, preliminary match, or match, with the emphasis on avoiding missed true matches while minimizing unnecessary manual review.

We focus on the following priorities:

 - **Precision** in “Match” and “Not Match” is our top priority. If the system labels something as a match, it should truly be a match. Likewise, if it says not match, it should be genuinely safe to ignore. These two classes represent confident decisions, so we want them to be as reliable as possible.

 - “Preliminary Match” acts as a safety net where uncertain cases go for human review. We do not directly optimize for this class, but we rely on it to catch true matches missed

 - **Recall** in “Not Match” is important but secondary. It tells us how many of the actual “not match” cases are caught confidently by the model. Higher recall here means fewer names fall into the “preliminary match” category, which helps reduce manual workload. However, this is less critical than being certain about the predictions in the match and not match classes.



In summary, we want the model to be confident only when it is correct. Precision ensures that confidence is deserved, while recall helps reduce human burden. This approach ensures that true matches are not missed and that attention is focused where it is truly needed.

# Experimental Setup & Results

### Experiment 1:
- **Classifier**: Random Forest
- **Hyperparameters**: Default
- **Validation Strategy**: Simple Train-test Split
    - split ratio: 80/20
    - startified: yes (to maintain class balance)



In [9]:
#split the data into training and testing sets ensuring class balance using stratification
X_train_0, X_test_0, y_train_0, y_test_0=train_test_split(X, y, test_size=0.2, random_state=30, stratify=y)


#initialize and train model  
model_1=RandomForestClassifier(n_estimators=100, random_state=30)  #random_state=30 for reproducibility  
model_1.fit(X_train_0, y_train_0)

#get predictions on test set 
y_pred_1=model_1.predict(X_test_0)


In [10]:
#define class labels
class_labels=['not match','preliminary match', 'match']

#create confusion matrix
conf_matrix_1=confusion_matrix(y_test_0, y_pred_1)

#create classification report 
report_1=classification_report(y_test_0, y_pred_1, target_names=class_labels)

#extract summary metrics from report 
report_dict=classification_report(y_test_0, y_pred_1, target_names=class_labels,output_dict=True)
precision_match_1=round(report_dict['match']['precision'], 3)
precision_not_match_1=round(report_dict['not match']['precision'], 3)
recall_not_match_1=round(report_dict['not match']['recall'], 3)

#print results
print("Confusion Matrix:")
print(conf_matrix_1)
print("\nClassification Report:")
print(report_1)

print("\nClassification Report Summary:")
print(f"'Match' Precision:     {precision_match_1}")
print(f"'Not Match' Precision: {precision_not_match_1}")
print(f"'Not Match' Recall:    {recall_not_match_1}")




Confusion Matrix:
[[50  0  2]
 [ 0  9  1]
 [ 3  1 11]]

Classification Report:
                   precision    recall  f1-score   support

        not match       0.94      0.96      0.95        52
preliminary match       0.90      0.90      0.90        10
            match       0.79      0.73      0.76        15

         accuracy                           0.91        77
        macro avg       0.88      0.86      0.87        77
     weighted avg       0.91      0.91      0.91        77


Classification Report Summary:
'Match' Precision:     0.786
'Not Match' Precision: 0.943
'Not Match' Recall:    0.962


### Experiment 2:
- **Classifier**: Random Forest  
- **Hyperparameters**: Default  
- **Validation Strategy**: Stratified k-Fold CV
   - no folds: 5 (standard choice)
   - startified: yes (to maintain class balance)



In [11]:
#initialize cross validator
skf=StratifiedKFold(n_splits=5, shuffle=True, random_state=30)

#initialize model 
model_2=RandomForestClassifier(n_estimators=100, random_state=30)



conf_matrices=[]
reports=[]

#for each k fold
for train_idx, test_idx in skf.split(X, y):
    
    #split data into train and test sets
    X_train, X_test=X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test=y.iloc[train_idx], y.iloc[test_idx]

    #train model on training set
    model_2.fit(X_train, y_train)  

    #predict target values for test set
    y_pred=model_2.predict(X_test)

    #store confusion matrix and classification report for this fold
    conf_matrices.append(confusion_matrix(y_test, y_pred))
    reports.append(classification_report(y_test, y_pred, target_names=class_labels, output_dict=True))



#aggregate confusion matrices acorss all folds
conf_matrix_2=np.sum(conf_matrices, axis=0)

#extract class-specific metrics from each fold
precision_match=[r['match']['precision'] for r in reports]
precision_not_match=[r['not match']['precision'] for r in reports]
recall_not_match=[r['not match']['recall'] for r in reports]

#average metrics across all folds
precision_match_2= round(np.mean(precision_match), 3)
precision_not_match_2= round(np.mean(precision_not_match), 3)
recall_not_match_2= round(np.mean(recall_not_match), 3)




#print results
print("Confusion Matrix:")
print(conf_matrix_2)

print("\nClassification Report Summary:")
print(f"'Match' Precision:     {precision_match_2}")
print(f"'Not Match' Precision: {precision_not_match_2}")
print(f"'Not Match' Recall:    {recall_not_match_2}")


Confusion Matrix:
[[253   0   8]
 [  0  47   4]
 [  5   1  67]]

Classification Report Summary:
'Match' Precision:     0.848
'Not Match' Precision: 0.981
'Not Match' Recall:    0.969


### Experiment 3:
- **Classifier**: Random Forest  
- **Hyperparameters**: tuned with GridSearchCV + custom scoring function
   - number of trees: 50, 100, 200
   - tree depth: None, 10, 20
   - min samples to split: 2, 5
   - min samples per leaf: 1, 2
- **Validation Strategy**: Stratified k-Fold CV
    - number of folds: 5 



In [12]:
#define a custom scoring function 
def priority_metric(y_true, y_pred):

    #extract the relevant metrics
    report=classification_report(y_true, y_pred, target_names=class_labels, output_dict=True)    
    precision_match=report['match']['precision']
    precision_not_match=report['not match']['precision']
    recall_not_match=report['not match']['recall']
    
    #create weighted score based on the relative importance of each metric
    weighted_score=0.4*precision_match + 0.4*precision_not_match + 0.4*recall_not_match
    
    return weighted_score

#wrap the custom scoring function using sklearn's make_scorer for use in GridSearchCV
custom_scorer=make_scorer(priority_metric, greater_is_better=True)




In [13]:
#initialize base model
base_model_3=RandomForestClassifier(random_state=30)

#define standard hyperparameter grid
param_grid_3 = {
    'n_estimators':[50, 100, 200],
    'max_depth':[None, 10, 20],
    'min_samples_split':[2, 5],
    'min_samples_leaf':[1, 2]
}

#perform grid search with custom scoring function and cross-validation startegy
grid_search_3=GridSearchCV(base_model_3, param_grid_3, scoring=custom_scorer, cv=skf, n_jobs=-1)

#fit the grid search to the entire labelled dataset
grid_search_3.fit(X, y)

#store the best performing model and parameters
model_3=grid_search_3.best_estimator_
best_parameters_3=grid_search_3.best_params_



In [14]:
#initialize lists to store results
conf_matrices = []
reports = []

#for each k-fold
for train_idx, test_idx in skf.split(X, y):

    #split data into train and test sets
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    #train model on training set
    model_3.fit(X_train, y_train)

    #predict target values based on test data
    y_pred_3=model_3.predict(X_test)

    #store confusion matrix and classification report for this fold
    conf_matrices.append(confusion_matrix(y_test, y_pred_3)) 
    reports.append(classification_report(y_test, y_pred_3, target_names=class_labels, output_dict=True))




#aggregate confusion matrix across all folds
conf_matrix_3=np.sum(conf_matrices, axis=0)

#extract class-specific metrics from each fold
precision_match=[r['match']['precision'] for r in reports]
precision_not_match=[r['not match']['precision'] for r in reports]
recall_not_match=[r['not match']['recall'] for r in reports]

#average metrics across all folds
precision_match_3=round(np.mean(precision_match), 3)
precision_not_match_3=round(np.mean(precision_not_match), 3)
recall_not_match_3=round(np.mean(recall_not_match), 3)



#print results
print("Best hyperparameters:")
print(best_parameters_3)

print("\nConfusion Matrix:")
print(conf_matrix_3)

print("\nClassification Report Summary:")
print(f"'Match' Precision:     {precision_match_3}")
print(f"'Not Match' Precision: {precision_not_match_3}")
print(f"'Not Match' Recall:    {recall_not_match_3}")


Best hyperparameters:
{'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}

Confusion Matrix:
[[254   0   7]
 [  0  47   4]
 [  6   0  67]]

Classification Report Summary:
'Match' Precision:     0.859
'Not Match' Precision: 0.977
'Not Match' Recall:    0.973


### Experiment 4
- **Classifier**: Logistic Regression
- **Hyperparameters**: tuned with GridSearchCV + custom scoring function
    - regularization strength C: 0.001, 0.01, 0.1, 1, 10, 100
    - penalty: L1, L2
    - solver: saga (works for both L1 and L2)
- **Validation Strategy**: Stratified K-Fold
    - number of folds: 5


In [15]:
#standardize features 
scaler=StandardScaler()
X_scaled=scaler.fit_transform(X)

#initialize base model
base_model_4=LogisticRegression(random_state=30, max_iter=10000,solver='saga')

#define hyperparameter grid for tuning 
param_grid_4={
    'C': [0.001, 0.01, 0.1, 1, 10, 100],  #wide range of values
    'penalty': ['l1','l2']
}

#grid search with custom scoring and stratified CV
grid_search_4 = GridSearchCV(base_model_4, param_grid_4, scoring=custom_scorer, cv=skf, n_jobs=-1)
grid_search_4.fit(X_scaled, y)

#store best performing model and parameters
model_4=grid_search_4.best_estimator_
best_parameters_4=grid_search_4.best_params_



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [16]:
conf_matrices=[]
reports=[]

#for each k-fold
for train_idx, test_idx in skf.split(X, y):

    #split data into train and test sets
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    #standardize data 
    X_train_scaled=scaler.fit_transform(X_train)
    X_test_scaled=scaler.transform(X_test)
    
    #train model
    model_4.fit(X_train_scaled, y_train)

    #predict target values based on test data
    y_pred_4=model_4.predict(X_test_scaled)

    #store confusion matrix and classification report for this fold
    conf_matrices.append(confusion_matrix(y_test, y_pred_4)) 
    reports.append(classification_report(y_test, y_pred_4, target_names=class_labels, output_dict=True))




#aggregate confusion matrix across all folds
conf_matrix_4=np.sum(conf_matrices, axis=0)

#extract class-specific metrics from each fold
precision_match=[r['match']['precision'] for r in reports]
precision_not_match=[r['not match']['precision'] for r in reports]
recall_not_match=[r['not match']['recall'] for r in reports]

#average metrics across all folds
precision_match_4=round(np.mean(precision_match), 3)
precision_not_match_4=round(np.mean(precision_not_match), 3)
recall_not_match_4=round(np.mean(recall_not_match), 3)



#print results
print("Best hyperparameters:")
print(best_parameters_4)

print("\nConfusion Matrix:")
print(conf_matrix_4)

print("\nClassification Report Summary:")
print(f"'Match' Precision:     {precision_match_4}")
print(f"'Not Match' Precision: {precision_not_match_4}")
print(f"'Not Match' Recall:    {recall_not_match_4}")


Best hyperparameters:
{'C': 100, 'penalty': 'l1'}

Confusion Matrix:
[[252   1   8]
 [  1  48   2]
 [  8   3  62]]

Classification Report Summary:
'Match' Precision:     0.874
'Not Match' Precision: 0.966
'Not Match' Recall:    0.965


### Experiment 5
- **Classifier**: Logistic Regression
- **Hyperparameters**: tuned with GridSearchCV + custom scoring function
    - regularization strength C: 0.001, 0.01, 0.1, 1, 10, 100
    - penalty: L1
    - solver: liblinear (efficient for L1)
- **Validation Strategy**: Stratified K-Fold
    - number of folds: 5


In [17]:
#initilize base model
base_model_5=LogisticRegression(random_state=30, max_iter=10000)

#define hyperparameter grid
param_grid_5={
    'C': [0.001,0.01, 0.1, 1, 10, 100],
    'penalty': ['l1'],
    'solver': ['liblinear'] 
}

#grid search with custom scoring and stratified CV 
grid_search_5=GridSearchCV(base_model_5,param_grid_5,scoring=custom_scorer,cv=skf,n_jobs=-1)
grid_search_5.fit(X_scaled, y)

#store best performing model and parameters
model_5=grid_search_5.best_estimator_
best_parameters_5=grid_search_5.best_params_

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [18]:
conf_matrices=[]
reports=[]

#for each k-fold
for train_idx, test_idx in skf.split(X, y):

    #split data into train and test sets
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    X_train_scaled=scaler.fit_transform(X_train)
    X_test_scaled=scaler.transform(X_test)
    
    #train model
    model_5.fit(X_train_scaled, y_train)

    #predict target values 
    y_pred_5=model_5.predict(X_test_scaled)

    #store confusion matrix and classification report for this fold
    conf_matrices.append(confusion_matrix(y_test, y_pred_5)) 
    reports.append(classification_report(y_test, y_pred_5, target_names=class_labels, output_dict=True))




#aggregate confusion matrix across all folds
conf_matrix_5=np.sum(conf_matrices, axis=0)

#extract class-specific metrics from each fold
precision_match=[r['match']['precision'] for r in reports]
precision_not_match=[r['not match']['precision'] for r in reports]
recall_not_match=[r['not match']['recall'] for r in reports]

#average metrics across all folds
precision_match_5=round(np.mean(precision_match), 3)
precision_not_match_5=round(np.mean(precision_not_match), 3)
recall_not_match_5=round(np.mean(recall_not_match), 3)




#print results
print("Best hyperparameters:")
print(best_parameters_5)

print("\nConfusion Matrix:")
print(conf_matrix_5)

print("\nClassification Report Summary:")
print(f"'Match' Precision:     {precision_match_5}")
print(f"'Not Match' Precision: {precision_not_match_5}")
print(f"'Not Match' Recall:    {recall_not_match_5}")


Best hyperparameters:
{'C': 10, 'penalty': 'l1', 'solver': 'liblinear'}

Confusion Matrix:
[[257   1   3]
 [  1  47   3]
 [ 16   4  53]]

Classification Report Summary:
'Match' Precision:     0.898
'Not Match' Precision: 0.938
'Not Match' Recall:    0.985


### Experiment 6
- **Classifier**: XGBoost Classifier
- **Hyperparameters**: tuned with GridSearchCV + custom scoring function
    - no of estimators: 50, 100, 200
    - max depth of each tree: 3, 6, 10
    - step size shrinkage: 0.01, 0.1, 0.2
    - row sampling: 0.7, 1.0
    - feature sampling: 0.7, 1.0
    - L1 regularization term: 0, 0.1, 1 
    - L2 regularization term: 1, 10 
- **Validation Strategy**: Stratified K-Fold
    - number of folds: 5

In [19]:
#initialize base model
base_model_6=XGBClassifier(random_state=30, eval_metric='mlogloss')

#define hyperparameter grid for tuning
param_grid_6 = {
    'n_estimators':[50, 100, 200],           #number of trees
    'max_depth':[3, 6, 10],                  #depth of each tree
    'learning_rate':[0.01, 0.1, 0.2],        #step size shrinkage
    'subsample':[0.7, 1.0],                  # row sampling
    'colsample_bytree':[0.7, 1.0],           # feature sampling
    'reg_alpha':[0, 0.1, 1],                 # L1 regularization
    'reg_lambda':[1, 10]                     # L2 regularization
}

#encode labels as integers for compatibility with XGBClassifier
label_mapping={
    'not match': 0,
    'preliminary match': 1,
    'match': 2
}
df_label['encoded_label'] = df_label['label'].map(label_mapping)
y=df_label['encoded_label']

#create a reverse operation to use in the future and go back to readable labels
reverse_mapping={v: k for k, v in label_mapping.items()}

#grid search with custom scoring and stratified CV 
grid_search_6=GridSearchCV(estimator=base_model_6,param_grid=param_grid_6,scoring=custom_scorer,cv=skf,n_jobs=-1)
grid_search_6.fit(X_scaled,y)

#store best performing model and parameters
model_6=grid_search_6.best_estimator_
best_parameters_6=grid_search_6.best_params_


In [20]:
#initialize base model
base_model_6=XGBClassifier(random_state=30, eval_metric='mlogloss')

#define hyperparameter grid for tuning
param_grid_6 = {
    'n_estimators':[50, 100, 200],           # number of trees
    'max_depth':[3, 6, 10],                  # depth of each tree
    'learning_rate':[0.01, 0.1, 0.2],        # step size shrinkage
    'subsample':[0.7, 1.0],                  # row sampling
    'colsample_bytree':[0.7, 1.0],           # feature sampling
    'reg_alpha':[0, 0.1, 1],                 # L1 regularization
    'reg_lambda':[1, 10]                     # L2 regularization
}

#grid search with custom scoring and stratified CV 
grid_search_6=GridSearchCV(estimator=base_model_6,param_grid=param_grid_6,scoring=custom_scorer,cv=skf,n_jobs=-1)
grid_search_6.fit(X_scaled,y)

#store best performing model and parameters
model_6=grid_search_6.best_estimator_
best_parameters_6=grid_search_6.best_params_


In [21]:
conf_matrices=[]
reports=[]

#for each fold
for train_idx, test_idx in skf.split(X, y):

    #split data into 
    X_train, X_test=X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test=y.iloc[train_idx], y.iloc[test_idx]

    #standardize features
    X_train_scaled=scaler.fit_transform(X_train)
    X_test_scaled=scaler.transform(X_test)

    #train model 
    model_6.fit(X_train_scaled, y_train)

    #predict target 
    y_pred_6=model_6.predict(X_test_scaled)

    #store confusion matrix and classification report
    conf_matrices.append(confusion_matrix(y_test, y_pred_6))
    reports.append(classification_report(y_test, y_pred_6, target_names=class_labels, output_dict=True))

#aggregate confusion matrices from all folds
conf_matrix_6=np.sum(conf_matrices, axis=0)

#extract class-specific metrics from each fold
precision_match=[r['match']['precision'] for r in reports]
precision_not_match=[r['not match']['precision'] for r in reports]
recall_not_match =[r['not match']['recall'] for r in reports]

#average metrics across folds
precision_match_6=round(np.mean(precision_match), 3)
precision_not_match_6=round(np.mean(precision_not_match), 3)
recall_not_match_6=round(np.mean(recall_not_match), 3)

#print results
print("Best hyperparameters for Experiment 6:")
print(best_parameters_6)

print("\nConfusion Matrix:")
print(conf_matrix_6)

print("\nClassification Report Summary:")
print(f"'Match' Precision:     {precision_match_6}")
print(f"'Not Match' Precision: {precision_not_match_6}")
print(f"'Not Match' Recall:    {recall_not_match_6}")


Best hyperparameters for Experiment 6:
{'colsample_bytree': 0.7, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 0.7}

Confusion Matrix:
[[ 47   4   0]
 [  1  68   4]
 [  0  10 251]]

Classification Report Summary:
'Match' Precision:     0.985
'Not Match' Precision: 0.978
'Not Match' Recall:    0.92


# Discussion

In [22]:
#create dataframe with results from all experiments
results_comparison= [
    {
        'Experiment': '1 (Baseline)',
        'Match Precision': precision_match_1,
        'Not Match Precision': precision_not_match_1,
        'Not Match Recall': recall_not_match_1
    },
    {
        'Experiment': '2 (Random Forest)',
        'Match Precision': precision_match_2,
        'Not Match Precision': precision_not_match_2,
        'Not Match Recall': recall_not_match_2
    },
    {
        'Experiment': '3 (Tuned Random Forest)',
        'Match Precision': precision_match_3,
        'Not Match Precision': precision_not_match_3,
        'Not Match Recall': recall_not_match_3
    },
    {
        'Experiment': '4 (LogReg L1 vs L2)',
        'Match Precision': precision_match_4,
        'Not Match Precision': precision_not_match_4,
        'Not Match Recall': recall_not_match_4
    },
    {
        'Experiment': '5 (LogReg L1 refined)',
        'Match Precision': precision_match_5,
        'Not Match Precision': precision_not_match_5,
        'Not Match Recall': recall_not_match_5
    },
    {
        'Experiment': '6 (XGBoost)',
        'Match Precision': precision_match_6,
        'Not Match Precision': precision_not_match_6,
        'Not Match Recall': recall_not_match_6
    }
]

results_df=pd.DataFrame(results_comparison)

#create new column with the custom scoring function
results_df['Custom Score']=(0.4*results_df['Match Precision']+0.4 * results_df['Not Match Precision']+0.2*results_df['Not Match Recall'])

#display cleaned results
results_df=results_df.round(3)
print(results_df)

                Experiment  Match Precision  Not Match Precision  Not Match Recall  Custom Score
0             1 (Baseline)            0.786                0.943             0.962         0.884
1        2 (Random Forest)            0.848                0.981             0.969         0.925
2  3 (Tuned Random Forest)            0.859                0.977             0.973         0.929
3      4 (LogReg L1 vs L2)            0.874                0.966             0.965         0.929
4    5 (LogReg L1 refined)            0.898                0.938             0.985         0.931
5              6 (XGBoost)            0.985                0.978             0.920         0.969


- As expected, the baseline experiment without stratified k fold method resulted in overly optimistic results, with perfect precision and very high recall, therefore these results were not considered. 
- The highest precision for the 'match' class was 0.985, achieved in experiment 6.
- The highest precision for the 'not match' class was 0.98, achieved by the refined L2 Logistic Regression model.
- The highest recall for the 'not match' category was 0.94, achievd by the tuned Random Forest Model.
- The best model overall appears to be Model 3, with precision value svery close to the maximum and maximum recall.
- The highest Custom score was 0.972, achieved by Model 3 and corroborating its superiority. 

# Predicting Labels

In [23]:
#define model
chosen_model=model_6

In [24]:
#train final model with the labelled data 
chosen_model.fit(X,y)

#generate class probabilities for each class 
df_predicted_1=df_holdout.copy()
df_predicted_1[['not_match_prob','preliminary_prob','match_prob']]=chosen_model.predict_proba(X_holdout)

#predict class label 
df_predicted_1['Predicted Label 1']=chosen_model.predict(X_holdout)

#reverse mapping from experiment 6 for readability of labels
df_predicted_1['Predicted Label 1']=df_predicted_1['Predicted Label 1'].map(reverse_mapping)


In [25]:
#create new sample 
df_review_1=df_predicted_1.sample(200, random_state=30).sort_values('uk_id').reset_index(drop=True)

#print relevant columns from the sample to inspect the labels
df_review_1[['uk_id','uk_name','eu_name','final_multi_score','avg_adjusted_token_score','jaccard_similarity','overlap_count','Predicted Label 1','match_prob','not_match_prob','preliminary_prob']].head(200)

Unnamed: 0,uk_id,uk_name,eu_name,final_multi_score,avg_adjusted_token_score,jaccard_similarity,overlap_count,Predicted Label 1,match_prob,not_match_prob,preliminary_prob
0,7309,"{'Abdul', 'Haq', 'Motmaen', 'Abdulhai'}","{'Abdul', 'Haq', 'Motmaen', 'Abdulhai'}",96,94.0,1.0,3,match,0.709686,0.143808,0.146506
1,7489,"{'Thaer', 'Alouche', 'Mohamed', 'Ali', 'Isam', 'Mansour'}","{'Thaer', 'Alouche', 'Mohamed', 'Ali', 'Isam', 'Mansour'}",97,96.17,1.0,6,match,0.71058,0.143989,0.145431
2,7860,"{'Damel', 'Belkasam', 'Djamel', 'Mostafa', 'Mostefa', 'Kalad', 'Bekasam', 'Fjamel', 'Mustafa', 'Barkani', 'Ali', 'Djamal', 'Moustfa', 'Balkasam'}","{'Ly', 'Ben', 'Belkassem', 'Abou', 'Drissi', 'Noureddine', 'Ali', 'Ldrysy', 'Bn', 'Ldyn', 'Blqsm', 'Nwr', 'Faycal', 'Al'}",79,85.91,0.6,2,not match,0.195555,0.569903,0.234542
3,7878,"{'Kamal', 'Kamel', 'Ben', 'Kimo', 'Maoeldi', 'Hassan', 'Hamraoui', 'Mouldi', 'Al', 'Hamroui'}","{'Kamal', 'Kamel', 'Ben', 'Lhmrwy', 'Maoeldi', 'Kimo', 'Hassan', 'Kml', 'Lmwldy', 'Hamraoui', 'Mouldi', 'Bn', 'Hsn', 'Al', 'Hamroui'}",96,94.86,1.0,7,match,0.70938,0.143746,0.146874
4,8246,"{'Nidal', 'Rabi', 'I', 'Al'}","{'Nidal', 'Rabi', 'I', 'Al'}",93,91.0,1.0,4,match,0.710147,0.143974,0.145879
5,8253,"{'Hussein', 'Raghad', 'Saddam', 'Tikriti', 'Al'}","{'Hussein', 'Raghad', 'Saddam', 'Tikriti', 'Al'}",98,97.4,1.0,5,match,0.706473,0.144611,0.148916
6,8720,"{'Yusef', 'Yusif', 'Elsayed', 'Sebai', 'I', 'Elsebai', 'Abu', 'El', 'Tusnin', 'Sababt', 'Sayyed', 'Sayyid', 'Hani', 'Hany', 'Akram', 'Siba', 'Karim', 'Sabai', 'Youssef', 'Yousef', 'Sabaay', 'Al', 'Youseff'}","{'Yusif', 'Mhmd', 'Ibn', 'Muhammad', 'Lslm', 'Ywsf', 'Khattab', 'Abd', 'Yusuf', 'Salam', 'Thmn', 'Uthman', 'Shrf', 'Bd', 'Ashraf', 'Al'}",81,91.0,0.5,2,preliminary match,0.21739,0.384714,0.397896
7,8781,"{'Kaua', 'Ahmad', 'Farhad', 'Omar', 'Kawa', 'Hamawandi', 'Achmed', 'Kanabi', 'Ahmed'}","{'Kaua', 'Ahmad', 'Farhad', 'Omar', 'Kawa', 'Hamawandi', 'Achmed', 'Kanabi', 'Ahmed'}",98,97.5,1.0,6,match,0.710174,0.143843,0.145984
8,9065,"{'Straton', 'Io', 'Musoni'}","{'Straton', 'Io', 'Musoni'}",97,95.67,1.0,3,match,0.71058,0.143989,0.145431
9,10674,"{'Mateso', 'Bigurura', 'Deogratias', 'Izabayo', 'Jules', 'Deo', 'Bigaruka', 'Nzeyimana', 'Mlamba', 'Stanislas'}","{'Mateso', 'Bigurura', 'Deogratias', 'Jules', 'Izabayo', 'Deo', 'Bigaruka', 'Nzeyimana', 'Mlamba', 'Stanislas'}",98,98.0,1.0,9,match,0.710669,0.143943,0.145388


In [26]:
#store the uk ids of rows with incorrect labels 
change_to_match=[14408,14437,15168,15350]
change_to_not_match=[8720,13378,13879,15653,15764,15821,16547]
change_to_preliminary_match=[11805,12657,12850,13401,13719,13914]


In [27]:
#create a new column for the human labels where the incorrect ones are properly labelled
df_review_1['Human-assigned Label']=df_review_1['Predicted Label 1'].copy()

df_review_1.loc[df_review_1['uk_id'].isin(change_to_match), 'Human-assigned Label']='match'
df_review_1.loc[df_review_1['uk_id'].isin(change_to_not_match), 'Human-assigned Label']='not match'
df_review_1.loc[df_review_1['uk_id'].isin(change_to_preliminary_match), 'Human-assigned Label']='preliminary match'

In [51]:
#create classification report 
report_review_1=classification_report(
    df_review_1['Human-assigned Label'],
    df_review_1['Predicted Label 1'],
    labels=class_labels,
    output_dict=True
)

#create confusion matrix
confusion_matrix_r_1=confusion_matrix(df_review_1['Human-assigned Label'], df_review_1['Predicted Label 1'],labels=class_labels)


#store relevant metrics
precision_match_r_1=report_review_1['match']['precision']
precision_not_match_r_1=report_review_1['not match']['precision']
recall_not_match_r_1=report_review_1['not match']['recall']

#print results
print(confusion_matrix_r_1)

print("\nClassification Report Summary:")
print(f"'Match' Precision:     {precision_match_r_1}")
print(f"'Not Match' Precision: {precision_not_match_r_1}")
print(f"'Not Match' Recall:    {recall_not_match_r_1}")

[[ 20   7   0]
 [  0  29   6]
 [  0   4 134]]

Classification Report Summary:
'Match' Precision:     0.9571428571428572
'Not Match' Precision: 1.0
'Not Match' Recall:    0.7407407407407407


# Retrain with new labelled data 

In [29]:
#update training data with the most recently assigned labels
new_labelled_ids=df_review_1['uk_id']
new_labels=df_review_1['Human-assigned Label'].map(label_mapping)

new_df_label=df_full[df_full['uk_id'].isin(new_labelled_ids)].copy()
new_df_label['label']=new_labels.values

df_label=pd.concat([df_label,new_df_label],ignore_index=True)


#remove most recently assigned labelled data from holdout 
total_labelled_ids=pd.concat([labelled_ids,new_labelled_ids],ignore_index=True).sort_values().reset_index(drop=True)
df_holdout=df_full[~df_full['uk_id'].isin(total_labelled_ids)].copy()


#create final training and holdout sets
new_X=new_df_label[feature_cols]  
new_y=new_df_label['label']  
new_X_holdout=df_holdout[feature_cols]

In [30]:
#retrain the same model using newly labelled data 
chosen_model.fit(new_X,new_y)

#generate class probabilities for each class 
df_predicted_2=df_holdout.copy()
df_predicted_2[['not_match_prob','preliminary_prob','match_prob']]=chosen_model.predict_proba(new_X_holdout)

#predict class label 
df_predicted_2['Predicted Label 2']=chosen_model.predict(new_X_holdout)

#reverse mapping from experiment 6 for readability of labels
df_predicted_2['Predicted Label 2']=df_predicted_2['Predicted Label 2'].map(reverse_mapping)

In [31]:
#create a new sample from the new predictions
df_review_2=df_predicted_2.sample(200, random_state=30).sort_values('uk_id').reset_index(drop=True)

#print relevant columns for inspection of predicted labels
df_review_2[['uk_id','uk_name','eu_name','final_multi_score','avg_adjusted_token_score','jaccard_similarity','overlap_count','Predicted Label 2','match_prob','not_match_prob','preliminary_prob']].head(200)

Unnamed: 0,uk_id,uk_name,eu_name,final_multi_score,avg_adjusted_token_score,jaccard_similarity,overlap_count,Predicted Label 2,match_prob,not_match_prob,preliminary_prob
0,6936,"{'Abu', 'Hidayatullah', 'Turab'}","{'Abu', 'Hidayatullah', 'Turab'}",96,94.0,1.0,3,match,0.705767,0.145738,0.148495
1,7091,"{'Mouhajer', 'Ben', 'Elsseid', 'Omar', 'Saber', 'Saleh', 'El', 'Sami', 'Khamis'}","{'Mouhajer', 'Ben', 'Elsseid', 'Omar', 'Saber', 'Saleh', 'El', 'Sami', 'Khamis'}",96,94.89,1.0,9,match,0.705574,0.145979,0.148447
2,7123,"{'Sarfida', 'Akhundzada', 'Ehsanullah', 'Hesamuddin', 'Sarfadi'}","{'Sarfida', 'Akhundzada', 'Ehsanullah', 'Hesamuddin', 'Sarfadi'}",100,100.0,1.0,4,match,0.706245,0.145836,0.147919
3,7164,"{'Mohammad', 'Hanif', 'Din'}","{'Hanif', 'Iadena', 'Din', 'Mohammad', 'Qari'}",96,94.0,1.0,3,match,0.70448,0.145753,0.149767
4,7329,"{'Karim', 'Abdullah', 'Mohammad', 'Hammad', 'Hamad', 'Al'}","{'Karim', 'Abdullah', 'Mohammad', 'Hammad', 'Hamad', 'Al'}",97,95.4,1.0,5,match,0.705565,0.145978,0.148457
5,7449,"{'Abdul', 'Ghafar', 'Shinwari'}","{'Abdul', 'Ghafar', 'Shinwari'}",99,98.33,1.0,3,match,0.704398,0.146749,0.148853
6,7527,"{'Karim', 'Ud', 'Din', 'Muhammad', 'Nooruddin', 'Qasim', 'Noor', 'Turabi'}","{'Karim', 'Ud', 'Din', 'Muhammad', 'Haji', 'Nooruddin', 'Qasim', 'Noor', 'Turabi'}",96,94.88,1.0,8,match,0.705974,0.145781,0.148245
7,7887,"{'Mishaal', 'Khalid'}","{'Mizban', 'Khadr', 'Hadi'}",50,69.09,0.6,1,not match,0.154548,0.684076,0.161376
8,8252,"{'Ibrahim', 'Barzan', 'Noor', 'Tikriti', 'Al', 'Hasan'}","{'Ibrahim', 'Barzan', 'Noor', 'Tikriti', 'Al', 'Hasan'}",97,96.17,1.0,6,match,0.70566,0.145997,0.148343
9,8352,"{'Kamel', 'Abdeljalil', 'Adel', 'Fodhil', 'Abou', 'Djermane', 'Bilal'}","{'Kamel', 'Abdeljalil', 'Adel', 'Fodhil', 'Abou', 'Djermane', 'Bilal', 'L', 'N'}",98,97.14,1.0,7,match,0.704865,0.145552,0.149583


In [32]:
#store uk ids of incorrectly labelled rows
change_to_match_2=[11039,13676,13810,14637]
change_to_not_match_2=[13888,14742,16224]
change_to_preliminary_match_2=[]


In [36]:
#create new column for human assigned labels by replacing the incorrect labels 
df_review_2['Human-assigned Label 2']=df_review_2['Predicted Label 2']

df_review_2.loc[df_review_2['uk_id'].isin(change_to_not_match_2), 'Human-assigned Label 2']='not match'
df_review_2.loc[df_review_2['uk_id'].isin(change_to_preliminary_match_2), 'Human-assigned Label 2']='preliminary match'
df_review_2.loc[df_review_2['uk_id'].isin(change_to_match_2), 'Human-assigned Label 2']='match'



In [58]:
#create classification report
report_review_2=classification_report(    
    df_review_2['Human-assigned Label 2'],
    df_review_2['Predicted Label 2'],
    labels=class_labels,
    output_dict=True
)

#create confusion matrix
confusion_matrix_r_2=confusion_matrix(df_review_2['Human-assigned Label 2'], df_review_2['Predicted Label 2'],labels=class_labels)

#store relevant metrics
precision_match_r_2=report_review_2['match']['precision']
precision_not_match_r_2=report_review_2['not match']['precision']
recall_not_match_r_2=report_review_2['not match']['recall']


#print results
print(confusion_matrix_r_2)


print("\nClassification Report Summary:")
print(f"'Match' Precision:     {precision_match_r_2}")
print(f"'Not Match' Precision: {precision_not_match_r_2}")
print(f"'Not Match' Recall:    {recall_not_match_r_2}")

[[ 31   3   0]
 [  0  36   0]
 [  0   4 126]]

Classification Report Summary:
'Match' Precision:     1.0
'Not Match' Precision: 1.0
'Not Match' Recall:    0.9117647058823529


In [57]:
#display value counts for each precited class
df_predicted_2['Predicted Label 2'].value_counts()

Predicted Label 2
match                1991
preliminary match     641
not match             391
Name: count, dtype: int64