This notebook prepares the matched name pairs generated in the matching pipeline for use in a machine learning classification task. It extracts and structures relevant features into a model-ready dataset that will be used to train and evaluate classifiers in the next stage.

# 1. Imports

In [1]:
#necessary libraries
from pathlib import Path
import pandas as pd  
import numpy as np  
import warnings  
from unidecode import unidecode
import re  
import matplotlib.pyplot as plt
import seaborn as sns
import time
from rapidfuzz import process, fuzz
import random
import scipy.stats as st
import math
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
 
#commands for better output readability 
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_rows', None)  
warnings.filterwarnings("ignore", category=UserWarning, module='pandas')  
pd.set_option('display.max_colwidth', None)

In [2]:
#paths
project_dir=Path.cwd().parent.parent
processed_dir=project_dir/'data'/'processed'

final_file=processed_dir/'automation_data.pkl'
labeled_file=processed_dir/'automation_sample_labeled.pkl'

df=pd.read_pickle(final_file)  
sample=pd.read_pickle(labeled_file)  

# 2. Preprocessing

In [3]:
df.head()

Unnamed: 0,UK ID,UK Sanction Programme,UK Name,UK Letters,Candidate EU IDs,Candidate Count,Name Overlap,EU Name Match,EU ID,Multi Score,Coverage Ratio,Length Adjusted Scores,Weighted Score,Raw Scores,Label
0,6894,ISIL (Da'esh) and Al-Qaida,"{A, Fihiruddin, Fikiruddin, Jibril, Mohamad, Iqbal, Abdul, Abdurrahman, Muqti, Rahman, Abu}","{A, I, M, J, R, F}","[630, 643, 1004, 3140, 4686, 5240, 5262, 5271, 5623, 6133, 6211, 6478, 6494, 6830, 6974, 7250, 113355, 115714, 117974, 123615, 125562, 126101, 127538, 129864, 130225, 133935, 134828, 135060, 136530, 136975, 138176, 145803, 146560, 147032, 150914, 159126, 162479, 162975, 165652, 166799, 167049, 167477, 171171, 172374, 172394]",45,"[Jibril, Abdurrahman, Abu, A, Rahman, Fihiruddin, Mohamad, Iqbal, Muqti]","{A, Fihiruddin, Fikiruddin, Jibril, Mohamad, Iqbal, Abdul, Abdurrahman, Muqti, Rahman, Abu}",1004,97,1.0,"[100.0, 100, 87.0, 87.0, 100.0, 100, 100.0, 95.0, 95.0]",97.0,"[100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0]",match
1,6895,Afghanistan,"{Qader, Abdul, Hazem, Hai}","{Q, A, H}","[20, 505, 590, 591, 595, 603, 651, 661, 706, 709, 829, 842, 2193, 5270, 5416, 6078, 6093, 6095, 6113, 6130, 6207, 6224, 6240, 6312, 6583, 6616, 6694, 6873, 6887, 6898, 7085, 7146, 7368, 7386, 7447, 7501, 7513, 7586, 105305, 106138, 107163, 109900, 110138, 113244, 115145, 119228, 124306, 126530, 126538, 126554, 127417, 128186, 141403, 141872, 144861, 145430, 145486, 145521, 145691, 146428, 148293, 149178, 149390, 150680, 151879, 151903, 152704, 153011, 154125, 165849, 166346, 170630]",72,"[Abdul, Qader, Hazem, Hai]","{Qader, Abdul, Hazem, Hai}",505,95,1.0,"[95.0, 95.0, 95.0, 87.0]",94.75,"[100.0, 100.0, 100.0, 100.0]",match
2,6897,ISIL (Da'esh) and Al-Qaida,"{Abd, Al, Am, Man, Saiyid, Agha, Manan, Abdul}","{M, A, S}","[54, 58, 76, 83, 103, 136, 143, 154, 156, 157, 176, 508, 514, 515, 516, 517, 522, 524, 526, 528, 545, 548, 553, 556, 581, 593, 595, 599, 600, 603, 604, 641, 643, 644, 656, 659, 661, 676, 696, 727, 733, 739, 758, 760, 765, 779, 781, 796, 826, 840, 931, 965, 1064, 1065, 1069, 1092, 1102, 1924, 2193, 2208, 2700, 3144, 3225, 3341, 3361, 3663, 3741, 3793, 3862, 4142, 5268, 5271, 5279, 5294, 5416, 5417, 5499, 5616, 5619, 5623, 5793, 5804, 6084, 6095, 6101, 6113, 6114, 6116, 6130, 6133, 6206, 6211, 6223, 6228, 6230, 6231, 6238, 6303, 6305, 6309, ...]",610,"[Agha, Man, Al, Am, Abdul, Saiyid]","{Abd, Bd, Ag, Lmnn, Al, Am, Man, Saiyid, Agha, Manan, Abdul}",514,94,1.0,"[95.0, 87.0, 87.0, 87.0, 95.0, 100.0]",93.88,"[100.0, 100.0, 100.0, 100.0, 100.0, 100.0]",match
3,6899,ISIL (Da'esh) and Al-Qaida,"{Shihata, Shahata, Tarwat, Salah, Thirwat, Abdallah, Tharwat, Ali}","{T, A, S}","[13, 20, 23, 54, 58, 67, 76, 83, 87, 98, 157, 507, 515, 516, 553, 604, 727, 733, 758, 765, 779, 786, 789, 796, 826, 829, 840, 1880, 1883, 1886, 1888, 1892, 1896, 1924, 2193, 2208, 2921, 3080, 3085, 3225, 3341, 3862, 5279, 5357, 5610, 5793, 6101, 6113, 6114, 6116, 6231, 6306, 6496, 6584, 6619, 6625, 6695, 6696, 6916, 6917, 6973, 6982, 7027, 7069, 7077, 7094, 7137, 7166, 7294, 7300, 7336, 7343, 7359, 7361, 7406, 7434, 7483, 7492, 7496, 7504, 7524, 7556, 105424, 106138, 106544, 106548, 107009, 110103, 110164, 112198, 113224, 113334, 113787, 113926, 117506, 118875, 119026, 119200, 119561, 119633, ...]",339,"[Tarwat, Shahata, Abdallah, Ali, Salah]","{Shihata, Shahata, Tarwat, Salah, Thirwat, Abdallah, Tharwat, Ali}",796,97,1.0,"[100.0, 100.0, 100.0, 87.0, 95.0]",97.3,"[100.0, 100.0, 100.0, 100.0, 100.0]",match
4,6901,ISIL (Da'esh) and Al-Qaida,"{Majeed, Chaudhry, Majid, Abdul}","{M, C, A}","[83, 143, 154, 515, 517, 522, 526, 545, 548, 581, 603, 641, 643, 659, 661, 676, 696, 727, 1064, 1086, 1092, 1102, 3185, 3190, 3793, 5271, 5416, 5420, 5499, 5522, 5553, 5555, 5623, 6095, 6130, 6133, 6206, 6211, 6230, 6238, 6303, 6309, 6485, 6506, 6569, 6615, 6616, 6617, 6652, 6695, 6830, 6831, 6908, 6944, 6972, 6974, 6982, 7035, 7069, 7166, 7168, 7206, 7237, 7285, 7307, 7367, 7388, 7446, 7456, 7472, 7483, 7496, 7499, 7508, 105631, 106554, 106732, 109887, 112198, 113282, 115829, 117360, 118362, 118875, 119561, 119637, 121034, 121036, 123615, 124966, 124980, 125453, 125550, 126101, 126106, 126554, 127531, 127538, 128186, 128241, ...]",254,"[Abdul, Majeed, Chaudhry, Majid]","{Majeed, Chaudhry, Majid, Abdul}",641,98,1.0,"[95.0, 100.0, 100.0, 95.0]",98.12,"[100.0, 100.0, 100.0, 100.0]",match


In [4]:
sample.head()

Unnamed: 0,UK ID,UK Sanction Programme,UK Name,UK Letters,Candidate EU IDs,Candidate Count,Name Overlap,EU Name Match,EU ID,Multi Score,Coverage Ratio,Length Adjusted Scores,Weighted Score,Raw Scores,Label,True Label
0,6894,ISIL (Da'esh) and Al-Qaida,"{A, Fihiruddin, Fikiruddin, Jibril, Mohamad, Iqbal, Abdul, Abdurrahman, Muqti, Rahman, Abu}","{A, I, M, J, R, F}","[630, 643, 1004, 3140, 4686, 5240, 5262, 5271, 5623, 6133, 6211, 6478, 6494, 6830, 6974, 7250, 113355, 115714, 117974, 123615, 125562, 126101, 127538, 129864, 130225, 133935, 134828, 135060, 136530, 136975, 138176, 145803, 146560, 147032, 150914, 159126, 162479, 162975, 165652, 166799, 167049, 167477, 171171, 172374, 172394]",45,"[Jibril, Abdurrahman, Abu, A, Rahman, Fihiruddin, Mohamad, Iqbal, Muqti]","{A, Fihiruddin, Fikiruddin, Jibril, Mohamad, Iqbal, Abdul, Abdurrahman, Muqti, Rahman, Abu}",1004,97,1.0,"[100.0, 100, 87.0, 87.0, 100.0, 100, 100.0, 95.0, 95.0]",97.0,"[100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0]",match,match
1,6905,Afghanistan,"{Akhund, Sahib, Zakir, Bari, Haji, Mullah, Abdul}","{A, M, H, B, Z, S}","[25, 46, 54, 103, 514, 515, 528, 553, 556, 593, 595, 599, 600, 603, 604, 661, 727, 750, 927, 928, 937, 965, 966, 1016, 1065, 1069, 1085, 1102, 1894, 1896, 2193, 2208, 3084, 3144, 3225, 3341, 3361, 3663, 3808, 4142, 5294, 5295, 5416, 5417, 5529, 5616, 5619, 5625, 6044, 6047, 6049, 6052, 6053, 6054, 6073, 6074, 6078, 6079, 6084, 6085, 6090, 6116, 6130, 6213, 6223, 6238, 6309, 6484, 6494, 6607, 6608, 6616, 6695, 6898, 6947, 6982, 7065, 7069, 7074, 7166, 7199, 7223, 7231, 7443, 7472, 7474, 7475, 7477, 7483, 7484, 7496, 7597, 106138, 106144, 106385, 106389, 106534, 106538, 106584, 106654, ...]",196,"[Sahib, Bari, Akhund, Zakir, Mullah, Abdul, Haji]","{Akhund, Sahib, Zakir, Bari, Haji, Mullah, Abdul}",556,97,1.0,"[95.0, 95.0, 100.0, 95.0, 100.0, 95.0, 95.0]",97.32,"[100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0]",match,match
2,6912,Afghanistan,"{Khadem, Abdul, Rauf, Aliza}","{A, K, R}","[143, 146, 513, 659, 676, 719, 724, 786, 849, 1096, 2921, 3062, 3140, 3342, 3793, 5266, 5925, 6212, 6215, 6216, 6240, 6582, 6584, 6650, 6945, 6974, 6989, 6993, 7085, 7221, 7229, 7237, 7250, 7291, 7361, 7368, 7405, 7435, 7455, 7456, 7504, 7513, 7519, 7523, 7567, 107818, 108640, 109244, 109980, 110174, 110425, 113326, 117485, 117974, 119548, 119656, 123615, 125538, 125637, 126338, 126507, 126698, 127526, 127528, 127726, 128270, 128279, 129815, 129836, 129884, 130225, 130355, 132522, 133907, 133959, 134035, 134083, 134107, 134171, 134175, 134340, 134484, 134488, 134532, 134556, 134684, 134860, 134872, 134876, 134880, 134884, 134892, 134900, 134920, 134924, 134928, 134932, 134936, 134964, 134976, ...]",248,"[Abdul, Aliza, Khadem, Rauf]","{Khadem, Aliza, Mullah, Rauf, Abdul}",719,97,1.0,"[95.0, 95.0, 100.0, 95.0]",97.19,"[100.0, 100.0, 100.0, 100.0]",match,match
3,6932,ISIL (Da'esh) and Al-Qaida,"{Mohammed, Uthman, Takfiri, Umar, Al, Othman, Filistini, Qatada, Umr, Ismail, Mahmoud, Samman, Abu, Omar}","{A, I, O, M, S, T, Q, F, U}","[23, 58, 157, 550, 836, 3081, 3082, 3083, 3225, 3962, 5609, 6515, 7025, 7478, 7484, 7497, 115714, 117251, 139991, 152885]",20,"[Umar, Takfiri, Filistini, Othman, Qatada, Ismail, Abu, Al, Samman, Mahmoud, Mohammed]","{Mohammed, Uthman, Takfiri, Umar, Al, Othman, Filistini, Qatada, Umr, Ismail, Mahmoud, Samman, Abu, Omar}",836,98,1.0,"[95.0, 100.0, 100, 100.0, 100.0, 100.0, 87.0, 87.0, 100.0, 100.0, 100.0]",97.89,"[100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0]",match,match
4,7024,ISIL (Da'esh) and Al-Qaida,"{Belkacem, Hannachi, Al, Mohamed, Abdallah, Belgacem, Aouadi, Fathi, Ben}","{A, M, H, B, F}","[25, 54, 69, 89, 103, 139, 175, 191, 528, 544, 556, 603, 651, 653, 661, 669, 702, 706, 923, 927, 928, 1016, 1065, 1085, 1894, 2700, 3000, 3060, 3144, 3181, 4140, 5416, 5417, 5500, 5619, 5623, 5625, 6047, 6049, 6052, 6074, 6078, 6079, 6080, 6082, 6096, 6118, 6130, 6133, 6213, 6309, 6457, 6555, 6605, 6613, 6614, 6616, 6889, 6947, 6958, 7065, 7074, 7080, 7086, 7150, 7167, 7199, 7246, 7262, 7356, 7379, 7443, 7472, 7540, 7601, 7602, 105748, 106536, 106538, 106542, 106732, 107693, 107695, 108458, 109872, 109896, 110845, 110994, 111553, 111567, 117539, 117869, 120797, 121825, 122300, 125450, 126101, 126546, 126554, 126837, ...]",128,"[Ben, Fathi, Belkacem, Al, Aouadi, Abdallah, Hannachi, Mohamed]","{Belkacem, Hannachi, Al, Mohamed, Abdallah, Belgacem, Aouadi, Fathi, Ben}",927,97,1.0,"[87.0, 95.0, 100.0, 87.0, 100.0, 100.0, 100.0, 100.0]",97.09,"[100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0]",match,match


The features here are chosen to mirror the composite components of the multi_score metric rather than breaking them down further. This aligns the data directly with the existing rule-based logic, making it easier to evaluate how well the multi_score captures key signals for matching.

In [5]:
def feature_engineering(df):


    def get_count(token_set_or_list):

        if isinstance(token_set_or_list, (set, list)):
            return len(token_set_or_list)
            
        return 0

    
    def get_average(list_scores):

        if isinstance(list_scores, list) and len(list_scores) > 0:
            return round(sum(list_scores)/len(list_scores), 1)
            
        return 0



    df['uk_letters_count']=df['UK Letters'].apply(get_count)
    df['candidate_count']=df['Candidate Count']
    df['uk_name_count']=df['UK Name'].apply(get_count)
    df['overlap_name_count']=df['UK Name'].apply(get_count)
    df['eu_name_match_count']=df['EU Name Match'].apply(get_count)
    df['multi_score']=df['Multi Score']
    df['coverage_ratio']=df['Coverage Ratio'].round(2)
    df['length_adj_avg_score']=df['Length Adjusted Scores'].apply(get_average).round(2)
    df['avg_raw_score']=df['Raw Scores'].apply(get_average).round(2)
    
    columns_to_drop=['UK Sanction Programme',
                     'UK Letters',
                     'Candidate EU IDs',
                     'Candidate Count',
                     'Multi Score',
                     'Coverage Ratio',
                     'Length Adjusted Scores',
                     'Weighted Score',
                     'Raw Scores',
                     'Label']
    
    df=df.drop(columns=columns_to_drop)
    
    return df

In [6]:
df=feature_engineering(df)
sample=feature_engineering(sample)

In [7]:
sample=sample[['UK ID','UK Name','Name Overlap','EU Name Match','EU ID','uk_letters_count','candidate_count','uk_name_count','overlap_name_count','eu_name_match_count','multi_score','coverage_ratio','length_adj_avg_score','avg_raw_score','True Label']]
sample=sample.rename(columns={'True Label': 'Label'})

In [8]:
df.head()

Unnamed: 0,UK ID,UK Name,Name Overlap,EU Name Match,EU ID,uk_letters_count,candidate_count,uk_name_count,overlap_name_count,eu_name_match_count,multi_score,coverage_ratio,length_adj_avg_score,avg_raw_score
0,6894,"{A, Fihiruddin, Fikiruddin, Jibril, Mohamad, Iqbal, Abdul, Abdurrahman, Muqti, Rahman, Abu}","[Jibril, Abdurrahman, Abu, A, Rahman, Fihiruddin, Mohamad, Iqbal, Muqti]","{A, Fihiruddin, Fikiruddin, Jibril, Mohamad, Iqbal, Abdul, Abdurrahman, Muqti, Rahman, Abu}",1004,6,45,11,11,11,97,1.0,96.0,100.0
1,6895,"{Qader, Abdul, Hazem, Hai}","[Abdul, Qader, Hazem, Hai]","{Qader, Abdul, Hazem, Hai}",505,3,72,4,4,4,95,1.0,93.0,100.0
2,6897,"{Abd, Al, Am, Man, Saiyid, Agha, Manan, Abdul}","[Agha, Man, Al, Am, Abdul, Saiyid]","{Abd, Bd, Ag, Lmnn, Al, Am, Man, Saiyid, Agha, Manan, Abdul}",514,3,610,8,8,11,94,1.0,91.8,100.0
3,6899,"{Shihata, Shahata, Tarwat, Salah, Thirwat, Abdallah, Tharwat, Ali}","[Tarwat, Shahata, Abdallah, Ali, Salah]","{Shihata, Shahata, Tarwat, Salah, Thirwat, Abdallah, Tharwat, Ali}",796,3,339,8,8,8,97,1.0,96.4,100.0
4,6901,"{Majeed, Chaudhry, Majid, Abdul}","[Abdul, Majeed, Chaudhry, Majid]","{Majeed, Chaudhry, Majid, Abdul}",641,3,254,4,4,4,98,1.0,97.5,100.0


In [9]:
sample.head()

Unnamed: 0,UK ID,UK Name,Name Overlap,EU Name Match,EU ID,uk_letters_count,candidate_count,uk_name_count,overlap_name_count,eu_name_match_count,multi_score,coverage_ratio,length_adj_avg_score,avg_raw_score,Label
0,6894,"{A, Fihiruddin, Fikiruddin, Jibril, Mohamad, Iqbal, Abdul, Abdurrahman, Muqti, Rahman, Abu}","[Jibril, Abdurrahman, Abu, A, Rahman, Fihiruddin, Mohamad, Iqbal, Muqti]","{A, Fihiruddin, Fikiruddin, Jibril, Mohamad, Iqbal, Abdul, Abdurrahman, Muqti, Rahman, Abu}",1004,6,45,11,11,11,97,1.0,96.0,100.0,match
1,6905,"{Akhund, Sahib, Zakir, Bari, Haji, Mullah, Abdul}","[Sahib, Bari, Akhund, Zakir, Mullah, Abdul, Haji]","{Akhund, Sahib, Zakir, Bari, Haji, Mullah, Abdul}",556,6,196,7,7,7,97,1.0,96.4,100.0,match
2,6912,"{Khadem, Abdul, Rauf, Aliza}","[Abdul, Aliza, Khadem, Rauf]","{Khadem, Aliza, Mullah, Rauf, Abdul}",719,3,248,4,4,5,97,1.0,96.2,100.0,match
3,6932,"{Mohammed, Uthman, Takfiri, Umar, Al, Othman, Filistini, Qatada, Umr, Ismail, Mahmoud, Samman, Abu, Omar}","[Umar, Takfiri, Filistini, Othman, Qatada, Ismail, Abu, Al, Samman, Mahmoud, Mohammed]","{Mohammed, Uthman, Takfiri, Umar, Al, Othman, Filistini, Qatada, Umr, Ismail, Mahmoud, Samman, Abu, Omar}",836,9,20,14,14,14,98,1.0,97.2,100.0,match
4,7024,"{Belkacem, Hannachi, Al, Mohamed, Abdallah, Belgacem, Aouadi, Fathi, Ben}","[Ben, Fathi, Belkacem, Al, Aouadi, Abdallah, Hannachi, Mohamed]","{Belkacem, Hannachi, Al, Mohamed, Abdallah, Belgacem, Aouadi, Fathi, Ben}",927,5,128,9,9,9,97,1.0,96.1,100.0,match


# 3. Output

In [10]:
df.to_pickle(processed_dir/'features_data.pkl')
sample.to_pickle(processed_dir/'features_sample_labeled.pkl')