The goal of this notebook is to compare two name-matching methods: `Standard Name Matching` and `Grouped Name Matching`.

The notebook is structured as follows:

- Preprocessing

- Summary Statistics: Overview of labeling outcomes across the dataset.

- Labeling Validation: Evaluating how well the methods categorize name pairs.

- Disagreement Analysis: Comparing the methods’ choices for best-match.

- Conclusion

- Future Improvements

- Limitations



# 1. Imports

In [1]:
#necessary libraries
from pathlib import Path
import pandas as pd  
import numpy as np  
import warnings  
from unidecode import unidecode
import re  
import matplotlib.pyplot as plt
import seaborn as sns
import time
from rapidfuzz import process, fuzz
import random
import scipy.stats as st
import math
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
 
#commands for better output readability 
pd.set_option('display.max_columns', None)  
pd.set_option('display.max_rows', None)  
warnings.filterwarnings("ignore", category=UserWarning, module='pandas')  
pd.set_option('display.max_colwidth', None)

In [2]:
#paths
project_dir=Path.cwd().parent.parent
processed_dir=project_dir/'data'/'processed'

matched_standard_file=processed_dir/'matched_standard.pkl'
matched_grouped_file=processed_dir/'matched_grouped.pkl'

df_standard=pd.read_pickle(matched_standard_file)  
df_grouped=pd.read_pickle(matched_grouped_file)  

# 2. Preprocessing

In [3]:
df_standard.head()

Unnamed: 0,UK Sanction Programme,UK ID,UK Name,EU Name Match,Score,EU ID,String Length,Label
0,ISIL (Da'esh) and Al-Qaida,6894,Abdul Rahman Mohamad Iqbal,Mohamad Iqbal Abdul Rahman,100,1004,26,match
1,Afghanistan,6895,Abdul Hai Hazem Abdul Qader,Abdul Hai Hazem Abdul Qader,100,505,27,match
2,ISIL (Da'esh) and Al-Qaida,6897,Abdul Man Am Saiyid,Abdul Man Am Saiyid,100,514,19,match
3,ISIL (Da'esh) and Al-Qaida,6899,Tharwat Salah Shihata Ali,Tharwat Salah Shihata Ali,100,796,25,match
4,ISIL (Da'esh) and Al-Qaida,6901,Majeed Abdul Chaudhry,Majeed Chaudhry Abdul,100,641,21,match


In [4]:
df_grouped.head()

Unnamed: 0,UK ID,UK Sanction Programme,UK Name,UK Letters,Candidate EU IDs,Candidate Count,Name Overlap,EU Matched Name,EU Matched ID,Multi Score,Coverage Ratio,Length Adjusted Scores,Weighted Score,Raw Scores,Label
0,6894,ISIL (Da'esh) and Al-Qaida,"{Muqti, Abdurrahman, Mohamad, Jibril, Rahman, Fihiruddin, Fikiruddin, Abdul, A, Abu, Iqbal}","{M, I, J, A, F, R}","[630, 643, 1004, 3140, 4686, 5240, 5262, 5271, 5623, 6133, 6211, 6478, 6494, 6830, 6974, 7250, 113355, 115714, 117974, 123615, 125562, 126101, 127538, 129864, 130225, 133935, 134828, 135060, 136530, 136975, 138176, 145803, 146560, 147032, 150914, 159126, 162479, 162975, 165652, 166799, 167049, 167477, 171171, 172374, 172394]",45,"[Fikiruddin, Rahman, Mohamad, A, Abdul, Jibril, Iqbal, Muqti, Abdurrahman]","{Muqti, Abdurrahman, Mohamad, Jibril, Rahman, Fihiruddin, Fikiruddin, Abdul, A, Abu, Iqbal}",1004,98,1.0,"[100, 100.0, 100.0, 87.0, 95.0, 100.0, 95.0, 95.0, 100]",97.67,"[100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0]",match
1,6895,Afghanistan,"{Hai, Abdul, Hazem, Qader}","{A, H, Q}","[20, 505, 590, 591, 595, 603, 651, 661, 706, 709, 829, 842, 2193, 5270, 5416, 6078, 6093, 6095, 6113, 6130, 6207, 6224, 6240, 6312, 6583, 6616, 6694, 6873, 6887, 6898, 7085, 7146, 7368, 7386, 7447, 7501, 7513, 7586, 105305, 106138, 107163, 109900, 110138, 113244, 115145, 119228, 124306, 126530, 126538, 126554, 127417, 128186, 141403, 141872, 144861, 145430, 145486, 145521, 145691, 146428, 148293, 149178, 149390, 150680, 151879, 151903, 152704, 153011, 154125, 165849, 166346, 170630]",72,"[Hazem, Abdul, Hai, Qader]","{Hai, Abdul, Hazem, Qader}",505,95,1.0,"[95.0, 95.0, 87.0, 95.0]",94.75,"[100.0, 100.0, 100.0, 100.0]",match
2,6897,ISIL (Da'esh) and Al-Qaida,"{Am, Saiyid, Al, Man, Abd, Manan, Abdul, Agha}","{M, A, S}","[54, 58, 76, 83, 103, 136, 143, 154, 156, 157, 176, 508, 514, 515, 516, 517, 522, 524, 526, 528, 545, 548, 553, 556, 581, 593, 595, 599, 600, 603, 604, 641, 643, 644, 656, 659, 661, 676, 696, 727, 733, 739, 758, 760, 765, 779, 781, 796, 826, 840, 931, 965, 1064, 1065, 1069, 1092, 1102, 1924, 2193, 2208, 2700, 3144, 3225, 3341, 3361, 3663, 3741, 3793, 3862, 4142, 5268, 5271, 5279, 5294, 5416, 5417, 5499, 5616, 5619, 5623, 5793, 5804, 6084, 6095, 6101, 6113, 6114, 6116, 6130, 6133, 6206, 6211, 6223, 6228, 6230, 6231, 6238, 6303, 6305, 6309, ...]",610,"[Saiyid, Al, Am, Abd, Agha, Man]","{Am, Saiyid, Ag, Al, Man, Abd, Manan, Lmnn, Bd, Abdul, Agha}",514,93,1.0,"[100.0, 87.0, 87.0, 87.0, 95.0, 87.0]",92.88,"[100.0, 100.0, 100.0, 100.0, 100.0, 100.0]",match
3,6899,ISIL (Da'esh) and Al-Qaida,"{Ali, Salah, Tharwat, Thirwat, Shihata, Abdallah, Shahata, Tarwat}","{A, T, S}","[13, 20, 23, 54, 58, 67, 76, 83, 87, 98, 157, 507, 515, 516, 553, 604, 727, 733, 758, 765, 779, 786, 789, 796, 826, 829, 840, 1880, 1883, 1886, 1888, 1892, 1896, 1924, 2193, 2208, 2921, 3080, 3085, 3225, 3341, 3862, 5279, 5357, 5610, 5793, 6101, 6113, 6114, 6116, 6231, 6306, 6496, 6584, 6619, 6625, 6695, 6696, 6916, 6917, 6973, 6982, 7027, 7069, 7077, 7094, 7137, 7166, 7294, 7300, 7336, 7343, 7359, 7361, 7406, 7434, 7483, 7492, 7496, 7504, 7524, 7556, 105424, 106138, 106544, 106548, 107009, 110103, 110164, 112198, 113224, 113334, 113787, 113926, 117506, 118875, 119026, 119200, 119561, 119633, ...]",339,"[Abdallah, Ali, Salah, Tharwat, Shahata]","{Ali, Salah, Tharwat, Thirwat, Shihata, Abdallah, Shahata, Tarwat}",796,97,1.0,"[100.0, 87.0, 95.0, 100.0, 100.0]",97.3,"[100.0, 100.0, 100.0, 100.0, 100.0]",match
4,6901,ISIL (Da'esh) and Al-Qaida,"{Abdul, Chaudhry, Majeed, Majid}","{M, C, A}","[83, 143, 154, 515, 517, 522, 526, 545, 548, 581, 603, 641, 643, 659, 661, 676, 696, 727, 1064, 1086, 1092, 1102, 3185, 3190, 3793, 5271, 5416, 5420, 5499, 5522, 5553, 5555, 5623, 6095, 6130, 6133, 6206, 6211, 6230, 6238, 6303, 6309, 6485, 6506, 6569, 6615, 6616, 6617, 6652, 6695, 6830, 6831, 6908, 6944, 6972, 6974, 6982, 7035, 7069, 7166, 7168, 7206, 7237, 7285, 7307, 7367, 7388, 7446, 7456, 7472, 7483, 7496, 7499, 7508, 105631, 106554, 106732, 109887, 112198, 113282, 115829, 117360, 118362, 118875, 119561, 119637, 121034, 121036, 123615, 124966, 124980, 125453, 125550, 126101, 126106, 126554, 127531, 127538, 128186, 128241, ...]",254,"[Abdul, Majeed, Majid, Chaudhry]","{Abdul, Chaudhry, Majeed, Majid}",641,98,1.0,"[95.0, 100.0, 95.0, 100.0]",98.12,"[100.0, 100.0, 100.0, 100.0]",match


# 3. Summary Statistics

## 3.1 Results

In [6]:
#standard matching label summary
counts_standard=df_standard['Label'].value_counts().sort_index()
proportions_standard=df_standard['Label'].value_counts(normalize=True).sort_index()
summary_standard=pd.DataFrame({'Count': counts_standard, 'Proportion': proportions_standard})

#grouped label summary
counts_grouped=df_grouped['Label'].value_counts().sort_index()
proportions_grouped=df_grouped['Label'].value_counts(normalize=True).sort_index()
summary_grouped=pd.DataFrame({'Count': counts_grouped, 'Proportion': proportions_grouped})

#print results
print("Standard Matching Label Summary:\n", summary_standard)
print("\nGrouped Matching Label Summary:\n", summary_grouped)

Standard Matching Label Summary:
                    Count  Proportion
Label                               
match               2699    0.748060
not match            271    0.075111
preliminary match    638    0.176829

Grouped Matching Label Summary:
                    Count  Proportion
Label                               
match               2390    0.662417
not match            438    0.121397
preliminary match    780    0.216186


## 3.2 Discussion

- The `standard method`is more decisive, assigning 74.8% of cases as matches and only 7.5% as non-matches, with 17.7% marked as preliminary matches. This suggests it tends to commit more often, possibly at the cost of precision in ambiguous cases.

- In contrast, the `grouped method` is more cautious, labeling only 66.2% as matches and increasing both non-matches (12.2%) and especially preliminary matches (21.6%), showing a preference for deferring decisions when confidence is lower.

# 4. Labeling Validation

## 4.1 Methodology

To assess the performance of each name-matching method in attributing labels, we manually evaluated a sample of name pairs. The goal was to determine how accurately each method classified matches, near matches, and non-matches. For each method, the UK IDs of incorrectly labeled pairs were recorded and used to generate confusion matrices and calculate precision metrics, enabling a direct comparison between the two approaches.

#### 4.1.1 *Sample*

To a accomplish this, a random sample of 250 UK IDs was selected and used for both matching methods to ensure a fair comparison. Although the ideal sample size for a ±5% margin of error at 95% confidence is 385, a smaller sample of 250 offered a practical balance between statistical rigor and manual labeling effort.

#### 4.1.2  *Evaluation Metrics*

As outlined in previous notebooks, we adopted a three-tier labeling strategy (Match, Not Match, and Preliminary Match)  designed to maximize confidence in automatic decisions while reserving uncertain cases for human review. This strategy was developed to reduce manual effort in the context of financial sanctions, where accuracy is critical and false positives carry significant consequences.

The most important evaluation metrics are:

- **Precision** in `match` and `not match`: If the system labels a pair as a match, it should truly be a match. Likewise, if it labels a pair as not match, it should be genuinely safe to ignore. These represent the most confident outcomes and must be highly reliable.

- **Recall** in `not match` (important but secondary): This metric indicates how many actual non-matches are confidently identified. Higher recall reduces the number of names falling into the Preliminary Match category, thereby lowering manual review workload. However, ensuring precision remains more critical.

We do not directly optimize for the `Preliminary Match` class and instead rely on it to catch edge cases and potential true matches that lack clear confidence.


We do not directly optimize for the `preliminary match` class. Instead, it serves as a catch-all for edge cases and uncertain true matches that fall below the confidence threshold. According to our strategy, it is perfectly acceptable for a real match to be classified as preliminary match, since these cases will still be manually reviewed and not missed.

#### 4.1.3 *Human Evaluation*

Determining whether two names correspond to the same person is a highly subjective task, complicated by different languages, language structures, spelling variations, and nuances that often require linguistic expertise to fully understand. To maintain consistency when visually evaluating matched names, the following guidelines were established:

- A pair is considered a match only if there are at least three common words; otherwise, it is classified as a preliminary match to reflect lower confidence.

- Since human evaluators may have varying familiarity with different languages, all words were weighted equally, with no distinction made between first names, middle names, or surnames. This approach was chosen to maintain consistency across unknown language structures.

- When only one word matches, the pair is usually classified as a non-match, except in cases where the matching word is a distinctive or unusually long name, in which case it may be considered a preliminary match.

## 4.2 Results

In [7]:
#create samples 
sample_size=250
sample_s=df_standard.sample(n=sample_size, random_state=5).sort_values(by='UK ID', ascending=True).reset_index(drop=True)
sample_g=df_grouped.sample(n=sample_size, random_state=5).sort_values(by='UK ID', ascending=True).reset_index(drop=True)

In [8]:
#inspect standard macthes
sample_s.head(250).sort_values(by='Score',ascending=False)

Unnamed: 0,UK Sanction Programme,UK ID,UK Name,EU Name Match,Score,EU ID,String Length,Label
0,ISIL (Da'esh) and Al-Qaida,6894,Abdul Rahman Mohamad Iqbal,Mohamad Iqbal Abdul Rahman,100.0,1004,26,match
1,Afghanistan,6905,Abdul Bari Akhund,Abdul Bari Akhund,100.0,556,17,match
2,Afghanistan,6912,Abdul Rauf Khadem,Abdul Rauf Khadem,100.0,719,17,match
3,ISIL (Da'esh) and Al-Qaida,6932,Abu Qatada Al Filistini,Abu Qatada Al Filistini,100.0,836,23,match
4,ISIL (Da'esh) and Al-Qaida,7024,Mohamed Ben Belgacem Ben Abdallah Al Aouadi,Mohamed Ben Belgacem Ben Abdallah Al Aouadi,100.0,927,43,match
5,ISIL (Da'esh) and Al-Qaida,7099,Bilal Bin Marwan,Bin Marwan Bilal,100.0,557,16,match
6,Afghanistan,7278,Matiullah,Matiullah,100.0,649,9,match
7,Afghanistan,7296,Akhtar Mohammad Maz Hari,Akhtar Mohammad Maz Hari,100.0,651,24,match
8,Afghanistan,7380,Abdul Manan Nayazi,Abdul Manan Nayazi,100.0,702,18,match
9,Afghanistan,7442,Abdul Wahed Shafiq,Abdul Wahed Shafiq,100.0,758,18,match


In [9]:
#store wrongly labeled UK IDs from the standard sample

change_to_match_s=[14535,16307,13838,14479,14918]	

change_to_not_match_s=[16072,12725,14807,15962,15440,12852,16068,15052,16014,15420,16309,15783,15651,16472,16782]		

change_to_preliminary_match_s=[7278,7558,8258,11778,11933,11934,12216,12418,12507,12469,12769,12769,13333,13413,13497,13505,16595,16542,15862,15626,16381,16367,14329]	

In [10]:
#inspect grouped macthes
sample_g[['UK ID', 'UK Name', 'Name Overlap','EU Name Match','Multi Score','Coverage Ratio','Raw Scores','Label']].sort_values(by='Multi Score', ascending=False).head(250)


Unnamed: 0,UK ID,UK Name,Name Overlap,EU Name Match,Multi Score,Coverage Ratio,Raw Scores,Label
18,8904,"{Vladimirovich, Uladzimiravich, Viktor, Sheiman, Sheyman, Ulazimiravich, Viktar, Uladimiravich}","[Viktor, Sheiman, Vladimirovich]","{Vladimirovich, Uladzimiravich, Sjejman, Uladzimiravitj, Vladimirovitj, Viktor, Sheiman, Sheyman, Viktar}",100,1.0,"[100.0, 100.0, 100.0]",match
16,8258,"{Chadian, Samira, Shahbandar}","[Chadian, Shahbandar, Samira]","{Chadian, Samira, Shahbandar}",100,1.0,"[100.0, 100.0, 100.0]",match
48,12923,"{Konstantinov, Vladimir, Andreevich, Andriyovych, Volodymyr}","[Volodymyr, Andriyovych, Andreevich, Konstantinov, Vladimir]","{Wladimir, Konstantinov, Vladimir, Andrejevitj, Andreevich, Andriyovych, Konstantynov, Andrijovytj, Volodymyr, Konstantynow, Volodomyr}",100,1.0,"[100.0, 100.0, 100.0, 100.0, 100.0]",match
56,13041,"{Degtyaryov, Vladimirovich, Degtyarev, Mikhail}","[Degtyaryov, Mikhail, Vladimirovich]","{Vladimirovich, Degtiarjov, Degtyaryov, Vladimirovitj, Degtyarev, Michail, Mikhail}",100,1.0,"[100.0, 100.0, 100.0]",match
57,13068,"{Alexey, Gromov, Alexeyevich}","[Alexeyevich, Alexey, Gromov]","{Gromov, Aleksei, Alexey, Alekseevich, Alexeyevich}",100,1.0,"[100.0, 100.0, 100.0]",match
51,12953,"{Menyailo, Ivanovich, Sergei}","[Menyailo, Ivanovich, Sergei]","{Menyailo, Ivanovitj, Iwanowitsch, Sergej, Menjajlo, Ivanovich, Menjailo, Sergei}",100,1.0,"[100.0, 100.0, 100.0]",match
72,13394,"{Savchenko, Svetlana, Borisovna}","[Savchenko, Svetlana, Borisovna]","{Savchenko, Svetlana, Borisovna, Savtjenko}",100,1.0,"[100.0, 100.0, 100.0]",match
64,13206,"{Tambov, Ignatov, Sergey, Yurevich, Kuzovliov, Kuzovlev}","[Yurevich, Tambov, Kuzovlev, Ignatov, Sergey]","{Tambov, Ignatov, Sergej, Sergey, Yurevich, Kuzovlev, Jurjevitj}",100,1.0,"[100.0, 100.0, 100.0, 100.0, 100.0]",match
59,13073,"{Malofeev, Valerevich, Konstantin}","[Konstantin, Malofeev, Valerevich]","{Malofeev, Evich, Valer, Valerevich, Konstantin}",100,1.0,"[100.0, 100.0, 100.0]",match
58,13071,"{Vadimovich, Abisov, Vadymovych, Sergey, Sergiy, Serhiy}","[Abisov, Sergiy, Vadymovych]","{Vadimovich, Sergej, Vadymovytj, Abisov, Serhij, Vadimovitj, Vadymovych, Sergey, Sergiy, Serhiy}",100,1.0,"[100.0, 100.0, 100.0]",match


In [11]:
#store wrongly labeled UK IDs from the grouped sample

change_to_match_g=[]	

change_to_not_match_g=[15526,15651,14807]		

change_to_preliminary_match_g=[]	

In [12]:
#create a column with the true labels for each sample

sample_s['True Label']=sample_s['Label'].copy()
sample_g['True Label']=sample_g['Label'].copy()

sample_s.loc[sample_s['UK ID'].isin(change_to_match_s),'True Label']='match'
sample_s.loc[sample_s['UK ID'].isin(change_to_not_match_s),'True Label']='not match'
sample_s.loc[sample_s['UK ID'].isin(change_to_preliminary_match_s),'True Label']='preliminary match'

sample_g.loc[sample_s['UK ID'].isin(change_to_match_g),'True Label']='match'
sample_g.loc[sample_s['UK ID'].isin(change_to_not_match_g),'True Label']='not match'
sample_g.loc[sample_s['UK ID'].isin(change_to_preliminary_match_g),'True Label']='preliminary match'

In [13]:
class_labels=['not match','preliminary match', 'match']


#calculate classification report
report_s=classification_report(
    sample_s['True Label'], 
    sample_s['Label'], 
    labels=class_labels, 
    output_dict=True
)

#compute confusion matrix
confusion_matrix_s=confusion_matrix(sample_s['True Label'],sample_s['Label'], labels=class_labels)

#extract relevant metrics
precision_match_s=report_s['match']['precision']
precision_not_match_s=report_s['not match']['precision']
recall_not_match_s=report_s['not match']['recall']

#print results
print("Confusion Matrix:")
print(confusion_matrix_s)

print("\nClassification Report Summary:")
print(f"'Match' Precision:     {precision_match_s}")
print(f"'Not Match' Precision: {precision_not_match_s}")
print(f"'Not Match' Recall:    {recall_not_match_s}")

Confusion Matrix:
[[ 21  14   1]
 [  0  38  21]
 [  0   5 150]]

Classification Report Summary:
'Match' Precision:     0.872093023255814
'Not Match' Precision: 1.0
'Not Match' Recall:    0.5833333333333334


In [14]:
class_labels=['not match','preliminary match', 'match']


#calculate classification report
report_g=classification_report(
    sample_g['True Label'], 
    sample_g['Label'], 
    labels=class_labels, 
    output_dict=True
)

#compute confusion matrix
confusion_matrix_g=confusion_matrix(sample_g['True Label'],sample_g['Label'], labels=class_labels)

#extract relevant metrics
precision_match_g=report_g['match']['precision']
precision_not_match_g=report_g['not match']['precision']
recall_not_match_g=report_g['not match']['recall']

#print results
print("Confusion Matrix:")
print(confusion_matrix_g)

print("\nClassification Report Summary:")
print(f"'Match' Precision:     {precision_match_g}")
print(f"'Not Match' Precision: {precision_not_match_g}")
print(f"'Not Match' Recall:    {recall_not_match_g}")

Confusion Matrix:
[[ 39   3   0]
 [  0  52   0]
 [  0   0 156]]

Classification Report Summary:
'Match' Precision:     1.0
'Not Match' Precision: 1.0
'Not Match' Recall:    0.9285714285714286


## 4.3 Discussion

- The `grouped method` achieved perfect precision for both match and not match classes, and high not match recall (0.91), confirming it only commits to clear cases, consistent with its more conservative behavior observed earlier.

- The `standard method` showed slightly lower precision for match (0.87) and weak recall for not match (0.58), meaning it’s more willing to classify aggressively but at the cost of mislabeling.

This aligns with prior observations: the `standard method` is more decisive but riskier, while the `grouped method` is cautious and favors accuracy.

# 5. Disagreement Analysis 

In [15]:
standard_map=df_standard[['UK ID', 'UK Name', 'EU ID', 'EU Name Match', 'Label']].rename(
    columns={'EU ID': 'Standard Method', 'Label': 'Standard Label', 'EU Name Match': 'EU Name (Standard)','UK Name':'UK Name (Standard)'}
)

grouped_map=df_grouped[['UK ID','UK Name', 'EU ID', 'Name Overlap','EU Name Match', 'Label']].rename(
    columns={'EU ID': 'Grouped Method', 'Label': 'Grouped Label', 'EU Name Match': 'EU Name (Grouped)','UK Name':'UK Name (Grouped)'}
)

df_comparison=pd.merge(standard_map, grouped_map, on='UK ID', how='inner')

#ensure consistent types
df_comparison['Standard Method']=df_comparison['Standard Method'].astype('Int64')
df_comparison['Grouped Method']=df_comparison['Grouped Method'].astype('Int64')

def compare_ids(row):
    val1=row['Standard Method']
    val2=row['Grouped Method']
    if pd.isna(val1) or pd.isna(val2):
        return False
    return val1==val2

df_comparison['Same EU ID'] = df_comparison.apply(compare_ids, axis=1)

#filter out cases where the names were matched to the same EU ID + cases were a match was not found
df_mismatch=df_comparison[df_comparison['Same EU ID']==False]
df_mismatch=df_mismatch[~((df_mismatch['Standard Label']=='not match') & 
                            (df_mismatch['Grouped Label']=='not match'))]

#reorder columns for clarity
df_mismatch=df_mismatch[[
    'UK ID', 'UK Name (Standard)',
    'Standard Method', 'EU Name (Standard)', 'Standard Label',
    'UK Name (Grouped)','Grouped Method', 'Name Overlap','EU Name (Grouped)', 'Grouped Label'
]]

#print results
print(f"Total meaningful mismatches in EU IDs: {len(df_mismatch)}")
df_mismatch.head(100)

Total meaningful mismatches in EU IDs: 588


Unnamed: 0,UK ID,UK Name (Standard),Standard Method,EU Name (Standard),Standard Label,UK Name (Grouped),Grouped Method,Name Overlap,EU Name (Grouped),Grouped Label
37,6994,Khalid Sheikh Mohammed,191,Khalid Sheikh Mohammed,match,"{Mohammed, Sheikh, Khalid}",151409.0,"[Khalid, Sheikh, Muhammad]","{Sejk, Sheikh, Hanafi, Schejk, Khalid, Chalil, Muhammad}",match
43,7016,Abu Mohammed,4141,Abu Muhammad,preliminary match,"{Abdul, Dhawahri, Salim, Muhammed, Doctor, Nur, Mohammed, Aziz, Ahmed, Deen, Fatma, Zawahry, Eddaouahiri, Rabi, Al, Fuad, Ayman, Mohamed, Moez, Robi, Zawahari, Muaz, Abdel, Zawahiri, Ahmad, Abu, Aiman, Rabie, Qader}",3563.0,"[Abdul, Mohamed, Raji]","{Lebachir, Abdul, Faraj, Raji, Besir, Muhammed, Khalil, Zgby, Ben, El, Libico, Zoghbai, Ila, Lazrag, Fattah, Fredj, Singapore, Mohamed, Abdefattah, F, Khlyl, Mry, Zgbye, Di, Merai, Albdelfattah, Bdfth, Larzg, Il, Zoghbi, Meri, Farag}",preliminary match
105,7354,Mohammad Naim,654,Nik Mohammad,preliminary match,"{Gul, Akhund, Naeem, Mohammed, Kamran, Bareh, Naimullah, Barich, Berich, Zrae, Ghul, Barech, Khudaidad, Baraich, Baric, Spen, Naim, Mohammad, Mullah}",556.0,"[Mullah, Akhund, Bari]","{Abdul, Zakir, Akhund, Mullah, Sahib, Bari, Haji}",preliminary match
196,7860,Ali Barkani,6827,Ali Barakat,preliminary match,"{Mostafa, Fjamel, Djamel, Mustafa, Moustfa, Bekasam, Kalad, Mostefa, Barkani, Damel, Balkasam, Djamal, Belkasam, Ali}",1095.0,"[Ali, Belkassem]","{Blqsm, Nwr, Abou, Belkassem, Drissi, Ldyn, Ly, Faycal, Ldrysy, Bn, Ali, Noureddine, Ben, Al}",not match
207,7886,Usama Hamdan,152920,Usama Ramadan,preliminary match,"{Usama, Hamdan}",,,,not match
209,7888,Abu Umar,7482,Abu Umar,match,"{Sa, Mohammed, Mohamed, Marzouk, Id, Musa, Marzuk, Marzook, Mousa, Umar, Dr, Abu, Marzuq, Abou}",836.0,"[Umar, Mohammed, Abu]","{Mohammed, Ismail, Uthman, Filistini, Takfiri, Omar, Qatada, Mahmoud, Samman, Umar, Abu, Othman, Umr, Al}",preliminary match
218,8248,Ali Barzan Ibrahim Hasan Al Tikriti,1898,Ali Barzan Ibrahim Hasan Al Tikriti,match,"{Hasan, Tikriti, Ibrahim, Barzan, Ali, Al}",101.0,"[Tikriti, Al, Ibrahim, Barzan, Hassan]","{Hassan, Tikriti, Ibrahim, Barzan, Al}",match
254,8709,Jules Mutebutsi,3189,Jules Mutebutsi,match,"{Mutebuzi, Jules, Mutebusi, Mutebutsi}",168504.0,[Jules],"{Mbitshemunda, Habyarimana, Jean, Mulumba, Jules, Claude, Mbitse}",not match
262,8720,Abu Akram,5618,Abu Akram,match,"{Elsebai, Youseff, I, Sayyed, El, Hany, Sabaay, Youssef, Yusif, Siba, Akram, Al, Elsayed, Karim, Sababt, Sebai, Yousef, Tusnin, Hani, Sabai, Sayyid, Yusef, Abu}",6618.0,"[Yusef, I, Al, Abu, Haji]","{Shahlai, Abdul, Hajj, I, Reza, Hajji, Shahla, Yusuf, Yusif, Shalai, Al, Shahlaee, Haj, Shala, Abd, Abdol, Abdolreza, Yusef, Karkh, Yasir, Haji, Abu, Abdorreza}",preliminary match
300,10637,Ali Hoseynitash,5270,Ali Hoseynitash,match,"{Hosseini, Seyyed, Tash, Hoseynitash, Hoseini, Ali}",154125.0,"[Seyyed, Hosseini]","{Hosseini, Abbas, Seyyed}",preliminary match


In [16]:
standard_wins=[8248,8709,11901,11912,12176,12199,12211,12213,12729,12730,12781,13088] #12
grouped_wins=[7354,7860,7888,8720,11635,11647,11800,12221,12651,12725,12765,12872,2971,13084,13301,13369] #16
draw=[6994,7016,7886,10637,10643,10647,10834,10929,11030,11235,11644,11646,11708,11933,12013,12015,12020,12045,12053,12230,12240,12415,12481,
     12510,12650,12852,12894,12995,13086,13233,13252,13347] #32

It is worth further inspecting the cases in which the standard approach clearly picked a better match than the grouped approach for the UK ID to try to comprehend the limitations of the algorithm created to match the names. For thsi, we printed thsoe UK IDs fro mismatch and also the df_grouped for those UK IDs to see if there is some relationship between the metrcis and those errors. 

In [17]:
df_mismatch[df_mismatch['UK ID'].isin(standard_wins)]

Unnamed: 0,UK ID,UK Name (Standard),Standard Method,EU Name (Standard),Standard Label,UK Name (Grouped),Grouped Method,Name Overlap,EU Name (Grouped),Grouped Label
218,8248,Ali Barzan Ibrahim Hasan Al Tikriti,1898,Ali Barzan Ibrahim Hasan Al Tikriti,match,"{Hasan, Tikriti, Ibrahim, Barzan, Ali, Al}",101,"[Tikriti, Al, Ibrahim, Barzan, Hassan]","{Hassan, Tikriti, Ibrahim, Barzan, Al}",match
254,8709,Jules Mutebutsi,3189,Jules Mutebutsi,match,"{Mutebuzi, Jules, Mutebusi, Mutebutsi}",168504,[Jules],"{Mbitshemunda, Habyarimana, Jean, Mulumba, Jules, Claude, Mbitse}",not match
433,11901,Ali Mamlouk,6304,Ali Mamlouk,match,"{Ali, Mamlouk, Mamluk}",508,[Alfi],"{Masri, Saleh, Ahmed, Mohamed, Abdullah, Alfi, Mariam, El, Abu, Al}",not match
442,11912,Amjad Al Abbas,6309,Amjad Al Abbas,match,"{Amjad, Abbas, Al}",7359,"[Al, Abbas]","{Suleiman, Abbas, Al}",preliminary match
486,12176,Hassan Akharian,6583,Hassan Akharian,match,"{Hossein, Hassan, Akharian}",1892,"[Hassan, Hussein]","{Saddam, Hassan, Tikriti, Hussein, Ali, Al}",preliminary match
506,12199,Mohammad Bagher Bagheri,6606,Mohammad Bagher Bagheri,match,"{Mohammad, Bagher, Bagheri}",3808,"[Mohammad, Bagher]","{Zulqader, Mohammad, Zolqadr, Bagher, Zolghadr, Baqer}",preliminary match
517,12211,Ali Abdullah Ayyoub,6623,Ali Abdullah Ayyoub,match,"{Ayoub, Ayob, Ayyub, Abdullah, Ayyoub, Abdallah, Ayub, Ali}",154,[Abdallah],"{Adil, Abdallah, Mahdi}",not match
519,12213,Aous Aslan,6625,Aous Ali Aslan,preliminary match,"{Aslan, Aous}",145712,[Absalan],"{Absalan, Parviz}",not match
643,12729,Issam Hallaq,7092,Issam Hallaq,match,"{Hallak, Issam, Halaq, Barber, Hallaq, Essam}",146470,[Isa],"{Isa, Bayat}",not match
644,12730,Ezzedine Ismail,7093,Ezzedine Ismail,match,"{Ismail, Ezzedine, Hamra, Ismael}",6236,[Esmaili],"{Hossein, Esmaili, Gholam}",not match


In [18]:
df_grouped_errors=df_grouped[df_grouped['UK ID'].isin(standard_wins)]
df_grouped_errors=df_grouped_errors.sort_values('UK ID')
df_grouped_errors.head(30)

Unnamed: 0,UK ID,UK Sanction Programme,UK Name,UK Letters,Candidate EU IDs,Candidate Count,Name Overlap,EU Name Match,EU ID,Multi Score,Coverage Ratio,Length Adjusted Scores,Weighted Score,Raw Scores,Label
218,8248,Iraq,"{Hasan, Tikriti, Ibrahim, Barzan, Ali, Al}","{H, T, I, B, A}","[13, 25, 33, 39, 54, 65, 69, 87, 89, 98, 99, 101, 506, 544, 706, 829, 943, 945, 1016, 1888, 1892, 1894, 1896, 1898, 1900, 1920, 1922, 2325, 3080, 3081, 3082, 3084, 3085, 3225, 3340, 4140, 6047, 6049, 6060, 6063, 6074, 6078, 6079, 6080, 6082, 6177, 6457, 6480, 6612, 6825, 6889, 7150, 7294, 7370, 7477, 7478, 7498, 7601, 7602, 105748, 106534, 106538, 107679, 107693, 109872, 110845, 111553, 112075, 115225, 118547, 121825, 125574, 126488, 126714, 127732, 130197, 131720, 133626, 134079, 134380, 135748, 135879, 136340, 138684, 142836, 142906, 146768, 149736, 152809, 152900, 154528, 155925, 156392, 161670, 162355, 166170, 166475, 169200, 172452, 172959]",100,"[Tikriti, Al, Ibrahim, Barzan, Hassan]","{Hassan, Tikriti, Ibrahim, Barzan, Al}",101.0,97,1.0,"[100.0, 87.0, 100.0, 100.0, 90.9090909090909]",96.686364,"[100.0, 100.0, 100.0, 100.0, 90.9090909090909]",match
254,8709,Democratic Republic of the Congo,"{Mutebuzi, Jules, Mutebusi, Mutebutsi}","{M, J}","[614, 623, 630, 649, 690, 851, 1923, 4141, 5271, 6108, 6211, 6508, 6592, 6593, 6597, 6648, 6656, 6822, 6830, 6871, 7141, 7201, 7206, 7495, 7500, 105596, 106457, 107816, 108458, 109183, 109876, 110276, 115228, 115965, 117521, 117533, 123624, 124315, 124573, 124987, 125486, 127549, 131237, 134007, 134115, 134292, 134388, 134832, 135640, 138155, 138489, 138504, 138841, 140489, 145794, 146410, 148363, 149393, 149904, 149914, 153611, 155933, 156306, 159833, 161870, 164367, 164376, 167488, 168504, 172394]",70,[Jules],"{Mbitshemunda, Habyarimana, Jean, Mulumba, Jules, Claude, Mbitse}",168504.0,50,0.5,[95.0],83.75,[100.0],not match
433,11901,Syria,"{Ali, Mamlouk, Mamluk}","{M, A}","[69, 83, 89, 103, 136, 139, 143, 154, 175, 508, 515, 517, 522, 523, 524, 526, 528, 540, 544, 545, 548, 552, 581, 582, 603, 614, 623, 641, 643, 647, 649, 651, 653, 659, 661, 669, 676, 696, 702, 717, 719, 727, 851, 945, 1064, 1065, 1092, 1102, 1923, 2194, 2700, 3000, 3144, 3782, 3793, 4141, 5271, 5416, 5417, 5499, 5623, 5804, 6079, 6095, 6096, 6108, 6130, 6133, 6206, 6211, 6230, 6238, 6303, 6305, 6309, 6485, 6506, 6555, 6569, 6597, 6605, 6614, 6615, 6616, 6617, 6630, 6656, 6695, 6822, 6830, 6867, 6888, 6908, 6944, 6972, 6974, 6982, 7035, 7069, 7072, ...]",362,[Alfi],"{Masri, Saleh, Ahmed, Mohamed, Abdullah, Alfi, Mariam, El, Abu, Al}",508.0,50,0.5,[81.42857142857143],73.571429,[85.71428571428572],not match
442,11912,Syria,"{Amjad, Abbas, Al}",{A},"[154, 513, 517, 523, 526, 552, 620, 733, 765, 1037, 1064, 1092, 3782, 3788, 5270, 5279, 5280, 5754, 5793, 5794, 6101, 6207, 6208, 6209, 6212, 6215, 6216, 6224, 6229, 6230, 6231, 6234, 6400, 6485, 6569, 6582, 6583, 6615, 6617, 6627, 6650, 6694, 6894, 6944, 6972, 6973, 6988, 7027, 7029, 7146, 7221, 7225, 7229, 7291, 7299, 7300, 7359, 7386, 7431, 7435, 7447, 7461, 7499, 7508, 7510, 7565, 7582, 7583, 105305, 106554, 110161, 113244, 113326, 115829, 117360, 117498, 118917, 119190, 119656, 119660, 121034, 122062, 125453, 125474, 125819, 126263, 126338, 126646, 126806, 126812, 127417, 129843, 130075, 130246, 130362, 132627, 133882, 134031, 134135, 134159, ...]",282,"[Al, Abbas]","{Suleiman, Abbas, Al}",7359.0,85,0.666667,"[87.0, 95.0]",84.916667,"[100.0, 100.0]",preliminary match
486,12176,Iran,"{Hossein, Hassan, Akharian}","{H, A}","[13, 39, 65, 69, 89, 103, 175, 505, 521, 523, 528, 533, 542, 544, 552, 580, 590, 591, 603, 607, 651, 661, 669, 706, 829, 842, 944, 1065, 1888, 1892, 2193, 3782, 4140, 5270, 5352, 5416, 5803, 6047, 6074, 6078, 6079, 6080, 6082, 6130, 6207, 6224, 6240, 6312, 6457, 6495, 6503, 6555, 6583, 6605, 6614, 6616, 6621, 6694, 6887, 6896, 6898, 6960, 7073, 7075, 7080, 7085, 7087, 7132, 7134, 7144, 7146, 7150, 7205, 7262, 7362, 7368, 7386, 7447, 7481, 7498, 7501, 7513, 7540, 7586, 105305, 105748, 106138, 106542, 107163, 108458, 109872, 110138, 110149, 110845, 110994, 111553, 112075, 113244, 115145, 117498, ...]",160,"[Hassan, Hussein]","{Saddam, Hassan, Tikriti, Hussein, Ali, Al}",1892.0,85,0.666667,"[100.0, 85.71428571428572]",86.309524,"[100.0, 85.71428571428572]",preliminary match
506,12199,Iran,"{Mohammad, Bagher, Bagheri}","{M, B}","[557, 649, 653, 702, 941, 3000, 3144, 3784, 3808, 5274, 5417, 6046, 6056, 6066, 6068, 6079, 6089, 6096, 6108, 6118, 6213, 6235, 6309, 6587, 6597, 6604, 6606, 6607, 6656, 6889, 7076, 7084, 7086, 7167, 7223, 7356, 7379, 7472, 7511, 106732, 109896, 110737, 111553, 111564, 111567, 117869, 120797, 121030, 125486, 126526, 126546, 129423, 129766, 130157, 130928, 131559, 132001, 132231, 133454, 134364, 134372, 134608, 134632, 134728, 135536, 137079, 137139, 138091, 138155, 138479, 138717, 138878, 141844, 142653, 145109, 145407, 145684, 145794, 146403, 148363, 149904, 151897, 152854, 152976, 154271, 156306, 156545, 157608, 159913, 162452, 167488, 167824, 167917, 168387, 168472, 169137, 171250, 172367, 172423]",99,"[Mohammad, Bagher]","{Zulqader, Mohammad, Zolqadr, Bagher, Zolghadr, Baqer}",3808.0,85,1.0,"[100.0, 100.0]",100.0,"[100.0, 100.0]",preliminary match
517,12211,Syria,"{Ayoub, Ayob, Ayyub, Abdullah, Ayyoub, Abdallah, Ayub, Ali}",{A},"[154, 513, 517, 523, 526, 552, 620, 733, 765, 1037, 1064, 1092, 3782, 3788, 5270, 5279, 5280, 5754, 5793, 5794, 6101, 6207, 6208, 6209, 6212, 6215, 6216, 6224, 6229, 6230, 6231, 6234, 6400, 6485, 6569, 6582, 6583, 6615, 6617, 6627, 6650, 6694, 6894, 6944, 6972, 6973, 6988, 7027, 7029, 7146, 7221, 7225, 7229, 7291, 7299, 7300, 7359, 7386, 7431, 7435, 7447, 7461, 7499, 7508, 7510, 7565, 7582, 7583, 105305, 106554, 110161, 113244, 113326, 115829, 117360, 117498, 118917, 119190, 119656, 119660, 121034, 122062, 125453, 125474, 125819, 126263, 126338, 126646, 126806, 126812, 127417, 129843, 130075, 130246, 130362, 132627, 133882, 134031, 134135, 134159, ...]",282,[Abdallah],"{Adil, Abdallah, Mahdi}",154.0,50,0.25,[100.0],81.25,[100.0],not match
519,12213,Syria,"{Aslan, Aous}",{A},"[154, 513, 517, 523, 526, 552, 620, 733, 765, 1037, 1064, 1092, 3782, 3788, 5270, 5279, 5280, 5754, 5793, 5794, 6101, 6207, 6208, 6209, 6212, 6215, 6216, 6224, 6229, 6230, 6231, 6234, 6400, 6485, 6569, 6582, 6583, 6615, 6617, 6627, 6650, 6694, 6894, 6944, 6972, 6973, 6988, 7027, 7029, 7146, 7221, 7225, 7229, 7291, 7299, 7300, 7359, 7386, 7431, 7435, 7447, 7461, 7499, 7508, 7510, 7565, 7582, 7583, 105305, 106554, 110161, 113244, 113326, 115829, 117360, 117498, 118917, 119190, 119656, 119660, 121034, 122062, 125453, 125474, 125819, 126263, 126338, 126646, 126806, 126812, 127417, 129843, 130075, 130246, 130362, 132627, 133882, 134031, 134135, 134159, ...]",282,[Absalan],"{Absalan, Parviz}",145712.0,50,0.25,[83.33333333333334],68.75,[83.33333333333334],not match
643,12729,Syria,"{Hallak, Issam, Halaq, Barber, Hallaq, Essam}","{H, I, E, B}","[101, 1003, 1898, 1922, 6044, 6054, 6480, 6825, 6889, 7131, 7136, 7200, 7525, 110491, 118383, 119557, 126320, 130218, 134588, 142906, 146470, 146892, 149379, 159201, 159818, 161670, 162265, 162963, 164357, 165636, 167693]",31,[Isa],"{Isa, Bayat}",146470.0,4,0.166667,[],4.166667,[75.0],not match
644,12730,Syria,"{Ismail, Ezzedine, Hamra, Ismael}","{H, I, E}","[1003, 6236, 7092, 7226, 7369, 110491, 117527, 117545, 124306, 126652, 129877, 130117, 136084, 136649, 138181, 139168, 141712, 148488, 149390, 149758, 159201, 166764]",22,[Esmaili],"{Hossein, Esmaili, Gholam}",6236.0,50,0.166667,[76.92307692307692],61.858974,[76.92307692307692],not match


## 5.1 Discussion

Out of the 3,608 unique UK individuals, the standard and grouped methods matched to different EU IDs in 587 cases. This already excludes instances where both methods selected different matches but classified them as not match.

From a manual review of the first 60 disagreement cases:

- 32/60 involved different but equally reasonable matches.

- 16/60 were cases where the grouped method found a significantly better match than the standard method — as expected.

- 12/60 showed the standard method produced a clearly better match, which was surprising given its simplicity as a baseline method.

Upon further investigation, those 12 unexpected failures revealed several limitations and blind spots in the grouped algorithm, including:

1. **Missing matches in candidate pool**: In some cases, the correct EU ID selected by the standard method wasn’t even included in the grouped method’s candidate list. This likely happened because the similarity threshold in the grouped method was too strict, causing it to overlook valid matches that scored just below the cutoff.
    
    - Example (UK ID 12729): `Issam Hallaq` was correctly matched by the standard method to EU ID 7092 (`Issam Hallaq`). However, the grouped method matched it to “Isa Bayat`, a completely different person. EU ID 7092 wasn’t even part of the candidate list, indicating the threshold filtered it out.
<br>

2. **Confusion within same family**: The grouped method sometimes confused individuals from the same family or name group. This was often due to short first names being ignored or underweighted—yet those are sometimes the only features that differentiate otherwise similar names.

   - Example (UK ID 8248): `Ali Barzan Ibrahim Hasan Al Tikriti` was correctly matched by the standard method. The grouped method picked another `Al Tikriti` family member, likely because it did not give enough importance to the short but distinctive token `Ali`.
<br>

3. **Over-simplification of similar words**: At times, the grouped method treated distinct names as if they were the same, especially when spellings were similar. This led to mismatches between people whose names look alike but are not actually the same.

   - Example (UK ID 12199): `Mohammad Bagher Bagheri` was matched correctly by the standard method. The grouped method, however, selected `Mohammad Bagher` instead, likely treating `Bagher` and `Bagheri` as mere variants of the same name, rather than distinct individuals.
<br>

4. **Too much weight on common words**: The algorithm occasionally gave excessive weight to generic name elements such as “`Mohammad`, `Al`, `Abu` etc. These frequent terms added noise to the comparison and sometimes led the model to overlook more distinctive name components.



# 6. Conclusion

The grouped method demonstrates clear superiority in accurately labeling matches, prioritizing precision and minimizing false positives. This cautious approach inevitably increases the number of cases requiring human review, which is expected given the inherent complexity of name matching across diverse languages and variations.

When it comes to selecting the best match, the two methods are broadly comparable. They disagreed in about 16% of cases, but roughly half of those disagreements involved equally valid alternatives. In the remaining cases, each method outperformed the other on some occasions, reflecting different strengths and weaknesses.

Notably, the few instances where the standard method outperformed the grouped method revealed blind spots in the grouped algorithm—limitations that stem from its design and thresholding, which sometimes cause it to miss the best matches.

Overall, while the grouped method excels in label accuracy and reducing false matches by capturing nuanced name information, fundamental algorithmic improvements are essential to ensure consistent performance. Without these, the method performs well most of the time but can fail drastically in certain cases.

# 7. Future Improvements

- Explore lowering the similarity threshold used in pre-selecting EU candidates to reduce the chance of missing good matches, while keeping computation time reasonable.

- Incorporating linguistic heuristics, especially for Arabic and Russian names, which are common in sanctions datasets. For example:

    - Assigning lower weight to very common names like `Muhamed` and its variants to minimize noise.

    - Accounting for patronymic name structures in Russian names, such as `Alekseyveech`being derived from the given name `Aleksey` (meaning 'son of').


# 8. Key Limitations

1. **Subjectivity of the task:** Despite efforts to maintain consistency and minimize bias, evaluating name matches remains inherently subjective due to linguistic nuances and variations. However, in real-world sanctions screening, name matching is only one part of a broader verification process that includes multiple personal details, which helps mitigate this subjectivity.

3. **Sample Size Constraints:** The full dataset included 3,608 individuals, but manual evaluations were limited to smaller samples of 250 and 60 pairs due to resource constraints. These small sample sizes reduce the statistical robustness of the validation and disagreement analyses.

5. **Dataset Choice and Risk of Overfitting:** To test matching methods effectively, two similar but not identical sanctions lists (UK and EU) were chosen, resulting in substantial overlap of individuals. While this allowed for a meaningful matching task, it may introduce overfitting risks, limiting the generalizability of results to other datasets or contexts.

# 9. Output

Since a validated sample of the grouped matching results has already been created, we can now use it for further analysis. The final datasets were saved as follows:

In [19]:
sample_g.head()

Unnamed: 0,UK ID,UK Sanction Programme,UK Name,UK Letters,Candidate EU IDs,Candidate Count,Name Overlap,EU Name Match,EU ID,Multi Score,Coverage Ratio,Length Adjusted Scores,Weighted Score,Raw Scores,Label,True Label
0,6894,ISIL (Da'esh) and Al-Qaida,"{Fihiruddin, Abdul, Jibril, Fikiruddin, Iqbal, Rahman, Abdurrahman, Mohamad, Abu, Muqti, A}","{M, J, F, I, A, R}","[630, 643, 1004, 3140, 4686, 5240, 5262, 5271, 5623, 6133, 6211, 6478, 6494, 6830, 6974, 7250, 113355, 115714, 117974, 123615, 125562, 126101, 127538, 129864, 130225, 133935, 134828, 135060, 136530, 136975, 138176, 145803, 146560, 147032, 150914, 159126, 162479, 162975, 165652, 166799, 167049, 167477, 171171, 172374, 172394]",45,"[Fikiruddin, Rahman, Mohamad, A, Abdul, Jibril, Iqbal, Muqti, Abdurrahman]","{Fihiruddin, Abdul, Jibril, Fikiruddin, Iqbal, Rahman, Abdurrahman, Mohamad, Abu, Muqti, A}",1004.0,98,1.0,"[100, 100.0, 100.0, 87.0, 95.0, 100.0, 95.0, 95.0, 100]",97.666667,"[100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0]",match,match
1,6905,Afghanistan,"{Abdul, Zakir, Akhund, Mullah, Sahib, Bari, Haji}","{M, S, H, B, A, Z}","[25, 46, 54, 103, 514, 515, 528, 553, 556, 593, 595, 599, 600, 603, 604, 661, 727, 750, 927, 928, 937, 965, 966, 1016, 1065, 1069, 1085, 1102, 1894, 1896, 2193, 2208, 3084, 3144, 3225, 3341, 3361, 3663, 3808, 4142, 5294, 5295, 5416, 5417, 5529, 5616, 5619, 5625, 6044, 6047, 6049, 6052, 6053, 6054, 6073, 6074, 6078, 6079, 6084, 6085, 6090, 6116, 6130, 6213, 6223, 6238, 6309, 6484, 6494, 6607, 6608, 6616, 6695, 6898, 6947, 6982, 7065, 7069, 7074, 7166, 7199, 7223, 7231, 7443, 7472, 7474, 7475, 7477, 7483, 7484, 7496, 7597, 106138, 106144, 106385, 106389, 106534, 106538, 106584, 106654, ...]",196,"[Mullah, Sahib, Zakir, Bari, Abdul, Akhund, Haji]","{Abdul, Zakir, Akhund, Mullah, Sahib, Bari, Haji}",556.0,97,1.0,"[100.0, 95.0, 95.0, 95.0, 95.0, 100.0, 95.0]",97.321429,"[100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0]",match,match
2,6912,Afghanistan,"{Aliza, Rauf, Abdul, Khadem}","{K, A, R}","[143, 146, 513, 659, 676, 719, 724, 786, 849, 1096, 2921, 3062, 3140, 3342, 3793, 5266, 5925, 6212, 6215, 6216, 6240, 6582, 6584, 6650, 6945, 6974, 6989, 6993, 7085, 7221, 7229, 7237, 7250, 7291, 7361, 7368, 7405, 7435, 7455, 7456, 7504, 7513, 7519, 7523, 7567, 107818, 108640, 109244, 109980, 110174, 110425, 113326, 117485, 117974, 119548, 119656, 123615, 125538, 125637, 126338, 126507, 126698, 127526, 127528, 127726, 128270, 128279, 129815, 129836, 129884, 130225, 130355, 132522, 133907, 133959, 134035, 134083, 134107, 134171, 134175, 134340, 134484, 134488, 134532, 134556, 134684, 134860, 134872, 134876, 134880, 134884, 134892, 134900, 134920, 134924, 134928, 134932, 134936, 134964, 134976, ...]",248,"[Abdul, Rauf, Khadem, Aliza]","{Abdul, Khadem, Aliza, Mullah, Rauf}",719.0,97,1.0,"[95.0, 95.0, 100.0, 95.0]",97.1875,"[100.0, 100.0, 100.0, 100.0]",match,match
3,6932,ISIL (Da'esh) and Al-Qaida,"{Mohammed, Ismail, Al, Uthman, Filistini, Takfiri, Omar, Qatada, Mahmoud, Samman, Umar, Abu, Umr, Othman}","{M, Q, U, S, F, O, T, I, A}","[23, 58, 157, 550, 836, 3081, 3082, 3083, 3225, 3962, 5609, 6515, 7025, 7478, 7484, 7497, 115714, 117251, 139991, 152885]",20,"[Othman, Omar, Umr, Qatada, Ismail, Al, Filistini, Mahmoud, Samman, Mohammed, Takfiri, Abu]","{Mohammed, Ismail, Uthman, Filistini, Takfiri, Omar, Qatada, Mahmoud, Samman, Umar, Abu, Othman, Umr, Al}",836.0,97,1.0,"[100.0, 95.0, 87.0, 100.0, 100.0, 87.0, 100, 100.0, 100.0, 100.0, 100.0, 87.0]",97.25,"[100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0]",match,match
4,7024,ISIL (Da'esh) and Al-Qaida,"{Aouadi, Belgacem, Belkacem, Mohamed, Hannachi, Fathi, Abdallah, Ben, Al}","{M, H, F, B, A}","[25, 54, 69, 89, 103, 139, 175, 191, 528, 544, 556, 603, 651, 653, 661, 669, 702, 706, 923, 927, 928, 1016, 1065, 1085, 1894, 2700, 3000, 3060, 3144, 3181, 4140, 5416, 5417, 5500, 5619, 5623, 5625, 6047, 6049, 6052, 6074, 6078, 6079, 6080, 6082, 6096, 6118, 6130, 6133, 6213, 6309, 6457, 6555, 6605, 6613, 6614, 6616, 6889, 6947, 6958, 7065, 7074, 7080, 7086, 7150, 7167, 7199, 7246, 7262, 7356, 7379, 7443, 7472, 7540, 7601, 7602, 105748, 106536, 106538, 106542, 106732, 107693, 107695, 108458, 109872, 109896, 110845, 110994, 111553, 111567, 117539, 117869, 120797, 121825, 122300, 125450, 126101, 126546, 126554, 126837, ...]",128,"[Abdallah, Ben, Al, Mohamed, Fathi, Belkacem, Aouadi, Hannachi]","{Aouadi, Belgacem, Belkacem, Mohamed, Hannachi, Fathi, Abdallah, Ben, Al}",927.0,97,1.0,"[100.0, 87.0, 87.0, 100.0, 95.0, 100.0, 100.0, 100.0]",97.09375,"[100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0, 100.0]",match,match


In [21]:
df_grouped.to_pickle(processed_dir/'matched_grouped_final.pkl')
sample_g.to_pickle(processed_dir/'matched_grouped_labeled_sample.pkl')