The goal of this notebook is to clean and standardize the raw EU Financial Sanctions dataset, preparing it for future name‑matching experiments.

The notebook is structured as follows:

- **Preprocessing**: Selecting relevant columns and filtering the dataset

- **Language Normalization**: Handling inconsistencies across Latin‑ and non‑Latin‑based names

- **Name Cleaning**: Formatting names for consistency 

- **Outlier Analysis** Removing the top 1% of entities with too many names to reduce noise

- **Data Aggregation** Creating a condensed version of the cleaned dataset, with a single row per entity


# 1. Imports

In [1]:
#necessary libraries
from pathlib import Path
import pandas as pd  
import numpy as np  
import warnings  
from unidecode import unidecode
import re  
import matplotlib.pyplot as plt
import seaborn as sns


#commands for better ouput readability 
pd.set_option('display.max_columns', None)   
#pd.set_option('display.max_rows', None)   
warnings.filterwarnings("ignore", category=UserWarning, module='pandas')  

# 2. Configuration

In [2]:
#paths
project_dir=Path.cwd().parent
raw_dir=project_dir/'data'/'raw'
processed_dir=project_dir/'data'/'processed'
processed_dir.mkdir(exist_ok=True)  #keep this just in case :/

eu_file=raw_dir/'eu_condensed_List.csv'

df=pd.read_csv(eu_file, delimiter=';')

  df=pd.read_csv(eu_file, delimiter=';')


# 3. Preprocessing

In [3]:
df.head()

Unnamed: 0,fileGenerationDate,Entity_LogicalId,Entity_EU_ReferenceNumber,Entity_UnitedNationId,Entity_DesignationDate,Entity_DesignationDetails,Entity_Remark,Entity_SubjectType,Entity_SubjectType_ClassificationCode,Entity_Regulation_Type,Entity_Regulation_OrganisationType,Entity_Regulation_PublicationDate,Entity_Regulation_EntryIntoForceDate,Entity_Regulation_NumberTitle,Entity_Regulation_Programme,Entity_Regulation_PublicationUrl,NameAlias_LastName,NameAlias_FirstName,NameAlias_MiddleName,NameAlias_WholeName,NameAlias_NameLanguage,NameAlias_Gender,NameAlias_Title,NameAlias_Function,NameAlias_LogicalId,NameAlias_RegulationLanguage,NameAlias_Remark,NameAlias_Regulation_Type,NameAlias_Regulation_OrganisationType,NameAlias_Regulation_PublicationDate,NameAlias_Regulation_EntryIntoForceDate,NameAlias_Regulation_NumberTitle,NameAlias_Regulation_Programme,NameAlias_Regulation_PublicationUrl,Address_City,Address_Street,Address_PoBox,Address_ZipCode,Address_Region,Address_Place,Address_AsAtListingTime,Address_ContactInfo,Address_CountryIso2Code,Address_CountryDescription,Address_LogicalId,Address_RegulationLanguage,Address_Remark,Address_Regulation_Type,Address_Regulation_OrganisationType,Address_Regulation_PublicationDate,Address_Regulation_EntryIntoForceDate,Address_Regulation_NumberTitle,Address_Regulation_Programme,Address_Regulation_PublicationUrl,BirthDate_BirthDate,BirthDate_Day,BirthDate_Month,BirthDate_Year,BirthDate_YearRangeFrom,BirthDate_YearRangeTo,BirthDate_Circa,BirthDate_CalendarType,BirthDate_ZipCode,BirthDate_Region,BirthDate_Place,BirthDate_City,BirthDate_CountryIso2Code,BirthDate_CountryDescription,BirthDate_LogicalId,BirthDate_RegulationLanguage,BirthDate_Remark,BirthDate_Regulation_Type,BirthDate_Regulation_OrganisationType,BirthDate_Regulation_PublicationDate,BirthDate_Regulation_EntryIntoForceDate,BirthDate_Regulation_NumberTitle,BirthDate_Regulation_Programme,BirthDate_Regulation_PublicationUrl,Identification_Number,Identification_Diplomatic,Identification_KnownExpired,Identification_KnownFalse,Identification_ReportedLost,Identification_RevokedByIssuer,Identification_IssuedBy,Identification_IssuedDate,Identification_ValidFrom,Identification_ValidTo,Identification_LatinNumber,Identification_NameOnDocument,Identification_TypeCode,Identification_TypeDescription,Identification_Region,Identification_CountryIso2Code,Identification_CountryDescription,Identification_LogicalId,Identification_RegulationLanguage,Identification_Remark,Identification_Regulation_Type,Identification_Regulation_OrganisationType,Identification_Regulation_PublicationDate,Identification_Regulation_EntryIntoForceDate,Identification_Regulation_NumberTitle,Identification_Regulation_Programme,Identification_Regulation_PublicationUrl,Citizenship_Region,Citizenship_CountryIso2Code,Citizenship_CountryDescription,Citizenship_LogicalId,Citizenship_RegulationLanguage,Citizenship_Remark,Citizenship_Regulation_Type,Citizenship_Regulation_OrganisationType,Citizenship_Regulation_PublicationDate,Citizenship_Regulation_EntryIntoForceDate,Citizenship_Regulation_NumberTitle,Citizenship_Regulation_Programme,Citizenship_Regulation_PublicationUrl
0,10/03/2025,13,EU.27.28,,,,(UNSC RESOLUTION 1483),P,person,regulation,commission,2003-07-08,2003-07-07,1210/2003 (OJ L169),IRQ,http://eur-lex.europa.eu/LexUriServ/LexUriServ...,Hussein Al-Tikriti,Saddam,,Saddam Hussein Al-Tikriti,,M,,,17.0,EN,,regulation,commission,2003-07-08,2003-07-07,1210/2003 (OJ L169),IRQ,http://eur-lex.europa.eu/LexUriServ/LexUriServ...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,10/03/2025,13,EU.27.28,,,,(UNSC RESOLUTION 1483),P,person,regulation,commission,2003-07-08,2003-07-07,1210/2003 (OJ L169),IRQ,http://eur-lex.europa.eu/LexUriServ/LexUriServ...,,,,Abu Ali,,,,,19.0,EN,,regulation,commission,2003-07-08,2003-07-07,1210/2003 (OJ L169),IRQ,http://eur-lex.europa.eu/LexUriServ/LexUriServ...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,10/03/2025,13,EU.27.28,,,,(UNSC RESOLUTION 1483),P,person,regulation,commission,2003-07-08,2003-07-07,1210/2003 (OJ L169),IRQ,http://eur-lex.europa.eu/LexUriServ/LexUriServ...,,,,Abou Ali,FR,,,,380.0,EN,,regulation,commission,2003-07-08,2003-07-07,1210/2003 (OJ L169),IRQ,http://eur-lex.europa.eu/LexUriServ/LexUriServ...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,10/03/2025,13,EU.27.28,,,,(UNSC RESOLUTION 1483),P,person,regulation,commission,2003-07-08,2003-07-07,1210/2003 (OJ L169),IRQ,http://eur-lex.europa.eu/LexUriServ/LexUriServ...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1937-04-28,28.0,4.0,1937.0,,,NO,GREGORIAN,,,,"al-Awja, near Tikrit",IQ,IRAQ,14.0,EN,,regulation,commission,2003-07-08,2003-07-07,1210/2003 (OJ L169),IRQ,http://eur-lex.europa.eu/LexUriServ/LexUriServ...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,10/03/2025,13,EU.27.28,,,,(UNSC RESOLUTION 1483),P,person,regulation,commission,2003-07-08,2003-07-07,1210/2003 (OJ L169),IRQ,http://eur-lex.europa.eu/LexUriServ/LexUriServ...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,IQ,,1.0,EN,,regulation,commission,2003-07-08,2003-07-07,1210/2003 (OJ L169),IRQ,http://eur-lex.europa.eu/LexUriServ/LexUriServ...


The raw dataset contains extensive information about each entity, including names, addresses, birthplaces, citizenship, and other details. Since this project focuses specifically on exploring name matching, the scope was narrowed down to the name fields and relevant sanction-related information.

In [4]:
#extract the relevant columns from the original dataframe into df_names
df_names=df[['Entity_LogicalId','Entity_EU_ReferenceNumber','Entity_SubjectType_ClassificationCode','Entity_Regulation_EntryIntoForceDate','Entity_Regulation_NumberTitle','Entity_Regulation_Programme','NameAlias_WholeName','NameAlias_NameLanguage']]

#renamecolumns into more concise and intuitive names 
df_names.columns=['Id','EU Reference Number','Entity Type','Entry Into Force Date','Regulation Identifier','Sanction Programme','Name','Name Language']

#delete rows that have become redundant after deleting columns 
df_names=df_names.replace('', np.nan)
df_names=df_names.dropna(subset=['Name'])  

df_names=df_names.reset_index(drop=True)

In [5]:
df_names.head()

Unnamed: 0,Id,EU Reference Number,Entity Type,Entry Into Force Date,Regulation Identifier,Sanction Programme,Name,Name Language
0,13,EU.27.28,person,2003-07-07,1210/2003 (OJ L169),IRQ,Saddam Hussein Al-Tikriti,
1,13,EU.27.28,person,2003-07-07,1210/2003 (OJ L169),IRQ,Abu Ali,
2,13,EU.27.28,person,2003-07-07,1210/2003 (OJ L169),IRQ,Abou Ali,FR
3,20,EU.39.56,person,2003-07-07,1210/2003 (OJ L169),IRQ,Qusay Saddam Hussein Al-Tikriti,
4,20,EU.39.56,person,2003-07-07,1210/2003 (OJ L169),IRQ,Qoussaï Saddam Hussein Al-Tikriti,FR


# 4. Language Normalization

In [6]:
df_names['Name Language'].unique()

array([nan, 'FR', 'ES', 'FI', 'DE', 'EN', 'SV', 'NL', 'MT', 'HU', 'LT',
       'IT', 'HR', 'GA', 'EL', 'ET', 'CS', 'BG', 'SL', 'SK', 'RO', 'PT',
       'PL', 'DA', 'PA', 'TR', 'KU', 'LV', 'AR', 'RU', 'BE', 'FA', 'KO',
       'UK', 'ZH', 'PS', 'UZ', 'SR', 'MO', 'GL', 'HE', 'GD'], dtype=object)

By reviewing the names listed for each language, it was possible to categorize them into two groups: Latin-based and Non-Latin-based languages.

- Latin-based languages: These languages use the Latin alphabet, but may include diacritical marks.
   - **Danish (DA):** æ,å,ø
   - **Polish (PO):** ł,ż,ę
   - **French (FR):** ï,é,œ
- Non-Latin-based languages: These languages have their own unique alphabets.
   - **Arabic (AR):** عبد المنان آغا
   - **Russian (RU):** Виктор Владимирович
   - **Hebrew (HE):** מאיר אטינגר

### 4.1 **Non-latin Languages**

For simplicity, all rows with names in non‑Latin scripts were removed from the dataset. This is because these names cannot be reliably transliterated into English with a simple rule‑based approach, and matching them accurately would require advanced linguistic expertise. The excluded languages are:
- Greek (EL)
- Bulgarian (BG) 
- Russian (RU)
- Farsi (FA)
- Korean (KO)
- Ukranian (UK)
- Chinese (ZH)
- Pashto (PS)
- Serbian (SR)
- Hebrew (HE)

In [7]:
#correcting a data error: 'Bahram Hosseini MOTLAGH' was mislabelled as Farsi
df_names.loc[df_names['Name']=='Bahram Hosseini MOTLAGH','Name Language']=np.nan

In [8]:
#list of non-latin languages
non_latin_list=['EL','BG','RU','FA','KO','UK','ZH','PS','SR','HE']  

#filter out rows corresponding to non-latin languages 
df_names=df_names[~df_names['Name Language'].isin(non_latin_list)]
df_names=df_names.reset_index(drop=True)    


### 4.2 **Latin-based Languages**

For the remaining languages, diacritics and special characters were handled using `unidecode`. This library intelligently maps non-ASCII Latin characters to their closest ASCII equivalents (e.g., ø → o, æ → ae, ç → c). More details about its limitations can be found at [Unidecode on PyPI](https://pypi.org/project/Unidecode/).


In [9]:
def normalize(text):
    """
    Normalizes text by removing diacritics and special characters.

    Args:
        text (str): The input text to be normalized.

    Returns:
        str: The normalized text with diacritics and special characters removed.

    Example:
        >>> normalize('ołá')
        'ola'
    """
    return unidecode(text)

df_names['Name']=df_names['Name'].apply(normalize)

In [10]:
#drop the 'Name Language' column (no longer needed)
df_names = df_names.drop(columns=['Name Language'])

# 5. Name Cleaning

After inspecting the names in the dataset, we noticed recurring inconsistencies in their formatting. To standardize them, we created two cleaning functions: `clean_name_person`and `clean_name_enterprise`. Both functions apply the same text normalization and cleaning rules, differing only in their handling of numerical characters (relevant for enterprises). 

In [11]:
def clean_name_person(text):
    """
    Cleans an individual's name using regex patterns.

    Rules:
    - Replaces apostrophes and hyphens with spaces
    - Removes any character that is not a letter or space
    - Capitalizes the first letter of each word

    Args:
        text (str): The raw name.

    Returns:
        str: The cleaned, properly capitalized name.
    """
    
    text=re.sub(r"[\'\-\\]",' ',text)  
    text=re.sub(r"[^a-zA-Z\s]",'',text) 
    
    text=text.split()
    text=[word.capitalize() for word in text]  

    text=' '.join(text)
    
    return text

#apply the cleaning function to 'person' rows
df_names.loc[df_names['Entity Type']=='person','Name']=df_names.loc[df_names['Entity Type']=='person','Name'].apply(clean_name_person)

In [12]:
def clean_name_enterprise(text):
    """
    Cleans an enterprise name using regex patterns.

    Rules:
    - Replaces apostrophes and hyphens with spaces
    - Removes any character that is not a letter, digit, or space
    - Capitalizes the first letter of each word

    Args:
        text (str): The raw enterprise name.

    Returns:
        str: The cleaned, properly capitalized name.
    """
    
    text=re.sub(r"[\'\-\\]",' ',text)  
    text=re.sub(r"[^a-zA-Z0-9\s]",'',text)
    
    text=text.split()
    text=[word.capitalize() for word in text]  

    text=' '.join(text)
    
    return text

#apply the cleaning function to 'enterprise' rows
df_names.loc[df_names['Entity Type']=='enterprise','Name']=df_names.loc[df_names['Entity Type']=='enterprise','Name'].apply(clean_name_enterprise)

In [13]:
#run this to avoid errors
df_names=df_names.drop_duplicates().reset_index(drop=True)

In [14]:
df_names.head()

Unnamed: 0,Id,EU Reference Number,Entity Type,Entry Into Force Date,Regulation Identifier,Sanction Programme,Name
0,13,EU.27.28,person,2003-07-07,1210/2003 (OJ L169),IRQ,Saddam Hussein Al Tikriti
1,13,EU.27.28,person,2003-07-07,1210/2003 (OJ L169),IRQ,Abu Ali
2,13,EU.27.28,person,2003-07-07,1210/2003 (OJ L169),IRQ,Abou Ali
3,20,EU.39.56,person,2003-07-07,1210/2003 (OJ L169),IRQ,Qusay Saddam Hussein Al Tikriti
4,20,EU.39.56,person,2003-07-07,1210/2003 (OJ L169),IRQ,Qoussai Saddam Hussein Al Tikriti


# 6. Outlier Analysis

Names associated with the same entity can vary significantly, from a handful of entries to 100+. To prevent disproportionately long or ambiguous records from affecting the quality of name matching, we identified and removed outliers:

- We first group names by entity (Id) and count how many names each entity had.

- We observed a highly skewed distribution, with certain entities having an unusually high number of names.

- To formalize the cutoff, we computed the 99th percentile of counts for people and enterprises:

- All entities with counts above these thresholds were treated as outliers and removed from the dataset.





In [15]:
#print Name Counts per ID

person_counts=df_names[df_names['Entity Type']=='person'].groupby('Id').size().reset_index(name='Count')
enterprise_counts=df_names[df_names['Entity Type']=='enterprise'].groupby('Id').size().reset_index(name='Count')

person_counts=person_counts.sort_values(by='Count',ascending=False)
enterprise_counts=enterprise_counts.sort_values(by='Count',ascending=False)

print(person_counts)
print(enterprise_counts)

          Id  Count
706     7133    145
589     6826     44
1365  122257     34
842     7387     28
1367  122316     26
...      ...    ...
3196  150654      1
3133  149589      1
3195  150649      1
3194  150643      1
4074  173191      1

[4078 rows x 2 columns]
         Id  Count
469  120128    105
330    7245     95
0       201     92
359    7428     83
504  128875     77
..      ...    ...
240    6557      1
238    6523      1
237    6521      1
542  133473      1
541  133470      1

[1197 rows x 2 columns]


In [16]:
#print 99th percentile name count

x_p=person_counts['Count']
x_e=enterprise_counts['Count']

upper_p_new=np.percentile(x_p, 99) 
upper_e_new=np.round(np.percentile(x_e, 99), 1) 

print('Upper limit for People:',upper_p_new)
print('Upper limit for Enterprises:',upper_e_new)

Upper limit for People: 14.0
Upper limit for Enterprises: 49.1


In [17]:
#remove outliers from dataset

outliers_person=person_counts[person_counts['Count']>upper_p_new].reset_index(drop=True)
outliers_enterprise=enterprise_counts[enterprise_counts['Count']>upper_e_new].reset_index(drop=True)

outliers_Id=pd.concat([outliers_person['Id'], outliers_enterprise['Id']], ignore_index=True).drop_duplicates()
df_names=df_names[~df_names['Id'].isin(outliers_Id)].reset_index(drop=True)

# 7. Data Agreggation

For future experiments, we created a condensed version of the dataset, `df_grouped`, by aggregating all names associated with a single ID into a single cell. This approach will potentially allow for more efficient processing and matching.

In [18]:
df_grouped=df_names.copy()

def aggregate_unique_words(grouped_names):
    """

    This function takes a list of names, splits each name into its individual words,
    and returns a comma-separated string of all unique words across the list.

    Args:
        grouped_names (list of str): List of names associated with a single entity.

    Returns:
        str: A comma-separated string of unique words extracted from the names.
    """
 
    words=set()

    for name in grouped_names:

        #example: Abu Ali - [Abu,Ali]
        name=name.split()
        words.update(name)
        
    return ', '.join(words)

            
df_grouped=df_grouped.groupby('Id',as_index=False).agg({
       
    'EU Reference Number': 'first',  
    'Entity Type': 'first',
    'Entry Into Force Date': 'first',
    'Regulation Identifier': 'first',
    'Sanction Programme': 'first',
    'Name': aggregate_unique_words,  
})

# 8. Output

In [19]:
df_names.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21324 entries, 0 to 21323
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Id                     21324 non-null  int64 
 1   EU Reference Number    21324 non-null  object
 2   Entity Type            21324 non-null  object
 3   Entry Into Force Date  21324 non-null  object
 4   Regulation Identifier  21324 non-null  object
 5   Sanction Programme     21324 non-null  object
 6   Name                   21324 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.1+ MB


In [20]:
df_grouped.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5227 entries, 0 to 5226
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Id                     5227 non-null   int64 
 1   EU Reference Number    5227 non-null   object
 2   Entity Type            5227 non-null   object
 3   Entry Into Force Date  5227 non-null   object
 4   Regulation Identifier  5227 non-null   object
 5   Sanction Programme     5227 non-null   object
 6   Name                   5227 non-null   object
dtypes: int64(1), object(6)
memory usage: 286.0+ KB


In [21]:
df_names.to_csv(processed_dir/'cleaned_eu_sanctions.csv', index=False)
df_grouped.to_csv(processed_dir/'cleaned_eu_sanctions_grouped.csv', index=False)