In [46]:
import os
import json
import pandas as pd
from bs4 import BeautifulSoup

## Designate input XLM file
xml_filepath = "Datasets/IRGC_sanctions.xml"


## Convert XML soup to JSON format
def xml_to_json(element):
    """
    Recursively parses XML soup, returning as JSON format 
    """
    
    if isinstance(element, str):
        return element
    
    if not element.contents:
        return element.string
    
    result = {}
    
    for child in element.children:
        
        if isinstance(child, str):
            continue
        
        if child.name not in result:
            result[child.name] = xml_to_json(child)
            
        else:
            if not isinstance(result[child.name], list):
                result[child.name] = [result[child.name]]
            result[child.name].append(xml_to_json(child))
            
    ### Directly capture text nodes without 'text' key
    if element.string and element.string.strip():
        return element.string.strip()
    
    return result

In [91]:
def find_relationships(entity):
    
    def extract_individual_name(name_parts):
        
        name_dict = {}
        
        compiled_name = ""
        
        for part in name_parts:
            name_dict[part["type"]] = part["value"]
            
        if "First Name" in name_dict.keys():
            first_name = name_dict["First Name"] + " "
            compiled_name += first_name
            
        if "Middle Name" in name_dict.keys():
            middle_name = name_dict["Middle Name"] + " "
            compiled_name += middle_name
        
        if "Last Name" in name_dict.keys():
            last_name = name_dict["Last Name"]
            compiled_name += last_name
        
        # Just to cover if "Last Name" is not present
        entity_name = compiled_name.strip() 
        
        return entity_name
             
   
    ## Confirm entity icludes relationship information
    if "relationships" not in entity.keys():
        return 
    
    
    ## Skip if relationships element is empty 
    if entity["relationships"] == None:
        return
    
    
    ## Record Entity Type
    entity_type = entity["generalInfo"]["entityType"]
    
    
    ## Collect entity name 
    name_ele = entity["names"]["name"]
    
    
    ### For "entity" entities 
    #### If name element is dict
    if type(name_ele) == dict:
        
        ##### Find Latin translation if more than one translation is present
        translation_element = name_ele["translations"]["translation"]
        
        if type(translation_element) == dict:
           
            if entity_type == "Entity":
                entity_name = translation_element["nameParts"]["namePart"]["value"]
           
            elif entity_type == "Individual":
                name_parts = translation_element["nameParts"]["namePart"]
                entity_name = extract_individual_name(name_parts)

        elif type(translation_element) == list:
            for trans in translation_element:
                if trans["script"] == "Latin":
                    
                    if entity_type == "Entity":
                        entity_name = trans["nameParts"]["namePart"]["value"]
                        
                    elif entity_type == "Individual":
                        name_parts = trans["nameParts"]["namePart"]
                        entity_name = extract_individual_name(name_parts)

    #### If name element is a list, aliases are present 
    elif type(name_ele) == list:
        
        ##### Find the primary name 
        for name in name_ele:
            
            if name["isPrimary"] == "true":
                
                translation_element = name["translations"]["translation"] 

                ##### Find Latin translation if more than one translation is present
                if type(translation_element) == dict:
                    
                    if entity_type == "Entity":
                        entity_name = translation_element["nameParts"]["namePart"]["value"]
           
                    elif entity_type == "Individual":
                        name_parts = translation_element["nameParts"]["namePart"]
                        entity_name = extract_individual_name(name_parts)
           
                                                
                elif type(translation_element) == list:
                    for trans in translation_element:
                        if trans["script"] == "Latin":
                            
                            if entity_type == "Entity":
                                entity_name = trans["nameParts"]["namePart"]["value"]
                                
                            elif entity_type == "Individual":
                                name_parts = trans["nameParts"]["namePart"]
                                entity_name = extract_individual_name(name_parts)


    ### Collect relationship information
    relationships = entity["relationships"]["relationship"]
    rel_list = []
    
    # print(f"bbb{entity_name}")
    if type(relationships) == dict:
        
        rel_type = relationships["type"]
        rel_entity = relationships["relatedEntity"]
        
        if rel_entity != None:
            rel_list = [entity_name, rel_type, rel_entity]
        
    elif type(relationships) == list: 
        
        for rel in relationships:
            
            rel_type = rel["type"]
            rel_entity = rel["relatedEntity"]
            
            if rel_entity != None:
                rel_list.append([entity_name, rel_type, rel_entity]) 
            
    return rel_list

In [92]:
### Execute with main 
def main(input_file):
    
    with open(input_file, "r") as file:
        xml_data = file.read()

    ## Convert XML to JSON, isolate entity data 
    soup = BeautifulSoup(xml_data, features='xml')
    
    entity_json = xml_to_json(soup)
    entity_data = entity_json['sanctionsData']["entities"]["entity"]
    entity_data = [entity for entity in entity_data if entity["generalInfo"]["entityType"] in ["Individual", "Entity"]]
    print(f"Entities found: {len(entity_data)}")
    
    ## Find Relationships
    relationships = []
    
    for entity in entity_data:
        
        rel_search = find_relationships(entity)
        
        if rel_search:
            if type(rel_search[0]) == str:
                relationships.append(rel_search)
            
            elif type(rel_search == list):
                for rel in rel_search:
                    relationships.append(rel)
                
        
    df = pd.DataFrame(relationships, columns=['entity_1', 'relationship', 'entity_2'])       
            
    df.to_csv("relationship_dataset.csv")
        
    return df

## To Do 
Now need to convert the entity 2 names into the appropriate format 

In [93]:
from pprint import pprint
test_df = main(xml_filepath)

test_df.head(15)

Entities found: 249


Unnamed: 0,entity_1,relationship,entity_2
0,Ayatollah Ebrahimi,Acting for or on behalf of,ANSAR BANK
1,Iranian Atlas Company,Owned or Controlled By,ANSAR BANK
2,Ansar Bank Brokerage Company,Owned or Controlled By,ANSAR BANK
3,Ansar Information Technology Company,Owned or Controlled By,ANSAR BANK
4,Ansar Exchange,Owned or Controlled By,ANSAR BANK
5,Ansar Exchange,Providing support to,ISLAMIC REVOLUTIONARY GUARD CORPS
6,Ansar Exchange,Providing support to,ISLAMIC REVOLUTIONARY GUARD CORPS (IRGC)-QODS ...
7,Alireza Atabaki,Acting for or on behalf of,ANSAR EXCHANGE
8,Zagros Pardis Kish,Owned or Controlled By,"ATABAKI, Alireza"
9,Zagros Pardis Kish,Providing support to,MINISTRY OF DEFENSE AND ARMED FORCES LOGISTICS


In [94]:
rel_set = set(test_df["entity_2"])
len(rel_set)
pprint(rel_set)

{'ALCHWIKI, Mhd Amer',
 'ANSAR BANK',
 'ANSAR EXCHANGE',
 'ATABAKI, Alireza',
 'BANIAS REFINERY COMPANY',
 'BASIJ RESISTANCE FORCE',
 'BONYAD TAAVON SEPAH',
 'GLOBAL VISION GROUP',
 'HAMAS',
 'HIZBALLAH',
 'IRANIAN ISLAMIC REVOLUTIONARY GUARD CORPS CYBER-ELECTRONIC COMMAND',
 'ISLAMIC REVOLUTION MOSTAZAFAN FOUNDATION',
 'ISLAMIC REVOLUTIONARY GUARD CORPS',
 'ISLAMIC REVOLUTIONARY GUARD CORPS (IRGC)-QODS FORCE',
 'ISLAMIC REVOLUTIONARY GUARD CORPS AEROSPACE FORCE SELF SUFFICIENCY JIHAD '
 'ORGANIZATION',
 'ISLAMIC REVOLUTIONARY GUARD CORPS AIR FORCE',
 'ISLAMIC REVOLUTIONARY GUARD CORPS AL-GHADIR MISSILE COMMAND',
 'ISLAMIC REVOLUTIONARY GUARD CORPS INTELLIGENCE ORGANIZATION',
 'ISLAMIC REVOLUTIONARY GUARD CORPS RESEARCH AND SELF-SUFFICIENCY JEHAD '
 'ORGANIZATION',
 'KAREEM, Aras Habib',
 "KATA'IB HIZBALLAH",
 'KHATIBI AGHADA, Ahmad',
 "KHODA'I, Mohammad Hasan",
 'KIMIA PART SIVAN COMPANY LLC',
 'MEHR EQTESAD BANK',
 'MINISTRY OF DEFENSE AND ARMED FORCES LOGISTICS',
 "NAJAFPUR, Sa'id",

## Time for what they call in the business, a pro move...

In [105]:
import re

def format_name(entity_2):
    
    entity_name = entity_2
    
    if ", " in entity_name:
        name_parts = entity_name.split(", ")
        entity_name = f"{name_parts[1]} {name_parts[0]}"
    
    entity_name = entity_name.lower().title()
    
    # Function to capitalize text within parentheses
    def capitalize(match):
        return match.group(1) + match.group(2).upper() + match.group(3)
    
    # Regular expression to find text within parentheses
    pattern = r'(\()([^\)]+)(\))'
    
    # Substitute the matched text with the capitalized version
    entity_name = re.sub(pattern, capitalize, entity_name)
    
    return entity_name

test_df["entity_2"] = test_df["entity_2"].apply(format_name)

In [106]:
test_df.head(30)

Unnamed: 0,entity_1,relationship,entity_2
0,Ayatollah Ebrahimi,Acting for or on behalf of,Ansar Bank
1,Iranian Atlas Company,Owned or Controlled By,Ansar Bank
2,Ansar Bank Brokerage Company,Owned or Controlled By,Ansar Bank
3,Ansar Information Technology Company,Owned or Controlled By,Ansar Bank
4,Ansar Exchange,Owned or Controlled By,Ansar Bank
5,Ansar Exchange,Providing support to,Islamic Revolutionary Guard Corps
6,Ansar Exchange,Providing support to,Islamic Revolutionary Guard Corps (IRGC)-Qods ...
7,Alireza Atabaki,Acting for or on behalf of,Ansar Exchange
8,Zagros Pardis Kish,Owned or Controlled By,Alireza Atabaki
9,Zagros Pardis Kish,Providing support to,Ministry Of Defense And Armed Forces Logistics


## Entity Navigator

In [90]:
# with open(xml_filepath, "r") as file:
#     xml_data = file.read()

# ## Convert XML to JSON, isolate entity data 
# soup = BeautifulSoup(xml_data, features='xml')

# entity_json = xml_to_json(soup)
# entity_data = entity_json['sanctionsData']["entities"]["entity"]
# entity_data = [entity for entity in entity_data if entity["generalInfo"]["entityType"] in ["Individual", "Entity"]]
# print(f"Entities found: {len(entity_data)}")

# entity_data[204]

Entities found: 249


{'generalInfo': {'identityId': '38542', 'entityType': 'Individual'},
 'sanctionsLists': {'sanctionsList': 'SDN List'},
 'sanctionsPrograms': {'sanctionsProgram': ['IFSR', 'IRGC', 'NPWMD']},
 'sanctionsTypes': {'sanctionsType': 'Block'},
 'legalAuthorities': {'legalAuthority': 'Executive Order 13382 (Non-proliferation)'},
 'names': {'name': {'isPrimary': 'true',
   'isLowQuality': 'false',
   'translations': {'translation': {'isPrimary': 'true',
     'script': 'Latin',
     'formattedFirstName': 'Agung Surya',
     'formattedLastName': 'DEWANTO',
     'formattedFullName': 'DEWANTO, Agung Surya',
     'nameParts': {'namePart': [{'type': 'First Name', 'value': 'Agung'},
       {'type': 'Last Name', 'value': 'Dewanto'},
       {'type': 'Middle Name', 'value': 'Surya'}]}}}}},
 'addresses': {'address': {'country': 'Indonesia',
   'translations': {'translation': {'isPrimary': 'true', 'script': 'Latin'}}}},
 'features': {'feature': [{'type': 'Additional Sanctions Information -',
    'versionId