In [4]:
import os
import json
from bs4 import BeautifulSoup

## Designate input XLM file
xml_filepath = "Datasets/IRGC_sanctions.xml"


## Convert XML soup to JSON format
def xml_to_json(element):
    """
    Recursively parses XML soup, returning as JSON format 
    """
    
    if isinstance(element, str):
        return element
    
    if not element.contents:
        return element.string
    
    result = {}
    
    for child in element.children:
        
        if isinstance(child, str):
            continue
        
        if child.name not in result:
            result[child.name] = xml_to_json(child)
            
        else:
            if not isinstance(result[child.name], list):
                result[child.name] = [result[child.name]]
            result[child.name].append(xml_to_json(child))
            
    ### Directly capture text nodes without 'text' key
    if element.string and element.string.strip():
        return element.string.strip()
    
    return result

In [93]:
def find_relationships(entity):
    
    ## Confirm entity icludes relationship information
    if "relationships" not in entity.keys():
        print("no relationships found")
        return 
    
    ## Skip if relationships element is empty 
    if entity["relationships"] == None:
        print("empty relationships element")
        return
    
    ## Record Entity Type
    entity_type = entity["generalInfo"]["entityType"]
    
    ## Collect entity name 
    name_ele = entity["names"]["name"]
    
    ### For "entity" entities 

    
    #### If name element is dict
    if type(name_ele) == dict:
        
        ##### Find Latin translation if more than one translation is present
        translation_element = name_ele["translations"]["translation"]
        
        if type(translation_element) == dict:
           
            if entity_type == "Entity":
                entity_name = translation_element["nameParts"]["namePart"]["value"]
           
            elif entity_type == "Individual":
                
                name_parts = translation_element["nameParts"]["namePart"]
                
                name_dict = {}
                
                compiled_name = ""
                
                for part in name_parts:
                    name_dict[part["type"]] = part["value"]
                    
                if "First Name" in name_dict.keys():
                    first_name = name_dict["First Name"] + " "
                    compiled_name += first_name
                    
                if "Middle Name" in name_dict.keys():
                    middle_name = name_dict["Middle Name"] + " "
                    compiled_name += middle_name
                
                if "Last Name" in name_dict.keys():
                    last_name = name_dict["Last Name"]
                    compiled_name += last_name
                
                # Just to cover if Last Name is not present
                entity_name = compiled_name.strip() 
                
        elif type(translation_element) == list:
            for trans in translation_element:
                if trans["script"] == "Latin":
                    if entity_type == "Entity":
                        entity_name = trans["nameParts"]["namePart"]["value"]

    #### If name element is a list, aliases are present 
    elif type(name_ele) == list:
        
        ##### Find the primary name 
        for name in name_ele:
            
            if name["isPrimary"] == "true":
                
                name_translations = name["translations"]["translation"] 

                ##### Find Latin translation if more than one translation is present
                if type(name_translations) == dict:
                    entity_name = name["translations"]["translation"]["formattedFullName"]
                                                
                elif type(name_translations) == list:
                    for translation in name["translations"]["translation"]:
                        if translation["script"] == "Latin":
                            entity_name = translation["formattedFullName"]

    elif entity_type == "Individual":
        entity_name = "individual lol"
    
    ### Collect relationship information
    relationships = entity["relationships"]["relationship"]
    rel_list = []
    
    # print(f"bbb{entity_name}")
    if type(relationships) == dict:
        
        rel_type = relationships["type"]
        rel_entity = relationships["relatedEntity"]
        rel_list.append([entity_name, rel_type, rel_entity])
        
    elif type(relationships) == list: 
        
        for rel in relationships:
            
            rel_type = rel["type"]
            rel_entity = rel["relatedEntity"]
            rel_list.append([entity_name, rel_type, rel_entity]) 
            
    return rel_list

In [15]:
with open(xml_filepath, "r") as file:
    xml_data = file.read()

## Convert XML to JSON, isolate entity data 
soup = BeautifulSoup(xml_data, features='xml')

entity_json = xml_to_json(soup)
entity_data = entity_json['sanctionsData']["entities"]["entity"]
entity_data = [entity for entity in entity_data if entity["generalInfo"]["entityType"] in ["Individual", "Entity"]]
print(f"Entities found: {len(entity_data)}")
print(entity_data[7])



Entities found: 249
{'generalInfo': {'identityId': '18235', 'entityType': 'Individual'}, 'sanctionsLists': {'sanctionsList': 'SDN List'}, 'sanctionsPrograms': {'sanctionsProgram': ['IFSR', 'IRGC', 'SDGT']}, 'sanctionsTypes': {'sanctionsType': 'Block'}, 'legalAuthorities': {'legalAuthority': 'Executive Order 13224 (Terrorism)'}, 'names': {'name': {'isPrimary': 'true', 'isLowQuality': 'false', 'translations': {'translation': {'isPrimary': 'true', 'script': 'Latin', 'formattedFirstName': 'Alireza', 'formattedLastName': 'ATABAKI', 'formattedFullName': 'ATABAKI, Alireza', 'nameParts': {'namePart': [{'type': 'First Name', 'value': 'Alireza'}, {'type': 'Last Name', 'value': 'Atabaki'}]}}}}}, 'addresses': {'address': None}, 'features': {'feature': [{'type': 'Additional Sanctions Information -', 'versionId': '28777', 'value': 'Subject to Secondary Sanctions', 'valueRefId': '91473', 'isPrimary': 'true'}, {'type': 'Birthdate', 'versionId': '28778', 'value': '1961', 'valueDate': {'fromDateBegin': 

In [94]:
test = find_relationships(entity_data[7])
# entity_data[7]["names"]["name"]["translations"]["translation"]["formattedFullName"]

test

[['Alireza Atabaki', 'Acting for or on behalf of', 'ANSAR EXCHANGE']]

In [45]:
### Execute with main 
def main(input_file):
    
    with open(xml_filepath, "r") as file:
        xml_data = file.read()

    ## Convert XML to JSON, isolate entity data 
    soup = BeautifulSoup(xml_data, features='xml')
    
    entity_json = xml_to_json(soup)
    entity_data = entity_json['sanctionsData']["entities"]["entity"]
    entity_data = [entity for entity in entity_data if entity["generalInfo"]["entityType"] in ["Individual", "Entity"]]
    print(f"Entities found: {len(entity_data)}")
    print(entity_data[0])
    
    ## Find Relationships
    relationships = []
    
    for entity in entity_data:
        
        rel_search = find_relationships(entity)
        
        if rel_search:
            if type(rel_search[0]) == str:
                relationships.append(rel_search)
            
            elif type(rel_search == list):
                for rel in rel_search:
                    relationships.append(rel)
                
        
            
            
    # relationships = [find_relationships(entity) for entity in entity_data if (result := find_relationships(entity)) != None]

        
    return relationships

In [46]:

print(main(xml_filepath))

Entities found: 249
{'generalInfo': {'identityId': '18423', 'entityType': 'Entity'}, 'sanctionsLists': {'sanctionsList': 'SDN List'}, 'sanctionsPrograms': {'sanctionsProgram': ['FTO', 'IRGC']}, 'sanctionsTypes': {'sanctionsType': 'Block'}, 'legalAuthorities': {'legalAuthority': 'INA'}, 'names': {'name': [{'isPrimary': 'true', 'isLowQuality': 'false', 'translations': {'translation': {'isPrimary': 'true', 'script': 'Latin', 'formattedLastName': 'IRGC GROUND FORCES', 'formattedFullName': 'IRGC GROUND FORCES', 'nameParts': {'namePart': {'type': 'Entity Name', 'value': 'IRGC Ground Forces'}}}}}, {'isPrimary': 'false', 'aliasType': 'A.K.A.', 'isLowQuality': 'false', 'translations': {'translation': {'isPrimary': 'true', 'script': 'Latin', 'formattedLastName': 'ISLAMIC REVOLUTION GUARDS CORPS GROUND FORCE', 'formattedFullName': 'ISLAMIC REVOLUTION GUARDS CORPS GROUND FORCE', 'nameParts': {'namePart': {'type': 'Entity Name', 'value': 'Islamic Revolution Guards Corps Ground Force'}}}}}]}, 'addre

The logic for individual name parts needs to be added to the rest of the parse_entity function. It should probably be made into a function to prevent redundancy 