In [1]:
import os
import json
from bs4 import BeautifulSoup

## Designate input XLM file
xml_filepath = "Datasets/IRGC_sanctions.xml"


## Convert XML soup to JSON format
def xml_to_json(element):
    """
    Recursively parses XML soup, returning as JSON format 
    """
    
    if isinstance(element, str):
        return element
    
    if not element.contents:
        return element.string
    
    result = {}
    
    for child in element.children:
        
        if isinstance(child, str):
            continue
        
        if child.name not in result:
            result[child.name] = xml_to_json(child)
            
        else:
            if not isinstance(result[child.name], list):
                result[child.name] = [result[child.name]]
            result[child.name].append(xml_to_json(child))
            
    ### Directly capture text nodes without 'text' key
    if element.string and element.string.strip():
        return element.string.strip()
    
    return result

In [44]:
def find_relationships(entity):
    
    ## Confirm entity icludes relationship information
    if "relationships" in entity.keys():
        if entity["relationships"] != None:
            
   
            ### Collect entity name 
            name_ele = entity["names"]["name"]
            entity_name = ""
            
    
            ### If element is a list, aliases are present 
            if type(name_ele) == list:
                
                for name in name_ele:
                    
                    #### Find the entity's primary full name 
                    if name["isPrimary"] == "true":
                        
                        name_translations = name["translations"]["translation"] 
                        
                        ##### Find latin script translation and ignore other scripts if they exist 
                        if type(name_translations) == dict:
                            entity_name = name_translations["formattedFullName"]

                                                     
                        elif type(name_translations) == list:
                            for translation in name_translations:
                                if translation["script"] == "Latin":
                                    entity_name = translation["formattedFullName"]
                            
            
            ### Collect relationship information
            relationships = entity["relationships"]["relationship"]
            rel_list = []
            
            if type(relationships) == dict:
                
                rel_type = relationships["type"]
                rel_entity = relationships["relatedEntity"]
                rel_list = [entity_name, rel_type, rel_entity]
                
            elif type(relationships) == list: 
                
                for rel in relationships:
                    
                    rel_type = rel["type"]
                    rel_entity = rel["relatedEntity"]
                    rel_list.append([entity_name, rel_type, rel_entity]) 
                    
            return rel_list

In [45]:
### Execute with main 
def main(input_file):
    
    with open(xml_filepath, "r") as file:
        xml_data = file.read()

    ## Convert XML to JSON, isolate entity data 
    soup = BeautifulSoup(xml_data, features='xml')
    
    entity_json = xml_to_json(soup)
    entity_data = entity_json['sanctionsData']["entities"]["entity"]
    entity_data = [entity for entity in entity_data if entity["generalInfo"]["entityType"] in ["Individual", "Entity"]]
    print(f"Entities found: {len(entity_data)}")
    print(entity_data[0])
    
    ## Find Relationships
    relationships = []
    
    for entity in entity_data:
        
        rel_search = find_relationships(entity)
        
        if rel_search:
            if type(rel_search[0]) == str:
                relationships.append(rel_search)
            
            elif type(rel_search == list):
                for rel in rel_search:
                    relationships.append(rel)
                
        
            
            
    # relationships = [find_relationships(entity) for entity in entity_data if (result := find_relationships(entity)) != None]

        
    return relationships

In [46]:

print(main(xml_filepath))

Entities found: 249
{'generalInfo': {'identityId': '18423', 'entityType': 'Entity'}, 'sanctionsLists': {'sanctionsList': 'SDN List'}, 'sanctionsPrograms': {'sanctionsProgram': ['FTO', 'IRGC']}, 'sanctionsTypes': {'sanctionsType': 'Block'}, 'legalAuthorities': {'legalAuthority': 'INA'}, 'names': {'name': [{'isPrimary': 'true', 'isLowQuality': 'false', 'translations': {'translation': {'isPrimary': 'true', 'script': 'Latin', 'formattedLastName': 'IRGC GROUND FORCES', 'formattedFullName': 'IRGC GROUND FORCES', 'nameParts': {'namePart': {'type': 'Entity Name', 'value': 'IRGC Ground Forces'}}}}}, {'isPrimary': 'false', 'aliasType': 'A.K.A.', 'isLowQuality': 'false', 'translations': {'translation': {'isPrimary': 'true', 'script': 'Latin', 'formattedLastName': 'ISLAMIC REVOLUTION GUARDS CORPS GROUND FORCE', 'formattedFullName': 'ISLAMIC REVOLUTION GUARDS CORPS GROUND FORCE', 'nameParts': {'namePart': {'type': 'Entity Name', 'value': 'Islamic Revolution Guards Corps Ground Force'}}}}}]}, 'addre

NOTE: Entity ID 18236 is not working 