## Sanctions Data Vault Generator

### Import Sanctions XML File

In [1]:
import os
import json
from bs4 import BeautifulSoup

xml_filepath = "Datasets/IRGC_sanctions.xml"

with open(xml_filepath, "r") as file:
    xml_data = file.read()
    
print(type(xml_data))
print(len(xml_data))

<class 'str'>
1942087


## Parse XML File and Convert to JSON
`def xml_to_json(element)`

In [57]:
soup = BeautifulSoup(xml_data, features='xml')

def xml_to_json(element):
    
    if isinstance(element, str):
        return element
    
    if not element.contents:
        return element.string
    
    result = {}
    
    for child in element.children:
        
        if isinstance(child, str):
            continue
        
        if child.name not in result:
            result[child.name] = xml_to_json(child)
            
        else:
            if not isinstance(result[child.name], list):
                result[child.name] = [result[child.name]]
            result[child.name].append(xml_to_json(child))
            
    # Directly capture text nodes without 'text' key
    if element.string and element.string.strip():
        return element.string.strip()
    
    return result

## Parse Sanctions Entities
`def parse_entity(entity)`

In [66]:
def parse_entity(entity):
    
    """
    Given an XML line of sanction data for an individual,
    populates a dictionary with useful information.
    
    Returns the dictionary.
    """
    
    entity_dict = {
        "name": "",
        "entity_type": "",
        "sanctions_lists": [],
        "sanctions_programs": [],
        "sanctions_types": "",
        "legal_authorities": [],
        # "aliases": [],
        # "relationships": [],
        # "identity_documents": [],
        # "addresses": [],
        # "features": [],
        # "remarks": ""
    }
    
    ## Screen out vessels and aircraft entities
    entity_type = entity["generalInfo"]["entityType"]
    entity_dict["entity_type"] = entity_type
    

    # if entity_type not in ["Entity", "Individual"]:
    #     entity_dict["entity_type"] = entity_type
    #     print(entity_type)  
    # else: return
    
    ## Name and aliases
    name_ele = entity["names"]["name"]
    
    ### If element is a list, aliases are present 
    if type(name_ele) == list:
        
        for name in name_ele:
            
            #### Find the entity's primary full name 
            if name["isPrimary"] == "true":
                
                ##### Confirm script is latin and ignore other scripts if they exist 
                if type(name["translations"]["translation"]) == dict:
                    entity_name = name["translations"]["translation"]["formattedFullName"]
                    entity_dict["name"] = entity_name
                    
                elif type(name["translations"]["translation"]) == list:
                    for trans in name["translations"]["translation"]:
                        if trans["script"] == "Latin":
                            # This approach assumes there will only ever be one latin script translation
                            entity_name = trans["formattedFullName"]
                            entity_dict["name"] = entity_name
            
            #### Add non-primary names as aliases  
            elif name["isPrimary"] == "false":
                
                ##### Create an alias dict value if none exist 
                if "aliases" not in entity_dict.keys():
                    entity_dict["aliases"] = []
                    
                alias_type = name["aliasType"]
                
                ##### Confirm script is latin and ignore other scripts if they exist 
                if type(name["translations"]["translation"]) == dict:
                    alias = name["translations"]["translation"]["formattedFullName"]

                elif type(name["translations"]["translation"]) == list:
                    for trans in name["translations"]["translation"]:
                        if trans["script"] == "Latin":
                            # This approach assumes there will only ever be one latin script translation
                            alias = trans["formattedFullName"]
                                
                entity_dict["aliases"].append([alias_type, alias])
        
    #### Parse name dict element
    elif type(name_ele) == dict:
        
        ##### Confirm script is latin and ignore other scripts if they exist 
        if type(name_ele["translations"]["translation"]) == dict:
            entity_name = name_ele["translations"]["translation"]["formattedFullName"]
            entity_dict["name"] = entity_name
            
        elif type(name_ele["translations"]["translation"]) == list:
            for trans in name_ele["translations"]["translation"]:
                if trans["script"] == "Latin":
                    # This approach assumes there will only ever be one latin script translation
                    entity_name = trans["formattedFullName"]
                            
        entity_dict["name"] = entity_name


    ## Sanctions list, program, type, legal authority
    entity_dict["sanctions_lists"] = entity["sanctionsLists"]["sanctionsList"]
    entity_dict["sanctions_types"] = entity["sanctionsTypes"]["sanctionsType"]
    entity_dict["sanctions_programs"] = entity["sanctionsPrograms"]["sanctionsProgram"]

    if entity["legalAuthorities"]["legalAuthority"] != None:
        entity_dict["legal_authorities"] = entity["legalAuthorities"]["legalAuthority"]
    else: entity_dict["legal_authorities"] = ""
 


    ## Relationships
    if "relationships" in entity.keys():
        
        if entity["relationships"] != None:
            
            entity_dict["relationships"] = []
            relationships = entity["relationships"]["relationship"]
            
            
            if type(relationships) == dict:
                
                rel_type = relationships["type"]
                rel_entity = relationships["relatedEntity"]
                entity_dict["relationships"].append([rel_type, rel_entity])
                
            elif type(relationships) == list: 
                
                for rel in relationships:
                    rel_type = rel["type"]
                    rel_entity = rel["relatedEntity"]
                    entity_dict["relationships"].append([rel_type, rel_entity])  
                    


    ## Identity Documents
    if "identityDocuments" in entity.keys():
        
        if entity["identityDocuments"] != None:

            entity_dict["identity_documents"] = []
            id_docs = entity["identityDocuments"]["identityDocument"]
            
            if type(id_docs) == dict:
                
                id_type = id_docs["type"]
                id_name = id_docs["name"]
                id_docno = id_docs["documentNumber"]
                id_valid = id_docs["isValid"]
                
                if "issuingCountry" in id_docs.keys(): 
                    id_issuer = id_docs["issuingCountry"]
                else: id_issuer = ""
                
                entity_dict["identity_documents"].append([id_type, id_name, id_docno, id_valid, id_issuer])
                
            elif type(id_docs) == list: 
                
                for id in id_docs:
                    
                    id_type = id["type"]
                    id_name = id["name"]
                    id_docno = id["documentNumber"]
                    id_valid = id["isValid"]
                    
                    if "issuingCountry" in id.keys(): 
                        id_issuer = id["issuingCountry"]
                    else: id_issuer = ""
                
                    entity_dict["identity_documents"].append([id_type, id_name, id_docno, id_valid, id_issuer])

            
  
    ## Features
    if "features" in entity.keys():
        
        entity_dict["features"] = []
        features = entity["features"]["feature"]
        
        if type(features) == dict:
            feature_type = features["type"]
            feature_value = features["value"]
            
            # if "valueDate" in features.keys():
            #     feature_date_from = features["documentNumber"]
            # else: feature_date_from = ""
            
            entity_dict["features"].append([feature_type, feature_value])
            
        elif type(features) == list: 
            
            for feature in features:
                feature_type = feature["type"]
                feature_value = feature["value"]
                entity_dict["features"].append([feature_type, feature_value])


    ## Remarks
    if "remarks" in entity["generalInfo"].keys():
        entity_dict["remarks"] = entity["generalInfo"]["remarks"]
      

    return entity_dict


In [59]:
def populate_template(entity):

    file_name = f"{entity["name"]}.md"
    
    text = f"""---
Entity Type: {entity["entity_type"]}
Sanctions Lists: {entity["sanctions_lists"]}
Sanctions Programs: {entity["sanctions_programs"]}
Sanctions Types: {entity["sanctions_types"]}
Legal Authorities: {entity["legal_authorities"]}
---
"""

    ## Remarks
    if  "remarks" in entity.keys():
        text += entity["remarks"] 
        text += "\n"

    ## Aliases
    if "aliases" in entity.keys():
        
        
        alias_text = """
## Aliases
| Type  | Name      | 
|-------|-----------|
"""


        for alias in entity["aliases"]: 
            alias_line = f"| {alias[0]} | {alias[1]} |\n"
            alias_text += alias_line
        
        text += alias_text
        
        
    ## Relationships
    if "relationships" in entity.keys():
        
        
        rel_text = """
## Relationships
| Type  | With      | 
|-------|-----------|
"""


        for rel in entity["relationships"]: 
            rel_line = f"| {rel[0]} | [[{rel[1]}]] |\n"
            rel_text += rel_line
        
        text += rel_text

## Identity Documents
    if "identity_documents" in entity.keys():
        
        
        id_text = """
## Identity Documents
| Type  | Name      | Document Number | Is valid |
|-------|-----------|-----------------|----------|
"""


        for id in entity["identity_documents"]: 
            id_line = f"| {id[0]} | {id[1]} | {id[2]} | {id[3]} |\n"
            id_text += id_line
        
        text += id_text
        
## Features
    if "features" in entity.keys():
        
        
        features_text = """
## Features
| Type  | Value      |
|-------|------------|
"""


        for feature in entity["features"]: 
            feature_line = f"| {feature[0]} | {feature[1]} |\n"
            features_text += feature_line
        
        text += features_text
            
    return(file_name, text)


In [60]:
def create_md_files(file_name, file_content):
    
    destination_folder = "Vault/"
    file_name = file_name.replace("/", "")
    file_path = f"{destination_folder}{file_name}"
    
    if not os.path.exists(destination_folder):
        os.makedirs(destination_folder)
        
    # if os.path.exists(file_path):
    #     return
    

    
    with open(file_path, 'w') as f:
        f.write(file_content)    

## Run

In [61]:
entity_json = xml_to_json(soup)
entity_data = entity_json['sanctionsData']["entities"]["entity"]
entity_data = [entity for entity in entity_data if entity["generalInfo"]["entityType"] in ["Individual", "Entity"]]
print(f"Entities found: {len(entity_data)}")
print(entity_data[0])

Entities found: 249
{'generalInfo': {'identityId': '18423', 'entityType': 'Entity'}, 'sanctionsLists': {'sanctionsList': 'SDN List'}, 'sanctionsPrograms': {'sanctionsProgram': ['FTO', 'IRGC']}, 'sanctionsTypes': {'sanctionsType': 'Block'}, 'legalAuthorities': {'legalAuthority': 'INA'}, 'names': {'name': [{'isPrimary': 'true', 'isLowQuality': 'false', 'translations': {'translation': {'isPrimary': 'true', 'script': 'Latin', 'formattedLastName': 'IRGC GROUND FORCES', 'formattedFullName': 'IRGC GROUND FORCES', 'nameParts': {'namePart': {'type': 'Entity Name', 'value': 'IRGC Ground Forces'}}}}}, {'isPrimary': 'false', 'aliasType': 'A.K.A.', 'isLowQuality': 'false', 'translations': {'translation': {'isPrimary': 'true', 'script': 'Latin', 'formattedLastName': 'ISLAMIC REVOLUTION GUARDS CORPS GROUND FORCE', 'formattedFullName': 'ISLAMIC REVOLUTION GUARDS CORPS GROUND FORCE', 'nameParts': {'namePart': {'type': 'Entity Name', 'value': 'Islamic Revolution Guards Corps Ground Force'}}}}}]}, 'addre

## TO DO

There is a title value in some cases, within generalInfo

In [62]:
# test_entity = entity_data[0]
# x = parse_entity(test_entity)
# print(x)

In [63]:
entity_dicts = [parse_entity(entity) for entity in entity_data]
entity_texts = [populate_template(entity) for entity in entity_dicts]

In [64]:
for entity in entity_texts:
    create_md_files(entity[0], entity[1])

In [65]:
# counter = 0
# fail_indexes = []

# entity_texts = []

# for entity in entity_dicts:
#     counter += 1
    
#     try:
#         entity_texts.append((entity))
        
#     except:
#         fail_indexes.append(counter)

# print(f"Attempts: {counter}")
# print(f"Failed: {len(fail_indexes)}")
# print(f"Succeeded: {counter - len(fail_indexes)}")
# print(f"Failed on: {fail_indexes}")

```
'names': {'name': [{'isPrimary': 'true',
    'isLowQuality': 'false',
    'translations': {'translation': [{'isPrimary': 'true',
       'script': 'Latin',
       'formattedFirstName': 'Hasan',
       'formattedLastName': 'SABURINEZHAD',
       'formattedFullName': 'SABURINEZHAD, Hasan',
       'nameParts': {'namePart': [{'type': 'First Name', 'value': 'Hasan'},
         {'type': 'Last Name', 'value': 'Saburinezhad'}]}},
      {'isPrimary': 'false',
       'script': 'Arabic',
       'formattedLastName': '\u202bحسن سبری نژاد\u202c',
       'formattedFullName': '\u202bحسن سبری نژاد\u202c',
       'nameParts': {'namePart': [{'type': 'First Name', 'value': 'حسن'},
         {'type': 'Last Name', 'value': 'سبری نژاد'}]}}]}},
   {'isPrimary': 'false',
    'aliasType': 'A.K.A.',
    'isLowQuality': 'false',
    'translations': {'translation': {'isPrimary': 'true',
      'script': 'Latin',
```