# XML to JSON Conversion

## Load XML Data

In [1]:
xml_data = ""
xml_filepath = "Datasets/full_sanctions_list.xml"

with open(xml_filepath, "r") as file:
    xml_data = file.read()
    
print(type(xml_data))
print(len(xml_data))

<class 'str'>
91631773


## Parse using BS4

In [2]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(xml_data, features='xml')

def xml_to_dict(element):
    
    if isinstance(element, str):
        return element
    
    if not element.contents:
        return element.string
    
    result = {}
    
    for child in element.children:
        
        if isinstance(child, str):
            continue
        
        if child.name not in result:
            result[child.name] = xml_to_dict(child)
            
        else:
            if not isinstance(result[child.name], list):
                result[child.name] = [result[child.name]]
            result[child.name].append(xml_to_dict(child))
            
    # Directly capture text nodes without 'text' key
    if element.string and element.string.strip():
        return element.string.strip()
    
    return result

xml_dict = xml_to_dict(soup)

In [3]:
xml_dict

{'sanctionsData': {'publicationInfo': {'dataAsOf': '2024-08-28T00:00:00',
   'filters': {'sanctionsLists': {'sanctionsList': ['SDN List',
      'Non-SDN Palestinian Legislative Council List',
      'FSE List',
      'Sectoral Sanctions Identifications List',
      'Consolidated List',
      'CAPTA List',
      'Non-SDN Menu-Based Sanctions List',
      'Non-SDN CMIC List']},
    'sanctionsPrograms': {'sanctionsProgram': 'IRGC'}}},
  'referenceValues': {'referenceValue': [{'type': 'Additional Sanctions Information -',
     'value': 'Subject to Secondary Sanctions'},
    {'type': 'Additional Sanctions Information -',
     'value': 'Subject to Secondary Sanctions Pursuant to the Hizballah Financial Sanctions Regulations'},
    {'type': 'ALIAS TYPE', 'value': 'A.K.A.'},
    {'type': 'ALIAS TYPE', 'value': 'F.K.A.'},
    {'type': 'ALIAS TYPE', 'value': 'N.K.A.'},
    {'type': 'CAATSA Section 235 Information:',
     'value': 'FOREIGN EXCHANGE. Sec 235(a)(7)'},
    {'type': 'CAATSA Section 23

## Save as JSON

In [3]:
import json

output_file_name = "IRGC_sanctions.json"

json_content = json.dumps(xml_dict, indent=4)

with open(output_file_name, 'w') as json_file:
    json_file.write(json_content)