In [1]:
%pip install rdflib

import os
import rdflib
from rdflib import Graph
import json

Note: you may need to restart the kernel to use updated packages.


In [2]:
import rdflib
from rdflib import Graph, Namespace
import json
import os
import re

def convert_xml_to_jsonld(input_data, output_file=None):
    """
    Convert XML to JSON-LD using rdflib, preserving only namespaces
    declared in the XML file
    
    Args:
        input_data: Either a filename (string) or XML content
        output_file: Optional filename to save the result
        
    Returns:
        JSON-LD as a Python dictionary
    """
    # Load XML content
    if isinstance(input_data, str):
        if os.path.isfile(input_data):
            # Input is a file path
            try:
                with open(input_data, 'r', encoding='utf-8') as f:
                    content = f.read()
            except Exception as e:
                print(f"Error reading file: {e}")
                return None
        elif '<rdf:RDF' in input_data or '<?xml' in input_data:
            # Input is XML content string
            content = input_data
        else:
            print(f"Input doesn't appear to be a valid file path or XML content")
            return None
    else:
        print(f"Input must be a file path or XML content string")
        return None
    
    # Extract namespaces from the XML
    namespaces = {}
    ns_pattern = r'xmlns:([a-zA-Z0-9]+)="([^"]+)"'
    for match in re.finditer(ns_pattern, content):
        prefix, uri = match.groups()
        namespaces[prefix] = uri
    
    print(f"Extracted {len(namespaces)} namespaces from XML: {list(namespaces.keys())}")
    
    # Fix missing adms namespace if needed
    if 'adms:' in content and 'adms' not in namespaces:
        print("Found adms: references but no adms namespace declaration")
        if '<rdf:RDF' in content:
            content = content.replace(
                '<rdf:RDF ',
                '<rdf:RDF xmlns:adms="http://www.w3.org/ns/adms#" '
            )
            namespaces['adms'] = "http://www.w3.org/ns/adms#"
            print("Added missing adms namespace")
    
    # Parse the content
    g = Graph()
    try:
        g.parse(data=content, format="xml")
        print(f"Successfully parsed XML with {len(g)} triples")
    except Exception as e:
        print(f"Error parsing XML: {e}")
        return None
    
    # Bind only namespaces found in the source XML
    for prefix, uri in namespaces.items():
        g.bind(prefix, Namespace(uri))
    
    # Create context using only namespaces from the source XML
    context = dict(namespaces)
    
    # Serialize to JSON-LD
    try:
        jsonld_data = g.serialize(format="json-ld", context=context, indent=2)
        result = json.loads(jsonld_data)
        
        # Save to file if requested
        if output_file:
            with open(output_file, 'w', encoding='utf-8') as f:
                json.dump(result, f, ensure_ascii=False, indent=2)
            print(f"JSON-LD data written to {output_file}")
        
        return result
    except Exception as e:
        print(f"Error serializing to JSON-LD: {e}")
        return None

In [8]:
from rdflib import Graph, Namespace

In [11]:
g = Graph()

In [12]:
g.parse("/Users/krista/Development/repos/nordic44/xml_jsonld_converter/Enterprise/N44-ENT-Scheider_AC.xml")

<Graph identifier=Nc46ce1ba71044b42971a4395bb0265ae (<class 'rdflib.graph.Graph'>)>

In [16]:
g.bind("adms", Namespace("http://www.w3.org/ns/adms#"))
g.bind("dct", Namespace("http://purl.org/dc/terms/"))
g.bind("dct", Namespace("http://purl.org/dc/terms/"))
g.bind("foaf", Namespace("http://xmlns.com/foaf/0.1/"))
g.bind("owl", Namespace("http://www.w3.org/2002/07/owl#"))
g.bind("rdf", Namespace("http://www.w3.org/1999/02/22-rdf-syntax-ns#"))
g.bind("rdfs", Namespace("http://www.w3.org/2000/01/rdf-schema#"))
g.bind("cim", Namespace("http://iec.ch/TC57/CIM/CIM100#"))

In [17]:
g.serialize("N44-ENT-Scheider_AC.ttl")

<Graph identifier=Nc46ce1ba71044b42971a4395bb0265ae (<class 'rdflib.graph.Graph'>)>

In [19]:
g.serialize("N44-ENT-Scheider_AC_simple.jsonld", format="json-ld")

<Graph identifier=Nc46ce1ba71044b42971a4395bb0265ae (<class 'rdflib.graph.Graph'>)>

In [7]:

xml_file_path = "Nordic44-HV_EQ.xml"
output_file = "Nordic44-HV_EQ.jsonld"

# Convert XML to JSON-LD
result = convert_xml_to_jsonld(xml_file_path, output_file)




Extracted 4 namespaces from XML: ['cim', 'eu', 'md', 'rdf']
Successfully parsed XML with 15095 triples
JSON-LD data written to Nordic44-HV_EQ.jsonld
