In [0]:
import json
import pandas as pd



In [0]:
!pip install rdflib


[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.[0m


In [0]:
from rdflib import Graph, URIRef, Literal, Namespace

In [0]:
# Load the JSON metadata file from DBFS
def load_json_metadata(json_file):
    with open(json_file, 'r') as f:
        try:
            metadata = json.load(f)
            # Debug: print the loaded metadata
            print("Loaded metadata:", metadata)
            if isinstance(metadata, str):
                raise ValueError("Metadata was loaded as a string. Expected a dictionary or list.")
            return metadata
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")
            return None



# Load the CSV file containing categories and corresponding placeholders/text
def load_csv_tags(csv_file):
    df = pd.read_csv(csv_file)
    
    # Clean the column names by stripping extra characters
    df.columns = df.columns.str.strip()  # This will remove any extra spaces or semicolons
    
    # Print column names for debugging
    print("Columns in CSV:", df.columns.tolist())
    
    return df


In [0]:
'''
# Function to construct RDF using rdflib
def reconstruct_ontology_to_graph(metadata, csv_data):
    # Initialize RDF graph
    g = Graph()

    # Define namespaces
    EX = Namespace("http://example.org/")
    g.bind("ex", EX)  # Bind the 'ex' prefix to the example namespace
    
    for monument in metadata:
        object_id = monument.get('object_id', 'Unknown_ID')
        location = monument.get('location', 'Unknown Location')
        aanduidingsobject_url = monument.get('aanduidingsobject_url', 'Unknown URL')
        besluiten = monument.get('besluiten', [])
        
        # Create URI for the monument
        monument_uri = URIRef(f"http://example.org/monument/{object_id}")
        
        for besluit in besluiten:
            besluit_id = besluit.get('besluit_id', 'Unknown_Besluit_ID')
            besluit_date = besluit.get('besluit_date', 'Unknown Date')
            besluit_url = besluit.get('besluit_url', 'Unknown Besluit URL')
            besluit_pdf_url = besluit.get('besluit_pdf_url', 'Unknown PDF URL')
            
            # Create URI for the besluit
            besluit_uri = URIRef(f"http://example.org/besluit/{besluit_id}")

            # Add RDF triples based on CSV data
            for index, row in csv_data.iterrows():
                category = row['Categories'].strip()
                text = row['Text'].strip()

                # Populate the RDF graph based on category
                if category == '<Monument>':
                    g.add((monument_uri, EX.hasName, Literal(text)))
                elif category == '<heeftAdres>':
                    g.add((monument_uri, EX.hasAddress, Literal(location)))
                elif category == '<beschermdDoor>':
                    g.add((monument_uri, EX.protectedBy, besluit_uri))
                elif category == '<heeftBesluitDatum>':
                    g.add((besluit_uri, EX.hasDecisionDate, Literal(besluit_date)))
                elif category == '<beschermt>':
                    g.add((besluit_uri, EX.protects, monument_uri))
                # Add more cases as needed...

    return g

# Save the RDF graph to Turtle format
def save_graph_to_ttl(graph, output_file):
    graph.serialize(destination=output_file, format='turtle')
'''

'\n# Function to construct RDF using rdflib\ndef reconstruct_ontology_to_graph(metadata, csv_data):\n    # Initialize RDF graph\n    g = Graph()\n\n    # Define namespaces\n    EX = Namespace("http://example.org/")\n    g.bind("ex", EX)  # Bind the \'ex\' prefix to the example namespace\n    \n    for monument in metadata:\n        object_id = monument.get(\'object_id\', \'Unknown_ID\')\n        location = monument.get(\'location\', \'Unknown Location\')\n        aanduidingsobject_url = monument.get(\'aanduidingsobject_url\', \'Unknown URL\')\n        besluiten = monument.get(\'besluiten\', [])\n        \n        # Create URI for the monument\n        monument_uri = URIRef(f"http://example.org/monument/{object_id}")\n        \n        for besluit in besluiten:\n            besluit_id = besluit.get(\'besluit_id\', \'Unknown_Besluit_ID\')\n            besluit_date = besluit.get(\'besluit_date\', \'Unknown Date\')\n            besluit_url = besluit.get(\'besluit_url\', \'Unknown Besluit U

In [0]:

# Function to reconstruct RDF for each category dynamically
def reconstruct_ontology(metadata, csv_data):
    rdf_output = []
    
    # Check if metadata is a list of monuments
    if isinstance(metadata, dict):
        # If metadata is a dictionary, wrap it in a list
        metadata = [metadata]
    
    # Now iterate over the list of monuments
    for monument in metadata:
        object_id = monument.get('object_id', 'Unknown_ID')
        location = monument.get('location', 'Unknown Location')
        aanduidingsobject_url = monument.get('aanduidingsobject_url', 'Unknown URL')
        besluiten = monument.get('besluiten', [])
        
        # Loop over each besluit (legal decree)
        for besluit in besluiten:
            besluit_id = besluit.get('besluit_id', 'Unknown_Besluit_ID')
            besluit_date = besluit.get('besluit_date', 'Unknown Date')
            besluit_url = besluit.get('besluit_url', 'Unknown Besluit URL')
            besluit_pdf_url = besluit.get('besluit_pdf_url', 'Unknown PDF URL')
            
            # Now process CSV data as before to construct the RDF
            rdf_parts = []
            for index, row in csv_data.iterrows():
                category = row['Categories']
                text = row['Text']
                
                # Customize RDF content for known categories
                if category == '<Monument>':
                    rdf_parts.append(f'<Monument rdf:ID="Monument_{object_id}">')
                elif category == '<heeftNaam>':
                    rdf_parts.append(f'    <heeftNaam>{text}</heeftNaam>')
                elif category == '<heeftAdres>':
                    rdf_parts.append(f'    <heeftAdres>{location}</heeftAdres>')
                elif category == '<gevestigdIn>':
                    rdf_parts.append(f'    <gevestigdIn rdf:resource="#Locatie_Gent"/>')
                elif category == '<ontworpenDoor>':
                    rdf_parts.append(f'    <ontworpenDoor rdf:resource="#Architect_[ArchitectID]"/>')
                elif category == '<beschermdDoor>':
                    rdf_parts.append(f'    <beschermdDoor rdf:resource="#Besluit_{besluit_id}"/>')
                elif category == '<heeftBesluitDatum>':
                    rdf_parts.append(f'<Besluit rdf:ID="Besluit_{besluit_id}">')
                    rdf_parts.append(f'    <heeftBesluitDatum>{besluit_date}</heeftBesluitDatum>')
                elif category == '<beschermt>':
                    rdf_parts.append(f'    <beschermt rdf:resource="#Monument_{object_id}"/>')
                elif category == '<heeftURL>':
                    rdf_parts.append(f'    <heeftURL rdf:resource="{aanduidingsobject_url}"/>')
                else:
                    rdf_parts.append(f'    {category} {text}')
            
            # Add closing tags for Monument and Besluit
            rdf_parts.append('</Monument>')
            rdf_parts.append('</Besluit>')
            
            # Append the result to the overall RDF output
            rdf_output.append("\n".join(rdf_parts))
    
    return "\n".join(rdf_output)


In [0]:
# Save the reconstructed ontology to a file
def save_ontology_to_file(ontology, output_file):
    with open(output_file, 'w') as f:
        f.write(ontology)

In [0]:

# Function to run the full process
def main(json_file, csv_file, output_file):
    # Load metadata and tags
    metadata = load_json_metadata(json_file)
    csv_data = load_csv_tags(csv_file)
    
    # Reconstruct RDF
    ontology = reconstruct_ontology(metadata, csv_data)
    
    # Output the entire RDF content to console
    print(ontology)
    
    # Save the RDF structure to a file
    save_ontology_to_file(ontology, output_file)


In [0]:
# Function to run the full process
def main(json_file, csv_file, output_file):
    # Load metadata and tags
    metadata = load_json_metadata(json_file)
    csv_data = load_csv_tags(csv_file)
    
    # Reconstruct RDF
    ontology = reconstruct_ontology(metadata, csv_data)
    
    # Output the entire RDF content to console
    print(ontology)
    
    # Save the RDF structure to a file
    save_ontology_to_file(ontology, output_file)


In [0]:
# Paths to the metadata and CSV file
json_file = "/dbfs/FileStore/ABB_pdf/metadata_113448.json"  # metadata for ids
csv_file = "/dbfs/FileStore/ABB_pdf/tags.csv"  # pdf extracted and put to CSV 
output_file = "/dbfs/FileStore/ABB_pdf/reconstructed_ontology.rdf"  # Output RDF file path

# Call the main function to execute the script
main(json_file, csv_file, output_file)

Loaded metadata: {'aanduidingsobject_url': 'https://inventaris.onroerenderfgoed.be/aanduidingsobjecten/113448', 'object_id': 113448, 'location': 'Gérard Willemotlaan 85 (Gent)', 'besluiten': [{'besluit_url': 'https://besluiten.onroerenderfgoed.be/besluiten/14743', 'besluit_pdf_url': 'https://besluiten.onroerenderfgoed.be/besluiten/14743/bestanden/21145', 'besluit_id': 14743, 'pdf_file_id': 21145, 'besluit_date': '2018-12-20'}]}
Columns in CSV: ['Categories', 'Text']
<Monument rdf:ID="Monument_113448">
    <heeftNaam>Architectenwoning Ferdinand Schlich;</heeftNaam>
    <heeftAdres>Gérard Willemotlaan 85 (Gent)</heeftAdres>
    <gevestigdIn rdf:resource="#Locatie_Gent"/>
    <heeftKadastraleInformatie> Gent 29ste afdeling sectie A perceelnummer 770M;
    <ontworpenDoor rdf:resource="#Architect_[ArchitectID]"/>
    <volgtArchitectuurstijl> Modernisme;
    <heeftBouwdatum> 1988;
    <beschermdDoor rdf:resource="#Besluit_14743"/>
    <heeftHistorischeBetekenis>,"De woning van Ferdinand Schl