In [6]:
from lxml import etree

# File path
file_path = "202001-rma-lido-collection.xml"

# Namespace dictionary for LIDO format
namespaces = {
    'lido': 'http://www.lido-schema.org'
}

# Streaming the XML
context = etree.iterparse(file_path, events=('start', 'end'), tag='{http://www.lido-schema.org}lido')

# Inspect and filter
subset = []
for event, element in context:
    if event == 'end':  # End of an element
        # Debugging: Print out raw element for exploration
        print(etree.tostring(element, pretty_print=True, encoding='unicode'))

        # Extract specific data
        object_id = element.find('lido:objectID', namespaces)
        title = element.find('lido:titleSet/lido:appellationValue', namespaces)

        subset.append({
            'id': object_id.text if object_id is not None else None,
            'title': title.text if title is not None else None,
        })

        # Clear processed elements to save memory
        element.clear()

    if len(subset) >= 10:  # Stop after extracting 10 records for debugging
        break

# Inspect the subset
for record in subset:
    print(record)

<lido:lido xmlns:lido="http://www.lido-schema.org" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
      <lido:lidoRecID lido:type="http://terminology.lido-schema.org/identifier_type/local_identifier">NL-AsdRM/lido/704235</lido:lidoRecID>
      <lido:objectPublishedID lido:type="http://terminology.lido-schema.org/identifier_type/local_identifier">http://hdl.handle.net/10934/RM0001.COLLECT.704235</lido:objectPublishedID>
      <lido:descriptiveMetadata xml:lang="en">
        <lido:objectClassificationWrap>
          <lido:objectWorkTypeWrap>
            <lido:objectWorkType>
              <lido:conceptID lido:type="http://terminology.lido-schema.org/identifier_type/local_identifier">RM0001.THESAU.1361</lido:conceptID>
              <lido:term xml:lang="en">book</lido:term>
              <lido:term xml:lang="nl">boek</lido:term>
            </lido:objectWorkType>
          </lido:objectWorkTypeWrap>
          <lido:classificationWrap>
            <lido:classification lido:type="ht

In [30]:
from lxml import etree

# Define namespaces used in the XML file
namespaces = {
    'lido': 'http://www.lido-schema.org',
    'xsi': 'http://www.w3.org/2001/XMLSchema-instance',
    'xml': 'http://www.w3.org/XML/1998/namespace'  # Include the xml namespace
}

# Input and output file paths
input_file = '202001-rma-lido-collection.xml'
output_file = 'photo_prints_subset.xml'

# Define the target term for filtering
target_term = "photomechanical print"

# Open the output file
with open(output_file, 'wb') as outfile:
    # Write the XML declaration and root element
    outfile.write(b'<?xml version="1.0" encoding="UTF-8"?>\n<records>\n')
    
    # Parse the input file
    context = etree.iterparse(input_file, events=('end',), tag='{http://www.lido-schema.org}lido')
    for event, element in context:
        # Check if the record contains the target objectWorkType
        object_work_type = element.find(
            'lido:descriptiveMetadata/lido:objectClassificationWrap/lido:objectWorkTypeWrap/lido:objectWorkType/lido:term[@xml:lang="en"]',
            namespaces
        )
        if object_work_type is not None and object_work_type.text == target_term:
            # Convert the matching element to a string and write to the output file
            outfile.write(etree.tostring(element, pretty_print=True, encoding='utf-8'))
        
        # Clear the processed element to save memory
        element.clear()

    # Write the closing root tag
    outfile.write(b'</records>')

In [76]:
!pip install xmltodict

Collecting xmltodict
  Obtaining dependency information for xmltodict from https://files.pythonhosted.org/packages/d6/45/fc303eb433e8a2a271739c98e953728422fa61a3c1f36077a49e395c972e/xmltodict-0.14.2-py2.py3-none-any.whl.metadata
  Downloading xmltodict-0.14.2-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading xmltodict-0.14.2-py2.py3-none-any.whl (10.0 kB)
Installing collected packages: xmltodict
Successfully installed xmltodict-0.14.2


In [80]:
import xmltodict
import json

# Load  XML data and convert it to JSON
with open('photo_prints_subset.xml') as xml_file:
    xml_data = xml_file.read()

# Convert XML to a dictionary
json_data = xmltodict.parse(xml_data)

# Save JSON data to a file
json_file_path = 'data.json'  # Specify the file name and path

with open(json_file_path, 'w', encoding='utf-8') as json_file:
    json.dump(json_data, json_file, ensure_ascii=False, indent=4)

print(f"JSON data saved to {json_file_path}")

JSON data saved to data.json


In [82]:
# Load JSON data from the file
json_file_path = 'data.json'  
with open(json_file_path, 'r', encoding='utf-8') as json_file:
    loaded_json_data = json.load(json_file)

# Function to extract LIDO classes
def extract_lido_classes(data, prefix='lido'):
    lido_classes = set()  # Set to store unique classes

    def recursive_search(d):
        if isinstance(d, dict):
            for key, value in d.items():
                if prefix in key:  # Check for LIDO classes
                    lido_classes.add(key)  # Add the LIDO class to the set
                # Recursively search in nested dictionaries
                recursive_search(value)
        elif isinstance(d, list):
            for item in d:
                recursive_search(item)

    recursive_search(data)
    return lido_classes

# Extract LIDO classes
lido_classes = extract_lido_classes(loaded_json_data)

# Print all unique LIDO classes found
print("Unique LIDO classes:")
for lido_class in sorted(lido_classes):
    print(lido_class)

Unique LIDO classes:
@lido:formatResource
@lido:pref
@lido:source
@lido:type
@xmlns:lido
lido:actor
lido:actorID
lido:actorInRole
lido:administrativeMetadata
lido:appellationValue
lido:attributionQualifierActor
lido:classification
lido:classificationWrap
lido:conceptID
lido:creditLine
lido:date
lido:descriptiveMetadata
lido:descriptiveNoteID
lido:descriptiveNoteValue
lido:displayEdition
lido:displayState
lido:displayStateEditionWrap
lido:earliestDate
lido:event
lido:eventActor
lido:eventDate
lido:eventID
lido:eventMaterialsTech
lido:eventMethod
lido:eventName
lido:eventPlace
lido:eventSet
lido:eventType
lido:eventWrap
lido:extentMeasurements
lido:inscriptionDescription
lido:inscriptionTranscription
lido:inscriptions
lido:inscriptionsWrap
lido:latestDate
lido:legalBodyID
lido:legalBodyName
lido:legalBodyWeblink
lido:lido
lido:lidoRecID
lido:linkResource
lido:materialsTech
lido:measurementType
lido:measurementUnit
lido:measurementValue
lido:measurementsSet
lido:nameActorSet
lido:namePlac