# Kalliope SRU Abfrage und parsen von MODS mit Python etree
Quelle: https://github.com/deutsche-nationalbibliothek/dnblab/blob/main/DNB_SRU_Tutorial.ipynb

In [23]:
import requests
from lxml import etree
import pandas as pd

In [24]:
# SRU query
def kalliope_sru(query):
    base_url = "https://kalliope-verbund.info/sru"
    params = {
        'version': '1.2',
        'operation': 'searchRetrieve',
        'recordSchema': 'mods37',
        'maximumRecords': '100',
        'query': query
    }
    
    r = requests.get(base_url, params=params)
    mods_content = r.content
    records_mods = etree.fromstring(mods_content)
    
    # Check if more than 100 records
    if len(records_mods.xpath("//srw:record", namespaces={'srw': 'http://www.loc.gov/zing/srw/'})) < 100:
        return records_mods
    else:
        num_results = 100
        i = 101
        while num_results == 100:
            params.update({'startRecord': i})
            r = requests.get(base_url, params=params)
            new_mods_content = r.content
            new_records_mods = etree.fromstring(new_mods_content)
            records_mods.extend(new_records_mods.xpath("//srw:record", namespaces={'srw': 'http://www.loc.gov/zing/srw/'}))
            i += 100
            num_results = len(new_records_mods.xpath("//srw:record", namespaces={'srw': 'http://www.loc.gov/zing/srw/'}))
        
        return records_mods


In [28]:
# function parse mods
def parse_mods(record):
    ns = {
        'srw': 'http://www.loc.gov/zing/srw/',  # SRW namespace
        'mods': 'http://www.loc.gov/mods/v3'    # MODS namespace
    }
    
    # Prüfen, ob der <title>-Tag leer ist
    empty_title = record.xpath(".//mods:titleInfo/mods:title[not(node())]", namespaces=ns)
    
    # Wenn der <title>-Tag nicht leer ist, diese Record überspringen
    if not empty_title:
        return None  # Überspringen, wenn kein leerer <title>-Tag
    
    # Falls leer, extrahiere andere Daten
    try:
        title = empty_title[0].text
    except IndexError:
        title = "unknown"
    
    # Extract holder //mods/location/physicalLocation/@authority Physischer Standort (bestandshaltende Einrichtung)
    holder = record.xpath(".//mods:location/mods:physicalLocation", namespaces=ns)    
    try:
        holder = holder[0].text
    except IndexError:
        holder = "unknown"
        
    # Extract ISIL //mods/location/physicalLocation/@authority Physischer Standort (bestandshaltende Einrichtung)
    ISILholder = record.xpath(".//mods:location/mods:physicalLocation/@authorityURI", namespaces=ns)
    
    lang_attributes = record.xpath("//@lang")
    try:
        lang_attributes = lang_attributes[:3] #only first 4 entries
    except IndexError:
        lang_attributes = "unknown"
    
    name = record.xpath(".//mods:namePart/text()", namespaces=ns)
    nameID = record.xpath(".//mods:name/@authority", namespaces=ns)
    abstract_content = record.xpath(".//mods:abstract[@type='content']/text()", namespaces=ns)
    abstract_content = abstract_content[0] if abstract_content else 'NaN'

    # Return a dictionary to build the DataFrame
    return {
        "Title": title,
        "holder": holder,
        "ISILholder": ISILholder,
        "langattributes": lang_attributes,
        "name": name,
        "nameID": nameID,
        "abstract_content": abstract_content
    }



In [31]:
# Example query
query = 'ead.repository.isil="DE-1a"'
records_xml = kalliope_sru(query)

print(f'{len(records_xml.xpath("//srw:record", namespaces={"srw": "http://www.loc.gov/zing/srw/"}))} Ergebnisse gefunden')


200 Ergebnisse gefunden


In [32]:
# query that ignores records with no empty title tag

records_xml = kalliope_sru(query)
records = records_xml.xpath("//srw:record", namespaces={'srw': 'http://www.loc.gov/zing/srw/'})
output = [parse_mods(record) for record in records if parse_mods(record) is not None]  # filter of empty title tag
df = pd.DataFrame(output)
df


Unnamed: 0,Title,holder,ISILholder,langattributes,name,nameID,abstract_content
0,,Staatsbibliothek zu Berlin. Handschriftenabtei...,[http://ld.zdb-services.de/resource/organisati...,"[ger, ger, ger]","[Grønbech, Kaare (1901-1957)]",[DE-588],"Übersetzungen, Vorlesungsmanuskripte, Jakutisc..."
1,,Staatsbibliothek zu Berlin. Handschriftenabtei...,[http://ld.zdb-services.de/resource/organisati...,"[ger, ger, ger]","[Gurlitt, Wilhelm (1844-1905)]",[DE-588],
2,,Staatsbibliothek zu Berlin. Handschriftenabtei...,[http://ld.zdb-services.de/resource/organisati...,"[ger, ger, ger]","[Schmidt, Carl]",[DE-588],"Manuskripte, Materialien u.a. zu Carl Schmidt ..."
3,,Staatsbibliothek zu Berlin. Handschriftenabtei...,[http://ld.zdb-services.de/resource/organisati...,"[ger, ger, ger]","[Harnack, Adolf von (1851-1930)]",[DE-588],"Personalpapiere, Ehrendiplome, Manuskripte, Ko..."
4,,Staatsbibliothek zu Berlin. Handschriftenabtei...,[http://ld.zdb-services.de/resource/organisati...,"[ger, ger, ger]",[Verschiedene],[DE-611],Dokumente und Materialien zum 2. Weltkrieg
5,,Staatsbibliothek zu Berlin. Handschriftenabtei...,[http://ld.zdb-services.de/resource/organisati...,"[ger, ger, ger]","[Lembcke, Anders]",[DE-588],Teilnachlass mit Korrespondenzen u.a. Material...
6,,Staatsbibliothek zu Berlin. Handschriftenabtei...,[http://ld.zdb-services.de/resource/organisati...,"[ger, ger, ger]","[Gurlitt, Wilhelm (1844-1905), Gurlitt, Louis ...","[DE-588, DE-588, DE-588, DE-611, DE-588]","Wilhelm Gurlitt, Mercedes Gurlitt und Familie:..."
7,,Staatsbibliothek zu Berlin. Handschriftenabtei...,[http://ld.zdb-services.de/resource/organisati...,"[ger, ger, ger]","[Mommsen, Theodor (1817-1903)]",[DE-588],
8,,Staatsbibliothek zu Berlin. Handschriftenabtei...,[http://ld.zdb-services.de/resource/organisati...,"[ger, ger, ger]","[Galle, Johann Gottfried (1812-1910)]",[DE-588],"Manuskripte, Korrespondenz, Konzepte von Vorle..."
9,,Staatsbibliothek zu Berlin. Handschriftenabtei...,[http://ld.zdb-services.de/resource/organisati...,"[ger, ger, ger]","[Buff, Charlotte (1753-1828)]",[DE-588],70 Briefe von den Nachkommen


In [22]:
# print(etree.tostring(records_xml, pretty_print=True).decode())

<srw:searchRetrieveResponse xmlns:srw="http://www.loc.gov/zing/srw/" xmlns:srw_dc="info:srw/schema/1/dc-v1.1" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:diag="http://www.loc.gov/zing/srw/diagnostic/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:mods="http://www.loc.gov/mods/v3" xmlns:cld="http://www.ukoln.ac.uk/metadata/rslp/schema/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:my="http://www.infolytics.com/kopac" xmlns:eacNS="urn:isbn:1-931666-33-4">
   <srw:version>1.2</srw:version>
   <srw:numberOfRecords>46479</srw:numberOfRecords>
   <srw:records>
      <srw:record>
         <srw:recordSchema>info:srw/schema/1/mods-v3.7</srw:recordSchema>
         <srw:recordPacking>xml</srw:recordPacking>
         <srw:recordData>
            <mods xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.loc.gov/mods/v3" xsi:schemaLocation="http://www.loc.gov/mods/v3 https://www.loc.gov/standards/mods/v3/mods-3-7.xsd">
               <identifier type="uri

#### Parse data and convert to DataFrame
records = records_xml.xpath("//srw:record", namespaces={"srw": "http://www.loc.gov/zing/srw/"})
output = [parse_mods(record) for record in records]
df = pd.DataFrame(output)
df


# function parse mods
def parse_mods(record):
    ns = {
        'srw': 'http://www.loc.gov/zing/srw/',  # SRW namespace
        'mods': 'http://www.loc.gov/mods/v3'    # MODS namespace
    }
    
    # Extract the title (e.g., <titleInfo><title>)
    title = record.xpath(".//mods:titleInfo/mods:title", namespaces=ns)
    try:
        title = title[0].text
    except IndexError:
        title = "unknown"
        
    # Extract holder //mods/location/physicalLocation/@authority Physischer Standort (bestandshaltende Einrichtung)
    holder = record.xpath(".//mods:location/mods:physicalLocation", namespaces=ns)    
    try:
        holder = holder[0].text
    except IndexError:
        holder = "unknown"
        
    # Extract ISIL //mods/location/physicalLocation/@authority Physischer Standort (bestandshaltende Einrichtung)
    ISILholder = record.xpath(".//mods:location/mods:physicalLocation/@authorityURI", namespaces=ns)
    
    lang_attributes = record.xpath("//@lang")
    
    name = record.xpath(".//mods:namePart/text()", namespaces=ns)
    nameID = record.xpath(".//mods:name/@authority", namespaces=ns)
    abstract_content = record.xpath(".//mods:abstract[@type='content']/text()", namespaces=ns)
    abstract_content = abstract_content[0] if abstract_content else 'NaN'
    
    # XPath-Ausdruck zum Finden des leeren <title>-Tags
    empty_titles = record.xpath("//mods:title[not(node())]", namespaces=ns)

    # Ausgabe der leeren Titel
    for title in empty_titles:
        #print(f"Leerer <title>-Tag gefunden: {etree.tostring(title, pretty_print=True).decode()}")
        print(f"Leerer <title>-Tag gefunden")

   

    # Return a dictionary to build the DataFrame
    return {
        "Title": title,
        "holder": holder,
        "ISILholder": ISILholder,
        "langattributes": lang_attributes,
        "name": name,
        "nameID": nameID,
        "abstract_content": abstract_content,
        
        # Add more fields to extract as needed
        
        
    }