# Kalliope SRU Abfrage und parsen von MODS mit Python etree
Quelle: https://github.com/deutsche-nationalbibliothek/dnblab/blob/main/DNB_SRU_Tutorial.ipynb

In [19]:
import requests
from lxml import etree
import pandas as pd

In [20]:
# SRU query
def kalliope_sru(query):
    base_url = "https://kalliope-verbund.info/sru"
    params = {
        'version': '1.2',
        'operation': 'searchRetrieve',
        'recordSchema': 'mods37',
        'maximumRecords': '100',
        'query': query
    }
    
    r = requests.get(base_url, params=params)
    mods_content = r.content
    records_mods = etree.fromstring(mods_content)
    
    # Check if more than 100 records
    if len(records_mods.xpath("//srw:record", namespaces={'srw': 'http://www.loc.gov/zing/srw/'})) < 100:
        return records_mods
    else:
        num_results = 100
        i = 101
        while num_results == 100:
            params.update({'startRecord': i})
            r = requests.get(base_url, params=params)
            new_mods_content = r.content
            new_records_mods = etree.fromstring(new_mods_content)
            records_mods.extend(new_records_mods.xpath("//srw:record", namespaces={'srw': 'http://www.loc.gov/zing/srw/'}))
            i += 100
            num_results = len(new_records_mods.xpath("//srw:record", namespaces={'srw': 'http://www.loc.gov/zing/srw/'}))
        
        return records_mods


In [39]:
def parse_mods(record):
    ns = {
        'srw': 'http://www.loc.gov/zing/srw/',  # SRW namespace
        'mods': 'http://www.loc.gov/mods/v3'    # MODS namespace
    }
    
    # Prüfen, ob der <title>-Tag leer ist
    #empty_title = record.xpath(".//mods:titleInfo/mods:title[not(node())]", namespaces=ns)
    empty_title = record.xpath(".//mods:relatedItem[@type='host']/mods:titleInfo/mods:title[not(node())]", namespaces=ns)
    #//mods:relatedItem[@type="host"]/mods:titleInfo[not(mods:title)]
    
    # Wenn der <title>-Tag nicht leer ist, diese Record überspringen
    if not empty_title:
        return None  # Überspringen, wenn kein leerer <title>-Tag
    
    # Falls leer, extrahiere andere Daten
    title = "empty"  # Da der Titel leer ist, setzen wir einen Platzhalter
    
    # Extract holder //mods/location/physicalLocation/@authority Physischer Standort (bestandshaltende Einrichtung)
    holder_elements = record.xpath(".//mods:location/mods:physicalLocation", namespaces=ns)    
    holder = holder_elements[0].text if holder_elements else "unknown"
        
    # Extract ISIL //mods/location/physicalLocation/@authorityURI
    ISILholder_elements = record.xpath(".//mods:location/mods:physicalLocation/@authorityURI", namespaces=ns)
    ISILholder = ISILholder_elements[0] if ISILholder_elements else "unknown"
    
    lang_attributes = record.xpath("//@lang", namespaces=ns)
    lang_attributes = lang_attributes[:3]  # Nur die ersten 3 Einträge, falls vorhanden
    
    name = record.xpath(".//mods:namePart/text()", namespaces=ns)
    nameID = record.xpath(".//mods:name/@authority", namespaces=ns)
    abstract_content = record.xpath(".//mods:abstract[@type='content']/text()", namespaces=ns)
    abstract_content = abstract_content[0] if abstract_content else 'NaN'
    
    recordID = record.xpath(".//mods:recordInfo/mods:recordIdentifier/text()", namespaces=ns)
   

    # Return a dictionary to build the DataFrame
    return {
        
        "Title": title,
        "holder": holder,
        "ISILholder": ISILholder,
        "langattributes": lang_attributes,
        "name": name,
        "nameID": nameID,
        "abstract_content": abstract_content
        "recordID": recordID
        
    }


SyntaxError: invalid syntax (1810029463.py, line 48)

In [23]:
# Example query
query = 'ead.repository.isil="DE-M36a"'
records_xml = kalliope_sru(query)

print(f'{len(records_xml.xpath("//srw:record", namespaces={"srw": "http://www.loc.gov/zing/srw/"}))} Ergebnisse gefunden')


200 Ergebnisse gefunden


In [28]:
# query that ignores records with no empty title tag

records_xml = kalliope_sru(query)
records = records_xml.xpath("//srw:record", namespaces={'srw': 'http://www.loc.gov/zing/srw/'})
output = [parse_mods(record) for record in records if parse_mods(record) is not None]  # filter of empty title tag
df = pd.DataFrame(output)
df


Unnamed: 0,Title,holder,ISILholder,langattributes,name,nameID,abstract_content
0,empty,Münchner Stadtbibliothek / Monacensia,http://ld.zdb-services.de/resource/organisations/,"[ger, ger, ger]",[Bayerischer Rundfunk],[DE-611],
1,empty,Münchner Stadtbibliothek / Monacensia,http://ld.zdb-services.de/resource/organisations/,"[ger, ger, ger]","[Münchner Stadtbibliothek / Monacensia, Münchn...","[DE-611, DE-588]","Umf. Diplome, Medaillen, Plaketten und Bilder ..."
2,empty,Münchner Stadtbibliothek / Monacensia,http://ld.zdb-services.de/resource/organisations/,"[ger, ger, ger]","[Karlinger, Hans (1882-1944)]",[DE-588],
3,empty,Münchner Stadtbibliothek / Monacensia,http://ld.zdb-services.de/resource/organisations/,"[ger, ger, ger]","[Ille, Eduard (1823-1901)]",[DE-588],
4,empty,Münchner Stadtbibliothek / Monacensia,http://ld.zdb-services.de/resource/organisations/,"[ger, ger, ger]","[Münchner Stadtbibliothek / Monacensia, Münchn...",[DE-588],
5,empty,Münchner Stadtbibliothek / Monacensia,http://ld.zdb-services.de/resource/organisations/,"[ger, ger, ger]","[Reck-Malleczewen, Friedrich Percyval (1884-19...",[DE-588],


In [18]:
print(etree.tostring(records_xml, pretty_print=True).decode())

<srw:searchRetrieveResponse xmlns:srw="http://www.loc.gov/zing/srw/" xmlns:srw_dc="info:srw/schema/1/dc-v1.1" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:diag="http://www.loc.gov/zing/srw/diagnostic/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:mods="http://www.loc.gov/mods/v3" xmlns:cld="http://www.ukoln.ac.uk/metadata/rslp/schema/" xmlns:dcterms="http://purl.org/dc/terms/" xmlns:my="http://www.infolytics.com/kopac" xmlns:eacNS="urn:isbn:1-931666-33-4">
   <srw:version>1.2</srw:version>
   <srw:numberOfRecords>303882</srw:numberOfRecords>
   <srw:records>
      <srw:record>
         <srw:recordSchema>info:srw/schema/1/mods-v3.7</srw:recordSchema>
         <srw:recordPacking>xml</srw:recordPacking>
         <srw:recordData>
            <mods xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="http://www.loc.gov/mods/v3" xsi:schemaLocation="http://www.loc.gov/mods/v3 https://www.loc.gov/standards/mods/v3/mods-3-7.xsd">
               <identifier type="ur