In [1]:
import pandas as pd
from bs4 import BeautifulSoup
from lxml import etree
import unicodedata
import io
import xml.etree.ElementTree as ET
import requests

In [37]:
#50 first records for fiction in ONB after 2009
#SRU: https://obv-at-oenb.alma.exlibrisgroup.com/view/sru/43ACC_ONB?version=1.2&query=alma.national_bibliography_number=OeBC%20and%20alma.local_field_970=56%20and%20alma.%20main_pub_date%3E2009&operation=searchRetrieve

parsed_xml = ET.parse("LT_MC_160424_onb_request_before2009_xmlresults.xml") # This creates an ElementTree
xml_data = parsed_xml.getroot() # This creates an Element

In [38]:
#Funktion zum Extrahieren der Inhalte:
def parse_record(record):
    
    ns = {"marc":"http://www.loc.gov/MARC21/slim"}
        
    #Author: 
    creator = record.findall("marc:datafield[@tag = '100']/marc:subfield[@code = 'a']", namespaces=ns)
    if creator:
        author = creator[0].text
    else:
        author = "fail"
        
        
    #Jahr: 
    year = record.findall("marc:datafield[@tag = '264']/marc:subfield[@code = 'c']", namespaces=ns)
    try:
        year = year[0].text
    except:
        year = "fail"
        
    #publisher: 
    publisher = record.findall("marc:datafield[@tag = '264']/marc:subfield[@code = 'a']", namespaces=ns)
    try:
        publisher = publisher[0].text
    except:
        publisher = "fail"
        
        
    #country: 
    country = record.findall("marc:datafield[@tag = '044']/marc:subfield[@code = 'c']", namespaces=ns)
    try:
        country = country[0].text
    except:
        country = "fail"
        
        
    #IDN:
    idn = record.findall("marc:controlfield[@tag = '001']", namespaces=ns) # Shouldn't this be a controlfield 001 and not a datafield?
    try:
        idn = idn[0].text
    except:
        idn = 'fail'
                
        
    #Translated Language:      
    lang = record.findall('marc:datafield[@tag="041"]/marc:subfield[@code="a"]', namespaces=ns)
    try:
        lang = lang[0].text
    except:
        lang = 'fail'
        
        #Original Language:      
    oglang = record.findall("marc:datafield[@tag = '041']/marc:subfield[@code = 'h']", namespaces=ns)
    try:
        oglang = oglang[0].text
    except:
        oglang = 'fail'
         
        
    #ISBN:
    isbn = record.findall("marc:datafield[@tag = '020']marc:subfield[@code = 'a']", namespaces=ns) # Shouldn't this be a controlfield 001 and not a datafield?
    try:
        isbn = isbn[0].text
    except:
        isbn = 'fail'
        
    #title:
    title = record.findall("marc:datafield[@tag = '245']marc:subfield[@code = 'a']", namespaces=ns) # Shouldn't this be a controlfield 001 and not a datafield?
    try:
        title = title[0].text
    except:
        title = 'fail'
        
        #original title:
    ogtitle = record.findall("marc:datafield[@tag = '240']marc:subfield[@code = 'a']", namespaces=ns) # Shouldn't this be a controlfield 001 and not a datafield?
    try:
        ogtitle = ogtitle[0].text
    except:
        ogtitle = 'fail'
        

    gathered = {'IDN':idn, 'ISBN':isbn, 'year':year, 'author':author, 'language':lang, 'original language':oglang, 'title':title, 'uniform title':ogtitle, 'publisher':publisher,'country':country}
    return gathered
    

In [39]:
#Überführen in Tabelle: 
result = [parse_record(record) for record in xml_data]
df_all = pd.DataFrame(result)
df_all

Unnamed: 0,IDN,ISBN,year,author,language,original language,title,uniform title,publisher,country
0,9959032210903338,9788664072113,2022,"Reyer, Sophie",srp,ger,: izabrane pesme,fail,Beograd,XA-RS
1,990056743370603338,9788467582468,octubre de 2015,"Brezina, Thomas",spa,ger,¡Ah del barco fantasma!,Geisterschiff ahoi!,Boadilla del Monte (Madrid),XA-ES
2,990037600490603338,9788483430873,2010,"Nöstlinger, Christine",spa,ger,¡Ojo! ¡Vranek parece totalmente inofensivo!,Achtung! Vranek sieht ganz harmlos aus,[Barcelona],XA-ES
3,990044392180603338,9788492649310,2010,"Zweig, Stefan",spa,ger,¿Fue él?,War er es? <<span.>>,Barcelona,XA-ES
4,990038910480603338,9783844808896,[2011],"Heresch, André",ger,fail,... ein Stück von mir,fail,Norderstedt,XA-DE
...,...,...,...,...,...,...,...,...,...,...
17310,990018577880603338,fail,1951,"Bednarik, Karl",ger,fail,Zwischenfall in Wien,fail,Tübingen,XA-DE
17311,990008242010603338,3423133414,2005,"Vertlib, Vladimir",ger,fail,Zwischenstationen,fail,München,XA-DE
17312,990044509070603338,fail,1958,"Blok, Aleksandr Aleksandrovič",ger,fail,<<Die>> Zwölf,Dvenadcat',Frankfurt a.M.,XA-DE
17313,990012983320603338,9783492050135,2007,"Molnár, Ákos",ger,hun,Zwölf Schritte,Tizenkét lépés,München [u.a.],XA-DE


In [40]:
#Show only results where Original Language present: 
df2 = (df_all[df_all['original language'] != 'fail'])
df2

Unnamed: 0,IDN,ISBN,year,author,language,original language,title,uniform title,publisher,country
0,9959032210903338,9788664072113,2022,"Reyer, Sophie",srp,ger,: izabrane pesme,fail,Beograd,XA-RS
1,990056743370603338,9788467582468,octubre de 2015,"Brezina, Thomas",spa,ger,¡Ah del barco fantasma!,Geisterschiff ahoi!,Boadilla del Monte (Madrid),XA-ES
2,990037600490603338,9788483430873,2010,"Nöstlinger, Christine",spa,ger,¡Ojo! ¡Vranek parece totalmente inofensivo!,Achtung! Vranek sieht ganz harmlos aus,[Barcelona],XA-ES
3,990044392180603338,9788492649310,2010,"Zweig, Stefan",spa,ger,¿Fue él?,War er es? <<span.>>,Barcelona,XA-ES
5,9958505677803338,9782351281475,[2018],"Trakl, Georg",fre,ger,... Et des soleils éternellement recommencés n...,fail,Montélimar (Drôme),XA-FR
...,...,...,...,...,...,...,...,...,...,...
17275,990049196150603338,fail,1952,"Zweig, Stefan",scr,ger,Zvezdani časovi čovečanstva,Sternstunden der Menschheit,Beograd,XA-YUCS
17287,990042606860603338,9783463405223,2008,"Monaldi, Rita",ger,ita,<<Die>> Zweifel des Salaì,<<I>> dubbi di Salaì,Berlin,XA-DE
17295,990005371050603338,3458192352,2002,Michelangelo,ger,ita,Zweiundvierzig Sonette,fail,Frankfurt am Main [u.a.],XA-DE
17313,990012983320603338,9783492050135,2007,"Molnár, Ákos",ger,hun,Zwölf Schritte,Tizenkét lépés,München [u.a.],XA-DE


In [41]:
df2.to_csv("290424_onb_trans_before2009.csv")

In [None]:
###ARCHIVE###

In [None]:
#Funktion: count languages
def howmanylanguages(record):
    
    ns = {"marc":"http://www.loc.gov/MARC21/slim"}
    xml = etree.fromstring(unicodedata.normalize("NFC", str(record)))
    
          
    #original language:  
    oglangArray = []    
    oglang = xml.findall("marc:datafield[@tag = '41']/marc:subfield[@code = 'h']", namespaces=ns)
    #print(language)
    if len(language) >= 1:
        for currentLanguage in language:
            languageArray.append(currentLanguage.text)
    else:
        languageArray = 'N/A'
        
    #translated language:   
    lantypeArray = [] 
    langtype = xml.findall("marc:datafield[@tag = '41']/marc:subfield[@code = 'a']", namespaces=ns)
    print(langtype)
    if len(langtype) >= 1:
        for currentLangtype in langtype:
            langtypeArray.append(currentLangtype.text)
    else:
        langtypeArray = 'N/A'
        
        
    numberofolanguages = len(languageArray)
    numberoflangtypes = len(langtypeArray)
    
  
    counting_languages = {'how many original languages':numberofolanguages, 'how many translated languages':numberoflangtypes}
    return counting_languages 

In [None]:
numbers = [howmanylanguages(item) for item in gndm]
df1 = pd.DataFrame(numbers)
df1

In [None]:
#display maximum value: 
df1.max()

In [None]:
#Funktion zum Extrahieren der Inhalte:
def parse_record(record):
    
    ns = {"marc":"http://www.loc.gov/MARC21/slim"}
    xml = etree.fromstring(unicodedata.normalize("NFC", str(record)))
    
    #Author: 
    creator = xml.findall("marc:datafield[@tag = '100']/marc:subfield[@code = 'a']", namespaces=ns)
    if creator:
        author = creator[0].text
    else:
        author = "fail"

    #IDN:      
    idn = xml.findall("marc:controlfield[@tag = '001']", namespaces=ns)
    try:
        idn = idn[0].text
    except:
        idn = 'fail'
        
        
    #Orte und Bezug:      
    place1 = xml.findall("marc:datafield[@tag = '551']/marc:subfield[@code = 'a']", namespaces=ns)
    relation = xml.findall("marc:datafield[@tag = '551']/marc:subfield[@code = 'i']", namespaces=ns)
    
    length = (len(place1))

    if length == 0: 
        found1 = "None"
        found2 = "None"
        found3 = "None"
        found4 = "None"
        found5 = "None"
        found6 = "None"
        found7 = "None"
        found8 = "None"
        found9 = "None"
        rela1 = "None"
        rela2 = "None"
        rela3 = "None"
        rela4 = "None"
        rela5 = "None"
        rela6 = "None"
        rela7 = "None"
        rela8 = "None"
        rela9 = "None"
    elif length == 1: 
        found1 = place1[0].text
        found2 = "None"
        found3 = "None"
        found4 = "None"
        found5 = "None"
        found6 = "None"
        found7 = "None"
        found8 = "None"
        found9 = "None"
        rela1 = relation[0].text
        rela2 = "None"
        rela3 = "None"
        rela4 = "None"
        rela5 = "None"
        rela6 = "None"
        rela7 = "None"
        rela8 = "None"
        rela9 = "None"
    elif length == 2: 
        found1 = place1[0].text
        found2 = place1[1].text
        found3 = "None"
        found4 = "None"
        found5 = "None"
        found6 = "None"
        found7 = "None"
        found8 = "None"
        found9 = "None"
        rela1 = relation[0].text
        rela2 = relation[1].text
        rela3 = "None"
        rela4 = "None"
        rela5 = "None"
        rela6 = "None"
        rela7 = "None"
        rela8 = "None"
        rela9 = "None"
    elif length == 3: 
        found1 = place1[0].text
        found2 = place1[1].text
        found3 = place1[2].text
        found4 = "None"
        found5 = "None"
        found6 = "None"
        found7 = "None"
        found8 = "None"
        found9 = "None"
        rela1 = relation[0].text
        rela2 = relation[1].text
        rela3 = relation[2].text
        rela4 = "None"
        rela5 = "None"
        rela6 = "None"
        rela7 = "None"
        rela8 = "None"
        rela9 = "None"
    elif length == 4: 
        found1 = place1[0].text
        found2 = place1[1].text
        found3 = place1[2].text
        found4 = place1[3].text
        found5 = "None"
        found6 = "None"
        found7 = "None"
        found8 = "None"
        found9 = "None"
        rela1 = relation[0].text
        rela2 = relation[1].text
        rela3 = relation[2].text
        rela4 = relation[3].text
        rela5 = "None"
        rela6 = "None"
        rela7 = "None"
        rela8 = "None"
        rela9 = "None"
    elif length == 5: 
        found1 = place1[0].text
        found2 = place1[1].text
        found3 = place1[2].text
        found4 = place1[3].text
        found5 = place1[4].text
        found6 = "None"
        found7 = "None"
        found8 = "None"
        found9 = "None"
        rela1 = relation[0].text
        rela2 = relation[1].text
        rela3 = relation[2].text
        rela4 = relation[3].text
        rela5 = relation[4].text
        rela6 = "None"
        rela7 = "None"
        rela8 = "None"
        rela9 = "None"
    elif length == 6: 
        found1 = place1[0].text
        found2 = place1[1].text
        found3 = place1[2].text
        found4 = place1[3].text
        found5 = place1[4].text
        found6 = place1[5].text
        found7 = "None"
        found8 = "None"
        found9 = "None"
        rela1 = relation[0].text
        rela2 = relation[1].text
        rela3 = relation[2].text
        rela4 = relation[3].text
        rela5 = relation[4].text
        rela6 = relation[5].text
        rela7 = "None"
        rela8 = "None"
        rela9 = "None"
    elif length == 7: 
        found1 = place1[0].text
        found2 = place1[1].text
        found3 = place1[2].text
        found4 = place1[3].text
        found5 = place1[4].text
        found6 = place1[5].text
        found7 = place1[6].text
        found8 = "None"
        found9 = "None"
        rela1 = relation[0].text
        rela2 = relation[1].text
        rela3 = relation[2].text
        rela4 = relation[3].text
        rela5 = relation[4].text
        rela6 = relation[5].text
        rela7 = relation[6].text
        rela8 = "None"
        rela9 = "None"
    elif length == 8: 
        found1 = place1[0].text
        found2 = place1[1].text
        found3 = place1[2].text
        found4 = place1[3].text
        found5 = place1[4].text
        found6 = place1[5].text
        found7 = place1[6].text
        found8 = place1[7].text
        found9 = "None"
        rela1 = relation[0].text
        rela2 = relation[1].text
        rela3 = relation[2].text
        rela4 = relation[3].text
        rela5 = relation[4].text
        rela6 = relation[5].text
        rela7 = relation[6].text
        rela8 = relation[7].text
        rela9 = "None"
    elif length == 9: 
        found1 = place1[0].text
        found2 = place1[1].text
        found3 = place1[2].text
        found4 = place1[3].text
        found5 = place1[4].text
        found6 = place1[5].text
        found7 = place1[6].text
        found8 = place1[7].text
        found9 = place1[8].text
        rela1 = relation[0].text
        rela2 = relation[1].text
        rela3 = relation[2].text
        rela4 = relation[3].text
        rela5 = relation[4].text
        rela6 = relation[5].text
        rela7 = relation[6].text
        rela8 = relation[7].text
        rela9 = relation[8].text
    elif length >= 10:
        found1 = "Zuviele Treffer"
        found2 = "Zuviele Treffer"
        found3 = "Zuviele Treffer"
        found4 = "Zuviele Treffer"
        found5 = "Zuviele Treffer"
        found6 = "Zuviele Treffer"
        found7 = "Zuviele Treffer"
        found8 = "Zuviele Treffer"
        found9 = "Zuviele Treffer"
        rela1 = "Zuviele Treffer"
        rela2 = "Zuviele Treffer"
        rela3 = "Zuviele Treffer"
        rela4 = "Zuviele Treffer"
        rela5 = "Zuviele Treffer"
        rela6 = "Zuviele Treffer"
        rela7 = "Zuviele Treffer"
        rela8 = "Zuviele Treffer"
        rela9 = "Zuviele Treffer"
    else:
        found1 = "ERROR"  
        found2 = "ERROR"  
        found3 = "ERROR"  
        found4 = "ERROR"  
        found5 = "ERROR"
        found6 = "ERROR"
        found7 = "ERROR"
        found8 = "ERROR"
        found9 = "ERROR"
        rela1 = "ERROR"  
        rela2 = "ERROR"  
        rela3 = "ERROR"  
        rela4 = "ERROR"
        rela5 = "ERROR"
        rela6 = "ERROR"
        rela7 = "ERROR"
        rela8 = "ERROR"
        rela9 = "ERROR"
        
        
    #Zusammenführen:
    gathered = {'Author':author, 'ID':idn, 'Ort1':found1, 'Art1':rela1, 'Ort2':found2, 'Art2':rela2, 
                'Ort3':found3, 'Art3':rela3, 'Ort4':found4, 'Art4':rela4, 'Ort5':found5, 'Art5':rela5,
                'Ort6':found6, 'Art6':rela6, 'Ort7':found7, 'Art7':rela7, 'Ort8':found8, 'Art8':rela8,
                'Ort9':found9, 'Art9':rela9}
    return gathered

In [None]:
#Überführen in Tabelle: 
result = [parse_record(item) for item in gndm]
df_all = pd.DataFrame(result)
df_all

In [None]:
#Show only results where Wirkungsort present: 
df2 = (df_all[df_all['Ort1'] != 'None'])
df2

In [None]:
df2.to_csv("20220128_dnball_authors_places.csv")