<a href="https://colab.research.google.com/github/iued-uni-heidelberg/corpusdev/blob/main/bibliography_dataset_download_v05.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extracting bibliography for the Y1600, Y1700, Y1800 periods from library catalogues

## 1.1 downloading the json dump of vd17
This will last about 2 minutes, the command downloads 307 json files
(such files should be generated on stage 2, see 2.1, etc. -- but there are differences between the dupm and the downloaded version)

In [1]:
!git clone https://git.hab.de/beyer/vd17-dump/

Cloning into 'vd17-dump'...
remote: Enumerating objects: 4739, done.[K
remote: Counting objects: 100% (2862/2862), done.[K
remote: Compressing objects: 100% (660/660), done.[K
remote: Total 4739 (delta 2202), reused 2862 (delta 2202), pack-reused 1877[K
Receiving objects: 100% (4739/4739), 646.03 MiB | 18.37 MiB/s, done.
Resolving deltas: 100% (4033/4033), done.
Updating files: 100% (309/309), done.


now we try to filter the json objects, where

"langOrig":"eng",

e.g., /content/vd17-dump/json/vd17-290.json

	Line  6361:   "langOrig":"eng",
	Line 39193:   "langOrig":"eng",

and write this to a separate file

(ideally we try to create a separate language file for each langOrig value)

https://stackoverflow.com/questions/27189892/how-to-filter-json-array-in-python


### class to read the directory with json files and extract the needed language


In [3]:
import json
import re, os, sys

class clJsonDirFindFilter(object):
    '''
    @author Bogdan Babych, IÜD, Heidelberg University, 2023
    @email bogdan [dot] babych [at] iued [dot] uni-heidelberg [dot] de
    '''
    def __init__(self, SDirName, output_file = 'vdExtracted-all.json', debug_file = 'vdExtracted-debug-oriLangsAll.txt', debug_file_02 = 'vdExtracted-debug-sourceFileCounts.txt', findKey = 'langOrig', filtVal = 'eng'):
        self.FOut = open(output_file, 'w')
        self.FDebug = open(debug_file, 'w')
        self.FDebugCounts = open(debug_file_02, 'w')

        self.output_data = []
        self.dictVals = {}

        self.mainDirWalk(SDirName, findKey, filtVal)
        output_list = self.filtByLang(self.output_data, findKey, filtVal)

        self.dumpOutput(output_list, self.FOut)
        print(len(self.dictVals))
        self.printFrqDict(self.dictVals, self.FDebug)



    def mainDirWalk(self, path, findKey, filtVal):

        for root,d_names,f_names in os.walk(path):
            for f in f_names:
                # if not re.match('^[0-9]+$', f):
                if not re.search('json$', f):
                    print(f'Skipped: {f}')
                    continue
                fullpath = os.path.join(root, f)
                self.procFile(fullpath, findKey, filtVal)




    def procFile(self, SFIn, findKey, filtVal):
        # output_data is a json list of dictionaries, which is updated with every json file read from the directory

        with open(SFIn, 'r', encoding='utf-8') as input_json:
            input_list = json.load(input_json)
            countLO = 0
            output_data_one_file = []
            for i in input_list:
                if findKey in i:
                    countLO += 1

                    langOrigValue = i[findKey]

                    if langOrigValue not in self.dictVals:
                        self.dictVals[langOrigValue] = 0
                    self.dictVals[langOrigValue] += 1

                    self.output_data.append(i)
                    output_data_one_file.append(i)

            output_list_one_file = list(filter(lambda x: x[findKey] == filtVal, output_data_one_file))
            output_list_len_one_file = len(output_list_one_file)
            self.FDebugCounts.write(f'{SFIn}\t{countLO}\t{output_list_len_one_file}\n')

    def filtByLang(self, output_data2filter, findKey, filtVal):
        # output_list = [x for x in input_list if x['langOrig'] == 'eng']
        output_list = list(filter(lambda x: x[findKey] == filtVal, output_data2filter))
        return output_list


    def dumpOutput(self, output_data2print, output_dict_file):
        json.dump(output_data2print, output_dict_file, indent=4, ensure_ascii=False)
        output_dict_file.flush()


    def printFrqDict(self, DFrq2print, FOutput):
        for key, val in sorted(DFrq2print.items(), key=lambda item: item[1], reverse=True):
            FOutput.write(f'{key}\t{val}\n')
        FOutput.flush()




    # output_json_string = json.dumps(output_data)
    ## json.dump(output_data, output_dict_file, indent=4, ensure_ascii=False)

# end: class clJsonDirFindFilter

In [None]:
OJsonDirFindFilter = clJsonDirFindFilter('/content/vd17-dump/json')

In [4]:
'''
OJsonDirFindFilter0 = clJsonDirFindFilter('/content/vd17-dump/json', output_file = 'vdExtracted-eng-all.json', debug_file = 'vdExtracted-eng-debug-oriLangsAll.txt', debug_file_02 = 'vdExtracted-eng-debug-sourceFileCounts.txt', findKey = 'langOrig', filtVal = 'eng')
OJsonDirFindFilter1 = clJsonDirFindFilter('/content/vd17-dump/json', output_file = 'vdExtracted-engFre-all.json', debug_file = 'vdExtracted-engFre-debug-oriLangsAll.txt', debug_file_02 = 'vdExtracted-engFre-debug-sourceFileCounts.txt', findKey = 'langOrig', filtVal = 'eng;fre')
OJsonDirFindFilter2 = clJsonDirFindFilter('/content/vd17-dump/json', output_file = 'vdExtracted-engDut-all.json', debug_file = 'vdExtracted-engDut-debug-oriLangsAll.txt', debug_file_02 = 'vdExtracted-engDut-debug-sourceFileCounts.txt', findKey = 'langOrig', filtVal = 'eng;dut')
OJsonDirFindFilter3 = clJsonDirFindFilter('/content/vd17-dump/json', output_file = 'vdExtracted-engIta-all.json', debug_file = 'vdExtracted-engIta-debug-oriLangsAll.txt', debug_file_02 = 'vdExtracted-engIta-debug-sourceFileCounts.txt', findKey = 'langOrig', filtVal = 'eng;ita')
OJsonDirFindFilter4 = clJsonDirFindFilter('/content/vd17-dump/json', output_file = 'vdExtracted-latEng-all.json', debug_file = 'vdExtracted-latEng-debug-oriLangsAll.txt', debug_file_02 = 'vdExtracted-latEng-debug-sourceFileCounts.txt', findKey = 'langOrig', filtVal = 'lat;eng')
'''

OJsonDirFindFilter01 = clJsonDirFindFilter('/content/vd17-dump/json', output_file = 'vdExtracted-eng-all.json', debug_file = 'vdExtracted-eng-debug-oriLangsAll.txt', debug_file_02 = 'vdExtracted-eng-debug-sourceFileCounts.txt', findKey = 'langOrig', filtVal = 'eng')

OJsonDirFindFilter02 = clJsonDirFindFilter('/content/vd17-dump/json', output_file = 'vdExtracted-engFre-all.json', debug_file = 'vdExtracted-engFre-debug-oriLangsAll.txt', debug_file_02 = 'vdExtracted-engFre-debug-sourceFileCounts.txt', findKey = 'langOrig', filtVal = 'eng;fre')
OJsonDirFindFilter03 = clJsonDirFindFilter('/content/vd17-dump/json', output_file = 'vdExtracted-engDut-all.json', debug_file = 'vdExtracted-engDut-debug-oriLangsAll.txt', debug_file_02 = 'vdExtracted-engDut-debug-sourceFileCounts.txt', findKey = 'langOrig', filtVal = 'eng;dut')
OJsonDirFindFilter04 = clJsonDirFindFilter('/content/vd17-dump/json', output_file = 'vdExtracted-FreEng-all.json', debug_file = 'vdExtracted-FreEng-debug-oriLangsAll.txt', debug_file_02 = 'vdExtracted-FreEng-debug-sourceFileCounts.txt', findKey = 'langOrig', filtVal = 'fre;eng')
OJsonDirFindFilter05 = clJsonDirFindFilter('/content/vd17-dump/json', output_file = 'vdExtracted-DutEng-all.json', debug_file = 'vdExtracted-DutEng-debug-oriLangsAll.txt', debug_file_02 = 'vdExtracted-DutEng-debug-sourceFileCounts.txt', findKey = 'langOrig', filtVal = 'dut;eng')

OJsonDirFindFilter06 = clJsonDirFindFilter('/content/vd17-dump/json', output_file = 'vdExtracted-engIta-all.json', debug_file = 'vdExtracted-engIta-debug-oriLangsAll.txt', debug_file_02 = 'vdExtracted-engIta-debug-sourceFileCounts.txt', findKey = 'langOrig', filtVal = 'eng;ita')
OJsonDirFindFilter07 = clJsonDirFindFilter('/content/vd17-dump/json', output_file = 'vdExtracted-latEng-all.json', debug_file = 'vdExtracted-latEng-debug-oriLangsAll.txt', debug_file_02 = 'vdExtracted-latEng-debug-sourceFileCounts.txt', findKey = 'langOrig', filtVal = 'lat;eng')
OJsonDirFindFilter08 = clJsonDirFindFilter('/content/vd17-dump/json', output_file = 'vdExtracted-FreDutEng-all.json', debug_file = 'vdExtracted-FreDutEng-debug-oriLangsAll.txt', debug_file_02 = 'vdExtracted-FreDutEng-debug-sourceFileCounts.txt', findKey = 'langOrig', filtVal = 'fre;dut;eng')
OJsonDirFindFilter09 = clJsonDirFindFilter('/content/vd17-dump/json', output_file = 'vdExtracted-EngFreDut-all.json', debug_file = 'vdExtracted-EngFreDut-debug-oriLangsAll.txt', debug_file_02 = 'vdExtracted-EngFreDut-debug-sourceFileCounts.txt', findKey = 'langOrig', filtVal = 'eng;fre;dut')

# fre;dut;eng
# eng;fre;dut


81
81
81
81
81
81
81
81
81


In [None]:
!tar -cvzf vdExtracted-langs.tgz *.json

In [5]:
!zip vdExtracted-langs.zip *.json

  adding: vdExtracted-DutEng-all.json (deflated 80%)
  adding: vdExtracted-eng-all.json (deflated 86%)
  adding: vdExtracted-engDut-all.json (deflated 81%)
  adding: vdExtracted-engFre-all.json (deflated 80%)
  adding: vdExtracted-EngFreDut-all.json (deflated 51%)
  adding: vdExtracted-engIta-all.json (deflated 87%)
  adding: vdExtracted-FreDutEng-all.json (deflated 68%)
  adding: vdExtracted-FreEng-all.json (deflated 86%)
  adding: vdExtracted-latEng-all.json (deflated 74%)


## 2.1 setting up the environment for download xml and converting to json

We will try to download again the original xml for 17, check if it agrees with Dump, and then try to download 18, etc.

In [7]:
!git clone https://github.com/hbeyer/pylib

Cloning into 'pylib'...
remote: Enumerating objects: 694, done.[K
remote: Counting objects: 100% (270/270), done.[K
remote: Compressing objects: 100% (193/193), done.[K
remote: Total 694 (delta 183), reused 162 (delta 77), pack-reused 424[K
Receiving objects: 100% (694/694), 398.78 KiB | 2.68 MiB/s, done.
Resolving deltas: 100% (448/448), done.


In [8]:
%cd /content/pylib/
!pwd

/content/pylib
/content/pylib


In [9]:
from lib import bookwheel as bw

In [10]:
import logging
import pickle
from lib import sru
from lib import isil
from lib import xmlreader as xr
from lib import pica

In [11]:
%cd /content
!pwd

/content
/content


In [12]:
# testing
# from lib import bookwheel as bw
cat = bw.Catalogue
sec = cat.get_section(2589)
print(sec)


{'start': 2511, 'end': 2738, 'group': 'Libri Varii', 'dateBegin': '1634', 'year': 1634, 'writer': 'Herzog August'}


In [13]:
!mkdir xmlbibliography
!mkdir xmlbibliographyac

In [14]:
!mkdir jsonbibliography

## 2.2 running the download script for RecordVD17

In [None]:
# import logging
# import pickle
# from lib import sru
# from lib import isil
# from lib import xmlreader as xr
# from lib import pica

logging.basicConfig(level=logging.ERROR)

# Festlegen der Speicherpfade und der Datensätze pro JSON-Datei
# source_folder = "{Ordner mit den PICAXML-Dateien}/"
source_folder = "/content/xmlbibliography/"
# source_folder_ac = "{Ordner mit den PICAXML-Dateien für Gesamtaufnahmen mehrbändiger Werke (Ac-Sätze)}/"
source_folder_ac = "/content/xmlbibliographyac/"
# target_folder = "{Ordner zum Speichern der JSON-Dateien}/"
target_folder = "/content/jsonbibliography/"
size = 1000
limit = 350000

# Laden der Ac-Sätze und Extrahieren der Gattungsbegriffe
req = sru.Request_VD17()
num = req.prepare("pica.bbg=Ac*")
print(req.url)
print(req.numFound)
req.download(source_folder_ac)

res = {}
reader = xr.DownloadReader(source_folder_ac, "record", "info:srw/schema/5/picaXML-v1.0")

for count, node in enumerate(reader):
    rec = pica.RecordVD17(node)
    gatt = [gat for gat in rec.gatt]
    if gatt == []:
        continue
    res[rec.ppn] = gatt
    if count > 100000:
        break

with open('gattungen-ac', 'wb') as file:
    pickle.dump(res, file)

# Download der PICA-XML-Daten
req = sru.Request_VD17()
num = req.prepare("pica.bbg=(Aa* or Af*)")
print(req.url)
print(req.numFound)
req.download(source_folder)

# Auslesen und Abspeichern in JSON
with open('gattungen-ac','rb') as file:
    gatt_ac = pickle.load(file)

reader = xr.DownloadReader(source_folder, "record", "info:srw/schema/5/picaXML-v1.0")

content = []
fnn = []
setn = 1
count = 0

for no, node in enumerate(reader):
    rec = pica.RecordVD17(node)
    if rec.get_rec_type() in ["Teilband", "Teilband mit eigenem Titel"]:
        try:
            rec.gatt = gatt_ac[rec.ppn_sup]
        except:
            logging.info(f"Keine Gattungsbegriffe bei PPN {rec.ppn_sup}")
    content.append(rec)
    count += 1
    if count >= size:
        recl = pica.RecordList(content)
        fn = f"vd17-{str(setn).zfill(3)}"
        recl.to_json(target_folder + fn)
        content = []
        setn += 1
        count = 0
    if no > limit:
        break
if content != []:
    recl = pica.RecordList(content)
    fn = f"vd17-{str(setn).zfill(3)}"
    recl.to_json(target_folder + fn)


In [None]:
!tar -cvzf xmlbibliography.tgz ./xmlbibliography

In [None]:
!tar -cvzf jsonbibliography.tgz ./jsonbibliography

## Extracting from downloaded json files (not the dump)
now we need to extract data from the downloader repository, and check the statistics...

In [16]:
OJsonDirFindFilter10 = clJsonDirFindFilter('/content/jsonbibliography', output_file = 'veExtracted-eng-all.json', debug_file = 'veExtracted-eng-debug-oriLangsAll.txt', debug_file_02 = 'veExtracted-eng-debug-sourceFileCounts.txt', findKey = 'langOrig', filtVal = 'eng')


80


## Testing VD18 & VD16


In [None]:
!mkdir xmlbibliography18
!mkdir xmlbibliographyac18
!mkdir jsonbibliography18

In [None]:
# import logging
# import pickle
# from lib import sru
# from lib import isil
# from lib import xmlreader as xr
# from lib import pica

logging.basicConfig(level=logging.ERROR)

# Festlegen der Speicherpfade und der Datensätze pro JSON-Datei
# source_folder = "{Ordner mit den PICAXML-Dateien}/"
source_folder = "/content/xmlbibliography18/"
# source_folder_ac = "{Ordner mit den PICAXML-Dateien für Gesamtaufnahmen mehrbändiger Werke (Ac-Sätze)}/"
source_folder_ac = "/content/xmlbibliographyac18/"
# target_folder = "{Ordner zum Speichern der JSON-Dateien}/"
target_folder = "/content/jsonbibliography18/"
size = 1000
limit = 350000

# Laden der Ac-Sätze und Extrahieren der Gattungsbegriffe
req = sru.Request_VD18()
num = req.prepare("pica.bbg=Ac*")
print(req.url)
print(req.numFound)
req.download(source_folder_ac)

res = {}
reader = xr.DownloadReader(source_folder_ac, "record", "info:srw/schema/5/picaXML-v1.0")

for count, node in enumerate(reader):
    rec = pica.RecordVD18(node)
    gatt = [gat for gat in rec.gatt]
    if gatt == []:
        continue
    res[rec.ppn] = gatt
    if count > 100000:
        break

with open('gattungen-ac', 'wb') as file:
    pickle.dump(res, file)

# Download der PICA-XML-Daten
req = sru.Request_VD18()
num = req.prepare("pica.bbg=(Aa* or Af*)")
print(req.url)
print(req.numFound)
req.download(source_folder)

# Auslesen und Abspeichern in JSON
with open('gattungen-ac','rb') as file:
    gatt_ac = pickle.load(file)

reader = xr.DownloadReader(source_folder, "record", "info:srw/schema/5/picaXML-v1.0")

content = []
fnn = []
setn = 1
count = 0

for no, node in enumerate(reader):
    rec = pica.RecordVD18(node)
    if rec.get_rec_type() in ["Teilband", "Teilband mit eigenem Titel"]:
        try:
            rec.gatt = gatt_ac[rec.ppn_sup]
        except:
            logging.info(f"Keine Gattungsbegriffe bei PPN {rec.ppn_sup}")
    content.append(rec)
    count += 1
    if count >= size:
        recl = pica.RecordList(content)
        fn = f"vd18-{str(setn).zfill(3)}"
        recl.to_json(target_folder + fn)
        content = []
        setn += 1
        count = 0
    if no > limit:
        break
if content != []:
    recl = pica.RecordList(content)
    fn = f"vd18-{str(setn).zfill(3)}"
    recl.to_json(target_folder + fn)

### VD16 -- error (?)


In [19]:
!mkdir xmlbibliography16
!mkdir xmlbibliographyac16
!mkdir jsonbibliography16

In [20]:
# import logging
# import pickle
# from lib import sru
# from lib import isil
# from lib import xmlreader as xr
# from lib import pica

logging.basicConfig(level=logging.ERROR)

# Festlegen der Speicherpfade und der Datensätze pro JSON-Datei
# source_folder = "{Ordner mit den PICAXML-Dateien}/"
source_folder = "/content/xmlbibliography16/"
# source_folder_ac = "{Ordner mit den PICAXML-Dateien für Gesamtaufnahmen mehrbändiger Werke (Ac-Sätze)}/"
source_folder_ac = "/content/xmlbibliographyac16/"
# target_folder = "{Ordner zum Speichern der JSON-Dateien}/"
target_folder = "/content/jsonbibliography16/"
size = 1000
limit = 350000

# Laden der Ac-Sätze und Extrahieren der Gattungsbegriffe
req = sru.Request_VD16()
num = req.prepare("pica.bbg=Ac*")
print(req.url)
print(req.numFound)
req.download(source_folder_ac)

res = {}
reader = xr.DownloadReader(source_folder_ac, "record", "info:srw/schema/5/picaXML-v1.0")

for count, node in enumerate(reader):
    rec = pica.RecordVD16(node)
    gatt = [gat for gat in rec.gatt]
    if gatt == []:
        continue
    res[rec.ppn] = gatt
    if count > 100000:
        break

with open('gattungen-ac', 'wb') as file:
    pickle.dump(res, file)

# Download der PICA-XML-Daten
req = sru.Request_VD16()
num = req.prepare("pica.bbg=(Aa* or Af*)")
print(req.url)
print(req.numFound)
req.download(source_folder)

# Auslesen und Abspeichern in JSON
with open('gattungen-ac','rb') as file:
    gatt_ac = pickle.load(file)

reader = xr.DownloadReader(source_folder, "record", "info:srw/schema/5/picaXML-v1.0")

content = []
fnn = []
setn = 1
count = 0

for no, node in enumerate(reader):
    rec = pica.RecordVD16(node)
    if rec.get_rec_type() in ["Teilband", "Teilband mit eigenem Titel"]:
        try:
            rec.gatt = gatt_ac[rec.ppn_sup]
        except:
            logging.info(f"Keine Gattungsbegriffe bei PPN {rec.ppn_sup}")
    content.append(rec)
    count += 1
    if count >= size:
        recl = pica.RecordList(content)
        fn = f"vd16-{str(setn).zfill(3)}"
        recl.to_json(target_folder + fn)
        content = []
        setn += 1
        count = 0
    if no > limit:
        break
if content != []:
    recl = pica.RecordList(content)
    fn = f"vd16-{str(setn).zfill(3)}"
    recl.to_json(target_folder + fn)

AttributeError: ignored

## testing commands - to be removed

In [None]:
%cd xmlbibliography/

/content/xmlbibliography


In [None]:
!echo $PYTHONPATH

"$/env/python"


In [None]:
!echo $PATH

"/opt/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/tools/node/bin:/tools/google-cloud-sdk/bin:/content/pylib/lib"


In [None]:
%env PATH="/opt/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/tools/node/bin:/tools/google-cloud-sdk/bin:/content/pylib/lib"

env: PATH="/opt/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/tools/node/bin:/tools/google-cloud-sdk/bin:/content/pylib/lib"


In [None]:
%env PYTHONPATH="$/env/python"

env: PYTHONPATH="$/env/python"


In [None]:
%env PYTHONPATH="$/env/python:/content/pylib/lib/:/content/pylib/lib/bookwheel.py"
# !PYTHONPATH=. ./comet/cli/score.py

env: PYTHONPATH="$/env/python:/content/pylib/lib/:/content/pylib/lib/bookwheel.py"


In [None]:
%cd /content/
!pwd

/content
/content


In [None]:
!wget http://sru.k10plus.de/vd17?version=2.0&operation=searchRetrieve&query=pica.bbg=%28Aa*%20or%20Af*%29&maximumRecords=500&startRecord=1&recordSchema=picaxml

In [None]:
!mv vd17?version=2.0 vd17_500.xml

In [None]:
%cd /content/

/content
