# Access and transform XML metadata and integrate with fulltexts to JSON

<h1><span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Prerequisites" data-toc-modified-id="Prerequisites-0"><span class="toc-item-num">0&nbsp;&nbsp;</span>Prerequisites</a></span></li><li><span><a href="#Access-XML-metadata-from-e-periodica-and-e-rara" data-toc-modified-id="Access-XML-metadata-from-e-periodica-and-e-rara-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Access XML metadata from e-periodica and e-rara</a></span><ul class="toc-item"><li><span><a href="#Access-XML-metadata-from-e-periodica-and-save-to-local" data-toc-modified-id="Access-XML-metadata-from-e-periodica-and-save-to-local-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Access XML metadata from e-periodica and save to local</a></span></li><li><span><a href="#Access-XML-metadata-from-e-rara-(Bernensia-set)-and-save-to-local" data-toc-modified-id="Access-XML-metadata-from-e-rara-(Bernensia-set)-and-save-to-local-1.2"><span class="toc-item-num">1.2&nbsp;&nbsp;</span>Access XML metadata from e-rara (Bernensia set) and save to local</a></span></li><li><span><a href="#Upload-to-AWS-S3" data-toc-modified-id="Upload-to-AWS-S3-1.3"><span class="toc-item-num">1.3&nbsp;&nbsp;</span>Upload to AWS S3</a></span></li></ul></li><li><span><a href="#Transform-XML-file-to-JSON" data-toc-modified-id="Transform-XML-file-to-JSON-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Transform XML file to JSON</a></span><ul class="toc-item"><li><span><a href="#Transform-e-rara-XML-to-JSON-with-inserting-full-texts---local" data-toc-modified-id="Transform-e-rara-XML-to-JSON-with-inserting-full-texts---local-2.1"><span class="toc-item-num">2.1&nbsp;&nbsp;</span>Transform e-rara XML to JSON with inserting full texts - local</a></span></li><li><span><a href="#Transform-e-periodica-XML-to-JSON-with-inserting-full-texts---remote:-S3-->-S3" data-toc-modified-id="Transform-e-periodica-XML-to-JSON-with-inserting-full-texts---remote:-S3-->-S3-2.2"><span class="toc-item-num">2.2&nbsp;&nbsp;</span>Transform e-periodica XML to JSON with inserting full texts - remote: S3 -&gt; S3</a></span></li><li><span><a href="#Transform-e-periodica-XML-to-JSON-without-fulltexts---local" data-toc-modified-id="Transform-e-periodica-XML-to-JSON-without-fulltexts---local-2.3"><span class="toc-item-num">2.3&nbsp;&nbsp;</span>Transform e-periodica XML to JSON without fulltexts - local</a></span></li></ul></li></ul></div>

## Prerequisites

In [26]:
# Load the necessary libraries
import requests                                 # request URLs
from bs4 import BeautifulSoup as soup           # webscrape and parse HTML and XML
!pip install lxml
import lxml                                     # XML parser supported by bs4
                                                # call with soup(markup, 'lxml-xml' OR 'xml')
import os                                       # navigate and manipulate file directories
import time                                     # work with time stamps
import pandas as pd                             # pandas is the Python standard library to work with dataframes
import math                                     # work with mathematical functions
import re                                       # work with regular expressions
!pip install xmltodict
import xmltodict                                # read and transform XML
import json                                     # work with rjson
from IPython.display import IFrame              # embed website views in Jupyter Notebook
from IPython.utils import io                    # for capturing output

!pip install awscli
!pip install boto3
import boto3

print("Succesfully imported necessary libraries")

Succesfully imported necessary libraries


## Access XML metadata from e-periodica and e-rara

### Access XML metadata from e-periodica and save to local

In [6]:
oai = 'https://www.e-periodica.ch/oai/'

def load_xml(params):
    '''
    Accesses the OAI interface according to given parameters and scrapes its content.
    Parameters:
    All available native OAI verbs and parameter/value pairs.
    '''
    base_url = oai
    response = requests.get(base_url, params=params)
    output_soup = soup(response.content, "lxml")
    return output_soup

def download_record(ID, filename=None):
    '''
    Downloads a certain metadata record from OAI to a single XML file.
    Throws a notice if metadata file already exists and leaves the existing one.
    Parameters:
    ID = E-periodica ID of the desired record.
    filename = File name to choose for the downloaded record. Without a filename one is derived from ID.
    '''
    path = os.getcwd()
    output_soup = load_xml({'verb': 'GetRecord', 'metadataPrefix': 'oai_dc', 'identifier': 'oai:agora.ch:' + str(ID)})
    if filename == None:
        filename=re.sub(':', '_', ID)
    else: pass
    outfile = path + '/{}.xml'.format(filename) 
    try:
        with open(outfile, mode='x', encoding='utf-8') as f:
            f.write(output_soup.decode())
            #print("Metadata file {}.xml saved".format(filename))
    except FileExistsError:
            print("Metadata file {}.xml exists already".format(filename))
    finally:
            pass

In [13]:
# Test
download_record('zgh-001:1977:39::177', 'test')

In [4]:
with open('content/eperiodica_files.csv', 'r') as f:
    files = pd.read_csv(f, encoding='utf-8')
files.head()

Unnamed: 0,Column 1,Column 2,size,id_intern,file
0,2021-09-26,12:03:57,9.9 MiB,zgh-001:1939:1::293,zgh-001:1939:1::293.pdf
1,2021-09-26,12:03:58,3.5 MiB,zgh-001:1939:1::294,zgh-001:1939:1::294.pdf
2,2021-09-26,12:04:01,11.0 MiB,zgh-001:1939:1::295,zgh-001:1939:1::295.pdf
3,2021-09-26,12:04:03,9.1 MiB,zgh-001:1939:1::296,zgh-001:1939:1::296.pdf
4,2021-09-26,12:04:05,1.4 MiB,zgh-001:1939:1::297,zgh-001:1939:1::297.pdf


In [2]:
for i in files.index:
    download_record(files.id_intern[i])

### Access XML metadata from e-rara (Bernensia set) and save to local

In [51]:
def retrieve_set_metadata(Set, foldername, metadataPrefix='mods'):
    '''
    Downloads metadata records of a given set and in a given format from OAI to XML files
    in a designated folder.
    Therefore it
    * requests e-rara OAI-PMH interface according to a set 
    * creates a folder for the records according to parameter foldername
    * retrieves the set's e-rara IDs
    * retrieves metadata according to IDs and given metadata format (default: MODS)
    * saves metadata to single <e-rara ID>.xml files in the folder.
    Parameters:
    Set = The 'setSpec' short cut of the desired OAI set.
    foldername = The name of the folder which will be created to hold the metadata files.
    metadataPrefix = Metadata format to be delivered. Default value is MODS.
    '''
    start = time.perf_counter()

    # Set parameters to the interface
    base_url = oai
    recordsearch_term = {'verb': 'GetRecord', 'metadataPrefix': metadataPrefix}
    listsearch_term = {'verb': 'ListIdentifiers', 'metadataPrefix': metadataPrefix, 'set': Set}
    
    # Make a folder <metadata> with subfolder named like the set to store files in it
    path = os.getcwd() + '/' + foldername
    try:
        os.makedirs(path, exist_ok = True)
        print("Path {} is already available or created successfully".format(path))
    except OSError as error:
        print("Path {} can not be created".format(path))
    
        
    # Basic functions
    def load_xml(params):
        '''
        Accesses the OAI interface according to given parameters and scrapes its content.
        '''
        response = requests.get(base_url, params=params)
        output_soup = soup(response.content, "lxml")
        return output_soup

    def download_record(ID):
        '''
        Downloads a certain metadata record from OAI to a single XML file.
        Throws a notice if metadata file already exists and leaves the existing one.
        Parameter:
        ID = E-rara ID of the desired record.
        '''
        output_soup = load_xml({'verb': 'GetRecord', 'metadataPrefix': metadataPrefix, 'identifier': ID})
        outfile = path + '/{}.xml'.format(ID) 
        try:
            with open(outfile, mode='x', encoding='utf-8') as f:
                    f.write(output_soup.decode())
        except FileExistsError:
                print("Metadata file {}.xml exists already".format(ID))
        finally:
                pass

    # Start with the first access to OAI interface - get the item IDs of a set
    xml_soup = load_xml(listsearch_term)

    # Calculate how many accesses it takes to go through all the pages of the results list, print notice
    splits = math.ceil(int(xml_soup.resumptiontoken['completelistsize']) // 10) + 1
    print(xml_soup.resumptiontoken['completelistsize'], 'identifiers to request in ', splits, 'data splits')
    

    for i in range(splits):
        if i == 0:
            # First access for item IDs - first page + information about whole length of results list
            xml_soup_new = load_xml(listsearch_term)      
        else:
            # Following accesses for item IDs
            xml_soup_new = load_xml({'verb': 'ListIdentifiers', 'resumptionToken': resumption_token})

        # Scraping out the e-rara IDs
        ids = [] 
        for ID in [(i.contents[0]) for i in xml_soup_new.find_all('identifier')]:
            match = re.search('oai:www.e-rara.ch:(\d+)', ID)      # extract the number following 'oai:www.e-rara.ch:'
            if match:
                ids.append(match.group(1))     # first parenthesized subgroup of group() = number

        # Download the MODS metadata records according to retrieved e-rara IDs
        print('Start retrieving metadata for e-rara IDs ', ids)  
        for ID in ids:
            download_record(ID)
        ids = []

        # Actualize the resumtpion token to retrieve the the next page
        try:
            new_token = xml_soup_new.find('resumptiontoken').get_text()
            resumption_token = new_token
            print('New resumption token:', resumption_token)
        except AttributeError:
            print('Reached end of IDs/results list')       # notice when last page is done
        finally:
            pass

    with os.scandir(path) as entries:
        count = 0
        for entry in entries:
            count += 1       
    print("{} metadata files in {}".format(count, path))
    finish = time.perf_counter()
    print("Finished in {} second(s)".format(round(finish - start, 2)))

In [53]:
retrieve_set_metadata('bernensia', 'xml-from_oai-e-rara', metadataPrefix='oai_dc')

Path /home/jovyan/xml-from_oai-e-rara is already available or created successfully
571 identifiers to request in  58 data splits
Start retrieving metadata for e-rara IDs  ['1395833', '1396731', '1397203', '1757425', '1757509', '1757592', '1757931', '1758267', '2069554', '4709578']
New resumption token: 0x928e8fa96dcd710c248f7aebd1f02740-cursor_p_3D10_p_26set_p_3Dbernensia_p_26metadataPrefix_p_3Doai_dc_p_26batch_size_p_3D11
Start retrieving metadata for e-rara IDs  ['4711794', '5706340', '5708393', '5709367', '5709444', '5710407', '5710598', '5710847', '5711104', '5711626']
New resumption token: 0x737c57ec5788b68c3a9d0ad3df432805-cursor_p_3D20_p_26set_p_3Dbernensia_p_26metadataPrefix_p_3Doai_dc_p_26batch_size_p_3D11
Start retrieving metadata for e-rara IDs  ['5711988', '5712774', '5713078', '5713424', '5713843', '6800516', '6800520', '6800524', '6800528', '6805629']
New resumption token: 0x737c57ec5788b68c3a9d0ad3df432805-cursor_p_3D30_p_26set_p_3Dbernensia_p_26metadataPrefix_p_3Doai_dc

### Upload to AWS S3

In [27]:
s3 = boto3.resource('s3')
for bucket in s3.buckets.all():
    print(bucket.name)

bgd-content
bgd-test-content


In [37]:
# from: https://boto3.amazonaws.com/v1/documentation/api/latest/guide/s3-examples.html
import logging
from botocore.exceptions import ClientError

def upload_file(file_name, bucket, object_name=None):
    """Upload a file to an S3 bucket

    :param file_name: File to upload
    :param bucket: Bucket to upload to
    :param object_name: S3 object name. If not specified then file_name is used
    :return: True if file was uploaded, else False
    """

    # If S3 object_name was not specified, use file_name
    if object_name is None:
        object_name = os.path.basename(file_name)

    # Upload the file
    s3_client = boto3.client('s3')
    try:
        response = s3_client.upload_file(file_name, bucket, object_name)
    except ClientError as e:
        logging.error(e)
        return False
    return True

In [183]:
# Test: Upload a file to S3 with boto3
upload_file(file_name='xml-from_oai-e-rara/10179500.xml', bucket='bgd-content', \
            object_name='bernensia-xml/10179500.xml')

True

In [38]:
for file in os.listdir('xml-from_oai-e-rara'):
    upload_file(file_name='xml-from_oai-e-rara/{}'.format(file), bucket='bgd-content', \
                object_name='bernensia-xml/{}'.format(file))

True

In [43]:
response = s3.list_objects(Bucket='bgd-content', MaxKeys=1000, \
                           Prefix='bernensia-xml/')    #bernensia-json-fulltext/
print(len(response['Contents']))
keys = []
for i in response['Contents'][0:5]:
    keys.append(i['Key'])
keys

572


['bernensia-xml/',
 'bernensia-xml/10179500.xml',
 'bernensia-xml/10347968.xml',
 'bernensia-xml/10381638.xml',
 'bernensia-xml/10722710.xml']

## Transform XML file to JSON

### Transform e-rara XML to JSON with inserting full texts - local

In [52]:
# make JSON file from e-rara XML file via dict
path = 'xml-from_oai-e-rara/'

for file in os.listdir(path):
    if file.endswith('.xml'):
        with open(path + file, 'r') as file:
            obj = xmltodict.parse(file.read())
            identifier = obj['html']['body']['oai-pmh']['getrecord']['record']['header']['identifier']
           
            # identifier for e-periodica XML
            #identifier = identifier.replace(':', r'_')
            
            # identifier for e-rara XML
            match = re.search('oai:www.e-rara.ch:(\d+)', identifier)      # extract the number following 'oai:www.e-rara.ch:'
            if match:
                identifier = match.group(1)     # first parenthesized subgroup of group() = number
                
            # read out metadata, delete certain fields and add'id-intern' field
            metadata = obj['html']['body']['oai-pmh']['getrecord']['record']['metadata']['oai_dc:dc']
            del_fields = ['@xmlns:dc', '@xmlns:oai_dc', '@xmlns:xsi', '@xsi:schemalocation']
            [metadata.pop(field) for field in del_fields]
            metadata.update({"id_intern": identifier})
            
            dic_new = {}
            for key in metadata.keys():
                match = re.search('dc:(\w+)', key)      # extract field names without 'dc_' beginning
                if match:
                    key_new = match.group(1)     # first parenthesized subgroup of group() = content
                else: 
                    key_new = key
                #key_new = key.replace(':', r'_')    # easier alternative to get rid of ':'
                value = metadata.get(key)
                dic_new.update({key_new: value})
                
            # insert fulltext
            try:
                with open('content/raw/fulltext/mix_erara_bernensia/' + identifier + '.txt', 'r') as file:
                    fulltext = file.read()
                dic_new['fulltext'] = fulltext
            except: pass
            
            dump = json.dumps(dic_new)
            with open('content/processed/bernensia_json_fulltext/' + identifier +'.json', 'w', encoding='UTF-8') as file:
                file.write(dump)
    else: pass


In [53]:
# Check e-rara JSON example without fulltext
with open('content/processed/bernensia_json_fulltext/23563758.json') as f:
  dic = json.load(f)
dic

{'title': 'Flora von Bern : systematische Uebersicht der in der Gegend von Bern wildwachsenden und allgemein cultivirten Phanerogamen und GefÃ¤sskryptogamen',
 'creator': 'Fischer, Ludwig',
 'subject': 'Pflanzen',
 'description': ['von Dr. L. Fischer',
  'Mit einer Karte (Abdruck aus der topographischen Karte der Schweiz von G.H. Dufour'],
 'publisher': 'KÃ¶rber',
 'date': '1897',
 'type': ['Text', 'Book'],
 'format': 'XXXVI, 309 Seiten, 1 gefaltetes Blatt mit Bildtafel : 1 Karte',
 'identifier': ['doi:10.3931/e-rara-83285',
  'https://www.e-rara.ch/bes_1/doi/10.3931/e-rara-83285',
  'system:99116798113805511'],
 'relation': 'vignette : https://www.e-rara.ch/bes_1/titlepage/doi/10.3931/e-rara-83285/128',
 'language': 'ger',
 'coverage': '580',
 'rights': 'pdm',
 'id_intern': '23563758'}

In [65]:
len(os.listdir('content/processed/bernensia_json_fulltexts/'))

571

In [61]:
# Check e-rara JSON example with fulltext
with open('content/processed/bernensia_json_fulltext/13217374.json') as f:
  dic = json.load(f)
#dic
#print(dic['fulltext'])
dic['identifier'][1]

'https://www.e-rara.ch/bes_1/doi/10.3931/e-rara-47605'

In [38]:
# Check e-rara JSON example with fulltext
with open('content/processed/bernensia_json_fulltexts/2069554.json') as f:
  dic = json.load(f, encoding='utf-8')
dic

{'title': 'Der  Familie Ziegler demÃ¼thige und unterthÃ¤nigste Bittschrift',
 'creator': '[s.n.]',
 'description': ['Beylag No. 1 : Extract aus dem Raths-Manual der Stadt Bern (1636). Beylag No. 2 : ; Copia Schreibens Mrghhrn. der RÃ¤the an die Stadt MÃ¼llhausen (1644). Beylag No. 3 : ; Artikel in der gedrukten Abzug-Ordnung (1644). Beylag No. 4 : ; Auszug aus dem deutschen Missiven-Buch der Stadt Bern: Der Stadt MÃ¼llhausen reciprocierliches Antwort-Schreiben des Abzugs halben ; Vortrag der Familie Ziegler an meine ... Herren der Burger-Cammer',
  'Antrag der Familie Ziegler an die Berner Obrigkeit, um das Berner Burgerrecht zu erlangen'],
 'publisher': '[Verlag nicht ermittelbar]',
 'date': '1775',
 'type': ['Text', 'Book'],
 'format': '16 S. ; 36 cm',
 'identifier': ['doi:10.3931/e-rara-7826',
  'https://www.e-rara.ch/bes_1/doi/10.3931/e-rara-7826',
  'system:99116894928805511'],
 'relation': 'vignette : https://www.e-rara.ch/bes_1/titlepage/doi/10.3931/e-rara-7826/128',
 'language'

### Transform e-periodica XML to JSON with inserting full texts - remote: S3 -> S3

In [75]:
# from local
! aws s3 cp s3://bgd-content/eperiodica-txt/zgh-001:1939:1::294.txt content/test3.txt

Completed 12.8 KiB/12.8 KiB (55.4 KiB/s) with 1 file(s) remaining
download: s3://bgd-content/eperiodica-txt/zgh-001:1939:1::294.txt to content\test3.txt


In [15]:
def transform_to_json(id_intern):
    '''
    Takes XML files from S3, reads defined contents out to a JSON file, and inserts fulltexts in this JSON file
    by dedicated TXT files.
    
    :param id_intern: internal identifier of e-periodica item
    '''
    id_underline = id_intern.replace(':', r'_')
    ! aws s3 cp s3://bgd-content/eperiodica-xml/zgh${id_underline}.xml metadata.xml
    with open('metadata.xml', 'r', encoding='UTF-8') as f:        
        obj = xmltodict.parse(f.read())
    if 'getrecord' in obj['html']['body']['oai-pmh']:
        #identifier = obj['html']['body']['oai-pmh']['getrecord']['record']['header']['identifier']

        # identifier for e-periodica XML
        #match = re.search('oai:agora.ch:(\S+)', identifier)       # extract the identifier
        #if match:
            #identifier = match.group(1)     # first parenthesized subgroup of group() = number
            #identifier = identifier.replace(':', r'_')

        # identifier for e-rara XML
        #match = re.search('oai:www.e-rara.ch:(\d+)', identifier)     # extract the identifier number
        #if match:
            #identifier = match.group(1)     # first parenthesized subgroup of group() = number

        # read out metadata, delete certain fields and add 'id-intern' field
        metadata = obj['html']['body']['oai-pmh']['getrecord']['record']['metadata']['oai_dc:dc']
        del_fields = ['@xmlns:dc', '@xmlns:oai_dc', '@xmlns:xsi', '@xsi:schemalocation']
        [metadata.pop(field) for field in del_fields]
        metadata.update({"id_intern": id_intern})

        dic_new = {}
        for key in metadata.keys():
            match = re.search('dc:(\w+)', key)      # extract field names without 'dc_' beginning
            if match:
                key_new = match.group(1)     # first parenthesized subgroup of group() = content
            else: 
                key_new = key
                    #key_new = key.replace(':', r'_')    # easier alternative to get rid of ':'
            value = metadata.get(key)
            dic_new.update({key_new: value})
                
        # write full text into JSON file while omiting the cover page content
        try:
            ! aws s3 cp s3://bgd-content/eperiodica-txt/zgh${id_intern}.txt fulltext.txt
            with open('fulltext.txt', 'r', encoding='UTF-8') as f:
                text = f.read()
            match = re.search(r'(\S+)\n\nhttp://www.e-periodica.(\w{2})\n\n\n\n\n', text) #.ch\n\n\n\n\n
            fulltext = text.split(match.group())[1]
            dic_new['fulltext'] = fulltext
        except: pass
            
        dump = json.dumps(dic_new)
        with open('metadata.json', 'w') as f:
            f.write(dump)
        ! aws s3 cp metadata.json s3://bgd-content/eperiodica-json-fulltext/zgh${id_intern}.json
    else: pass


In [83]:
transform_to_json('zgh-002:2018:80::405')

download: s3://bgd-content/eperiodica-xml/zgh-002_2018_80__405.xml to ./metadata.xml
download: s3://bgd-content/eperiodica-txt/zgh-002:2018:80::405.txt to ./fulltext.txt
upload: ./test.json to s3://bgd-content/eperiodica-json-fulltext/zgh-002:2018:80::405.json


In [22]:
# with preventing print output
with io.capture_output() as captured:
    transform_to_json('zgh-002:2018:80::405')

In [23]:
with open('eperiodica_files.csv', 'r') as f:
    df = pd.read_csv(f)
df.tail()

Unnamed: 0,Column 1,Column 2,size,id_intern,file
915,2021-09-26,14:17:46,0 Bytes,zgh-002:2020:82::295,zgh-002:2020:82::295.pdf
916,2021-09-26,14:17:47,0 Bytes,zgh-002:2020:82::296,zgh-002:2020:82::296.pdf
917,2021-09-26,14:17:48,0 Bytes,zgh-002:2020:82::303,zgh-002:2020:82::303.pdf
918,Total Objec,ts: 919,,,
919,Total Si,ze: 8.9 G,iB,,


In [29]:
with io.capture_output() as captured:
    for i in df.index:
        id_intern = df.id_intern[i]
        transform_to_json(id_intern)

In [36]:
# Check amount of objects in AWS S3
s3 = boto3.client('s3')
response = s3.list_objects(Bucket='bgd-content', MaxKeys=1000, \
                           Prefix='eperiodica-json-fulltext/')    #bernensia-json-fulltext/
print(len(response['Contents']))
keys = []
for i in response['Contents'][0:5]:
    keys.append(i['Key'])
keys

917


['eperiodica-json-fulltext/',
 'eperiodica-json-fulltext/zgh-001:1939:1::293.json',
 'eperiodica-json-fulltext/zgh-001:1939:1::294.json',
 'eperiodica-json-fulltext/zgh-001:1939:1::295.json',
 'eperiodica-json-fulltext/zgh-001:1939:1::296.json']

In [21]:
# XML file 'zgh-001_1977_39__176' without content stems from source platform problem
IFrame('https://www.e-periodica.ch/oai?verb=GetRecord&metadataPrefix=oai_dc&identifier=oai:agora.ch:zgh-001:1977:39::176', \
       width=970, height=300)

### Transform e-periodica XML to JSON without fulltexts - local

In [18]:
# make JSON file from e-periodica XML file via dict
path = 'content/raw/metadata/xml_eperiodica/'

for file in os.listdir(path):
    if file.endswith('.xml'):
        with open(path + file, 'r') as file:
            obj = xmltodict.parse(file.read())
            if 'getrecord' in obj['html']['body']['oai-pmh']:
                identifier = obj['html']['body']['oai-pmh']['getrecord']['record']['header']['identifier']

                # identifier for e-periodica XML
                match = re.search('oai:agora.ch:(\S+)', identifier)      # extract the identifer following 'oai:www.e-rara.ch:'
                if match:
                    identifier = match.group(1)     # first parenthesized subgroup of group() = number
                identifier = identifier.replace(':', r'_')

                # identifier for e-rara XML
                #match = re.search('oai:www.e-rara.ch:(\d+)', identifier)      # extract the number following 'oai:www.e-rara.ch:'
                #if match:
                    #identifier = match.group(1)     # first parenthesized subgroup of group() = number

                # read out metadata, delete certain fields and add 'id-intern' field
                metadata = obj['html']['body']['oai-pmh']['getrecord']['record']['metadata']['oai_dc:dc']
                del_fields = ['@xmlns:dc', '@xmlns:oai_dc', '@xmlns:xsi', '@xsi:schemalocation']
                [metadata.pop(field) for field in del_fields]
                metadata.update({"id_intern": identifier})

                dic_new = {}
                for key in metadata.keys():
                    match = re.search('dc:(\w+)', key)      # extract field names without 'dc_' beginning
                    if match:
                        key_new = match.group(1)     # first parenthesized subgroup of group() = content
                    else: 
                        key_new = key
                    #key_new = key.replace(':', r'_')    # easier alternative to get rid of ':'
                    value = metadata.get(key)
                    dic_new.update({key_new: value})

                dump = json.dumps(dic_new)
                with open('content/processed/eperiodica_json/' + identifier +'.json', 'w') as file:
                    file.write(dump)
            else: pass
    else: pass


In [19]:
print(len(os.listdir('content/raw/metadata/xml_eperiodica/')))
len(os.listdir('content/processed/eperiodica_json/'))

918


916

In [12]:
with open('content/raw/metadata/xml_eperiodica/zgh-001_1977_39__176.xml', 'r') as f:
    md = f.read()
md

'<?xml version="1.0" encoding="UTF-8"?><html><body><oai-pmh xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemalocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">\n<responsedate>2021-09-28T13:35:26Z</responsedate>\n<request>https://www.e-periodica.ch/oai/dataprovider</request>\n<error code="idDoesNotExist"></error>\n</oai-pmh></body></html>'

In [119]:
# Check e-periodica JSON example
with open('content/processed/eperiodica_json/zgh-001_1939_1__294.json') as f:
  dic = json.load(f, encoding='utf-8')
dic

{'title': 'Aus der Ur- und FrÃ¼hgeschichte von Spiez : ein Beitrag zur alpinen Siedlungsgeschichte',
 'creator': 'Tschumi, O.',
 'subject': None,
 'description': None,
 'publisher': 'Paul Haupt Bern',
 'contributor': None,
 'date': '1939',
 'type': ['Text', 'Journal Article'],
 'source': ['Berner Zeitschrift fÃ¼r Geschichte und Heimatkunde',
  '280461-x',
  '0005-9420',
  '1',
  '1939',
  '1',
  None,
  '21'],
 'language': None,
 'relation': None,
 'coverage': None,
 'rights': None,
 'format': ['text/html', 'application/pdf', 'text/html'],
 'identifier': ['https://www.e-periodica.ch/digbib/view?pid=zgh-001:1939:1::294',
  'https://www.e-periodica.ch/cntmng?type=pdf&pid=zgh-001:1939:1::294',
  'doi:10.5169/seals-237635'],
 'id_intern': 'zgh-001_1939_1__294'}