In [1]:
import os
import sys
import json
import boto3
import pysolr
import requests
import warnings
import pandas as pd
import matplotlib.pyplot as plt

sys.path.append("../helpers/")
from impresso_id import *

data_path = "data"
os.makedirs(data_path, exist_ok=True)

%load_ext autoreload
%autoreload 2

# Newspaper dataset of the National Library of Luxembourg

This notebook shows the creation of a dataset containing metadata on every items listed as tables by the National Library of Luxembourg.

### Impressio Middle Layer (IML)
The IML returns the following fields:
'uid', 'type', 'title', 'size', 'nbPages', 'pages', 'isCC', 'excerpt', 'language', 'issue', 'newspaper', 'collections', 'tags', 'country', 'year', 'date', 'isFront', 'accessRight', 'labels', 'contentLineBreaks', 'regionBreaks', 'regions', 'matches'.

In [2]:
iml_token = os.environ['IML_TOKEN']
headers = {"Accept": "application/json",
           "Authorization": "Bearer " + iml_token}

In [3]:
with warnings.catch_warnings(record=True):  # hides SSL Certificate error due to verify=False in requests.get()
    dataset_iml = dict()

    step_size = 1000
    offset = 0
    while True:
        url = f'https://dev.impresso-project.ch/api/search?group_by=articles&filters[0][type]=type&filters[0][q]=tb&offset={offset}&limit=1000'
        x = requests.get(url, headers=headers, verify=False)  # NOT ADVISED: verify=False
        
        for row in json.loads(x.content)['data']:

            metadata = dict()
            for table in row['regions']:
                pid = table['pageUid']
                
                if pid not in metadata:
                    metadata[pid] = dict()
                    metadata[pid]['coords'] = {'x': [], 'y': [], 'width': [], 'height': []}
                    metadata[pid]['iiifFragmentURL'] = []
                    iiifURL = table['iiifFragment'].split('/')
                    iiifURL[-4] = 'full'
                    iiifURL = '/'.join(iiifURL)
                    metadata[pid]['iiifURL'] = iiifURL

                metadata[pid]['iiifFragmentURL'].append(table['iiifFragment'])
                metadata[pid]['coords']['x'].append(table['coords'][0])
                metadata[pid]['coords']['y'].append(table['coords'][1])
                metadata[pid]['coords']['width'].append(table['coords'][2])
                metadata[pid]['coords']['height'].append(table['coords'][3])

            dataset_iml[row['uid']] = metadata

        if len(json.loads(x.content)['data']) < step_size:
            break
        else:
            offset += step_size

print(f"{len(dataset_iml)} results have been retrieved.")

62619 results have been retrieved.


In [4]:
empty_tables_iml = {k for k, v in dataset_iml.items() if len(v) == 0}
print(f"{len(empty_tables_iml)} table items do not contain any coordinate information.")

12 table items do not contain any coordinate information.


Some tables do not contain any information, we can therefore safely discard them.

In [5]:
for k in empty_tables_iml:
    del dataset_iml[k]

In [6]:
items_over_multiple_pages = [(k, v) for k, v in dataset_iml.items() if len(v) > 1]
print(f"{len(items_over_multiple_pages)} tables are spread over multiple pages:")
for a, b in items_over_multiple_pages:
    print(f"{a} over {len(b)} pages")

2 tables are spread over multiple pages:
luxwort-1896-02-06-a-i0006 over 2 pages
luxland-2006-05-05-a-i0111 over 2 pages


Since this particular case where a table is spread over multiple pages only takes place twice, these tables can be ignored without much incidence later on while largely simplifying upcoming processings. The dataset representation can therefore be simplified.

In [7]:
for a, b in items_over_multiple_pages:
    del dataset_iml[a]

In [8]:
for item_id, v1 in dataset_iml.items():
    page_id, page_dict = list(dataset_iml[item_id].items())[0]
    page_dict['pid'] = page_id
    dataset_iml[item_id] = page_dict

In [9]:
len(dataset_iml)

62605

In [10]:
filename = "metadata_IML"

In [11]:
with open(os.path.join(data_path, filename + ".json"), "w") as f:
    json.dump(dataset_iml, f)

In [12]:
with (open(os.path.join(data_path, filename + ".json"), "r")) as f:
    dataset_iml = json.load(f)

### SOLR impresso_dev

In [13]:
solr_url = os.environ['SOLR_URL_DEV']  #https://<username>:<password>@solr.dhlab.epfl.ch/solr
solr = pysolr.Solr(solr_url + "/impresso_dev")

In [14]:
# https://solr.apache.org/guide/8_9/pagination-of-results.html (see Deep Pagination)
# https://lucidworks.com/post/coming-soon-to-solr-efficient-cursor-based-iteration-of-large-result-sets/

dataset_solr = dict()
done = False
params = {'rows': 1000,
          'fl': 'id, rc_plains, pp_plain, content_txt_*',
          'fq': 'item_type_s:tb',
          'sort': 'id asc',
          'cursorMark': '*'}

while not done:
    results = solr.search('*:*', **params)

    for doc in results.docs:
        metadata = dict()
        content_keys = [k for k in doc.keys() if k.startswith('content_txt_')]
        assert(len(content_keys) <= 1)
        metadata['text'] = doc[content_keys[0]] if len(content_keys) == 1 else ''
        metadata['language'] = content_keys[0][12:] if len(content_keys) == 1 else ''

        for row_rc, row_pp in zip(doc['rc_plains'], json.loads(doc['pp_plain'])):  # eval necessary because the list is stored as a string
            row_rc = eval(row_rc)  # eval necessary because the dict is stored as a string

            assert(row_rc['pid'] == row_pp['id'])
            pid = row_rc['pid']

            if pid not in metadata:
                metadata[pid] = dict()
                metadata[pid]['tb_coords'] = {'x': [], 'y': [], 'width': [], 'height': []}
                metadata[pid]['text_coords'] = {'x': [], 'y': [], 'width': [], 'height': [], 's': [], 'l': []}

            for coord in row_rc['c']:
                metadata[pid]['tb_coords']['x'].append(coord[0])
                metadata[pid]['tb_coords']['y'].append(coord[1])
                metadata[pid]['tb_coords']['width'].append(coord[2])
                metadata[pid]['tb_coords']['height'].append(coord[3])

            for coord in row_pp['t']:
                metadata[pid]['text_coords']['x'].append(coord['c'][0])
                metadata[pid]['text_coords']['y'].append(coord['c'][1])
                metadata[pid]['text_coords']['width'].append(coord['c'][2])
                metadata[pid]['text_coords']['height'].append(coord['c'][3])
                metadata[pid]['text_coords']['s'].append(coord['s'])
                metadata[pid]['text_coords']['l'].append(coord['l'])

        dataset_solr[doc['id']] = metadata
            
    if params['cursorMark'] == results.nextCursorMark:
        done = True
    params['cursorMark'] = results.nextCursorMark
    
print(f"{len(dataset_solr)} results have been retrieved.")

58026 results have been retrieved.


In [15]:
empty_tables_solr = set()
for k1, v1 in dataset_solr.items():
    for k2, v2 in v1.items():
        if k2 not in {'text', 'language'}:
            if len(v2['tb_coords']['x']) == 0:
                empty_tables_solr.add(k1)
                
print(f"{len(empty_tables_solr)} table items do not contain any coordinate information")

12 table items do not contain any coordinate information


In [16]:
for a in empty_tables_solr:
    del dataset_solr[a]

In [17]:
items_over_multiple_pages = [(k, v) for k, v in dataset_solr.items() if len(v) > 3]
print(f"{len(items_over_multiple_pages)} tables are spread over multiple pages:")
for a, b in items_over_multiple_pages:
    print(f"{a} over {len(b) - 2} pages")

2 tables are spread over multiple pages:
luxland-2006-05-05-a-i0111 over 2 pages
luxwort-1896-02-06-a-i0006 over 2 pages


In [18]:
for a, b in items_over_multiple_pages:
    del dataset_solr[a]

In [19]:
for item_id, v1 in dataset_solr.items():
    page_id, page_dict = [(k, v) for k, v in dataset_solr[item_id].items() if k not in {'text', 'language'}][0]
    page_dict['pid'] = page_id
    page_dict['text'] = dataset_solr[item_id]['text']
    page_dict['language'] = dataset_solr[item_id]['language']
    dataset_solr[item_id] = page_dict

In [20]:
len(dataset_solr)

58012

In [21]:
filename = "metadata_SOLR"

In [22]:
with open(os.path.join(data_path, filename + ".json"), "w") as f:
    json.dump(dataset_solr, f)

In [23]:
with (open(os.path.join(data_path, filename + ".json"), "r")) as f:
    dataset_solr = json.load(f)

### Join

In [24]:
set(dataset_solr.keys()) == set(dataset_iml.keys()).intersection(set(dataset_solr.keys()))

True

The data from SOLR is a subset of the data from IML. Since the data from SOLR contains more information that might be relevant later on, such as text information, data from IML is discarded. Since I was unsure of the origin of this difference, and the data from SOLR is considered as the main source, I did not investigate further.

In [25]:
difference = set(dataset_iml.keys()).difference(set(dataset_solr.keys()))

for k in difference:
    del dataset_iml[k]
    
print(len(dataset_solr))
dataset_solr.keys() == dataset_iml.keys()

58012


True

NB: To whoever reads this, this difference should be investigated.

### Export
The datasets are merged and exported. Each image can now be retrieved from the IIIF of the National Library of Luxembourg, and additional information about the OCR and the location of the tables is stored.

In [26]:
NLL_metadata = dict()
NLL_metadata_lite = dict()
for k in dataset_iml.keys():
    
    table_metadata = dict()
    table_metadata_lite = dict()
    table_metadata = dataset_solr[k].copy()
    table_metadata['iiifURL'] = dataset_iml[k]['iiifURL']
    table_metadata['iiifFragmentURL'] = dataset_iml[k]['iiifFragmentURL']

    table_metadata_lite = table_metadata.copy()
    del table_metadata_lite['text_coords']
    del table_metadata_lite['text']
    del table_metadata_lite['language']

    NLL_metadata[k] = table_metadata    
    NLL_metadata_lite[k] = table_metadata_lite

In [27]:
with (open(os.path.join(data_path, "NLL_metadata.json"), "w")) as f:
    json.dump(NLL_metadata, f)

In [28]:
with (open(os.path.join(data_path, "NLL_metadata_lite.json"), "w")) as f:
    json.dump(NLL_metadata_lite, f)

In [29]:
with (open(os.path.join(data_path, "NLL_metadata.json"), "r")) as f:
    NLL_metadata = json.load(f)

In [30]:
with (open(os.path.join(data_path, "NLL_metadata_lite.json"), "r")) as f:
    NLL_metadata_lite = json.load(f)

### Additional exports
Specific dataset containing only a subset of the data are then created for the different steps of the pipeline used in [impresso-images](https://github.com/impresso/impresso-images).

#### Step 1: ID-IIIF pairs for *extract_images_iiif.py*
This dataset is necessary to indicate which images to download and under what name they must be referred to.

In [31]:
filename = "NLL_id_iiif_pairs"
NLL_id_iiif_pairs = {v['pid']: v['iiifURL'] for k, v in NLL_metadata_lite.items()}

In [32]:
with (open(os.path.join(data_path, filename + ".jsonl"), "w")) as f:
    for k, v in NLL_id_iiif_pairs.items():
        json.dump({'id': k, 'iiif_url': v}, f)
        f.write('\n')

#### Step 2: coordinates for *crop_images.py*
This dataset is necessary to give the coordinates where to crop the full images.

In [33]:
filename = "NLL_cropping_coordinates"
NLL_cropping_coordinates = dict()
for k, v in NLL_metadata_lite.items():
    pid = v['pid']
    if pid not in NLL_cropping_coordinates:
        NLL_cropping_coordinates[pid] = dict()
        
    coords = v['tb_coords']
    NLL_cropping_coordinates[pid][k] = list(zip(coords['x'], coords['y'], coords['width'], coords['height']))

In [34]:
with (open(os.path.join(data_path, filename + ".json"), "w")) as f:
    json.dump(NLL_cropping_coordinates, f)

#### Step 3: additional metadata for *import_visual_signatures.py*
This dataset contains metadata to be stored on SOLR alongside the visual signatures of the images.

In [35]:
filename = "NLL_metadata_solr"

NLL_metadata_solr = dict()
for k, v in NLL_metadata_lite.items():
    table_solr_metadata = dict()
    
    c = v['tb_coords']
    coords = []
    for x, y, width, height in zip(c['x'], c['y'], c['width'], c['height']):
        coords.append(str((x, y, width, height)))  # SOLR doesn't accept list of list of int, hence the use of a list of strings
        
    table_solr_metadata["coords_ss"] = coords
    table_solr_metadata["iiif_link_s"] = v['iiifURL']
    table_solr_metadata["iiif_fragment_link_ss"] = v['iiifFragmentURL']
    table_solr_metadata["meta_day_i"] = get_day(k)
    table_solr_metadata["meta_month_i"] = get_month(k)
    table_solr_metadata["meta_year_i"] = get_year(k)
    table_solr_metadata["meta_journal_s"] = get_journal(k)
    table_solr_metadata["meta_date_dt"] = get_date(k).strftime('%Y-%m-%dT%H:%M:%SZ')
    table_solr_metadata["item_type_s"] = "tb"
    table_solr_metadata["meta_ed_s"] = get_edition(k)
    table_solr_metadata["meta_issue_id_s"] = get_meta_issue_id(k)
    table_solr_metadata["page_nb_i"] = get_page(v['pid'])
    table_solr_metadata["front_b"] = is_front_page(v['pid'])
    
    NLL_metadata_solr[k] = table_solr_metadata
    

In [36]:
with (open(os.path.join(data_path, filename + ".json"), "w")) as f:
    json.dump(NLL_metadata_solr, f)

#### Step 4: S3 storage metadata for the GUI
This dataset maps the location of each image to each file in the dedicated S3.

In [37]:
s3 = boto3.resource('s3',
                    endpoint_url='https://' + os.environ.get("S3_ENDPOINT", 'os.zhdk.cloud.switch.ch'),
                    aws_access_key_id=os.environ.get("SE_ACCESS_KEY", None),
                    aws_secret_access_key=os.environ.get("SE_SECRET_KEY", None))

bucket = s3.Bucket("impresso-tables")

In [38]:
obj = bucket.Object("images-packed/image-ids-metadata-0.json")
images_metadata = []
for line in obj.get()['Body'].iter_lines():
    images_metadata.append(json.loads(line))
images_metadata = pd.DataFrame(images_metadata)
    
obj = bucket.Object("table-images-packed/image-ids-metadata-0.json")
table_images_metadata = []
for line in obj.get()['Body'].iter_lines():
    table_images_metadata.append(json.loads(line))
table_images_metadata = pd.DataFrame(table_images_metadata)

In [39]:
NLL_metadata_s3 = pd.DataFrame().from_dict(NLL_metadata_lite, orient='index').drop(['tb_coords', 'iiifURL', 'iiifFragmentURL'], axis=1)
NLL_metadata_s3 = pd.merge(NLL_metadata_s3, table_images_metadata, left_index=True, right_on='id').rename({'path': 'id_loc', 'id': 'tb_id'}, axis=1)
NLL_metadata_s3 = pd.merge(NLL_metadata_s3, images_metadata, left_on='pid', right_on='id').rename({'path': 'pid_loc'}, axis=1).drop('id', axis=1).rename({'tb_id': 'id'}, axis=1)

In [40]:
NLL_metadata_s3['id_loc'] = NLL_metadata_s3['id_loc'].apply(lambda x: x[16:])
NLL_metadata_s3['pid_loc'] = NLL_metadata_s3['pid_loc'].apply(lambda x: x[16:])

In [41]:
print(NLL_metadata_s3.shape)
NLL_metadata_s3.head()

(58011, 4)


Unnamed: 0,pid,id,id_loc,pid_loc
0,luxzeit1858-1858-03-10-a-p0001,luxzeit1858-1858-03-10-a-i0018,table-images-packed/image-data-00711.jsonl.gz,images-packed/image-data-00711.jsonl.gz
1,luxzeit1858-1858-03-10-a-p0001,luxzeit1858-1858-03-10-a-i0019,table-images-packed/image-data-00711.jsonl.gz,images-packed/image-data-00711.jsonl.gz
2,avenirgdl-1871-07-25-a-p0004,avenirgdl-1871-07-25-a-i0033,table-images-packed/image-data-00079.jsonl.gz,images-packed/image-data-00079.jsonl.gz
3,avenirgdl-1871-07-25-a-p0004,avenirgdl-1871-07-25-a-i0026,table-images-packed/image-data-00079.jsonl.gz,images-packed/image-data-00079.jsonl.gz
4,avenirgdl-1871-07-25-a-p0004,avenirgdl-1871-07-25-a-i0031,table-images-packed/image-data-00079.jsonl.gz,images-packed/image-data-00079.jsonl.gz


In [42]:
NLL_metadata_s3.to_parquet(os.path.join(data_path, "NLL_metadata_s3.parquet"))