## Import

In [1]:
from dask_k8 import DaskCluster
from impresso_commons.utils.s3 import fixed_s3fs_glob, IMPRESSO_STORAGEOPT, read_jsonlines
from impresso_commons.utils.kube import (make_scheduler_configuration,
                                         make_worker_configuration)
from sanity_check.contents.s3_data import list_newspapers
from dask import bag as db
import os
import pandas as pd
from dask.distributed import Client, progress
import json

## Create dask k8 cluster

In [2]:
cluster = DaskCluster(
    namespace="dhlab",
    cluster_id="impresso-sanitycheck",
    scheduler_pod_spec=make_scheduler_configuration(),
    worker_pod_spec=make_worker_configuration(
        docker_image="ic-registry.epfl.ch/dhlab/impresso_data-sanity-check:v1",
        memory="5G"
    )
)

In [3]:
cluster.create()
cluster.scale(150, blocking=False)

Scheduler: tcp://10.90.47.5:16365
Dashboard: http://10.90.47.5:12674


In [4]:
dask_client = cluster.make_dask_client()

In [20]:
dask_client.get_versions(check=True)

{'scheduler': {'host': {'python': '3.7.4.final.0',
   'python-bits': 64,
   'OS': 'Linux',
   'OS-release': '4.15.0-99-generic',
   'machine': 'x86_64',
   'processor': '',
   'byteorder': 'little',
   'LC_ALL': 'C.UTF-8',
   'LANG': 'C.UTF-8'},
  'packages': {'python': '3.7.4.final.0',
   'dask': '2.15.0',
   'distributed': '2.15.0',
   'msgpack': '1.0.0',
   'cloudpickle': '1.4.0',
   'tornado': '6.0.4',
   'toolz': '0.10.0',
   'numpy': '1.18.1',
   'lz4': '3.0.2',
   'blosc': '1.9.1'}},
 'workers': {'tcp://10.233.100.252:42849': {'host': {'python': '3.7.4.final.0',
    'python-bits': 64,
    'OS': 'Linux',
    'OS-release': '4.15.0-91-generic',
    'machine': 'x86_64',
    'processor': '',
    'byteorder': 'little',
    'LC_ALL': 'C.UTF-8',
    'LANG': 'C.UTF-8'},
   'packages': {'python': '3.7.4.final.0',
    'dask': '2.15.0',
    'distributed': '2.15.0',
    'msgpack': '1.0.0',
    'cloudpickle': '1.4.0',
    'tornado': '6.0.4',
    'toolz': '0.10.0',
    'numpy': '1.18.1',
    '

In [5]:
dask_client

0,1
Client  Scheduler: tcp://10.90.47.5:16365  Dashboard: http://10.90.47.5:8787/status,Cluster  Workers: 36  Cores: 36  Memory: 180.00 GB


## Functions

In [6]:
from typing import Tuple, List

def parse_iiif_link(iiif_link: str) -> Tuple[str, List[int]]:
    coords_field_position = [
        n
        for n, bit in enumerate(iiif_link.split('/'))
        if "," in bit
    ][0]
    iiif_manifest = "/".join(iiif_link.split('/')[:coords_field_position] + ['info.json'])
    coords = [
        int(x)
        for x in iiif_link.split('/')[coords_field_position].split(',')
    ]
    return (iiif_manifest, coords)

In [7]:
def fix_iiif_link(issue):
    for item in issue['i']:
        if item['m']['tp'] == 'image':
            
            # try to catch possible weirdnesses
            if item['m'] is None:
                continue
                
            if 'iiif_link' in item['m'] and item['m']['iiif_link'] is None:
                continue
            
            # first of all, move the field iiif_link 
            # when it is wrongly positioned (i.e. not inside _m_ field)
            if 'iiif_link' in item and 'iiif_link' not in item['m']:
                item['m']['iiif_link'] = item['iiif_link']
                del item['iiif_link']
            else:
                pass
            
            if 'iiif_link' not in item['m']:
                continue
                
            
            if '.jpg'in item['m']['iiif_link']:
                if 'c' in item['m']:
                    coords_str = ','.join([str(i) for i in item['m']['c']])
                    suffix = os.path.join(coords_str, 'full/0/default.jpg')
                    base_iiif = item['m']['iiif_link'].replace(suffix, '')
                    iiif_manifest_link = os.path.join(base_iiif, 'info.json')
                    item['m']['iiif_link'] = iiif_manifest_link
                    item['c'] = item['m']['c']
                    del item['m']['c']
                else:
                    iiif_manifest_link, coords = parse_iiif_link(item['m']['iiif_link'])
                    print(iiif_manifest_link, coords)
                    item['m']['iiif_link'] = iiif_manifest_link
                    item['c'] = coords
    return issue

In [8]:
def fix_coordinates(issue):
    for item in issue['i']:
        if item['m']['tp'] == 'image':
            if 'c'in item['m']:
                item['c'] = item['m']['c']
                del item['m']['c']
            else:
                continue
    return issue

In [9]:
def debug(issue):
    try:
        fix_iiif_link(issue)
        return {
            'id': issue['id'],
            'success': True,
            'error': None
        }
    except Exception as e:
        return {
            'id': issue['id'],
            'success': False,
            'error': e
        }

In [8]:
parse_iiif_link('https://gallica.bnf.fr/iiif/ark:/12148/bpt6k46000007/f9/3279,4693,2581,3285/full/0/default.jpg')

('https://gallica.bnf.fr/iiif/ark:/12148/bpt6k46000007/f9/info.json',
 [3279, 4693, 2581, 3285])

In [37]:
fix_iiif_link(example[0])

{'cdt': '2020-06-28 08:14:44',
 'i': [{'m': {'id': 'oeuvre-1943-07-01-a-i0001',
    'tp': 'article',
    'pp': [1],
    't': "Le clergé anglican l'archevêque d'York en tête préconise le bombardement de Rome !"},
   'l': {'parts': [{'comp_role': 'paragraph',
      'comp_id': 'PAG_1_TB000021',
      'comp_fileid': 'ocr.1',
      'comp_page_no': 1},
     {'comp_role': 'paragraph',
      'comp_id': 'PAG_1_TB000022',
      'comp_fileid': 'ocr.1',
      'comp_page_no': 1},
     {'comp_role': 'paragraph',
      'comp_id': 'PAG_1_TB000023',
      'comp_fileid': 'ocr.1',
      'comp_page_no': 1},
     {'comp_role': 'paragraph',
      'comp_id': 'PAG_1_TB000024',
      'comp_fileid': 'ocr.1',
      'comp_page_no': 1},
     {'comp_role': 'paragraph',
      'comp_id': 'PAG_1_TB000025',
      'comp_fileid': 'ocr.1',
      'comp_page_no': 1},
     {'comp_role': 'insideheading',
      'comp_id': 'PAG_1_TB000026',
      'comp_fileid': 'ocr.1',
      'comp_page_no': 1},
     {'comp_role': 'paragraph',


In [48]:
for item in fix_coordinates(example[0])['i']:
    if item['m']['tp'] == 'image' and item['m']['iiif_link'] is not None:
        print(item['m']['iiif_link'], item['m'].keys(), item.keys(), item['c'])

https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f1/info.json dict_keys(['id', 'tp', 'pp', 't', 'iiif_link']) dict_keys(['m', 'l', 'c']) [4177, 1524, 762, 872]
https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f1/info.json dict_keys(['id', 'tp', 'pp', 't', 'pOf', 'iiif_link']) dict_keys(['m', 'l', 'c']) [4186, 5832, 777, 1321]
https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f1/info.json dict_keys(['id', 'tp', 'pp', 't', 'pOf', 'iiif_link']) dict_keys(['m', 'l', 'c']) [5753, 7264, 767, 1076]
https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f2/info.json dict_keys(['id', 'tp', 'pp', 't', 'iiif_link']) dict_keys(['m', 'l', 'c']) [2910, 7242, 572, 627]
https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f1/info.json dict_keys(['id', 'tp', 'pp', 't', 'iiif_link']) dict_keys(['m', 'l', 'c']) [1058, 7657, 762, 1113]
https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f1/info.json dict_keys(['id', 'tp', 'pp', 't', 'iiif_link']) dict_keys(['m', 'l', 'c']) [1021, 3560, 781, 1247

## Fetch canonical issues

In [9]:
source_bucket = "s3://original-canonical-staging"
target_bucket = "s3://original-canonical-testing"

In [10]:
newspapers = list_newspapers(source_bucket)

Fetching list of newspapers from s3://original-canonical-staging
original-canonical-staging contains 70 newspapers


In [11]:
len(newspapers)

70

In [12]:
files = [
    file
    for np in newspapers
    for file in fixed_s3fs_glob(os.path.join(source_bucket, np, 'issues', '*.bz2'))
]

In [13]:
len(files)

2386

In [14]:
issue_bag = db.read_text(
    files,
    storage_options=IMPRESSO_STORAGEOPT
).map(json.loads)

In [23]:
# issue_bag.take(1)[0]

In [23]:
# dask_client.cancel(corrected_issues_bag)

### First patch

In [23]:
corrected_issues_bag = issue_bag.map(fix_iiif_link).persist()

In [26]:
issue_bag.pluck('id').count().compute()

362603

In [24]:
out_files = [
    file.replace(source_bucket, target_bucket)
    for file in files
]

In [19]:
out_files[:10]

['s3://original-canonical-testing/avenirgdl/issues/avenirgdl-1868-issues.jsonl.bz2',
 's3://original-canonical-testing/avenirgdl/issues/avenirgdl-1869-issues.jsonl.bz2',
 's3://original-canonical-testing/avenirgdl/issues/avenirgdl-1870-issues.jsonl.bz2',
 's3://original-canonical-testing/avenirgdl/issues/avenirgdl-1871-issues.jsonl.bz2',
 's3://original-canonical-testing/kommmit/issues/kommmit-1884-issues.jsonl.bz2',
 's3://original-canonical-testing/GAV/issues/GAV-1855-issues.jsonl.bz2',
 's3://original-canonical-testing/GAV/issues/GAV-1856-issues.jsonl.bz2',
 's3://original-canonical-testing/GAV/issues/GAV-1857-issues.jsonl.bz2',
 's3://original-canonical-testing/GAV/issues/GAV-1858-issues.jsonl.bz2',
 's3://original-canonical-testing/GAV/issues/GAV-1859-issues.jsonl.bz2']

In [26]:
for file in out_files:
    print(file)

s3://original-canonical-testing/luxembourg1935/issues/luxembourg1935-1935-issues.jsonl.bz2
s3://original-canonical-testing/luxembourg1935/issues/luxembourg1935-1936-issues.jsonl.bz2
s3://original-canonical-testing/luxembourg1935/issues/luxembourg1935-1937-issues.jsonl.bz2
s3://original-canonical-testing/luxembourg1935/issues/luxembourg1935-1938-issues.jsonl.bz2
s3://original-canonical-testing/luxembourg1935/issues/luxembourg1935-1939-issues.jsonl.bz2
s3://original-canonical-testing/luxembourg1935/issues/luxembourg1935-1940-issues.jsonl.bz2
s3://original-canonical-testing/oeuvre/issues/oeuvre-1915-issues.jsonl.bz2
s3://original-canonical-testing/oeuvre/issues/oeuvre-1916-issues.jsonl.bz2
s3://original-canonical-testing/oeuvre/issues/oeuvre-1917-issues.jsonl.bz2
s3://original-canonical-testing/oeuvre/issues/oeuvre-1918-issues.jsonl.bz2
s3://original-canonical-testing/oeuvre/issues/oeuvre-1919-issues.jsonl.bz2
s3://original-canonical-testing/oeuvre/issues/oeuvre-1920-issues.jsonl.bz2
s3:/

s3://original-canonical-testing/SMZ/issues/SMZ-1970-issues.jsonl.bz2
s3://original-canonical-testing/SMZ/issues/SMZ-1971-issues.jsonl.bz2
s3://original-canonical-testing/SMZ/issues/SMZ-1972-issues.jsonl.bz2
s3://original-canonical-testing/SMZ/issues/SMZ-1973-issues.jsonl.bz2
s3://original-canonical-testing/SMZ/issues/SMZ-1974-issues.jsonl.bz2
s3://original-canonical-testing/SMZ/issues/SMZ-1975-issues.jsonl.bz2
s3://original-canonical-testing/SMZ/issues/SMZ-1976-issues.jsonl.bz2
s3://original-canonical-testing/SMZ/issues/SMZ-1977-issues.jsonl.bz2
s3://original-canonical-testing/SMZ/issues/SMZ-1978-issues.jsonl.bz2
s3://original-canonical-testing/SMZ/issues/SMZ-1979-issues.jsonl.bz2
s3://original-canonical-testing/SMZ/issues/SMZ-1980-issues.jsonl.bz2
s3://original-canonical-testing/SMZ/issues/SMZ-1981-issues.jsonl.bz2
s3://original-canonical-testing/SMZ/issues/SMZ-1982-issues.jsonl.bz2
s3://original-canonical-testing/SMZ/issues/SMZ-1983-issues.jsonl.bz2
s3://original-canonical-testing/SM

In [25]:
corrected_issues_bag.map(
    json.dumps
).to_textfiles(out_files, storage_options=IMPRESSO_STORAGEOPT)

['original-canonical-testing/luxembourg1935/issues/luxembourg1935-1935-issues.jsonl.bz2',
 'original-canonical-testing/luxembourg1935/issues/luxembourg1935-1936-issues.jsonl.bz2',
 'original-canonical-testing/luxembourg1935/issues/luxembourg1935-1937-issues.jsonl.bz2',
 'original-canonical-testing/luxembourg1935/issues/luxembourg1935-1938-issues.jsonl.bz2',
 'original-canonical-testing/luxembourg1935/issues/luxembourg1935-1939-issues.jsonl.bz2',
 'original-canonical-testing/luxembourg1935/issues/luxembourg1935-1940-issues.jsonl.bz2',
 'original-canonical-testing/oeuvre/issues/oeuvre-1915-issues.jsonl.bz2',
 'original-canonical-testing/oeuvre/issues/oeuvre-1916-issues.jsonl.bz2',
 'original-canonical-testing/oeuvre/issues/oeuvre-1917-issues.jsonl.bz2',
 'original-canonical-testing/oeuvre/issues/oeuvre-1918-issues.jsonl.bz2',
 'original-canonical-testing/oeuvre/issues/oeuvre-1919-issues.jsonl.bz2',
 'original-canonical-testing/oeuvre/issues/oeuvre-1920-issues.jsonl.bz2',
 'original-canon

In [25]:
corrected_issues_bag.pluck('id').count().compute()

362603

### Second patch

In [15]:
corrected_issues_bag = issue_bag.map(fix_coordinates).persist()

In [16]:
out_files = [
    file.replace(source_bucket, target_bucket)
    for file in files
]

In [17]:
out_files[:10]

['s3://original-canonical-testing/avenirgdl/issues/avenirgdl-1868-issues.jsonl.bz2',
 's3://original-canonical-testing/avenirgdl/issues/avenirgdl-1869-issues.jsonl.bz2',
 's3://original-canonical-testing/avenirgdl/issues/avenirgdl-1870-issues.jsonl.bz2',
 's3://original-canonical-testing/avenirgdl/issues/avenirgdl-1871-issues.jsonl.bz2',
 's3://original-canonical-testing/dunioun/issues/dunioun-1944-issues.jsonl.bz2',
 's3://original-canonical-testing/dunioun/issues/dunioun-1945-issues.jsonl.bz2',
 's3://original-canonical-testing/dunioun/issues/dunioun-1946-issues.jsonl.bz2',
 's3://original-canonical-testing/dunioun/issues/dunioun-1947-issues.jsonl.bz2',
 's3://original-canonical-testing/dunioun/issues/dunioun-1948-issues.jsonl.bz2',
 's3://original-canonical-testing/NTS/issues/NTS-1856-issues.jsonl.bz2']

In [18]:
example = corrected_issues_bag.filter(lambda x: x['id'] == 'oeuvre-1943-07-01-a').compute()

In [52]:
example

[{'cdt': '2020-06-28 08:14:44',
  'i': [{'m': {'id': 'oeuvre-1943-07-01-a-i0001',
     'tp': 'article',
     'pp': [1],
     't': "Le clergé anglican l'archevêque d'York en tête préconise le bombardement de Rome !"},
    'l': {'parts': [{'comp_role': 'paragraph',
       'comp_id': 'PAG_1_TB000021',
       'comp_fileid': 'ocr.1',
       'comp_page_no': 1},
      {'comp_role': 'paragraph',
       'comp_id': 'PAG_1_TB000022',
       'comp_fileid': 'ocr.1',
       'comp_page_no': 1},
      {'comp_role': 'paragraph',
       'comp_id': 'PAG_1_TB000023',
       'comp_fileid': 'ocr.1',
       'comp_page_no': 1},
      {'comp_role': 'paragraph',
       'comp_id': 'PAG_1_TB000024',
       'comp_fileid': 'ocr.1',
       'comp_page_no': 1},
      {'comp_role': 'paragraph',
       'comp_id': 'PAG_1_TB000025',
       'comp_fileid': 'ocr.1',
       'comp_page_no': 1},
      {'comp_role': 'insideheading',
       'comp_id': 'PAG_1_TB000026',
       'comp_fileid': 'ocr.1',
       'comp_page_no': 1},
   

In [19]:
corrected_issues_bag.map(
    json.dumps
).to_textfiles(out_files, storage_options=IMPRESSO_STORAGEOPT)

['original-canonical-testing/avenirgdl/issues/avenirgdl-1868-issues.jsonl.bz2',
 'original-canonical-testing/avenirgdl/issues/avenirgdl-1869-issues.jsonl.bz2',
 'original-canonical-testing/avenirgdl/issues/avenirgdl-1870-issues.jsonl.bz2',
 'original-canonical-testing/avenirgdl/issues/avenirgdl-1871-issues.jsonl.bz2',
 'original-canonical-testing/dunioun/issues/dunioun-1944-issues.jsonl.bz2',
 'original-canonical-testing/dunioun/issues/dunioun-1945-issues.jsonl.bz2',
 'original-canonical-testing/dunioun/issues/dunioun-1946-issues.jsonl.bz2',
 'original-canonical-testing/dunioun/issues/dunioun-1947-issues.jsonl.bz2',
 'original-canonical-testing/dunioun/issues/dunioun-1948-issues.jsonl.bz2',
 'original-canonical-testing/NTS/issues/NTS-1856-issues.jsonl.bz2',
 'original-canonical-testing/NTS/issues/NTS-1857-issues.jsonl.bz2',
 'original-canonical-testing/NTS/issues/NTS-1858-issues.jsonl.bz2',
 'original-canonical-testing/NTS/issues/NTS-1859-issues.jsonl.bz2',
 'original-canonical-testing

## S3 shell commands

```bash

# mileage might vary depending on machine
cd impresso-processing/2020-release-v2

# read a list of s3 file paths and delete them
cat patch-canonical/canonical-issues-staging.txt | while read line; do s3cmd rm $line; done

# move bz2 files from testing to staging bucket
cat patch-canonical/canonical-issues-staging.txt | while read line; do newfile="$(echo $line |sed -e 's/staging/testing/')" ; s3cmd mv $newfile $line; done
 ```

## Debug

In [None]:
errors = issue_bag.map(debug).filter(lambda x: not x['success']).compute()

In [None]:
len(errors)

In [None]:
errors

## Dev

In [39]:
example = issue_bag.filter(lambda x: x['id'] == 'oeuvre-1943-07-01-a').compute()

In [43]:
for item in fix_iiif_link(example[0])['i']:
    if item['m']['tp'] == 'image' and item['m']['iiif_link'] is not None:
        print(item['m']['iiif_link'], item['m'].keys())

https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f1/info.json dict_keys(['id', 'tp', 'pp', 't', 'iiif_link', 'c'])
https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f1/info.json dict_keys(['id', 'tp', 'pp', 't', 'pOf', 'iiif_link', 'c'])
https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f1/info.json dict_keys(['id', 'tp', 'pp', 't', 'pOf', 'iiif_link', 'c'])
https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f2/info.json dict_keys(['id', 'tp', 'pp', 't', 'iiif_link', 'c'])
https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f1/info.json dict_keys(['id', 'tp', 'pp', 't', 'iiif_link', 'c'])
https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f1/info.json dict_keys(['id', 'tp', 'pp', 't', 'iiif_link', 'c'])
https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f1/info.json dict_keys(['id', 'tp', 'pp', 't', 'pOf', 'iiif_link', 'c'])
https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f2/info.json dict_keys(['id', 'tp', 'pp', 't', 'pOf', 'iiif_link', 'c'])


In [27]:
corrected_example = corrected_issues_bag.filter(lambda x: x['id'] == 'oeuvre-1943-07-01-a').compute()

In [28]:
corrected_example

[{'cdt': '2020-06-28 08:14:44',
  'i': [{'m': {'id': 'oeuvre-1943-07-01-a-i0001',
     'tp': 'article',
     'pp': [1],
     't': "Le clergé anglican l'archevêque d'York en tête préconise le bombardement de Rome !"},
    'l': {'parts': [{'comp_role': 'paragraph',
       'comp_id': 'PAG_1_TB000021',
       'comp_fileid': 'ocr.1',
       'comp_page_no': 1},
      {'comp_role': 'paragraph',
       'comp_id': 'PAG_1_TB000022',
       'comp_fileid': 'ocr.1',
       'comp_page_no': 1},
      {'comp_role': 'paragraph',
       'comp_id': 'PAG_1_TB000023',
       'comp_fileid': 'ocr.1',
       'comp_page_no': 1},
      {'comp_role': 'paragraph',
       'comp_id': 'PAG_1_TB000024',
       'comp_fileid': 'ocr.1',
       'comp_page_no': 1},
      {'comp_role': 'paragraph',
       'comp_id': 'PAG_1_TB000025',
       'comp_fileid': 'ocr.1',
       'comp_page_no': 1},
      {'comp_role': 'insideheading',
       'comp_id': 'PAG_1_TB000026',
       'comp_fileid': 'ocr.1',
       'comp_page_no': 1},
   

In [22]:
example1 = example[0]

In [194]:
example2 = example[1]

In [23]:
example1

{'cdt': '2020-06-28 08:14:44',
 'i': [{'m': {'id': 'oeuvre-1943-07-01-a-i0001',
    'tp': 'article',
    'pp': [1],
    't': "Le clergé anglican l'archevêque d'York en tête préconise le bombardement de Rome !"},
   'l': {'parts': [{'comp_role': 'paragraph',
      'comp_id': 'PAG_1_TB000021',
      'comp_fileid': 'ocr.1',
      'comp_page_no': 1},
     {'comp_role': 'paragraph',
      'comp_id': 'PAG_1_TB000022',
      'comp_fileid': 'ocr.1',
      'comp_page_no': 1},
     {'comp_role': 'paragraph',
      'comp_id': 'PAG_1_TB000023',
      'comp_fileid': 'ocr.1',
      'comp_page_no': 1},
     {'comp_role': 'paragraph',
      'comp_id': 'PAG_1_TB000024',
      'comp_fileid': 'ocr.1',
      'comp_page_no': 1},
     {'comp_role': 'paragraph',
      'comp_id': 'PAG_1_TB000025',
      'comp_fileid': 'ocr.1',
      'comp_page_no': 1},
     {'comp_role': 'insideheading',
      'comp_id': 'PAG_1_TB000026',
      'comp_fileid': 'ocr.1',
      'comp_page_no': 1},
     {'comp_role': 'paragraph',


In [26]:
fix_iiif_link(example1)

https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f1/info.json [4177, 1524, 762, 872]
https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f1/info.json [4186, 5832, 777, 1321]
https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f1/info.json [5753, 7264, 767, 1076]
https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f2/info.json [2910, 7242, 572, 627]
https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f1/info.json [1058, 7657, 762, 1113]
https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f1/info.json [1021, 3560, 781, 1247]
https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f1/info.json [2610, 2849, 1553, 1155]
https://gallica.bnf.fr/iiif/ark:/12148/bpt6k4622491m/f2/info.json [5115, 2888, 297, 224]


{'cdt': '2020-06-28 08:14:44',
 'i': [{'m': {'id': 'oeuvre-1943-07-01-a-i0001',
    'tp': 'article',
    'pp': [1],
    't': "Le clergé anglican l'archevêque d'York en tête préconise le bombardement de Rome !"},
   'l': {'parts': [{'comp_role': 'paragraph',
      'comp_id': 'PAG_1_TB000021',
      'comp_fileid': 'ocr.1',
      'comp_page_no': 1},
     {'comp_role': 'paragraph',
      'comp_id': 'PAG_1_TB000022',
      'comp_fileid': 'ocr.1',
      'comp_page_no': 1},
     {'comp_role': 'paragraph',
      'comp_id': 'PAG_1_TB000023',
      'comp_fileid': 'ocr.1',
      'comp_page_no': 1},
     {'comp_role': 'paragraph',
      'comp_id': 'PAG_1_TB000024',
      'comp_fileid': 'ocr.1',
      'comp_page_no': 1},
     {'comp_role': 'paragraph',
      'comp_id': 'PAG_1_TB000025',
      'comp_fileid': 'ocr.1',
      'comp_page_no': 1},
     {'comp_role': 'insideheading',
      'comp_id': 'PAG_1_TB000026',
      'comp_fileid': 'ocr.1',
      'comp_page_no': 1},
     {'comp_role': 'paragraph',


In [206]:
corrected_issues_bag = issue_bag.map(fix_iiif_link).persist()



In [201]:
dask_client.get_futures_error(corrected_issues_bag)

(<function distributed.worker.execute_task(task)>,
 ((<function dask.bag.core.reify(seq)>,
   (<function dask.bag.core.map_chunk(f, iters, iter_kwarg_keys=None, kwargs=None)>,
    <function __main__.fix_iiif_link(issue)>,
    [(<function dask.bag.core.map_chunk(f, iters, iter_kwarg_keys=None, kwargs=None)>,
      <function json.loads(s, *, encoding=None, cls=None, object_hook=None, parse_float=None, parse_int=None, parse_constant=None, object_pairs_hook=None, **kw)>,
      [(functools.partial(<function file_to_blocks at 0x7ff4b1a69d40>, False),
        <OpenFile 'original-canonical-staging/armeteufel/issues/armeteufel-1919-issues.jsonl.bz2'>)],
      None,
      {})],
    None,
    {})),),
 {},
 ())

In [202]:
examples = corrected_issues_bag.filter(
    lambda x: x['id'] == 'DTT-1978-01-03-a' or x['id'] == 'excelsior-1910-11-16-a'
).compute()



KilledWorker: ("('bag-from-delayed-file_to_blocks-list-loads-fix_iiif_link-22d6b4f5d2e6bbbed3f9c12ce580f69d', 1185)", <Worker 'tcp://10.233.95.129:45267', name: tcp://10.233.95.129:45267, memory: 0, processing: 3>)

## Release resources

In [20]:
cluster.close()

distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
concurrent.futures._base.CancelledError
distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
concurrent.futures._base.CancelledError
_GatheringFutu