# Dev notebook for patch 7: Rescaling the coordinates of RERO 1 canonical data

Patch 7 [link](https://docs.google.com/spreadsheets/d/1m-EaqsYpclDuUzE4vcl2DRgyrPQ6aauoMUAwpthx8Y4/edit?pli=1#gid=1323940846) concerns the coordinates of the various regions for some of the RERO 1 (Olive) data.

This is in particular due to the conversion of images to jp2 formats, were sometimes the "png_highest" strategy used did not work as intended, leaving a mismatch between the expected and actual dimensions of the image, leading to incorrect coordinates conversion.

The information about such conversion was logged in files named image-info.json for each issue, which can be used to identify which ones had an incorrect conversion. 

At the time of the identification of this issue, the 2 steps proposed fix was:
1. identifiying all the issues concerned with this issue (aka the source image used for jp2 conversion is not the largest one available)
2. patching concerned issues by rescaling all coordinates by factor (dest_res/curr_res) where dest_res is the smaller one (one of the jp2 files) and curr_res is the largest resolution which should have been selected initially.

This notebook aims at identifying which issues need patching, and subsequently correcting the coordinates in all the necessary issue and page files.

#### Imports

In [None]:
from bs4 import BeautifulSoup, element
import os
from text_importer.importers.mets_alto import alto, mets
from text_importer.importers.bl import classes, detect
from IPython.display import display
import cv2 as cv
from PIL import Image, ImageDraw, ImageFont
import json
import jsonlines
import git
import dask.bag as db
from zipfile import ZipFile
import logging
from text_importer.utils import init_logger
from impresso_commons.images import img_utils
from collections import defaultdict
import re
from impresso_commons.utils.s3 import fixed_s3fs_glob, IMPRESSO_STORAGEOPT, alternative_read_text
from impresso_commons.path.path_s3 import fetch_files
from text_importer.scripts.patching.canonical_patch_1_uzh import write_jsonlines_file, title_year_pair_to_issues, write_upload_issues, to_issue_id_pages_dict, nzz_write_upload_pages

### Functions

In [None]:
def read_json(file_path):
    lines = []
    with open(file_path, "r") as file:
        for line in file:
            lines.append(json.loads(line))
    return lines

In [None]:
def coords_to_xy(coords):
    return [coords[0], coords[1], coords[0]+coords[2], coords[1]+coords[3]]

In [None]:
def draw_box_on_img(base_img_path, coords_xy, img = None, width=10):
    if not img:
        img = Image.open(base_img_path)  
    ImageDraw.Draw(img).rectangle(coords_xy, outline ="red", width=width)
    return img

In [None]:
def read_xml(file_path):
    with open(file_path, 'rb') as f:
        raw_xml = f.read()

    return BeautifulSoup(raw_xml, 'xml')

In [None]:
def scale_coords(coords, curr_res, des_res):
    return [int(c*int(des_res)/int(curr_res)) for c in coords]

In [None]:
def get_regions_for_ci(canonical_page, ci_id):
    return [r['c'] for r in canonical_page['r'] if ci_id in r['pOf']]

In [None]:
def read_pages_from_s3(issue_id, bucket = 'canonical-data'):
    title = issue_id.split('-')[0]
    s3_path = f"s3://{bucket}/{title}/pages/{title}-{issue_id.split('-')[1]}/{issue_id}-pages.jsonl.bz2"
    return [json.loads(t) for t in alternative_read_text(s3_path, IMPRESSO_STORAGEOPT)]

In [None]:
def test_scale_coords(issue_id, page_nums, curr_res, dest_res):
    i_pages = read_pages_from_s3(issue_id)
    i_first_page = i_pages[page_nums[0]]
    i_second_page = i_pages[page_nums[1]]

    f_pg_iiif = i_first_page['iiif']
    s_pg_iiif = i_second_page['iiif']
    print(f"iiif pg_{page_nums[0]+1}: ", f_pg_iiif, f", iiif pg_{page_nums[1]+1}: ", s_pg_iiif)

    f_page_r1 = i_first_page['r'][0]['c']
    s_page_r1 = i_second_page['r'][0]['c']

    print("first_page_r1 current coords: ", f_page_r1)
    print("second_page_r1 current coords: ", s_page_r1)

    scaled_f_page_r1 = scale_coords(f_page_r1, curr_res, dest_res)
    scaled_s_page_r1 = scale_coords(s_page_r1, curr_res, dest_res)

    print("scaled_first_page_r1 updated coords: ", scaled_f_page_r1)
    print("scaled_second_page_r1 updated coords: ", scaled_s_page_r1)
    return i_pages, f_page_r1, scaled_f_page_r1, s_page_r1, scaled_s_page_r1

# Part 1: Find specific newspaper titles to fix for patch 7

The code for this was put into a script: impresso-text-acquisition/text_importer/scripts/patching/canonical_patch_7_find_issues.py

# Part 2: Find the exact issues to fix for patch 7 and identify which can be fixed (enough information) and which can't

### Once the necessary information is fetched, create the conversion dict, and convert the coordinates

For each of the issues:
- Read in it's image-info.json file: which strategy was used and which file was used
- If the strategy was 'png_highest', and that resolutions higher than the one used are in the Document.zip then:
  - write to a dict with issue (page?) ID as key: the file used, the strategy, the dest_res=resolution of the files used and the curr_res=largest resolution available.
  

In [None]:
logger = logging.getLogger()
log_file = '/home/piconti/impresso-text-acquisition/text_importer/data/patch_logs/patch_7_issues_to_patch_4.log'
if os.path.isfile(log_file):
    os.remove(log_file)
init_logger(logger, logging.INFO, log_file)

In [None]:
info_files_base_path = '/scratch/piconti/impresso/patch_7'
info_file = os.path.join(info_files_base_path, "{}_img_res_info.json")
issues_to_patch_file = os.path.join(info_files_base_path, "{}_issues_to_patch_4.json")
issues_to_inv_file = os.path.join(info_files_base_path, "{}_issues_to_investigate_4.json")
issues_not_to_touch_file = os.path.join(info_files_base_path, "{}_issues_not_to_touch_4.json")

In [None]:
def get_resolutions(res_file_dict, issue_id):
    pg_res = {'all': []}
    for f in res_file_dict['original']['resolutions']:
        pg = int(f.split("/")[0])
        res = int(os.path.basename(f).split(".")[0].split("_")[1])
        if res not in pg_res['all']:
            pg_res['all'].append(res)
        if pg in pg_res:
            pg_res[pg].append(res)
        else:
            pg_res[pg] = [res]

    #print(pg_res)
    if all([all([r in pg_res['all'] for r in v]) for k, v in pg_res.items()]):
        #print(f"{issue_id}: All page images have the same possible resolutions: {pg_res['all']}")
        logger.debug("   - %s: All page images have the same possible resolutions: %s", issue_id, pg_res['all'])
        issue_res = pg_res['all']
    else:
        #print(f"{issue_id}: Possible resolutions vary with the page: {pg_res}")
        logger.warning("    - %s: Possible resolutions vary with the page: %s", issue_id, pg_res)
        del pg_res['all']
        issue_res = pg_res

    return issue_res

In [None]:
def check_if_rescale(p, scaling, possible_res, issue_id):
    if isinstance(possible_res, dict):
        pos_res = possible_res[p]
    else:
        pos_res = possible_res

    if '_' in scaling['source_used']:
        source_res = int(os.path.basename(scaling['source_used']).split(".")[0].split("_")[1])
    else:
        if scaling['strat'] == 'png_highest':
            logger.warning("   %s: No resolution information in the file used to rescale, but should based on strategy: %s", issue_id, scaling)
            return  None, {'dest_res':None, 'curr_res':max(pos_res)}
        else:
            logger.debug("   %s: No resolution information in the file used to rescale, but not strategy: %s", issue_id, scaling)
            return False, {'dest_res':None, 'curr_res':max(pos_res)}

    dest_res = source_res
    curr_res = max(pos_res)
    if source_res != curr_res:
        if scaling['strat'] == 'png_highest':
            if p == 1:
                logger.info("   %s: Had strat 'png_highest', but used %s instead out of possibilities %s", issue_id, source_res, pos_res)
            #print(f"{issue_id}: Had strat 'png_highest', but used {source_res} instead (out of possibilities {pos_res})")
            to_rescale = True
            if to_rescale:
                logger.debug(f"   {issue_id}: to_rescale: {to_rescale}, dest_res: {dest_res}, curr_res: {curr_res}")
        else:
            if p == 1:
                logger.info("   %s: Had strat %s, but used %s instead out of possibilities %s", issue_id, scaling['strat'], source_res, pos_res)
            #print(f"{issue_id}: Had strat {scaling['strat']}, but used {source_res} instead out of possibilities {pos_res} --> to check by hand!")
            to_rescale = None
            logger.debug(f"   {issue_id}: to investigate, dest_res: {dest_res}, curr_res: {curr_res}")

        return to_rescale, {'dest_res':dest_res, 'curr_res':curr_res}
    
    return False, {'dest_res':dest_res, 'curr_res':curr_res}

In [None]:
def write_to_disk(title, contents, filename, log_msg):
    filepath = filename.format(title)
    logger.info("%s: Wirting the list of issues %s to disk: %s", title, log_msg, filepath)
    with open(filepath, "w", encoding="utf-8") as f_out:
        json.dump(contents, f_out, ensure_ascii=False, indent=4)

In [None]:
def handle_missing_img_info(issue_id, title, issue_info, no_to_touch, to_inv):
    # try to handle the various cases that can arise when the image info is missing to still identify the issues in need of patching, 
    # and the resolutions to use in each case.
    if any(['.tif' in f for f in issue_info['original']['zip_img_contents']]):
        logger.warning("   %s: No scaling info present, but tif images present in zip", issue_id)
        no_to_touch.append(issue_id)
    elif title == 'LES' and all(['.jpg' not in f for f in issue_info['original']['zip_img_contents']]):
        logger.debug("   %s: No scaling info present, but LES and no jpg images present in zip.", issue_id)
        no_to_touch.append(issue_id)
    elif title == 'LCG' and int(issue_id.split('-')[1])<1892:
        logger.debug("   %s: No scaling info present, but LCG and earlier than 1891.", issue_id)
        no_to_touch.append(issue_id)
    else:
        issue_possible_res = get_resolutions(issue_info, issue_id)
        to_inv[issue_id] = {}
        if isinstance(issue_possible_res, dict):
            for p, pos_res in issue_possible_res.items():
                to_inv[issue_id][p] = {'dest_res': min(pos_res), 'curr_res':max(pos_res)}
        else:
            to_inv[issue_id] = {'dest_res':min(issue_possible_res), 'curr_res':max(issue_possible_res)}
        to_inv[issue_id]['zip_contents'] = issue_info['original']['zip_img_contents']
        logger.warning("   %s: No scaling info present, but multiple resolutions available: %s", issue_id, issue_info['original']['zip_img_contents'])
    
    return no_to_touch, to_inv

In [None]:
def get_issues_to_patch_for_title(title: str, info_file=info_file, issues_to_patch_file=issues_to_patch_file) -> tuple[dict, dict, str]:
    info_file_path = info_file.format(title)

    with open(info_file_path, mode ='r', encoding='utf-8') as f:
        title_info = json.load(f)
    logger.info(f"----- Reading the info file for {title}: {len(title_info)} issues -----")
    
    # dict of issues to patch for title: issue_id -> {resolutions}
    issues_to_patch = {}
    issues_to_investigate = {}
    issues_not_to_rescale = []

    logger.info("Starting to identify the issues to patch...")
    for issue_id, info in title_info.items():
        # check if the image-info file is present and non-empty
        if info['img']['file_present'] and len(info['img']['info_f_contents'])!=0:
            # check if the Document.zip file was present and 
            if 'original' in info and 'resolutions' in info['original']:
                issue_possible_res = get_resolutions(info, issue_id)
            else:
                # if the files don't have their resolution, we have no way of knowing how to scale if there is an issue
                issues_not_to_rescale.append(issue_id)
                continue
            
            patch_d, inv_d = {}, {}
            # take note of the needed action (rescaling or not) for each page
            for idx, scaling in info['img']['info_f_contents'].items():
                p_num = int(idx)+1
                to_rescale, res_dict = check_if_rescale(p_num, scaling, issue_possible_res, issue_id)
                
                if to_rescale is None:
                    inv_d[p_num] = res_dict
                elif to_rescale:
                    patch_d[p_num] = res_dict
            
            # once all pages have been traversed, add the information to the final dicts/lists
            if len(inv_d) == 0:
                if len(patch_d) == 0:
                    # no rescaling needed
                    issues_not_to_rescale.append(issue_id)
                elif all([patch_d[1] == v for v in patch_d.values()]) and len(info['img']['info_f_contents']) == len(patch_d):
                    # same rescaling for all issues
                    issues_to_patch[issue_id] = patch_d[1]
                else:
                    logger.warning(f"  -->> {issue_id}: not all pages have the same patching: {patch_d}!!!")
                    issues_to_patch[issue_id] = patch_d
            else:
                issues_to_investigate[issue_id] = inv_d
        else:
            # if the file is not present, we cannot know which approach was chosen.
            if 'original' in info and 'resolutions' in info['original']:
                issues_not_to_rescale, issues_to_investigate = handle_missing_img_info(issue_id, title, info, issues_not_to_rescale, issues_to_investigate)
            else:
                issues_not_to_rescale.append(issue_id)

    if not len(title_info) == len(issues_to_patch) + len(issues_to_investigate) +len(issues_not_to_rescale):
        logger.warning(f"Problem: counts not matching: {len(title_info)}: {len(issues_to_patch) + len(issues_to_investigate) +len(issues_not_to_rescale)}")

    logger.info((
        f" Done for {title} : {len(issues_to_patch)}/{len(title_info)} need to be rescaled, "
        f" {len(issues_to_investigate)}/{len(title_info)} need to be investigated, "
        f" and {len(issues_not_to_rescale)}/{len(title_info)} can be left as-is. "
    ))

    if len(issues_to_patch) != 0:
        write_to_disk(title, issues_to_patch, issues_to_patch_file, 'needing rescaling')

    return issues_to_patch, issues_to_investigate, issues_not_to_rescale

In [None]:
_, rero_journal_dirs, _ = next(os.walk("/mnt/project_impresso/original/RERO/"))
rero_titles = ["LCG", "DLE", "LNF", "LBP", "LSE", "EXP"]
rero_titles.extend(rero_journal_dirs)
rero_titles = list(set(rero_titles))
logger.info("Will process titles: %s", rero_titles)

In [None]:
# dict of issues to patch for title: issue_id -> {resolutions}
all_issues_to_patch = {}
all_issues_to_investigate = {}
all_issues_not_to_rescale = []

In [None]:
for title in rero_titles:
    # identify the exact issues to scale for the title and get the rescaling values
    issues_to_patch, issues_to_investigate, issues_not_to_rescale = get_issues_to_patch_for_title(title)

    # add this information to the information collected for previous titles &  write the updated files to disk
    if len(issues_to_patch) != 0:
        all_issues_to_patch[title] = issues_to_patch
        write_to_disk('all', all_issues_to_patch, issues_to_patch_file, 'needing rescaling')
    if len(issues_to_investigate) != 0:
        all_issues_to_investigate[title] = issues_to_investigate
        write_to_disk('all', all_issues_to_investigate, issues_to_inv_file, 'needing investigating')
    if len(issues_not_to_rescale) != 0:
        all_issues_not_to_rescale.extend(issues_not_to_rescale)
        write_to_disk('all', all_issues_not_to_rescale, issues_not_to_touch_file, 'not needing rescaling.')


In [None]:
all_issues_to_patch.keys(), all_issues_to_investigate.keys()


## Investigation of issues with missing information

### LES

In [None]:
LES_info_file_path = info_file.format('LES')

with open(LES_info_file_path, mode ='r', encoding='utf-8') as f:
    LES_title_info = json.load(f)

In [None]:
les_ok_1 = 'LES-2009-02-01-a'
les_jpg_1 = 'LES-2011-05-01-a'
les_jpg_2 = 'LES-2006-02-01-a'

In [None]:
LES_title_info[les_ok_1]

#### LES-2009-02-01-a

Comments/Conclusions:
- Page 1:
    - [218,1208,720,1236] – original coords cannot be displayed.
    - [142,790,471,809] - new coordinates can be displayed (and seem to display the correct region) but not perfect.
- Page 5:
    - [65,77,456,126] – original coords can be displayed, but don't displace the correct region of text
    - [42,50,298,82] - new coordinates can be displayed, display the correct region, but not perfect (too large on the right)

In [None]:
les_ok_pages = read_pages_from_s3(les_ok_1)
les_ok_page_1 = les_ok_pages[0]
les_ok_page_5 = les_ok_pages[4]

pg_1_iiif = les_ok_page_1['iiif']
pg_5_iiif = les_ok_page_5['iiif']
print("iiif pg_1: ", pg_1_iiif, ", iiif pg_5: ", pg_5_iiif)

les_ok_page_1_r1 = les_ok_page_1['r'][0]['c']
les_ok_page_5_r1 = les_ok_page_5['r'][0]['c']

print("les_ok_page_1_r1 current coords: ", les_ok_page_1_r1)
print("les_ok_page_5_r1 current coords: ", les_ok_page_5_r1)

dest_res, curr_res = 72, 110

scaled_les_ok_page_1_r1 = scale_coords(les_ok_page_1_r1, curr_res, dest_res)
scaled_les_ok_page_5_r1 = scale_coords(les_ok_page_5_r1, curr_res, dest_res)

print("scaled_les_ok_page_1_r1 updated coords: ", scaled_les_ok_page_1_r1)
print("scaled_les_ok_page_5_r1 updated coords: ", scaled_les_ok_page_5_r1)

#### LES-2011-05-01-a

Comments/Conclusions:
- Page 1:
    - [79,52,468,81] – Display a cropped part of the "L'essor" title.
    - [51,34,306,53] - new coordinates can be displayed (and seem to display the correct region) but not perfect.
- Page 5:
    - [65, 291, 761, 339] – original coords can be displayed, but don't displace the correct region of text
    - [42, 190, 498, 221] - new coordinates can be displayed, and look good

In [None]:
les_jpg1_pages = read_pages_from_s3(les_jpg_1)
les_jpg1_page_1 = les_jpg1_pages[0]
les_jpg1_page_5 = les_jpg1_pages[4]

jpg1_pg_1_iiif = les_jpg1_page_1['iiif']
jpg1_pg_5_iiif = les_jpg1_page_5['iiif']
print("iiif pg_1: ", jpg1_pg_1_iiif, ", iiif pg_5: ", jpg1_pg_5_iiif)

les_jpg1_page_1_r1 = les_jpg1_page_1['r'][0]['c']
les_jpg1_page_5_r1 = les_jpg1_page_5['r'][0]['c']

print("les_jpg1_page_1_r1 current coords: ", les_jpg1_page_1_r1)
print("les_jpg1_page_5_r1 current coords: ", les_jpg1_page_5_r1)

curr_res, dest_res = 110, 72

scaled_les_jpg1_page_1_r1 = scale_coords(les_jpg1_page_1_r1, curr_res, dest_res)
scaled_les_jpg1_page_5_r1 = scale_coords(les_jpg1_page_5_r1, curr_res, dest_res)

print("scaled_les_jpg1_page_1_r1 updated coords: ", scaled_les_jpg1_page_1_r1)
print("scaled_les_jpg1_page_5_r1 updated coords: ", scaled_les_jpg1_page_5_r1)

#### LES-2006-02-01-a

Comments/Conclusions:
- Page 1:
    - [218,1208,720,1236] – original coords cannot be displayed.
    - [142,790,471,809] - new coordinates can be displayed (and seem to display the correct region) but not perfect.
- Page 5:
    - [65,77,456,126] – original coords can be displayed, but don't displace the correct region of text
    - [42,50,298,82] - new coordinates can be displayed, display the correct region, but not perfect (too large on the right)

In [None]:
les_jpg2_pages = read_pages_from_s3(les_jpg_2)
les_jpg2_page_1 = les_jpg2_pages[0]
les_jpg2_page_5 = les_jpg2_pages[4]

jpg2_pg_1_iiif = les_jpg2_page_1['iiif']
jpg2_pg_5_iiif = les_jpg2_page_5['iiif']
print("iiif pg_1: ", jpg2_pg_1_iiif, ", iiif pg_5: ", jpg2_pg_5_iiif)

les_jpg2_page_1_r1 = les_jpg2_page_1['r'][0]['c']
les_jpg2_page_5_r1 = les_jpg2_page_5['r'][0]['c']

print("les_jpg2_page_1_r1 current coords: ", les_jpg2_page_1_r1)
print("les_jpg2_page_5_r1 current coords: ", les_jpg2_page_5_r1)

curr_res, dest_res = 110, 72

scaled_les_jpg2_page_1_r1 = scale_coords(les_jpg2_page_1_r1, curr_res, dest_res)
scaled_les_jpg2_page_5_r1 = scale_coords(les_jpg2_page_5_r1, curr_res, dest_res)

print("scaled_les_jpg2_page_1_r1 updated coords: ", scaled_les_jpg2_page_1_r1)
print("scaled_les_jpg2_page_5_r1 updated coords: ", scaled_les_jpg2_page_5_r1)

### DLE

#### DLE-1914-01-28-a

Comments/Conclusions:
- Page 1:
    - [303, 208, 141, 41] – Word appears, but wrong one.
    - [162, 111, 75, 22] - Correctly displayed.
- Page 3:
    - [167, 59, 553, 64] – original coords can be displayed, but don't display the correct region of text
    - [89, 31, 296, 34] - new coordinates can be displayed, display the correct region.

In [None]:
dle_inv_1 = 'DLE-1914-01-28-a'
dle_inv_pages, dle_inv_page_1_r1, dle_inv_page_5_r1, scaled_dle_inv_page_1_r1, scaled_dle_inv_page_5_r1 = test_scale_coords(dle_inv_1, [0, 2], 108, 58)
dle_inv_page_1_r1, dle_inv_page_5_r1, scaled_dle_inv_page_1_r1, scaled_dle_inv_page_5_r1

### EXP

In [None]:
EXP_info_file_path = info_file.format('EXP')

with open(EXP_info_file_path, mode ='r', encoding='utf-8') as f:
    EXP_title_info = json.load(f)

In [None]:
all_issues_to_investigate['EXP']['EXP-2015-07-08-a']

In [None]:
EXP_title_info['EXP-2016-04-22-a']

In [None]:
exp_test_issue = 'EXP-2016-04-22-a'
exp_i_pages = read_pages_from_s3(exp_test_issue)

In [None]:
exp_i_pages

In [None]:
exp_i_p1 = exp_i_pages[0]
exp_i_p1

In [None]:
neuch_coords = [57, 480, 567, 612]

scaled_neuch_coords = scale_coords(neuch_coords, 160, 108)
neuch_coords_xy  = coords_to_xy(neuch_coords)
scaled_neuch_coords_xy  = coords_to_xy(scaled_neuch_coords)

# None of them work, 57,530,567,120 works
neuch_coords_xy, scaled_neuch_coords, scaled_neuch_coords_xy

In [None]:
EXP_title_info[exp_test_2_issue]

In [None]:
# test with an early issue with the scaling issue identified
exp_test_2_issue = 'EXP-1902-09-20-a'
exp_2_i_pages = [json.loads(p) for p in read_pages_from_s3(exp_test_2_issue)]
exp_2_i_p1 = exp_2_i_pages[0]
#exp_2_i_p1 = json.loads(exp_2_i_p1)
exp_2_i_p1

In [None]:
r_coords = exp_2_i_p1['r'][0]['c']

scaled_r_coords = scale_coords(r_coords, 144, 72)
r_coords_xy  = coords_to_xy(r_coords)
scaled_r_coords_xy  = coords_to_xy(scaled_r_coords)

# scaled_r_coords work as ints [12, 1090, 348, 1118]
r_coords_xy, scaled_r_coords, scaled_r_coords_xy

In [None]:
# test with an early issue without img info
exp_test_3_issue = 'EXP-1902-09-19-a'
exp_3_i_pages = [json.loads(p) for p in read_pages_from_s3(exp_test_3_issue)]
exp_3_i_p1 = exp_3_i_pages[0]
#exp_2_i_p1 = json.loads(exp_2_i_p1)
print(exp_3_i_p1['r'][0])

r2_coords = exp_3_i_p1['r'][0]['c']

scaled_r2_coords = scale_coords(r2_coords, 144, 72)
r2_coords_xy  = coords_to_xy(r2_coords)
scaled_r2_coords_xy  = coords_to_xy(scaled_r2_coords)

# scaled_r_coords work as ints [14,303,164,206]
r2_coords, r2_coords_xy, scaled_r2_coords, scaled_r2_coords_xy

In [None]:
exp_issue = 'EXP-2010-11-29-a'

exp_pages = read_pages_from_s3(exp_issue)
exp_pages[6]

### LCG

#### LCG-1892-07-20-a

Comments/Conclusions:
- Page 1:
    - [106, 511, 288, 22] – Word appears, but wrong one.
    - [71, 344, 194, 14] - Correctly displayed.
- Page 4: first region is not text, but second one works
    - [298, 166, 537, 45] – original coords can be displayed, but don't display the correct region of text
    - [200, 111, 361, 30] - new coordinates can be displayed, display the correct region.

In [None]:
lcg_inv_issue = 'LCG-1892-07-20-a'
lcg_inv_pages, lcg_inv_page_1_r1, lcg_inv_page_4_r1, scaled_lcg_inv_page_1_r1, scaled_lcg_inv_page_4_r1 = test_scale_coords(lcg_inv_issue, [0, 3], 144, 97)
lcg_inv_page_1_r1, lcg_inv_page_4_r1, scaled_lcg_inv_page_1_r1, scaled_lcg_inv_page_4_r1

In [None]:
lcg_inv_pages[3]['r'][:3]

In [None]:
LCG_info_file_path = info_file.format('LCG')

with open(LCG_info_file_path, mode ='r', encoding='utf-8') as f:
    LCG_title_info = json.load(f)

In [None]:
LCG_title_info['LCG-1892-06-01-a']

### LBP

#### LBP-1881-05-18-a

Comments/Conclusions:
- Page 1:
    - [330, 439, 1134, 78] – region appears, but wrong one.
    - [183, 243, 630, 43] - Correctly displayed.
- Page 4: first region is not text, but second one works
    - [104, 116, 937, 121] – original coords can be displayed, but don't display the correct region of text
    - [57, 64, 520, 67] - new coordinates can be displayed, display the correct region.

In [None]:
lbp_inv_issue = 'LBP-1881-05-18-a'
lbp_inv_pages, lbp_inv_page_1_r1, lbp_inv_page_4_r1, scaled_lbp_inv_page_1_r1, scaled_lbp_inv_page_4_r1 = test_scale_coords(lbp_inv_issue, [0, 3], 108, 60)
lbp_inv_page_1_r1, lbp_inv_page_4_r1, scaled_lbp_inv_page_1_r1, scaled_lbp_inv_page_4_r1

In [None]:
lbp_inv_pages[3]['r'][0]

In [None]:
LBP_info_file_path = info_file.format('LBP')

with open(LBP_info_file_path, mode ='r', encoding='utf-8') as f:
    LBP_title_info = json.load(f)

### LTF

#### LTF-1905-08-09-a

Comments/Conclusions:
- Page 1:
    - [83, 139, 224, 40] – region appears cropped.
    - [37, 62, 99, 17] - Correctly displayed.
- Page 4: first region is not text, but second one works
    - [78, 134, 348, 37] – original coords can be displayed, but don't display the correct region of text
    - [34, 59, 155, 16] - new coordinates can be displayed, display the correct region.

In [None]:
ltf_inv_issue = 'LTF-1905-08-09-a'
ltf_inv_pages, ltf_inv_page_1_r1, ltf_inv_page_4_r1, scaled_ltf_inv_page_1_r1, scaled_ltf_inv_page_4_r1 = test_scale_coords(ltf_inv_issue, [0, 3], 130, 58)
ltf_inv_page_1_r1, ltf_inv_page_4_r1, scaled_ltf_inv_page_1_r1, scaled_ltf_inv_page_4_r1

In [None]:
ltf_inv_pages[3]['r'][0]

# Part 3: Implement the patching code functions to be used in the patching script

In [None]:
all_info_file_path = issues_to_patch_file.format('all')
all_to_inv_path = issues_to_inv_file.format('all')

with open(all_info_file_path, mode ='r', encoding='utf-8') as f:
    all_to_patch = json.load(f)

with open(all_to_inv_path, mode ='r', encoding='utf-8') as f:
    all_to_inv = json.load(f)

In [None]:
# LTF is used as an example as it's relatively small
LTF_issues, LTF_pages = fetch_files('canonical-data', False, 'both', ['LTF'])

In [None]:
def convert_issue_coords(issue, res):
    scaled = False
    for i in issue['i']:
        if 'c' in i['m']:
            i['m']['c'] = scale_coords(i['m']['c'], res['curr_res'], res['dest_res'])
            scaled = True
        elif 'c' in i:
            i['c'] = scale_coords(i['c'], res['curr_res'], res['dest_res'])
            scaled = True
        elif 'iiif_link' in i['m'] or 'iiif_link' in i:
            iiif = i['m']['iiif_link'] if 'iiif_link' in i['m'] else i['iiif_link']
            logger.warning("%s: No coordinates but a IIIF link for item %s: %s", issue['id'], i['m']['id'], iiif)
    # return the issue as-is once it's been scaled
    return issue, scaled

In [None]:
def convert_page_coords(page, res):
    scaled = 0
    # count the expected number of coordinates to rescale on page
    coords_count = len(page['r'])
    for region in page['r']:
        region['c'] = scale_coords(region['c'], res['curr_res'], res['dest_res'])
        scaled += 1
        for para in region["p"]:
            coords_count += len(para["l"])
            for line in para["l"]:
                line['c'] = scale_coords(line['c'], res['curr_res'], res['dest_res'])
                scaled += 1
                coords_count += len(line["t"])
                for token in line['t']:
                    token['c'] = scale_coords(token['c'], res['curr_res'], res['dest_res'])
                    scaled += 1
    return page, scaled==coords_count

In [None]:
def find_convert_coords(elem, title, to_patch, to_inv, is_issue: bool = True):

    if is_issue:
        issue_id = elem['id']
        key = 'issue_patching_done'
        patched = {'issue_id': issue_id, key: False, 'num_pages':len(elem['pp'])}
    else:
        issue_id = '-'.join(elem['id'].split('-')[:-1])
        key = 'page_patching_done'
        patched = {'issue_id': issue_id, key: False, 'page_id':elem['id']}
    

    # for LCG, only years later than 1891 need to be fixed
    if title != 'LCG' or int(issue_id.split('-')[1])>1906:
        if issue_id in to_patch:
            res = to_patch[issue_id]
            # keep trace of whether or not we fetched the information from the image info file
            res['used_image_info_file'] = True
        elif issue_id in to_inv:
            res = to_inv[issue_id]
            res['used_image_info_file'] = False
        else:
            return elem, patched
        
        if is_issue:
            elem, scaled = convert_issue_coords(elem, res)
            # there may be no coordinated to scale in an issue
            res['scaled'] = scaled
        else:
            elem, scaled = convert_page_coords(elem, res)
            # sanity check that number of regions+lines+tokens=coords scaled
            res['all_scaled'] = scaled

        # keep trace of information about the patching performed.
        patched[key] = True
        patched.update(res)
    
    return elem, patched

In [None]:
np = 'LTF'
patched_ltf_issues = LTF_issues.map_partitions(
    lambda i_list: [find_convert_coords(i, np, all_to_patch[np], all_to_inv[np]) for i in i_list]
    ).persist()
patched_ltf_pages = LTF_pages.map_partitions(
    lambda p_list: [find_convert_coords(p, np, all_to_patch[np], all_to_inv[np], is_issue=False) for p in p_list]
    ).persist()

# extract only the "patched" 
patched_issues_ltf = patched_ltf_issues.map_partitions(lambda i_l: [i[1] for i in i_l])
patched_pages_ltf = patched_ltf_pages.map_partitions(lambda i_l: [i[1] for i in i_l])

In [None]:
patched_issues_df = patched_issues_ltf.to_dataframe(meta={'issue_id': str, 'issue_patching_done': bool, 
                            'num_pages': "Int64", 'dest_res': "Int64", 
                            "curr_res":  "Int64", 'zip_contents': str,
                            'used_image_info_file': bool}).compute()#.set_index('issue_id').compute()
patched_issues_df

In [None]:
patched_pages_df = (
    patched_pages_ltf.to_dataframe(meta={'issue_id': str, 'page_patching_done': bool, 
                            'page_id': str, 'dest_res': "Int64", 
                            "curr_res":  "Int64", 'zip_contents': str,
                            'used_image_info_file': bool, "all_scaled": bool})
        #.groupby(by='issue_id')
        .groupby(by=['issue_id', 'page_patching_done', 'dest_res', 'curr_res', 'used_image_info_file', 'all_scaled'])
        .agg({'page_id': 'count'})
        .rename(columns={'page_id': 'num_pages'})
        .reset_index()#.set_index('issue_id')
).compute()
patched_pages_df

In [None]:
patched_issues__merged_df = patched_issues_df.merge(patched_pages_df, how='outer')
patched_issues__merged_df

In [None]:
patched_issues__merged_df.to_csv(os.path.join(info_files_base_path, f'{np}_patched_issues.csv'))

In [None]:
all(patched_pages_df.page_patching_done), all(patched_issues_df.issue_patching_done)