# Dev notebook for patching code

Related to issue [#117](https://github.com/impresso/impresso-text-acquisition/issues/117)

### Imports

In [None]:
import os
import boto3
import json
import logging
import jsonlines
from impresso_commons.utils import s3
from impresso_commons.path.path_s3 import fetch_issues, list_issues, list_newspapers
from impresso_commons.utils.s3 import fixed_s3fs_glob
from impresso_commons.versioning.data_manifest import DataManifest
from text_importer.importers.core import upload_issues, write_error
from smart_open import open as smart_open_function
from impresso_commons.versioning.helpers import counts_for_canonical_issue
import dask.bag as db
from typing import Any, Callable
import git
from text_importer.utils import init_logger

In [None]:
IMPRESSO_STORAGEOPT = s3.get_storage_options()

In [None]:
logger = logging.getLogger()

## Functions

In [None]:
def add_property(object_dict: dict[str, Any], prop_name: str, prop_function: Callable[str, str], function_input: str):
    object_dict[prop_name] = prop_function(function_input)
    logger.debug("%s -> Added property %s: %s", object_dict['id'], prop_name, object_dict[prop_name])
    return object_dict

In [None]:
def write_upload_issues(
    key: tuple[str, str],
    issues: list[dict[str, Any]],
    output_dir: str,
    bucket_name: str,
    failed_log: str | None = None,
) -> tuple[str, str]:
    """Compress issues for a Journal-year in a json file and upload them to s3.

    The compressed ``.bz2`` output file is a JSON-line file, where each line
    corresponds to an individual and issue document in the canonical format.

    Args:
        key (str): Hyphen separated Newspaper ID and year of input issues, e.g. `GDL-1900`.
        issues (list[dict[str, Any]]): A list of issues as dicts.
        output_dir (str): Local output directory.
        bucket_name (str): Name of S3 bucket where to upload the file.
        failed_log (str | None, optional): Path to the log file used when an
            instantiation was not successful. Defaults to None.

    Returns:
        Tuple[str, str]: Label following the template `<NEWSPAPER>-<YEAR>` and 
            the path to the the compressed `.bz2` file.
    """
    newspaper, year = key
    filename = f'{newspaper}-{year}-issues.jsonl.bz2'
    filepath = os.path.join(output_dir, newspaper, filename)
    logger.info(f'Compressing {len(issues)} JSON files into {filepath}')

    os.makedirs(os.path.dirname(filepath), exist_ok =True)

    try:
        with smart_open_function(filepath, 'ab') as fout:
            writer = jsonlines.Writer(fout)

            writer.write_all(issues)

            logger.info(f'Written {len(items)} issues to {filepath}')
            writer.close()
    except Exception as e:
        logger.error(f"Error for {filepath}")
        logger.exception(e)
        #write_error(filepath, e, failed_log)

    upload_issues('-'.join(key), filepath, bucket_name)

    return key, filepath


# SWA - Patch 6

The patch consists of adding a new `iiif_manifest_uri` property mapping to the IIIF presentation API for the given issue.

In [None]:
# initialize values for patch
SWA_TITLES = ['arbeitgeber', 'handelsztg']
SWA_IIIF_BASE_URI = 'https://ub-iiifpresentation.ub.unibas.ch/impresso_sb'
PROP_NAME = 'iiif_manifest_uri'

error_log = '/home/piconti/impresso-text-acquisition/text_importer/data/patch_logs/patch_6_swa_errors.log'

init_logger(logger, logging.DEBUG, '/home/piconti/impresso-text-acquisition/text_importer/data/patch_logs/patch_6_swa.log')
logger.info("Patching titles %s: adding %s property at issue level", SWA_TITLES, PROP_NAME)

In [None]:
# define patch function
def swa_manifest_uri(issue_id: str, swa_iiif: str = SWA_IIIF_BASE_URI) -> str:
    """
    https://ub-iiifpresentation.ub.unibas.ch/impresso_sb/[issue canonical ID]-issue/manifest
    """
    return os.path.join(swa_iiif, '-'.join([issue_id, 'issue']), 'manifest')

In [None]:
# initialise manifest to keep track of updates
canonical_repo = git.Repo('/home/piconti/impresso-text-acquisition')
s3_input_bucket = 'canonical-data'
s3_output_bucket = 'canonical-sandbox'
temp_dir = '/scratch/piconti/impresso/patches_temp'
patched_fields=[PROP_NAME]
schema_path = '/home/piconti/impresso-text-acquisition/text_importer/impresso-schemas/json/versioning/manifest.schema.json'

swa_patch_6_manifest = DataManifest(
    data_stage = 'canonical',
    s3_output_bucket = s3_output_bucket,
    git_repo = canonical_repo,
    temp_dir = temp_dir,
    patched_fields=patched_fields,
    staging = True
)

Perform the patch, tracking updates and upload results

In [None]:
# download the issues of interest for this patch
swa_issues = fetch_issues('canonical-data', True, SWA_TITLES)

# patch them keeping track of the data that's been modified
yearly_patched_issues = {}

for issue in swa_issues[:200]:
    # key is title-year
    title, year = issue['id'].split('-')[:2]
    key = '-'.join([title, year])
    if key in yearly_patched_issues:
        yearly_patched_issues[key].append(add_property(issue, PROP_NAME, swa_manifest_uri, issue['id']))
    else:
        yearly_patched_issues[key] = [add_property(issue, PROP_NAME, swa_manifest_uri, issue['id'])]
    
    swa_patch_6_manifest.add_by_title_year(title, year, counts_for_canonical_issue(issue))

# write and upload the updated issues to s3
for key, issues in yearly_patched_issues.items():
    write_upload_issues(key.split('-'), issues, temp_dir, s3_output_bucket, error_log)

# finalize the manifest and export it
note = f"Patching titles {SWA_TITLES}: adding {PROP_NAME} property at issue level"
swa_patch_6_manifest.append_to_notes(note)
swa_patch_6_manifest.compute(export_to_git_and_s3 = False)
swa_patch_6_manifest.validate_and_export_manifest(path_to_schema=schema_path, push_to_git=False)
    

## FedGaz Patch