# Canonical manifest recomputation

In [None]:
import os
import shutil
import json
import boto3
import pandas as pd
import git
from dask import bag as db
from dask import dataframe as dd
import numpy as np
from pathlib import Path
from time import strftime
import copy
from typing import Any

from impresso_commons.utils.s3 import (fixed_s3fs_glob, alternative_read_text, upload,
                                       get_storage_options, get_boto3_bucket, IMPRESSO_STORAGEOPT, upload_to_s3)
from impresso_commons.versioning.helpers import (DataStage, read_manifest_from_s3, 
                                                 validate_stage, clone_git_repo,
                                                 write_and_push_to_git, write_dump_to_fs)
from impresso_commons.path import parse_canonical_filename
from impresso_commons.path.path_fs import IssueDir
from impresso_commons.path.path_s3 import read_s3_issues, list_newspapers, fetch_files
from impresso_commons.versioning.data_statistics import NewspaperStatistics, POSSIBLE_GRANULARITIES
from impresso_commons.versioning.data_manifest import DataManifest
from collections import defaultdict

## Functions

In [None]:
def compute_canonical_stats_for_manifest(
    s3_canonical_issues: db.core.Bag,
) -> list[dict[str, Any]]:
    """Computes number of issues and pages per newspaper from canonical data in s3.

    :param str s3_canonical_bucket: S3 bucket with canonical data.
    :return: A pandas DataFrame with newspaper ID as the index and columns `n_issues`, `n_pages`.
    :rtype: pd.DataFrame

    """

    print("Fetched all issues, gathering desired information.")
    pages_count_df = (
        s3_canonical_issues.map(
            lambda i: {
                "np_id": i["id"].split("-")[0],
                "year": i["id"].split("-")[1],
                "id": i["id"],
                "issue_id": i["id"],
                "n_pages": len(set(i["pp"])),
                "n_content_items": len(i["i"]),
                "n_images": len(
                    [item for item in i["i"] if item["m"]["tp"] == "image"]
                ),
            }
        )
        .to_dataframe(
            meta={
                "np_id": str,
                "year": str,
                "id": str,
                "issue_id": str,
                "n_pages": int,
                "n_images": int,
                "n_content_items": int,
            }
        )
        .set_index("id")
        .persist()
    )

    # cum the counts for all values collected
    aggregated_df = (
        pages_count_df.groupby(by=["np_id", "year"])
        .agg(
            {
                "n_pages": sum,
                "issue_id": "count",
                "n_content_items": sum,
                "n_images": sum,
            }
        )
        .rename(
            columns={
                "issue_id": "issues",
                "n_pages": "pages",
                "n_content_items": "content_items_out",
                "n_images": "images",
            }
        )
        .reset_index()
    )

    print("Finished grouping and aggregating by title and year.")
    # return as a list of dicts
    return aggregated_df.to_bag(format="dict").compute()



## Code

### BNF 

Recomputing the manifest after the BNF data update, since it was not updated correctly.

In [None]:
s3_bucket = 'canonical-staging'
newspapers = ['excelsior', 'lafronde', 'marieclaire', 'oeuvre']

In [None]:
s3_bnf_issues, _= fetch_files(s3_bucket, compute=False, newspapers_filter=newspapers)

In [None]:
manifest_out_name = 'canonical_v0-1-0.json'

repo = git.Repo('/home/piconti/impresso-text-acquisition')
mft_s3_input_bucket = None # bucket corresponding to the input data of the data currently in 'rebuilt-data'
mft_s3_output_bucket = 'canonical-staging' #'rebuilt-data' #'rebuilt-sandbox'
# there is no previous manifest
temp_dir = '/home/piconti/temp_canonical_v0-1-0'
previous_manifest_path = 's3://canonical-staging/canonical_v0-0-4.json'

canonical_patch_bnf = DataManifest(
    data_stage = 'canonical', # DataStage.REBUILT also works
    s3_output_bucket = mft_s3_output_bucket,
    s3_input_bucket = mft_s3_input_bucket,
    git_repo = repo,
    temp_dir = temp_dir,
    staging=True,

)

In [None]:
bnf_stats = compute_canonical_stats_for_manifest(s3_bnf_issues)

In [None]:
bnf_stats_copy = copy.deepcopy(bnf_stats)

print("Populating the manifest with the resulting yearly statistics...")
# populate the manifest with these statistics
for stats in bnf_stats_copy:
    title = stats['np_id']
    year = stats['year']
    del stats["np_id"]
    del stats["year"]
    canonical_patch_bnf.add_by_title_year(title, year, stats)

print("Finalizing the manifest, and computing the result...")

note = f"Ingestion of 16236 Newspaper issues into canonical format."
canonical_patch_bnf.append_to_notes(note)
canonical_patch_bnf.compute(export_to_git_and_s3 = False)

In [None]:
canonical_patch_bnf.manifest_data

In [None]:
updated_manifest_path = '/home/piconti/impresso-data-release/data-processing-versioning/data-preparation/canonical_v0-1-0.json'

In [None]:
mft_filename = os.path.join('canonical-staging', 'canonical_v0-1-0.json')

In [None]:
upload_to_s3(updated_manifest_path, 'canonical_v0-1-0.json', 'canonical-staging')

### BNL

The first ingestion for BNL stopped before the end, so the manifest was not generated. 
Since several newspapers had been entirely ingested, the manifest is computed retroactively on the data. 

In [None]:
s3_bucket = 'canonical-staging'
bnl_newspapers = ['actionfem', 'armeteufel', 'avenirgdl', 'buergerbeamten', 'courriergdl', 'gazgrdlux', 'landwortbild', 'luxwort', 'schmiede']

luxwort_excluded_years = ["1860","1908","1910","1935","1936","1937","1938","1939","1940","1941","1942","1943","1944","1945","1946","1947","1948","1949","1950"]
actionfem_excluded_years = ["1935", "1936", "1937", "1938", "1939", "1940"]

In [None]:
s3_bnl_issues, _= fetch_files(s3_bucket, compute=False, newspapers_filter=bnl_newspapers)

In [None]:
bnl_stats = compute_canonical_stats_for_manifest(s3_bnl_issues)

In [None]:
manifest_out_name = 'canonical_v0-3-0.json'

repo = git.Repo('/home/piconti/impresso-text-acquisition')
mft_s3_input_bucket = None # bucket corresponding to the input data of the data currently in 'rebuilt-data'
mft_s3_output_bucket = 'canonical-staging' #'rebuilt-data' #'rebuilt-sandbox'
# there is no previous manifest
temp_dir = '/home/piconti/temp_canonical_v0-3-0'
previous_manifest_path = 's3://canonical-staging/canonical_v0-2-0.json'

canonical_patch_bnl = DataManifest(
    data_stage = 'canonical', # DataStage.CANONICAL also works
    s3_output_bucket = mft_s3_output_bucket,
    s3_input_bucket = mft_s3_input_bucket,
    git_repo = repo,
    temp_dir = temp_dir,
    staging=True,
)

In [None]:
bnl_stats_copy = copy.deepcopy(bnl_stats)

print("Populating the manifest with the resulting yearly statistics...")
# populate the manifest with these statistics
for stats in bnl_stats_copy:
    title = stats['np_id']
    year = stats['year']
    if title == 'luxwort':
        if year in luxwort_excluded_years:
            print(f"Skipping year {year} for luxwort.")
            continue
    if title == 'actionfem':
        if year in actionfem_excluded_years:
            print(f"Skipping year {year} for luxwort.")
            continue
    title = stats['np_id']
    year = stats['year']
    del stats["np_id"]
    del stats["year"]
    print(f"Addind the stats for {title} {year}.")
    canonical_patch_bnl.add_by_title_year(title, year, stats)

In [None]:
print("Finalizing the manifest, and computing the result...")

note = f"Ingestion of Newspaper issues into canonical format for titles {bnl_newspapers}."
canonical_patch_bnl.append_to_notes(note)
canonical_patch_bnl.compute(export_to_git_and_s3 = True)

In [None]:
canonical_patch_bnl.manifest_data

### BNL - Second run

In [None]:
s3_bucket = 'canonical-staging'
bnl_newspapers_2 = ['deletz1893', 'diekwochen', 'kommmit', 'indeplux', 'kommmit', 'lunion', 'luxzeit1858', 'luxwort']

luxwort_included_years = ["1860","1908","1910"]
indeplux_excluded_first_year = 1919

In [None]:
s3_bnl_issues_2, _= fetch_files(s3_bucket, compute=False, newspapers_filter=bnl_newspapers_2)
#bnl_stats = compute_canonical_stats_for_manifest(s3_bnl_issues_2)

In [None]:
bnl_stats = compute_canonical_stats_for_manifest(s3_bnl_issues_2)

In [None]:
manifest_out_name = 'canonical_v0-4-0.json'

repo = git.Repo('/home/piconti/impresso-text-acquisition')
mft_s3_input_bucket = None # bucket corresponding to the input data of the data currently in 'rebuilt-data'
mft_s3_output_bucket = 'canonical-staging' #'rebuilt-data' #'rebuilt-sandbox'
# there is no previous manifest
temp_dir = '/home/piconti/temp_canonical_v0-4-0'
previous_manifest_path = 's3://canonical-staging/canonical_v0-3-0.json'

canonical_patch_bnl_2 = DataManifest(
    data_stage = 'canonical', # DataStage.CANONICAL also works
    s3_output_bucket = mft_s3_output_bucket,
    s3_input_bucket = mft_s3_input_bucket,
    git_repo = repo,
    temp_dir = temp_dir,
    staging=True,
)

In [None]:
bnl_stats_copy = copy.deepcopy(bnl_stats)

print("Populating the manifest with the resulting yearly statistics...")
# populate the manifest with these statistics
for stats in bnl_stats_copy:
    title = stats['np_id']
    year = stats['year']
    if title == 'luxwort':
        if year not in luxwort_included_years:
            print(f"Skipping year {year} for luxwort.")
            continue
    if title == 'indeplux':
        if int(year) > indeplux_excluded_first_year:
            print(f"Skipping year {year} for luxwort.")
            continue
    title = stats['np_id']
    year = stats['year']
    del stats["np_id"]
    del stats["year"]
    print(f"Addind the stats for {title} {year}.")
    canonical_patch_bnl_2.add_by_title_year(title, year, stats)

In [None]:
print("Finalizing the manifest, and computing the result...")

note = f"Ingestion of Newspaper issues into canonical format for titles {bnl_newspapers_2}."
canonical_patch_bnl_2.append_to_notes(note)
canonical_patch_bnl_2.compute(export_to_git_and_s3 = True)

In [None]:
canonical_patch_bnl_2.manifest_data

## BNL 3rd Run

In [None]:
s3_bucket = 'canonical-staging'
bnl_newspapers_3 = ['actionfem', 'onsjongen', 'demitock', 'luxzeit1844', 
                    'dunioun', 'obermosel', 'luxembourg1935', 'volkfreu1869',
                    'waechtersauer', 'waeschfra', 'luxwort', 'luxland', 'tageblatt']

luxwort_included_years = ["1935", "1936", "1937", "1938", "1939", "1940", "1941", "1942", "1943", "1944"] #, "1945", "1946", "1947", "1948", "1949", "1950"]
luxland_max_year = 1964
actionfem_min_year = 1935
dunioun_max_year = 1944
obermosel_max_year = 1944
onsjongen_max_year = 1947
tageblatt_max_year = 1946

In [None]:
s3_bnl_issues_3, _= fetch_files(s3_bucket, compute=False, newspapers_filter=bnl_newspapers_3)
bnl_stats_3 = compute_canonical_stats_for_manifest(s3_bnl_issues_3)

In [None]:
manifest_out_name = 'canonical_v0-5-0.json'

repo = git.Repo('/home/piconti/impresso-text-acquisition')
mft_s3_input_bucket = None # bucket corresponding to the input data of the data currently in 'rebuilt-data'
mft_s3_output_bucket = 'canonical-staging' #'rebuilt-data' #'rebuilt-sandbox'
# there is no previous manifest
temp_dir = '/home/piconti/temp_canonical_v0-5-0'
previous_manifest_path = 's3://canonical-staging/canonical_v0-4-0.json'

canonical_patch_bnl_3 = DataManifest(
    data_stage = 'canonical', # DataStage.CANONICAL also works
    s3_output_bucket = mft_s3_output_bucket,
    s3_input_bucket = mft_s3_input_bucket,
    git_repo = repo,
    temp_dir = temp_dir,
    staging=True,
)

In [None]:
luxland_max_year = 1964
actionfem_min_year = 1935
dunioun_max_year = 1944
obermosel_max_year = 1944
onsjongen_max_year = 1947
tageblatt_max_year = 1946

In [None]:
bnl_stats_copy = copy.deepcopy(bnl_stats_3)

print("Populating the manifest with the resulting yearly statistics...")
# populate the manifest with these statistics
for stats in bnl_stats_copy:
    title = stats['np_id']
    year = stats['year']
    if title == 'luxwort':
        if year not in luxwort_included_years:
            print(f"Skipping year {year} for luxwort.")
            continue
    if title == 'luxland':
        if int(year) > luxland_max_year:
            print(f"Skipping year {year} for luxland.")
            continue
    if title == 'dunioun':
        if int(year) > dunioun_max_year:
            print(f"Skipping year {year} for luxland.")
            continue
    if title == 'obermosel':
        if int(year) > obermosel_max_year:
            print(f"Skipping year {year} for luxland.")
            continue
    if title == 'onsjongen':
        if int(year) > onsjongen_max_year:
            print(f"Skipping year {year} for luxland.")
            continue
    if title == 'tageblatt':
        if int(year) > tageblatt_max_year:
            print(f"Skipping year {year} for luxland.")
            continue
    if title == 'actionfem':
        if int(year) < actionfem_min_year:
            print(f"Skipping year {year} for actionfem.")
            continue
    title = stats['np_id']
    year = stats['year']
    del stats["np_id"]
    del stats["year"]
    print(f"Addind the stats for {title} {year}.")
    canonical_patch_bnl_3.add_by_title_year(title, year, stats)

In [None]:
print("Finalizing the manifest, and computing the result...")

note = f"Ingestion of Newspaper issues into canonical format for titles {bnl_newspapers_3}."
canonical_patch_bnl_3.append_to_notes(note)
canonical_patch_bnl_3.compute(export_to_git_and_s3 = True)

In [None]:
canonical_patch_bnl_3.manifest_data