# Canonical manifest recomputation

In [1]:
import os
import shutil
import json
import boto3
import pandas as pd
import git
from dask import bag as db
from dask import dataframe as dd
import numpy as np
from pathlib import Path
from time import strftime
import copy
from typing import Any

from impresso_commons.utils.s3 import (fixed_s3fs_glob, alternative_read_text, upload,
                                       get_storage_options, get_boto3_bucket, IMPRESSO_STORAGEOPT, upload_to_s3)
from impresso_commons.versioning.helpers import (DataStage, read_manifest_from_s3, 
                                                 validate_stage, clone_git_repo,
                                                 write_and_push_to_git, write_dump_to_fs)
from impresso_commons.path import parse_canonical_filename
from impresso_commons.path.path_fs import IssueDir
from impresso_commons.path.path_s3 import read_s3_issues, list_newspapers, fetch_files
from impresso_commons.versioning.data_statistics import NewspaperStatistics, POSSIBLE_GRANULARITIES
from impresso_commons.versioning.data_manifest import DataManifest
from collections import defaultdict

## Functions

In [2]:
def compute_canonical_stats_for_manifest(
    s3_canonical_issues: db.core.Bag,
) -> list[dict[str, Any]]:
    """Computes number of issues and pages per newspaper from canonical data in s3.

    :param str s3_canonical_bucket: S3 bucket with canonical data.
    :return: A pandas DataFrame with newspaper ID as the index and columns `n_issues`, `n_pages`.
    :rtype: pd.DataFrame

    """

    print("Fetched all issues, gathering desired information.")
    pages_count_df = (
        s3_canonical_issues.map(
            lambda i: {
                "np_id": i["id"].split("-")[0],
                "year": i["id"].split("-")[1],
                "id": i["id"],
                "issue_id": i["id"],
                "n_pages": len(set(i["pp"])),
                "n_content_items": len(i["i"]),
                "n_images": len(
                    [item for item in i["i"] if item["m"]["tp"] == "image"]
                ),
            }
        )
        .to_dataframe(
            meta={
                "np_id": str,
                "year": str,
                "id": str,
                "issue_id": str,
                "n_pages": int,
                "n_images": int,
                "n_content_items": int,
            }
        )
        .set_index("id")
        .persist()
    )

    # cum the counts for all values collected
    aggregated_df = (
        pages_count_df.groupby(by=["np_id", "year"])
        .agg(
            {
                "issue_id": "count",
                "n_pages": sum,
                "n_content_items": sum,
                "n_images": sum,
            }
        )
        .rename(
            columns={
                "n_pages": "pages",
                "issue_id": "issues",
                "n_content_items": "content_items_out",
                "n_images": "images",
            }
        )
        .reset_index()
    )

    print("Finished grouping and aggregating by title and year.")
    # return as a list of dicts
    return aggregated_df.to_bag(format="dict").compute()



## Code

### BNF 

Recomputing the manifest after the BNF data update, since it was not updated correctly.

In [None]:
s3_bucket = 'canonical-staging'
newspapers = ['excelsior', 'lafronde', 'marieclaire', 'oeuvre']

In [None]:
s3_bnf_issues, _= fetch_files(s3_bucket, compute=False, newspapers_filter=newspapers)

In [None]:
manifest_out_name = 'canonical_v0-1-0.json'

repo = git.Repo('/home/piconti/impresso-text-acquisition')
mft_s3_input_bucket = None # bucket corresponding to the input data of the data currently in 'rebuilt-data'
mft_s3_output_bucket = 'canonical-staging' #'rebuilt-data' #'rebuilt-sandbox'
# there is no previous manifest
temp_dir = '/home/piconti/temp_canonical_v0-1-0'
previous_manifest_path = 's3://canonical-staging/canonical_v0-0-4.json'

canonical_patch_bnf = DataManifest(
    data_stage = 'canonical', # DataStage.REBUILT also works
    s3_output_bucket = mft_s3_output_bucket,
    s3_input_bucket = mft_s3_input_bucket,
    git_repo = repo,
    temp_dir = temp_dir,
    staging=True,

)

In [None]:
bnf_stats = compute_canonical_stats_for_manifest(s3_bnf_issues)

In [None]:
bnf_stats_copy = copy.deepcopy(bnf_stats)

print("Populating the manifest with the resulting yearly statistics...")
# populate the manifest with these statistics
for stats in bnf_stats_copy:
    title = stats['np_id']
    year = stats['year']
    del stats["np_id"]
    del stats["year"]
    canonical_patch_bnf.add_by_title_year(title, year, stats)

print("Finalizing the manifest, and computing the result...")

note = f"Ingestion of 16236 Newspaper issues into canonical format."
canonical_patch_bnf.append_to_notes(note)
canonical_patch_bnf.compute(export_to_git_and_s3 = False)

In [None]:
canonical_patch_bnf.manifest_data

In [None]:
updated_manifest_path = '/home/piconti/impresso-data-release/data-processing-versioning/data-preparation/canonical_v0-1-0.json'

In [None]:
mft_filename = os.path.join('canonical-staging', 'canonical_v0-1-0.json')

In [None]:
upload_to_s3(updated_manifest_path, 'canonical_v0-1-0.json', 'canonical-staging')

### BNL

The first ingestion for BNL stopped before the end, so the manifest was not generated. 
Since several newspapers had been entirely ingested, the manifest is computed retroactively on the data. 

In [None]:
s3_bucket = 'canonical-staging'
bnl_newspapers = ['actionfem', 'armeteufel', 'avenirgdl', 'buergerbeamten', 'courriergdl', 'gazgrdlux', 'landwortbild', 'luxwort', 'schmiede']

luxwort_excluded_years = ["1860","1908","1910","1935","1936","1937","1938","1939","1940","1941","1942","1943","1944","1945","1946","1947","1948","1949","1950"]
actionfem_excluded_years = ["1935", "1936", "1937", "1938", "1939", "1940"]

In [None]:
s3_bnl_issues, _= fetch_files(s3_bucket, compute=False, newspapers_filter=bnl_newspapers)

In [None]:
bnl_stats = compute_canonical_stats_for_manifest(s3_bnl_issues)

In [None]:
manifest_out_name = 'canonical_v0-3-0.json'

repo = git.Repo('/home/piconti/impresso-text-acquisition')
mft_s3_input_bucket = None # bucket corresponding to the input data of the data currently in 'rebuilt-data'
mft_s3_output_bucket = 'canonical-staging' #'rebuilt-data' #'rebuilt-sandbox'
# there is no previous manifest
temp_dir = '/home/piconti/temp_canonical_v0-3-0'
previous_manifest_path = 's3://canonical-staging/canonical_v0-2-0.json'

canonical_patch_bnl = DataManifest(
    data_stage = 'canonical', # DataStage.CANONICAL also works
    s3_output_bucket = mft_s3_output_bucket,
    s3_input_bucket = mft_s3_input_bucket,
    git_repo = repo,
    temp_dir = temp_dir,
    staging=True,
)

In [None]:
bnl_stats_copy = copy.deepcopy(bnl_stats)

print("Populating the manifest with the resulting yearly statistics...")
# populate the manifest with these statistics
for stats in bnl_stats_copy:
    title = stats['np_id']
    year = stats['year']
    if title == 'luxwort':
        if year in luxwort_excluded_years:
            print(f"Skipping year {year} for luxwort.")
            continue
    if title == 'actionfem':
        if year in actionfem_excluded_years:
            print(f"Skipping year {year} for luxwort.")
            continue
    title = stats['np_id']
    year = stats['year']
    del stats["np_id"]
    del stats["year"]
    print(f"Addind the stats for {title} {year}.")
    canonical_patch_bnl.add_by_title_year(title, year, stats)

In [None]:
print("Finalizing the manifest, and computing the result...")

note = f"Ingestion of Newspaper issues into canonical format for titles {bnl_newspapers}."
canonical_patch_bnl.append_to_notes(note)
canonical_patch_bnl.compute(export_to_git_and_s3 = True)

In [None]:
canonical_patch_bnl.manifest_data

### BNL - Second run

In [None]:
s3_bucket = 'canonical-staging'
bnl_newspapers_2 = ['deletz1893', 'diekwochen', 'kommmit', 'indeplux', 'kommmit', 'lunion', 'luxzeit1858', 'luxwort']

luxwort_included_years = ["1860","1908","1910"]
indeplux_excluded_first_year = 1919

In [None]:
s3_bnl_issues_2, _= fetch_files(s3_bucket, compute=False, newspapers_filter=bnl_newspapers_2)
#bnl_stats = compute_canonical_stats_for_manifest(s3_bnl_issues_2)

In [None]:
bnl_stats = compute_canonical_stats_for_manifest(s3_bnl_issues_2)

In [None]:
manifest_out_name = 'canonical_v0-4-0.json'

repo = git.Repo('/home/piconti/impresso-text-acquisition')
mft_s3_input_bucket = None # bucket corresponding to the input data of the data currently in 'rebuilt-data'
mft_s3_output_bucket = 'canonical-staging' #'rebuilt-data' #'rebuilt-sandbox'
# there is no previous manifest
temp_dir = '/home/piconti/temp_canonical_v0-4-0'
previous_manifest_path = 's3://canonical-staging/canonical_v0-3-0.json'

canonical_patch_bnl_2 = DataManifest(
    data_stage = 'canonical', # DataStage.CANONICAL also works
    s3_output_bucket = mft_s3_output_bucket,
    s3_input_bucket = mft_s3_input_bucket,
    git_repo = repo,
    temp_dir = temp_dir,
    staging=True,
)

In [None]:
bnl_stats_copy = copy.deepcopy(bnl_stats)

print("Populating the manifest with the resulting yearly statistics...")
# populate the manifest with these statistics
for stats in bnl_stats_copy:
    title = stats['np_id']
    year = stats['year']
    if title == 'luxwort':
        if year not in luxwort_included_years:
            print(f"Skipping year {year} for luxwort.")
            continue
    if title == 'indeplux':
        if int(year) > indeplux_excluded_first_year:
            print(f"Skipping year {year} for luxwort.")
            continue
    title = stats['np_id']
    year = stats['year']
    del stats["np_id"]
    del stats["year"]
    print(f"Addind the stats for {title} {year}.")
    canonical_patch_bnl_2.add_by_title_year(title, year, stats)

In [None]:
print("Finalizing the manifest, and computing the result...")

note = f"Ingestion of Newspaper issues into canonical format for titles {bnl_newspapers_2}."
canonical_patch_bnl_2.append_to_notes(note)
canonical_patch_bnl_2.compute(export_to_git_and_s3 = True)

In [None]:
canonical_patch_bnl_2.manifest_data

## BNL 3rd Run

In [None]:
s3_bucket = 'canonical-staging'
bnl_newspapers_3 = ['actionfem', 'onsjongen', 'demitock', 'luxzeit1844', 
                    'dunioun', 'obermosel', 'luxembourg1935', 'volkfreu1869',
                    'waechtersauer', 'waeschfra', 'luxwort', 'luxland', 'tageblatt']

luxwort_included_years = ["1935", "1936", "1937", "1938", "1939", "1940", "1941", "1942", "1943", "1944"] #, "1945", "1946", "1947", "1948", "1949", "1950"]
luxland_max_year = 1964
actionfem_min_year = 1935
dunioun_max_year = 1944
obermosel_max_year = 1944
onsjongen_max_year = 1947
tageblatt_max_year = 1946

In [None]:
s3_bnl_issues_3, _= fetch_files(s3_bucket, compute=False, newspapers_filter=bnl_newspapers_3)
bnl_stats_3 = compute_canonical_stats_for_manifest(s3_bnl_issues_3)

In [None]:
manifest_out_name = 'canonical_v0-5-0.json'

repo = git.Repo('/home/piconti/impresso-text-acquisition')
mft_s3_input_bucket = None # bucket corresponding to the input data of the data currently in 'rebuilt-data'
mft_s3_output_bucket = 'canonical-staging' #'rebuilt-data' #'rebuilt-sandbox'
# there is no previous manifest
temp_dir = '/home/piconti/temp_canonical_v0-5-0'
previous_manifest_path = 's3://canonical-staging/canonical_v0-4-0.json'

canonical_patch_bnl_3 = DataManifest(
    data_stage = 'canonical', # DataStage.CANONICAL also works
    s3_output_bucket = mft_s3_output_bucket,
    s3_input_bucket = mft_s3_input_bucket,
    git_repo = repo,
    temp_dir = temp_dir,
    staging=True,
)

In [None]:
luxland_max_year = 1964
actionfem_min_year = 1935
dunioun_max_year = 1944
obermosel_max_year = 1944
onsjongen_max_year = 1947
tageblatt_max_year = 1946

In [None]:
bnl_stats_copy = copy.deepcopy(bnl_stats_3)

print("Populating the manifest with the resulting yearly statistics...")
# populate the manifest with these statistics
for stats in bnl_stats_copy:
    title = stats['np_id']
    year = stats['year']
    if title == 'luxwort':
        if year not in luxwort_included_years:
            print(f"Skipping year {year} for luxwort.")
            continue
    if title == 'luxland':
        if int(year) > luxland_max_year:
            print(f"Skipping year {year} for luxland.")
            continue
    if title == 'dunioun':
        if int(year) > dunioun_max_year:
            print(f"Skipping year {year} for luxland.")
            continue
    if title == 'obermosel':
        if int(year) > obermosel_max_year:
            print(f"Skipping year {year} for luxland.")
            continue
    if title == 'onsjongen':
        if int(year) > onsjongen_max_year:
            print(f"Skipping year {year} for luxland.")
            continue
    if title == 'tageblatt':
        if int(year) > tageblatt_max_year:
            print(f"Skipping year {year} for luxland.")
            continue
    if title == 'actionfem':
        if int(year) < actionfem_min_year:
            print(f"Skipping year {year} for actionfem.")
            continue
    title = stats['np_id']
    year = stats['year']
    del stats["np_id"]
    del stats["year"]
    print(f"Addind the stats for {title} {year}.")
    canonical_patch_bnl_3.add_by_title_year(title, year, stats)

In [None]:
print("Finalizing the manifest, and computing the result...")

note = f"Ingestion of Newspaper issues into canonical format for titles {bnl_newspapers_3}."
canonical_patch_bnl_3.append_to_notes(note)
canonical_patch_bnl_3.compute(export_to_git_and_s3 = True)

In [None]:
canonical_patch_bnl_3.manifest_data

## BCUL 2nd, 3rd and 4th Runs

In [3]:
s3_bucket = 'canonical-staging'

bcul_nps_3 = ["CL", "JH", "Guepe1851", "Guepe1887", "PJ", "NV", "MB", "TouSuIl", "JVE", "OBS", "ouistiti", "Moniteur", "JV", "CharivariCH", 'PDL']

moniteur_included_years = ["1879"]
jv_included_years = ['1875', '1876', '1877', '1878', '1879']
charivari_included_years = ['1875', '1876']


In [4]:
s3_bcul_issues_2, _= fetch_files(s3_bucket, compute=False, newspapers_filter=bcul_nps_3)
bcul_stats_2 = compute_canonical_stats_for_manifest(s3_bcul_issues_2)

Fetching list of newspapers from canonical-staging
canonical-staging contains 114 newspapers
canonical-staging contains 161 .bz2 issue files for the provided newspapers ['CL', 'JH', 'Guepe1851', 'Guepe1887', 'PJ', 'NV', 'MB', 'TouSuIl', 'JVE', 'OBS', 'ouistiti', 'Moniteur', 'JV', 'CharivariCH', 'PDL']
Fetched all issues, gathering desired information.
Finished grouping and aggregating by title and year.


In [12]:
manifest_out_name = 'canonical_v2-0-0.json'

repo = git.Repo('/home/piconti/impresso-text-acquisition')
mft_s3_input_bucket = None # bucket corresponding to the input data of the data currently in 'rebuilt-data'
mft_s3_output_bucket = 'canonical-staging' #'rebuilt-data' #'rebuilt-sandbox'
# there is no previous manifest
temp_dir = '/home/piconti/temp_canonical_v2-0-0'
previous_manifest_path = 's3://canonical-staging/canonical_v1-0-1.json'

canonical_patch_bcul_2= DataManifest(
    data_stage = 'canonical', # DataStage.CANONICAL also works
    s3_output_bucket = mft_s3_output_bucket,
    s3_input_bucket = mft_s3_input_bucket,
    git_repo = repo,
    temp_dir = temp_dir,
    staging=True,
)

In [13]:
bcul_stats_copy = copy.deepcopy(bcul_stats_2)

print("Populating the manifest with the resulting yearly statistics...")
# populate the manifest with these statistics
for stats in bcul_stats_copy:
    title = stats['np_id']
    year = stats['year']
    if title == 'Moniteur':
        if year not in moniteur_included_years:
            print(f"Skipping year {year} for Moniteur.")
            continue
    if title == 'JV':
        if year not in jv_included_years:
            print(f"Skipping year {year} for JV.")
            continue
    if title == 'CharivariCH':
        if year not in charivari_included_years:
            print(f"Skipping year {year} for CharivariCH.")
            continue
    title = stats['np_id']
    year = stats['year']
    del stats["np_id"]
    del stats["year"]
    print(f"Addind the stats for {title} {year}.")
    canonical_patch_bcul_2.add_by_title_year(title, year, stats)

Populating the manifest with the resulting yearly statistics...
Addind the stats for CL 1882.
Addind the stats for CL 1883.
Addind the stats for CL 1884.
Addind the stats for CL 1885.
Addind the stats for CL 1886.
Addind the stats for CharivariCH 1875.
Addind the stats for CharivariCH 1876.
Skipping year 1877 for CharivariCH.
Addind the stats for Guepe1851 1851.
Addind the stats for Guepe1851 1852.
Addind the stats for Guepe1851 1853.
Addind the stats for Guepe1851 1854.
Addind the stats for Guepe1887 1887.
Addind the stats for Guepe1887 1888.
Addind the stats for Guepe1887 1897.
Addind the stats for JH 1738.
Addind the stats for JH 1739.
Addind the stats for JH 1740.
Addind the stats for JH 1741.
Addind the stats for JH 1742.
Addind the stats for JH 1743.
Addind the stats for JH 1744.
Addind the stats for JH 1745.
Addind the stats for JH 1746.
Addind the stats for JH 1747.
Addind the stats for JH 1748.
Addind the stats for JH 1749.
Addind the stats for JH 1750.
Addind the stats for JH

In [14]:
print("Finalizing the manifest, and computing the result...")

note = f"Ingestion of Newspaper issues into canonical format for titles {bcul_nps_3}."
canonical_patch_bcul_2.append_to_notes(note)
canonical_patch_bcul_2.compute(export_to_git_and_s3 = True)

Finalizing the manifest, and computing the result...
CL 1882
CL 1883
CL 1884
CL 1885
CL 1886
CharivariCH 1875
CharivariCH 1876
Guepe1851 1851
Guepe1851 1852
Guepe1851 1853
Guepe1851 1854
Guepe1887 1887
Guepe1887 1888
Guepe1887 1897
JH 1738
JH 1739
JH 1740
JH 1741
JH 1742
JH 1743
JH 1744
JH 1745
JH 1746
JH 1747
JH 1748
JH 1749
JH 1750
JH 1751
JH 1752
JH 1753
JH 1754
JH 1755
JH 1756
JH 1757
JH 1758
JH 1759
JH 1760
JH 1761
JH 1762
JH 1763
JH 1764
JH 1765
JH 1766
JH 1767
JH 1768
JH 1769
JH 1770
JH 1771
JH 1772
JH 1773
JH 1774
JH 1775
JH 1776
JH 1777
JH 1778
JH 1779
JH 1780
JH 1781
JH 1782
JV 1875
JV 1876
JV 1877
JV 1878
JV 1879
JVE 1891
JVE 1892
JVE 1893
JVE 1894
MB 1748
MB 1749
MB 1750
MB 1751
MB 1752
MB 1753
MB 1754
MB 1755
MB 1756
MB 1757
MB 1758
MB 1759
MB 1760
MB 1761
MB 1762
MB 1763
MB 1764
MB 1766
MB 1767
MB 1768
MB 1769
MB 1770
MB 1771
MB 1772
MB 1773
MB 1774
MB 1775
MB 1776
MB 1777
MB 1778
MB 1779
MB 1780
MB 1781
MB 1782
MB 1783
MB 1784
MB 1785
MB 1786
MB 1787
MB 1788
MB 1789
MB 1

In [15]:
canonical_patch_bcul_2.manifest_data

{'mft_version': 'v2.0.0',
 'mft_generation_date': '2024-03-18 11:45:56',
 'mft_s3_path': 's3://canonical-staging/canonical_v2-0-0.json',
 'input_mft_s3_path': None,
 'input_mft_git_path': None,
 'code_git_commit': 'https://github.com/impresso/impresso-text-acquisition/commit/25667221615e379ea9d20bb69158da6ce1f3b877',
 'media_list': [{'media_title': '0002088',
   'last_modification_date': '2024-01-24 23:06:54',
   'update_type': None,
   'update_level': 'title',
   'updated_years': [],
   'updated_fields': [],
   'code_git_commit': None,
   'media_statistics': [{'stage': 'canonical',
     'granularity': 'title',
     'element': '0002088',
     'nps_stats': {'content_items_out': 603, 'issues': 11, 'pages': 88}},
    {'stage': 'canonical',
     'granularity': 'year',
     'element': '0002088-1832',
     'nps_stats': {'pages': 88, 'issues': 11, 'content_items_out': 603}}]},
  {'media_title': '0002244',
   'last_modification_date': '2024-01-24 23:06:54',
   'update_type': None,
   'update_l