**Logic**:
- compare current and new canonical data
    - verify new CI IDs as well as deleted ones (a trace of the merging should be found in metadata)
- cross-check with user collections in MySQL
    - how many of the patched CIs are found there?

## Imports

In [1]:
import os, sys
sys.path.append("../")
import pandas as pd
import json
from dask import bag as db
from dask_k8 import DaskCluster
from dask.distributed import Client
from impresso_commons.utils.s3 import IMPRESSO_STORAGEOPT, fixed_s3fs_glob
from impresso_commons.utils.s3 import alternative_read_text
from impresso_commons.utils.kube import (make_scheduler_configuration,
                                         make_worker_configuration)
from sanity_check.contents.s3_data import list_files_rebuilt, list_pages

## Functions

In [4]:
S3_CANONICAL_DATA_BUCKET = "s3://original-canonical-fixed"
S3_REBUILT_DATA_BUCKET = "s3://canonical-rebuilt"

In [8]:
from typing import List

def list_issues(bucket_name: str = S3_CANONICAL_DATA_BUCKET, newspapers : List = []):
    if newspapers:
        pass
    else:
        if bucket_name:
            newspapers = list_newspapers(bucket_name)
        else:
            newspapers = list_newspapers()
    print(f'Issues for these newspapers will be listed: {newspapers}')
    issue_files = [
        file
        for np in newspapers
        for file in fixed_s3fs_glob(f"{os.path.join(bucket_name, f'{np}/issues/*')}")
    ]
    print(f"{bucket_name} contains {len(issue_files)} .bz2 files with issues")
    return issue_files

In [24]:
def fetch_issues(bucket_name=S3_CANONICAL_DATA_BUCKET, newspapers=[], compute=True):
    """
    Fetch issue JSON docs from an s3 bucket with impresso canonical data.
    """
    if newspapers:
        issue_files = list_issues(bucket_name, newspapers)
    else:
        issue_files = list_issues(bucket_name)

    print(
        (
            f"Fetching issue ids from {len(issue_files)} .bz2 files "
            f"(compute={compute})"
        )
    )
    issue_bag = db.read_text(issue_files, storage_options=IMPRESSO_STORAGEOPT).map(
        json.loads
    )

    if compute:
        return issue_bag.compute()
    else:
        return issue_bag

In [20]:
def start_cluster(n_workers : int = 10, worker_memory : str = '1G', blocking : bool = False):
    cluster = DaskCluster(
        namespace="dhlab",
        cluster_id="impresso-sanitycheck",
        scheduler_pod_spec=make_scheduler_configuration(),
        worker_pod_spec=make_worker_configuration(
            docker_image="ic-registry.epfl.ch/dhlab/impresso_pycommons:v1",
            memory=worker_memory
        )
    )
    cluster.create()
    cluster.scale(n_workers, blocking=False)
    return cluster, cluster.make_dask_client()

## Config

In [2]:
new_canonical_bucket = "s3://original-canonical-staging"
current_canonical_bucket = "s3://original-canonical-release"

In [95]:
lux_nps = [
    'actionfem',
    'armeteufel',
    'avenirgdl',
    'buergerbeamten',
    'courriergdl',
    'deletz1893',
    'demitock',
    'diekwochen',
    'dunioun',
    'gazgrdlux',
    'indeplux',
    'kommmit',
    'landwortbild',
    'lunion',
    'luxembourg1935',
    'luxland',
    'luxwort',
    'luxzeit1844',
    'luxzeit1858',
    'obermosel',
    'onsjongen',
    'schmiede',
    'tageblatt',
    'volkfreu1869',
    'waechtersauer',
    'waeschfra',
    'actionfem',
    'armeteufel',
    'avenirgdl',
    'buergerbeamten',
    'courriergdl',
    'deletz1893',
    'demitock',
    'diekwochen',
    'dunioun',
    'gazgrdlux',
    'indeplux',
    'kommmit',
    'landwortbild',
    'lunion',
    'luxembourg1935',
    'luxland',
    'luxwort',
    'luxzeit1844',
    'luxzeit1858',
    'obermosel',
    'onsjongen',
    'schmiede',
    'tageblatt',
    'volkfreu1869',
    'waechtersauer',
    'waeschfra',
]

In [96]:
len(lux_nps)

52

## dask k8 cluster

In [97]:
dask_cluster, dask_client = start_cluster(n_workers=100, worker_memory='5G')

Scheduler: tcp://10.90.49.4:9001
Dashboard: http://10.90.49.4:9078


## Read in new and current canonical

In [98]:
new_canonical_files = list_issues(new_canonical_bucket, newspapers=lux_nps)

Issues for these newspapers will be listed: ['actionfem', 'armeteufel', 'avenirgdl', 'buergerbeamten', 'courriergdl', 'deletz1893', 'demitock', 'diekwochen', 'dunioun', 'gazgrdlux', 'indeplux', 'kommmit', 'landwortbild', 'lunion', 'luxembourg1935', 'luxland', 'luxwort', 'luxzeit1844', 'luxzeit1858', 'obermosel', 'onsjongen', 'schmiede', 'tageblatt', 'volkfreu1869', 'waechtersauer', 'waeschfra', 'actionfem', 'armeteufel', 'avenirgdl', 'buergerbeamten', 'courriergdl', 'deletz1893', 'demitock', 'diekwochen', 'dunioun', 'gazgrdlux', 'indeplux', 'kommmit', 'landwortbild', 'lunion', 'luxembourg1935', 'luxland', 'luxwort', 'luxzeit1844', 'luxzeit1858', 'obermosel', 'onsjongen', 'schmiede', 'tageblatt', 'volkfreu1869', 'waechtersauer', 'waeschfra']
s3://original-canonical-staging contains 1050 .bz2 files with issues


In [99]:
current_canonical_files = list_issues(current_canonical_bucket, newspapers=lux_nps)

Issues for these newspapers will be listed: ['actionfem', 'armeteufel', 'avenirgdl', 'buergerbeamten', 'courriergdl', 'deletz1893', 'demitock', 'diekwochen', 'dunioun', 'gazgrdlux', 'indeplux', 'kommmit', 'landwortbild', 'lunion', 'luxembourg1935', 'luxland', 'luxwort', 'luxzeit1844', 'luxzeit1858', 'obermosel', 'onsjongen', 'schmiede', 'tageblatt', 'volkfreu1869', 'waechtersauer', 'waeschfra', 'actionfem', 'armeteufel', 'avenirgdl', 'buergerbeamten', 'courriergdl', 'deletz1893', 'demitock', 'diekwochen', 'dunioun', 'gazgrdlux', 'indeplux', 'kommmit', 'landwortbild', 'lunion', 'luxembourg1935', 'luxland', 'luxwort', 'luxzeit1844', 'luxzeit1858', 'obermosel', 'onsjongen', 'schmiede', 'tageblatt', 'volkfreu1869', 'waechtersauer', 'waeschfra']
s3://original-canonical-release contains 1050 .bz2 files with issues


In [100]:
newcanonical_issues_bag = fetch_issues(new_canonical_bucket, newspapers=lux_nps, compute=False)

Issues for these newspapers will be listed: ['actionfem', 'armeteufel', 'avenirgdl', 'buergerbeamten', 'courriergdl', 'deletz1893', 'demitock', 'diekwochen', 'dunioun', 'gazgrdlux', 'indeplux', 'kommmit', 'landwortbild', 'lunion', 'luxembourg1935', 'luxland', 'luxwort', 'luxzeit1844', 'luxzeit1858', 'obermosel', 'onsjongen', 'schmiede', 'tageblatt', 'volkfreu1869', 'waechtersauer', 'waeschfra', 'actionfem', 'armeteufel', 'avenirgdl', 'buergerbeamten', 'courriergdl', 'deletz1893', 'demitock', 'diekwochen', 'dunioun', 'gazgrdlux', 'indeplux', 'kommmit', 'landwortbild', 'lunion', 'luxembourg1935', 'luxland', 'luxwort', 'luxzeit1844', 'luxzeit1858', 'obermosel', 'onsjongen', 'schmiede', 'tageblatt', 'volkfreu1869', 'waechtersauer', 'waeschfra']
s3://original-canonical-staging contains 1050 .bz2 files with issues
Fetching issue ids from 1050 .bz2 files (compute=False)


In [101]:
currentcanonical_issues_bag = fetch_issues(current_canonical_bucket, newspapers=lux_nps, compute=False)

Issues for these newspapers will be listed: ['actionfem', 'armeteufel', 'avenirgdl', 'buergerbeamten', 'courriergdl', 'deletz1893', 'demitock', 'diekwochen', 'dunioun', 'gazgrdlux', 'indeplux', 'kommmit', 'landwortbild', 'lunion', 'luxembourg1935', 'luxland', 'luxwort', 'luxzeit1844', 'luxzeit1858', 'obermosel', 'onsjongen', 'schmiede', 'tageblatt', 'volkfreu1869', 'waechtersauer', 'waeschfra', 'actionfem', 'armeteufel', 'avenirgdl', 'buergerbeamten', 'courriergdl', 'deletz1893', 'demitock', 'diekwochen', 'dunioun', 'gazgrdlux', 'indeplux', 'kommmit', 'landwortbild', 'lunion', 'luxembourg1935', 'luxland', 'luxwort', 'luxzeit1844', 'luxzeit1858', 'obermosel', 'onsjongen', 'schmiede', 'tageblatt', 'volkfreu1869', 'waechtersauer', 'waeschfra']
s3://original-canonical-release contains 1050 .bz2 files with issues
Fetching issue ids from 1050 .bz2 files (compute=False)


In [102]:
newcanonical_issues_bag.count().compute()

194650

In [103]:
currentcanonical_issues_bag.count().compute()

194650

## Create dataframes

### From new canonical data

In [33]:
example = newcanonical_issues_bag.take(1)[0]

In [63]:
example

{'cdt': '2020-02-10 11:38:54',
 'i': [{'m': {'id': 'luxwort-1848-03-23-a-i0001',
    'pp': [1, 2],
    'tp': 'article',
    't': 'Die Regierung hat folgende Proklamation  erlassen:',
    'l': 'fr'},
   'l': {'id': 'MODSMD_ARTICLE1',
    'parts': [{'comp_role': 'heading',
      'comp_id': 'P1_TB00009',
      'comp_fileid': 'ALTO00001',
      'comp_page_no': 1},
     {'comp_role': 'heading',
      'comp_id': 'P1_TB00011',
      'comp_fileid': 'ALTO00001',
      'comp_page_no': 1},
     {'comp_role': 'body',
      'comp_id': 'P1_TB00010',
      'comp_fileid': 'ALTO00001',
      'comp_page_no': 1},
     {'comp_role': 'body',
      'comp_id': 'P1_TB00008',
      'comp_fileid': 'ALTO00001',
      'comp_page_no': 1},
     {'comp_role': 'body',
      'comp_id': 'P1_TB00007',
      'comp_fileid': 'ALTO00001',
      'comp_page_no': 1},
     {'comp_role': 'body',
      'comp_id': 'P1_TB00016',
      'comp_fileid': 'ALTO00001',
      'comp_page_no': 1},
     {'comp_role': 'body',
      'comp_id': 

In [104]:
newcanonical_cis_bag = newcanonical_issues_bag.map(
    lambda i: i['i']
).flatten().map(
    lambda ci: {
    'id': ci['m']['id'],
    'title': ci['m']['t'] if 't' in ci['m'] else None,
    'pages': ci['m']['pp'],
    'path': new_canonical_bucket
    }
 )

In [68]:
#newcanonical_cis_bag.take(1)[0]

{'id': 'luxwort-1848-03-23-a-i0001',
 'title': 'Die Regierung hat folgende Proklamation  erlassen:',
 'pages': [1, 2],
 'path': 's3://original-canonical-staging'}

In [105]:
newcanonical_df = newcanonical_cis_bag.to_dataframe().set_index('id').persist()

In [106]:
newcanonical_df.head()

Unnamed: 0_level_0,title,pages,path
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
actionfem-1927-10-15-a-i0001,,[1],s3://original-canonical-staging
actionfem-1927-10-15-a-i0001,,[1],s3://original-canonical-staging
actionfem-1927-10-15-a-i0003,Unsere Vice-Präsidentin,"[6, 7]",s3://original-canonical-staging
actionfem-1927-10-15-a-i0003,Unsere Vice-Präsidentin,"[6, 7]",s3://original-canonical-staging
actionfem-1927-10-15-a-i0004,Glück.,"[7, 8]",s3://original-canonical-staging


In [107]:
newcanonical_df.shape[0].compute()

8748580

### From current canonical data

In [108]:
currentanonical_cis_bag = currentcanonical_issues_bag.map(
    lambda i: i['i']
).flatten().map(
    lambda ci: {
    'id': ci['m']['id'],
    'title': ci['m']['t'] if 't' in ci['m'] else None,
    'pages': ci['m']['pp'],
    'path': current_canonical_bucket
    }
 )

In [109]:
currentanonical_df = currentanonical_cis_bag.to_dataframe().set_index('id').persist()

In [110]:
currentanonical_df.head()

Unnamed: 0_level_0,title,pages,path
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
actionfem-1927-10-15-a-i0001,,[1],s3://original-canonical-release
actionfem-1927-10-15-a-i0001,,[1],s3://original-canonical-release
actionfem-1927-10-15-a-i0002,3. Die Form der Nahrung.,[6],s3://original-canonical-release
actionfem-1927-10-15-a-i0002,3. Die Form der Nahrung.,[6],s3://original-canonical-release
actionfem-1927-10-15-a-i0003,Unsere Vice-Präsidentin,"[6, 7]",s3://original-canonical-release


In [111]:
currentanonical_df.shape[0].compute()

9173874

## Combine the two dataframes

In [112]:
lux_canonical_df = currentanonical_df.join(
    newcanonical_df,
    how='outer', 
    lsuffix='_current', 
    rsuffix='_new'
).persist()

In [113]:
lux_canonical_df.head()

Unnamed: 0_level_0,title_current,pages_current,path_current,title_new,pages_new,path_new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
actionfem-1927-10-15-a-i0001,,[1],s3://original-canonical-release,,[1],s3://original-canonical-staging
actionfem-1927-10-15-a-i0001,,[1],s3://original-canonical-release,,[1],s3://original-canonical-staging
actionfem-1927-10-15-a-i0001,,[1],s3://original-canonical-release,,[1],s3://original-canonical-staging
actionfem-1927-10-15-a-i0001,,[1],s3://original-canonical-release,,[1],s3://original-canonical-staging
actionfem-1927-10-15-a-i0002,3. Die Form der Nahrung.,[6],s3://original-canonical-release,,,


In [116]:
deleted_ci_ids = lux_canonical_df[lux_canonical_df.path_new.isnull()].persist()

In [117]:
deleted_ci_ids.head()

Unnamed: 0_level_0,title_current,pages_current,path_current,title_new,pages_new,path_new
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
actionfem-1927-10-15-a-i0002,3. Die Form der Nahrung.,[6],s3://original-canonical-release,,,
actionfem-1927-10-15-a-i0002,3. Die Form der Nahrung.,[6],s3://original-canonical-release,,,
actionfem-1927-10-15-a-i0024,1. Von den einzelnen Nahrungsmitteln sind die ...,"[5, 6]",s3://original-canonical-release,,,
actionfem-1927-10-15-a-i0024,1. Von den einzelnen Nahrungsmitteln sind die ...,"[5, 6]",s3://original-canonical-release,,,
actionfem-1927-10-15-a-i0025,2. Zahl der Mahlzeiten und Quantum der Rahrung.,[6],s3://original-canonical-release,,,


In [118]:
deleted_ci_ids.shape[0].compute()

1155482

In [119]:
lux_canonical_df.shape[0].compute()

17922454

In [120]:
newcanonical_df.shape[0].compute()

8748580

In [121]:
deleted_ci_ids_df = deleted_ci_ids.compute()

In [122]:
type(deleted_ci_ids_df)

pandas.core.frame.DataFrame

In [None]:
deleted_ci_ids_df['newspaper'] = deleted_ci_ids_df.index.map(lambda x: x.split('-')[0])

In [131]:
deleted_ci_ids_df.to_pickle('../../impresso-processing/2020-release-v2/deleted_ci_ids.pkl')

In [132]:
deleted_ci_ids_df.to_csv('../../impresso-processing/2020-release-v2/deleted_ci_ids.csv')

In [130]:
deleted_ci_ids_df.head(5)

Unnamed: 0_level_0,title_current,pages_current,path_current,title_new,pages_new,path_new,newspaper
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
actionfem-1927-10-15-a-i0002,3. Die Form der Nahrung.,[6],s3://original-canonical-release,,,,actionfem
actionfem-1927-10-15-a-i0002,3. Die Form der Nahrung.,[6],s3://original-canonical-release,,,,actionfem
actionfem-1927-10-15-a-i0024,1. Von den einzelnen Nahrungsmitteln sind die ...,"[5, 6]",s3://original-canonical-release,,,,actionfem
actionfem-1927-10-15-a-i0024,1. Von den einzelnen Nahrungsmitteln sind die ...,"[5, 6]",s3://original-canonical-release,,,,actionfem
actionfem-1927-10-15-a-i0025,2. Zahl der Mahlzeiten und Quantum der Rahrung.,[6],s3://original-canonical-release,,,,actionfem


In [133]:
deleted_ci_ids_df.newspaper.value_counts().to_frame()

Unnamed: 0,newspaper
tageblatt,433190
indeplux,203960
luxland,155674
luxwort,117218
obermosel,85488
luxembourg1935,60006
dunioun,26242
lunion,25840
luxzeit1858,9940
courriergdl,9602


## Release resources

In [135]:
cluster.close()