other statistics to add (per newspaper):
- ✓ number of issues 
- number of pages
- ✓ number of content items
- number of images
- number of tokens

## Imports

In [178]:
import os, sys
sys.path.append("../")
import pandas as pd
import json
from dask import bag as db
from dask_k8 import DaskCluster
from dask.distributed import Client
from impresso_commons.utils.s3 import IMPRESSO_STORAGEOPT, fixed_s3fs_glob
from impresso_commons.utils.s3 import alternative_read_text
from impresso_commons.utils.kube import (make_scheduler_configuration,
                                         make_worker_configuration)
from sanity_check.contents.s3_data import list_files_rebuilt, list_pages

## Set up dask k8 cluster

In [52]:
cluster = DaskCluster(
    namespace="dhlab",
    cluster_id="impresso-sanitycheck",
    scheduler_pod_spec=make_scheduler_configuration(),
    worker_pod_spec=make_worker_configuration(
        docker_image="ic-registry.epfl.ch/dhlab/impresso_pycommons:v1",
        memory="10G"
    )
)

In [116]:
cluster.close()

In [119]:
cluster.create()
cluster.scale(150, blocking=False)

Scheduler: tcp://10.90.47.30:30514
Dashboard: http://10.90.47.30:15661


In [132]:
dask_client = cluster.make_dask_client()

In [133]:
dask_client

0,1
Client  Scheduler: tcp://10.90.47.30:30514  Dashboard: http://10.90.47.30:8787/status,Cluster  Workers: 150  Cores: 150  Memory: 1.50 TB


In [83]:
dask_client.get_versions(check=True)

ValueError: Mismatched versions found

pandas
+---------------------------+---------+
|                           | version |
+---------------------------+---------+
| client                    | 0.24.2  |
| tcp://10.233.107.21:36029 | None    |
+---------------------------+---------+

## Gather data for stats from rebuilt

In [124]:
rebuilt_bucket = "s3://canonical-rebuilt-release"

In [127]:
rebuilt_files = list_files_rebuilt(rebuilt_bucket)

Fetching list of newspapers from s3://canonical-rebuilt-release
canonical-rebuilt-release contains 76 newspapers
s3://canonical-rebuilt-release contains 3040 .bz2 files


In [134]:
contentitems_df = db.from_sequence(
    rebuilt_files,
    partition_size=5
).map(
    alternative_read_text, IMPRESSO_STORAGEOPT
).flatten().map(
    json.loads
).map(
    lambda i: {
        "id": i['id'],
        "type": i['tp'],
        "year": i['d'].split('-')[0],
        "newspaper": i['id'].split('-')[0],
        # "tokens": len(i['ft'].split()) if "ft" in i else 0
    }
).to_dataframe().set_index('id').persist()

In [141]:
ci_grouped = contentitems_df.groupby(
    by=['newspaper', 'year']
).size().compute()

In [149]:
df = pd.DataFrame(ci_grouped)

In [153]:
df.reset_index(inplace=True)

In [158]:
df.index

RangeIndex(start=0, stop=3040, step=1)

In [159]:
df.head()

Unnamed: 0,newspaper,year,0
0,BDC,1839,146
1,BLB,1845,2
2,BLB,1846,172
3,BLB,1847,255
4,BNN,1885,47


In [165]:
df["id"] = df.apply(lambda x: f"{x.newspaper}-{x.year}", axis=1)

In [167]:
df.set_index('id', inplace=True)

In [173]:
df.columns = ['newspaper', 'year', 'count']

In [176]:
df[['count']].to_csv('../../impresso-processing/data/contentitems_stats.csv')