**Rationale**: code to re-package the canonical data in a more compressed format see [impresso-text-acquisition issue \#26](https://github.com/impresso/impresso-text-acquisition/issues/26)

In [1]:
from impresso_commons.path.path_fs import (KNOWN_JOURNALS,
                                           detect_canonical_issues)
import dask.bag as db
import jsonlines
from dask.distributed import Client, progress
import os
from smart_open import smart_open

In [48]:
from impresso_commons.text.rebuilder import upload

In [51]:
upload?

In [50]:
input_dir = "/scratch/matteo/ingested/"

In [3]:
local_issues = detect_canonical_issues(
        input_dir,
        KNOWN_JOURNALS
)

In [4]:
len(local_issues)

100251

In [5]:
local_issues[100]

IssueDirectory(journal='GDL', date=datetime.date(1964, 7, 16), edition='a', path='/scratch/matteo/ingested/GDL/1964/07/16/a')

In [6]:
dask_client = Client('iccluster036.iccluster.epfl.ch:8786')

In [7]:
dask_client

0,1
Client  Scheduler: tcp://iccluster036.iccluster.epfl.ch:8786  Dashboard: http://iccluster036.iccluster.epfl.ch:8787/status,Cluster  Workers: 36  Cores: 36  Memory: 360.00 GB


In [8]:
issue_bag = db.from_sequence(local_issues)

In [9]:
grouped_bag = issue_bag.groupby(lambda issue: f'{issue.journal}-{issue.date.year}')

In [11]:
def find_issue_files(key, issues):
    issue_files = []
    for issue in issues:
        basedir = issue.path
        try:
            filename = [
                file 
                for file in os.listdir(basedir) 
                if 'issue.json' in file
            ][0]
            issue_files.append(os.path.join(basedir, filename))
        except:
            pass
    return (key, issue_files)
    

In [12]:
def find_page_files(key, issues):
    page_files = []
    for issue in issues:
        basedir = issue.path
        try:
            page_filenames = [
                os.path.join(basedir, file)
                for file in os.listdir(basedir) 
                if '-p' in file
            ]
            page_files += page_filenames
        except:
            pass
    return (key, page_files)

In [54]:
# TODO: move this function to the codebase, it's generic enough!
def compress(key, json_files, output_dir, prefix=""):
    """Merges a set of JSON line files into a single compressed archive.

    :param key: signature of the newspaper issue (e.g. GDL-1900)
    :type key: str
    :param json_files: input JSON line files
    :type json_files: list
    :param output_dir: directory where to write the output file
    :type outp_dir: str
    :return: a tuple with: sorting key [0] and path to serialized file [1].
    :rytpe: tuple

    .. note::

        `sort_key` is expected to be the concatenation of newspaper ID and year
        (e.g. GDL-1900).
    """

    
    newspaper, year = key.split('-')
    prefix_string = "" if prefix == "" else f"-{prefix}"
    filename = f'{newspaper}-{year}{prefix_string}.jsonl.bz2'
    filepath = os.path.join(output_dir, filename)
    print(f'Compressing {len(json_files)} JSON files into {filepath}')

    with smart_open(filepath, 'wb') as fout:
        writer = jsonlines.Writer(fout)

        for json_file in json_files:
            with open(json_file, 'r') as inpf:
                reader = jsonlines.Reader(inpf)
                items = list(reader)
                writer.write_all(items)
            print(
                f'Written {len(items)} docs from {json_file} to {filepath}'
            )

        writer.close()

    return (key, filepath)
    print(len(json_files))

In [44]:
issue_bag = grouped_bag.starmap(find_issue_files)\
    .starmap(compress, prefix="issues", output_dir='/scratch/matteo/impresso-canonical-compressed/issues/')\
    .persist()

In [45]:
progress(issue_bag)

VBox()

In [41]:
progress(pages_bag)

VBox()

In [17]:
dask_client.status

'running'

In [33]:
pages_bag = grouped_bag.starmap(find_page_files)\
    .starmap(compress, prefix="pages", output_dir='/scratch/matteo/impresso-canonical-compressed/pages/')\
    .persist()

In [43]:
pages_bag.take(1)

(('GDL-1908',
  '/scratch/matteo/impresso-canonical-compressed/pages/GDL-1908-pages.jsonl.bz2'),)

In [46]:
issue_bag.take(1)

(('GDL-1908',
  '/scratch/matteo/impresso-canonical-compressed/issues/GDL-1908-issues.jsonl.bz2'),)

In [52]:
issue_bag.starmap(upload, bucket_name='original-canonical-compressed').compute()

[(True,
  '/scratch/matteo/impresso-canonical-compressed/issues/GDL-1908-issues.jsonl.bz2'),
 (True,
  '/scratch/matteo/impresso-canonical-compressed/issues/JDG-1997-issues.jsonl.bz2'),
 (True,
  '/scratch/matteo/impresso-canonical-compressed/issues/GDL-1862-issues.jsonl.bz2'),
 (True,
  '/scratch/matteo/impresso-canonical-compressed/issues/GDL-1890-issues.jsonl.bz2'),
 (True,
  '/scratch/matteo/impresso-canonical-compressed/issues/JDG-1901-issues.jsonl.bz2'),
 (True,
  '/scratch/matteo/impresso-canonical-compressed/issues/GDL-1900-issues.jsonl.bz2'),
 (True,
  '/scratch/matteo/impresso-canonical-compressed/issues/JDG-1870-issues.jsonl.bz2'),
 (True,
  '/scratch/matteo/impresso-canonical-compressed/issues/JDG-1965-issues.jsonl.bz2'),
 (True,
  '/scratch/matteo/impresso-canonical-compressed/issues/JDG-1880-issues.jsonl.bz2'),
 (True,
  '/scratch/matteo/impresso-canonical-compressed/issues/GDL-1839-issues.jsonl.bz2'),
 (True,
  '/scratch/matteo/impresso-canonical-compressed/issues/GDL-18

In [53]:
pages_bag.starmap(upload, bucket_name='original-canonical-compressed').compute()

[(True,
  '/scratch/matteo/impresso-canonical-compressed/pages/GDL-1908-pages.jsonl.bz2'),
 (True,
  '/scratch/matteo/impresso-canonical-compressed/pages/JDG-1997-pages.jsonl.bz2'),
 (True,
  '/scratch/matteo/impresso-canonical-compressed/pages/GDL-1862-pages.jsonl.bz2'),
 (True,
  '/scratch/matteo/impresso-canonical-compressed/pages/GDL-1890-pages.jsonl.bz2'),
 (True,
  '/scratch/matteo/impresso-canonical-compressed/pages/JDG-1901-pages.jsonl.bz2'),
 (True,
  '/scratch/matteo/impresso-canonical-compressed/pages/GDL-1900-pages.jsonl.bz2'),
 (True,
  '/scratch/matteo/impresso-canonical-compressed/pages/JDG-1870-pages.jsonl.bz2'),
 (True,
  '/scratch/matteo/impresso-canonical-compressed/pages/JDG-1965-pages.jsonl.bz2'),
 (True,
  '/scratch/matteo/impresso-canonical-compressed/pages/JDG-1880-pages.jsonl.bz2'),
 (True,
  '/scratch/matteo/impresso-canonical-compressed/pages/GDL-1839-pages.jsonl.bz2'),
 (True,
  '/scratch/matteo/impresso-canonical-compressed/pages/GDL-1860-pages.jsonl.bz2'),