In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import jsonlines
from smart_open import smart_open
from dask import bag as db
from dask.distributed import Client, progress

In [3]:
from text_importer.importers.lux.detect import detect_issues as lux_detect_issues
from text_importer.importers.lux.classes import LuxNewspaperIssue
from text_importer.importers.lux.core import mets2issue

In [4]:
from impresso_commons.text.rebuilder import upload

In [12]:
client = Client("localhost:8786")

In [13]:
client

0,1
Client  Scheduler: tcp://localhost:8786  Dashboard: http://localhost:8787/status,Cluster  Workers: 8  Cores: 16  Memory: 67.30 GB


In [14]:
input_dir = "/mnt/project_impresso/original/BNL/"

In [25]:
issues = lux_detect_issues(input_dir)
issue_bag = db.from_sequence(issues)

In [26]:
issue_bag

dask.bag<from_se..., npartitions=101>

## end-to-end processing of one newspaper

In [17]:
from text_importer.importers.lux.core import serialize_page, upload_issues, issue2pages
from text_importer.importers.lux.core import compress_issues, issue2pages, process_page
from impresso_commons.path.path_fs import canonical_path
from impresso_commons.text.rebuilder import cleanup
from impresso_commons.utils.s3 import get_s3_resource

In [18]:
out_dir = "/media/romanell/4T/matteo/impresso-canonical/"
s3_bucket = 'original-canonical-data'

In [27]:
issue_bag =  issue_bag.filter(lambda i: i.journal == 'onsjongen')\
        .repartition(8)\
        .map(mets2issue)\
        .filter(lambda i: i is not None)\
        .persist()

In [32]:
progress(issue_bag)

VBox()

In [33]:
issue_bag.count().compute()

139

In [None]:
result = issue_bag.groupby(lambda i: (i.journal, i.date.year))\
    .starmap(compress_issues, output_dir=out_dir)\
    .starmap(upload_issues, bucket_name=s3_bucket)\
    .compute()

In [None]:
pages_bag = issue_bag\
    .map(issue2pages)\
    .flatten()\
    .persist()

print(f'Pages to process: {pages_bag.count().compute()}')

In [None]:
pages_bag = pages_bag\
    .repartition(1500)\
    .map(process_page)\
    .map(serialize_page, output_dir=out_dir)\
    .persist()

In [None]:
progress(pages_bag)

In [105]:
x = pages_bag.groupby(lambda x: canonical_path(x[0], path_type='dir').replace('/', '-'))\
    .starmap(compress_pages, prefix='pages', output_dir=out_dir)\
    .starmap(upload_pages, bucket_name='original-canonical-data')\
    .starmap(cleanup).compute()

In [106]:
x

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [12]:
# TODO: move this function to the codebase, it's generic enough!
import json

def compress_pages(key, json_files, output_dir, prefix=""):
    """Merges a set of JSON line files into a single compressed archive.

    :param key: signature of the newspaper issue (e.g. GDL-1900)
    :type key: str
    :param json_files: input JSON line files
    :type json_files: list
    :param output_dir: directory where to write the output file
    :type outp_dir: str
    :return: a tuple with: sorting key [0] and path to serialized file [1].
    :rytpe: tuple

    .. note::

        `sort_key` is expected to be the concatenation of newspaper ID and year
        (e.g. GDL-1900).
    """

    
    newspaper, year, month, day, edition = key.split('-')
    prefix_string = "" if prefix == "" else f"-{prefix}"
    filename = f'{newspaper}-{year}-{month}-{day}-{edition}{prefix_string}.jsonl.bz2'
    filepath = os.path.join(output_dir, filename)
    print(f'Compressing {len(json_files)} JSON files into {filepath}')

    with smart_open(filepath, 'wb') as fout:
        writer = jsonlines.Writer(fout)

        items = []
        for issue, json_file in json_files:
            
            with open(json_file, 'r') as inpf:
                item = json.load(inpf)
                items.append(item)
            
        writer.write_all(items)
        print(
            f'Written {len(items)} docs from {json_file} to {filepath}'
        )

        writer.close()

    return (key, filepath)
    print(len(json_files))

In [13]:
def upload_pages(sort_key, filepath, bucket_name=None):
    """Uploads a file to a given S3 bucket.
    :param sort_key: the key used to group articles (e.g. "GDL-1900")
    :type sort_key: str
    :param filepath: path of the file to upload to S3
    :type filepath: str
    :param bucket_name: name of S3 bucket where to upload the file
    :type bucket_name: str
    :return: a tuple with [0] whether the upload was successful (boolean) and
        [1] the path of the uploaded file (string)
    .. note::
        `sort_key` is expected to be the concatenation of newspaper ID and year
        (e.g. GDL-1900).
    """
    # create connection with bucket
    # copy contents to s3 key
    newspaper, year, month, day, edition = sort_key.split('-')
    key_name = "{}/{}/{}".format(
        newspaper,
        f'{newspaper}-{year}',
        os.path.basename(filepath)
    )
    s3 = get_s3_resource()
    try:
        bucket = s3.Bucket(bucket_name)
        bucket.upload_file(filepath, key_name)
        print(f'Uploaded {filepath} to {key_name}')
        return True, filepath
    except Exception as e:
        #logger.error(e)
        #logger.error(f'The upload of {filepath} failed with error {e}')
        return False, filepath

## Notes

In [None]:
issue = issue_bag.take(1)[0]

In [92]:
LuxNewspaperIssue(issue)

<text_importer.importers.lux.classes.LuxNewspaperIssue at 0x7ff33fbbac18>

In [8]:
issue_bag.count().compute()

97326

In [9]:
issues_by_np = issue_bag.groupby(lambda i: i.journal).compute()

In [10]:
for np, issues in issues_by_np:
    print(np, len(issues))

courriergdl 4367
diekwochen 444
onsjongen 139
deletz1893 887
lunion 3144
armeteufel 916
schmiede 159
demitock 134
landwortbild 39
waechtersauer 2199
luxzeit1844 215
waeschfra 660
dunioun 891
obermosel 12563
avenirgdl 995
indeplux 18655
buergerbeamten 3008
gazgrdlux 141
luxembourg1935 1540
volkfreu1869 1087
actionfem 101
kommmit 22
luxzeit1858 538
luxwort 30372
luxland 2772
tageblatt 11338


In [11]:
issue_sample = issue_bag.filter(
    lambda i: i.journal=='luxzeit1858' and i.date.year==1858 and i.date.day == 1 
).persist()

In [23]:
issue_sample = issue_bag.filter(
    lambda i: i.journal=='indeplux' and i.date.year==1871 
).persist()

In [24]:
issue_sample.compute()[:100]

[IssueDirectory(journal='indeplux', date=datetime.date(1871, 10, 5), edition='a', path='/mnt/project_impresso/original/BNL/public_domain_007/3025392_newspaper_indeplux_1871-10-05_01', rights='o'),
 IssueDirectory(journal='indeplux', date=datetime.date(1871, 10, 3), edition='a', path='/mnt/project_impresso/original/BNL/public_domain_007/3025395_newspaper_indeplux_1871-10-03_01', rights='o'),
 IssueDirectory(journal='indeplux', date=datetime.date(1871, 10, 1), edition='a', path='/mnt/project_impresso/original/BNL/public_domain_007/3025398_newspaper_indeplux_1871-10-01_01', rights='o'),
 IssueDirectory(journal='indeplux', date=datetime.date(1871, 10, 6), edition='a', path='/mnt/project_impresso/original/BNL/public_domain_007/3025402_newspaper_indeplux_1871-10-06_01', rights='o'),
 IssueDirectory(journal='indeplux', date=datetime.date(1871, 10, 4), edition='a', path='/mnt/project_impresso/original/BNL/public_domain_007/3025404_newspaper_indeplux_1871-10-04_01', rights='o'),
 IssueDirectory

In [26]:
issue_sample.count().compute()

77

In [27]:
issue_sample = issue_sample.map(mets2issue).filter(lambda i: i is not None).persist()

In [28]:
progress(issue_sample)

VBox()

In [29]:
issues = issue_sample.compute()

In [30]:
issues

[]

In [106]:
def mets2issue(issue):
    try:
        return LuxNewspaperIssue(issue)
    except Exception as e:
        print(
            f'When instantiating issue {issue}',
            f'the following error was raised: {e}'
        )
        return None

In [60]:
from smart_open import smart_open as smart_open_function

def compress_issues(key, issues, output_dir=None):
    newspaper, year = key
    filename = f'{newspaper}-{year}-issues.jsonl.bz2'
    filepath = os.path.join(output_dir, filename)
    print(f'Compressing {len(issues)} JSON files into {filepath}')

    with smart_open_function(filepath, 'wb') as fout:
        writer = jsonlines.Writer(fout)
        items = [
            issue._issue_data
            for issue in issues
        ]
        writer.write_all(items)
        print(
            f'Written {len(items)} docs from to {filepath}'
        )
        writer.close()

    return (f'{newspaper}-{year}', filepath)

In [69]:
from impresso_commons.utils.s3 import get_s3_resource

def upload_issues(sort_key, filepath, bucket_name=None):
    """Uploads a file to a given S3 bucket.
    :param sort_key: the key used to group articles (e.g. "GDL-1900")
    :type sort_key: str
    :param filepath: path of the file to upload to S3
    :type filepath: str
    :param bucket_name: name of S3 bucket where to upload the file
    :type bucket_name: str
    :return: a tuple with [0] whether the upload was successful (boolean) and
        [1] the path of the uploaded file (string)
    .. note::
        `sort_key` is expected to be the concatenation of newspaper ID and year
        (e.g. GDL-1900).
    """
    # create connection with bucket
    # copy contents to s3 key
    newspaper, year = sort_key.split('-')
    key_name = "{}/{}/{}".format(
        newspaper,
        f'{newspaper}-{year}',
        os.path.basename(filepath)
    )
    s3 = get_s3_resource()
    try:
        bucket = s3.Bucket(bucket_name)
        bucket.upload_file(filepath, key_name)
        logger.info(f'Uploaded {filepath} to {key_name}')
        return True, filepath
    except Exception as e:
        logger.error(e)
        logger.error(f'The upload of {filepath} failed with error {e}')
        return False, filepath

In [15]:
issue_sample = issue_sample.map(mets2issue).filter(lambda i: i is not None).persist()

In [16]:
progress(issue_sample)

VBox()

In [17]:
issue_sample = issue_bag.filter(
    lambda i: i.journal=='luxzeit1858' and i.date.year==1858 and i.date.day == 1 
).map(mets2issue)\
.filter(lambda i: i is not None)\
.persist()

In [18]:
od = "/home/romanell/Documents/impresso/text-acquisition-dev/text_importer/data/out/"

In [70]:
result = issue_sample.groupby(lambda i: (i.journal, i.date.year))\
    .starmap(compress_issues, output_dir=od)\
    .starmap(upload_issues, bucket_name='original-canonical-data')\
    .compute()

NameError: name 'logger' is not defined

In [67]:
result[0]

(True,
 '/home/romanell/Documents/impresso/text-acquisition-dev/text_importer/data/out/luxzeit1858-1858-issues.jsonl.bz2')

In [20]:
def issue2pages(issue):
    pages = []
    for page in issue.pages:
        page.add_issue(issue)
        pages.append(page)
    return pages

In [21]:
def process_page(page):
    page.parse()
    return page

In [50]:
import os
import codecs
import json
from impresso_commons.path.path_fs import canonical_path

def serialize_page(luxpage, output_dir=None):
    
    issue_dir = luxpage.issue.issuedir
    
    out_dir = os.path.join(
        output_dir,
        canonical_path(issue_dir, path_type="dir")
    )
    
    if not os.path.exists(out_dir):
            os.makedirs(out_dir)
            
    canonical_filename = canonical_path(
        issue_dir,
        "p" + str(luxpage.number).zfill(4),
        ".json"
    )
    
    out_file = os.path.join(out_dir, canonical_filename)
    
    with codecs.open(out_file, 'w', 'utf-8') as jsonfile:
        json.dump(luxpage.data, jsonfile)
        print(
            "Written page \'{}\' to {}".format(luxpage.number, out_file)
        )
    return (issue_dir, out_file)

In [22]:
pages_bag = issue_sample.map(issue2pages).flatten().persist()

In [51]:
x = pages_bag.map(process_page).map(serialize_page, output_dir=od).compute()

In [56]:
x

[(IssueDirectory(journal='luxzeit1858', date=datetime.date(1858, 4, 1), edition='a', path='/mnt/project_impresso/original/BNL/public_domain_006/2380730_newspaper_luxzeit1858_1858-04-01_01'),
  '/home/romanell/Documents/impresso/text-acquisition-dev/text_importer/data/out/luxzeit1858/1858/04/01/a/luxzeit1858-1858-04-01-a-p0001.json'),
 (IssueDirectory(journal='luxzeit1858', date=datetime.date(1858, 4, 1), edition='a', path='/mnt/project_impresso/original/BNL/public_domain_006/2380730_newspaper_luxzeit1858_1858-04-01_01'),
  '/home/romanell/Documents/impresso/text-acquisition-dev/text_importer/data/out/luxzeit1858/1858/04/01/a/luxzeit1858-1858-04-01-a-p0002.json'),
 (IssueDirectory(journal='luxzeit1858', date=datetime.date(1858, 4, 1), edition='a', path='/mnt/project_impresso/original/BNL/public_domain_006/2380730_newspaper_luxzeit1858_1858-04-01_01'),
  '/home/romanell/Documents/impresso/text-acquisition-dev/text_importer/data/out/luxzeit1858/1858/04/01/a/luxzeit1858-1858-04-01-a-p0003.

**TODO**: in `LuxNewspaperIssue._find_pages()`: use `(.*?)(\d{5})(.*)` and raise an Error if no seq is matched

In [32]:
import re

In [54]:
test = '1935-03-02_01-00001.xml'

In [55]:
exp = r'(.*?)(\d{5})(.*)'

In [56]:
g = re.match(exp, test)

In [58]:
int(g.group(2))

1