In [47]:
import os
import pickle
import pdb
import logging
from boto.s3.connection import Key, Bucket

from impresso_commons.path import KNOWN_JOURNALS
from impresso_commons.path import detect_canonical_issues
from impresso_commons.path import s3_detect_issues, pair_issue
from impresso_commons.utils.s3 import get_s3_connection

import dask
from dask import compute, delayed
from dask.diagnostics import ProgressBar
from dask.multiprocessing import get as mp_get

In [2]:
logger = logging.getLogger(__name__)

In [3]:
input_dir = "/scratch/matteo/impresso-canonical/"

In [4]:
local_issues = detect_canonical_issues(
    input_dir,
    KNOWN_JOURNALS
)

In [5]:
len(local_issues)

20927

In [6]:
conn = get_s3_connection()

In [7]:
bucket = [
    bucket
    for bucket in conn.get_all_buckets()
    if bucket.name == "canonical-json"
][0]

In [8]:
bucket

<Bucket: canonical-json>

In [22]:
%%time
remote_issues = s3_detect_issues(bucket, prefix="IMP/1901")

CPU times: user 149 ms, sys: 89.4 ms, total: 238 ms
Wall time: 5.32 s


In [9]:
local_issues

[IssueDirectory(journal='GDL', date=datetime.date(1900, 1, 2), edition='a', path='text_importer/data/out/GDL/1900/01/02/a'),
 IssueDirectory(journal='GDL', date=datetime.date(1900, 1, 3), edition='a', path='text_importer/data/out/GDL/1900/01/03/a'),
 IssueDirectory(journal='GDL', date=datetime.date(1900, 1, 4), edition='a', path='text_importer/data/out/GDL/1900/01/04/a'),
 IssueDirectory(journal='GDL', date=datetime.date(1900, 1, 5), edition='a', path='text_importer/data/out/GDL/1900/01/05/a'),
 IssueDirectory(journal='IMP', date=datetime.date(1901, 9, 3), edition='a', path='text_importer/data/out/IMP/1901/09/03/a')]

In [28]:
def s3_upload_issue(local_issue, ouput_bucket, overwrite=False):
    """Upload a canonical newspaper issue to an S3 bucket.
    
    :param local_issue: the issue to upload
    :type local_issue: an instance of `IssueDir`
    :param output_bucket: the target bucket
    :type output_bucket: `boto.s3.connection.Bucket`
    :return: a list of tuples `t` where `t[0]` contains the issue,
        and `t[1]` is a boolean indicating whether the upload was
        successful or not.
    """
    my_dir = local_issue.path
    files = [os.path.join(my_dir, f) for f in os.listdir(my_dir)]
    try:
        for f in files:
            k = Key(bucket)
            # remove the input_dir when setting the key's name
            k.key = f.replace(input_dir, "")
            
            if not overwrite and k.exists() is True:
                pass
            else:
                # copy the content of the file into the key
                k.set_contents_from_filename(f)
                logger.info(f'Uploaded {f} to s3://{bucket.name}/{k.key}')
            
            k.close()
        return (local_issue, True)
    except Exception as e:
        logger.error(f'Failed uploading {local_issue} with error = {f}')
        return (local_issue, False)
    

In [10]:
logger.setLevel(logging.INFO)
handler = logging.FileHandler(filename="/home/romanell/s3_upload.log", mode='w')
logger.addHandler(handler)

In [23]:
tasks = [
        delayed(s3_upload_issue)(l, bucket, overwrite=False)
        for l in local_issues[:100]
        # for l in local_issues
    ]

In [24]:
with ProgressBar():
    result = compute(*tasks, get=mp_get)

[########################################] | 100% Completed |  2min 23.1s


In [25]:
result[:10]

((IssueDirectory(journal='IMP', date=datetime.date(1965, 10, 16), edition='a', path='/scratch/matteo/impresso-canonical/IMP/1965/10/16/a'),
  True),
 (IssueDirectory(journal='IMP', date=datetime.date(1965, 10, 1), edition='a', path='/scratch/matteo/impresso-canonical/IMP/1965/10/01/a'),
  True),
 (IssueDirectory(journal='IMP', date=datetime.date(1965, 10, 7), edition='a', path='/scratch/matteo/impresso-canonical/IMP/1965/10/07/a'),
  True),
 (IssueDirectory(journal='IMP', date=datetime.date(1965, 10, 27), edition='a', path='/scratch/matteo/impresso-canonical/IMP/1965/10/27/a'),
  True),
 (IssueDirectory(journal='IMP', date=datetime.date(1965, 10, 20), edition='a', path='/scratch/matteo/impresso-canonical/IMP/1965/10/20/a'),
  True),
 (IssueDirectory(journal='IMP', date=datetime.date(1965, 10, 19), edition='a', path='/scratch/matteo/impresso-canonical/IMP/1965/10/19/a'),
  True),
 (IssueDirectory(journal='IMP', date=datetime.date(1965, 10, 26), edition='a', path='/scratch/matteo/impress

In [26]:
errors = [
    issue
    for issue, success in result
    if not issue
]

In [44]:
try:
    assert len(errors) == 0
except Exception:
    with open('failed_s3_uploads.pkl', 'wb') as pickle_file:
        pickle.dump(
            [i.path for i, s in errors],
            pickle_file
        )