In [1]:
import os
import pdb
import logging
from boto.s3.connection import Key

from impresso_commons.path import KNOWN_JOURNALS
from impresso_commons.path import detect_canonical_issues
from impresso_commons.path import s3_detect_issues, pair_issue
from impresso_commons.utils.s3 import get_s3_connection

import dask
from dask import compute, delayed
from dask.diagnostics import ProgressBar
from dask.multiprocessing import get as mp_get

In [2]:
logger = logging.getLogger(__name__)

In [3]:
input_dir = "text_importer/data/out/"

In [4]:
local_issues = detect_canonical_issues(
    input_dir,
    KNOWN_JOURNALS
)

In [5]:
local_issues

[IssueDirectory(journal='GDL', date=datetime.date(1900, 1, 2), edition='a', path='text_importer/data/out/GDL/1900/01/02/a'),
 IssueDirectory(journal='GDL', date=datetime.date(1900, 1, 3), edition='a', path='text_importer/data/out/GDL/1900/01/03/a'),
 IssueDirectory(journal='GDL', date=datetime.date(1900, 1, 4), edition='a', path='text_importer/data/out/GDL/1900/01/04/a'),
 IssueDirectory(journal='GDL', date=datetime.date(1900, 1, 5), edition='a', path='text_importer/data/out/GDL/1900/01/05/a'),
 IssueDirectory(journal='IMP', date=datetime.date(1901, 9, 3), edition='a', path='text_importer/data/out/IMP/1901/09/03/a')]

In [6]:
conn = get_s3_connection()

In [7]:
bucket = [
    bucket
    for bucket in conn.get_all_buckets()
    if bucket.name == "canonical-json"
][0]

In [8]:
bucket

<Bucket: canonical-json>

In [22]:
%%time
remote_issues = s3_detect_issues(bucket, prefix="IMP/1901")

CPU times: user 149 ms, sys: 89.4 ms, total: 238 ms
Wall time: 5.32 s


In [9]:
local_issues

[IssueDirectory(journal='GDL', date=datetime.date(1900, 1, 2), edition='a', path='text_importer/data/out/GDL/1900/01/02/a'),
 IssueDirectory(journal='GDL', date=datetime.date(1900, 1, 3), edition='a', path='text_importer/data/out/GDL/1900/01/03/a'),
 IssueDirectory(journal='GDL', date=datetime.date(1900, 1, 4), edition='a', path='text_importer/data/out/GDL/1900/01/04/a'),
 IssueDirectory(journal='GDL', date=datetime.date(1900, 1, 5), edition='a', path='text_importer/data/out/GDL/1900/01/05/a'),
 IssueDirectory(journal='IMP', date=datetime.date(1901, 9, 3), edition='a', path='text_importer/data/out/IMP/1901/09/03/a')]

In [14]:
def s3_upload_issue(local_issue, ouput_bucket, overwrite=False):
    """
    
    """
    my_dir = local_issue.path
    files = [os.path.join(my_dir, f) for f in os.listdir(my_dir)]
    try:
        for f in files:
            k = Key(bucket)
            # remove the input_dir when setting the key's name
            k.key = f.replace(input_dir, "")
            
            if not overwrite and k.exists() is True:
                pass
            else:
                # copy the content of the file into the key
                k.set_contents_from_filename(f)
                logger.info(f'Uploaded {f} to s3://{bucket.name}/{k.key}')
            
            k.close()
        return True
    except Exception as e:
        logger.error(f'Failed uploading {local_issue} with error = {f}')
        return False
    

In [15]:
logger.setLevel(logging.INFO)
handler = logging.FileHandler(filename="/Users/rromanello/Downloads/s3_upload.log", mode='w')
logger.addHandler(handler)

In [16]:
tasks = [
        delayed(s3_upload_issue)(l, bucket, overwrite=True)
        for l in local_issues
    ]

In [17]:
with ProgressBar():
    result = compute(*tasks, get=mp_get)

[########################################] | 100% Completed |  9.5s
