In [177]:
import json
import sys
import smart_open
from datetime import date

In [53]:
from impresso_commons.path import IssueDir
from smart_open import s3_iter_bucket
from impresso_commons.utils.s3 import get_s3_connection

In [9]:
conn = get_s3_connection()

In [13]:
bucket = [b for b in conn.get_all_buckets() if b.name=="canonical-json"][0]

In [192]:
def s3_detect_issues(input_bucket, prefix=None):
    """
    Detect issues stored in an S3 drive/bucket.

    The path in `issue.path`is just the key name.

    Returns a list of `IssueDir` instances. 
    """
    def _key_to_issue(key):
        """Instantiate an IssueDir from a (canonical) key name."""
        name_no_prefix = key.name.split('/')[-1]
        canon_name = name_no_prefix.replace("-issue.json", "")
        journal, year, month, day, edition = canon_name.split('-')
        path = key.name
        return IssueDir(
            journal,
            date(int(year), int(month), int(day)),
            edition,
            path
        )
    
    if prefix is None:
        return [
            _key_to_issue(key)
            for key, content in s3_iter_bucket(
                input_bucket,
                accept_key=lambda key: key.endswith('issue.json')
            )
        ]
    else:
        return [
            _key_to_issue(key)
            for key, content in s3_iter_bucket(
                input_bucket,
                prefix=prefix,
                accept_key=lambda key: key.endswith('issue.json')
            )
        ]

In [196]:
def s3_read_issue(issue, bucket):
    """
    TODO: perform schema validation before returnin the object.
    """
    issue_data = list(s3_iter_bucket(bucket, prefix=issue.path))[0][1]
    issue_json = json.loads(issue_data.decode('utf-8'))
    return issue_json

In [198]:
# TODO: implement
def s3_read_page():
    pass

In [194]:
%%time
issues = s3_detect_issues(bucket, prefix="IMP/1900")

CPU times: user 91.4 ms, sys: 97 ms, total: 188 ms
Wall time: 2.59 s


In [189]:
len(issues)

2097

In [188]:
issues[0]

IssueDirectory(journal='IMP', date=datetime.date(1881, 7, 16), edition='a', path='IMP/1881/07/16/a/IMP-1881-07-16-a-issue.json')

In [195]:
s3_read_issue(issues[0], bucket)

{'i': [{'l': {'id': 'Ar00105', 'source': '113-IMP-1900-01-16-0001.pdf'},
   'm': {'id': 'IMP-1900-01-16-a-i0001',
    'l': 'french',
    'pp': [1],
    'pub': 'IMP',
    't': "PRIX D'AB ONNEMENT Franco pour la Suisse...",
    'tp': 'article'}},
  {'l': {'id': 'Ar00106', 'source': '113-IMP-1900-01-16-0001.pdf'},
   'm': {'id': 'IMP-1900-01-16-a-i0002',
    'l': 'french',
    'pp': [1],
    'pub': 'IMP',
    't': 'PRIX DES ANNONCES 10 cent, la ligne Pour...',
    'tp': 'article'}},
  {'l': {'id': 'Ar00103', 'source': '113-IMP-1900-01-16-0001.pdf'},
   'm': {'id': 'IMP-1900-01-16-a-i0003',
    'l': 'french',
    'pp': [1],
    'pub': 'IMP',
    't': '- Bu 1" Octobre 1899 Départs p> GARE CHA...',
    'tp': 'article'}},
  {'l': {'id': 'Ar00104', 'source': '113-IMP-1900-01-16-0001.pdf'},
   'm': {'id': 'IMP-1900-01-16-a-i0004',
    'l': 'french',
    'pp': [1],
    'pub': 'IMP',
    't': "/'ïMDaûBTÏAT dece J°u p P araîten U UV-l...",
    'tp': 'article'}},
  {'l': {'id': 'Ar00107', 'source':