In [1]:
import json
import sys
import smart_open
from datetime import date

In [2]:
from impresso_commons.path import IssueDir
from smart_open import s3_iter_bucket
from impresso_commons.utils.s3 import get_s3_connection

In [3]:
from impresso_commons.path import s3_detect_issues

In [4]:
from itertools import starmap
from functools import reduce

In [5]:
conn = get_s3_connection()

In [6]:
conn

S3Connection:os.zhdk.cloud.switch.ch

In [7]:
bucket = [b for b in conn.get_all_buckets() if b.name=="canonical-json"][0]

In [8]:
bucket

<Bucket: canonical-json>

In [24]:
def s3_get_articles(issue, bucket):
    """
    Read an issue from S3 and return the list of articles it contains.
    
    NB: Content items with type = "ad" (advertisement) are filtered out.
    
    """
    issue_data = list(s3_iter_bucket(bucket, prefix=issue.path))[0][1]
    issue_json = json.loads(issue_data.decode('utf-8'))
    articles = [
        item
        for item in issue_json["i"]
        if item["m"]["tp"]=="article"]
    return articles

In [48]:
def rebuild_text(regions):
    """
    Logic:
    - 
    """
    pass

In [51]:
# TODO: implement
def rebuild_article(article_metadata, bucket_id):
    """
    Logic:
    - for each page in ['m']['pp'], fetch the JSON pages from S3
    - for each page, filter the regions where occurs the article ID
    - pass the list of regions to `rebuild_text()`
    
    Returns a single string with the article's fulltext.
    """
    pass

In [54]:
def serialize_article(article_text, article_metadata, out_dir=None, s3_bucket_id=None):
    pass

In [9]:
%%time
issues = s3_detect_issues(bucket, prefix="IMP/1990")

CPU times: user 116 ms, sys: 98.2 ms, total: 214 ms
Wall time: 3.44 s


In [10]:
len(issues)

23

In [11]:
issues

[IssueDirectory(journal='IMP', date=datetime.date(1990, 1, 12), edition='a', path='IMP/1990/01/12/a/IMP-1990-01-12-a-issue.json'),
 IssueDirectory(journal='IMP', date=datetime.date(1990, 3, 22), edition='a', path='IMP/1990/03/22/a/IMP-1990-03-22-a-issue.json'),
 IssueDirectory(journal='IMP', date=datetime.date(1990, 2, 27), edition='a', path='IMP/1990/02/27/a/IMP-1990-02-27-a-issue.json'),
 IssueDirectory(journal='IMP', date=datetime.date(1990, 1, 16), edition='a', path='IMP/1990/01/16/a/IMP-1990-01-16-a-issue.json'),
 IssueDirectory(journal='IMP', date=datetime.date(1990, 8, 27), edition='a', path='IMP/1990/08/27/a/IMP-1990-08-27-a-issue.json'),
 IssueDirectory(journal='IMP', date=datetime.date(1990, 2, 7), edition='a', path='IMP/1990/02/07/a/IMP-1990-02-07-a-issue.json'),
 IssueDirectory(journal='IMP', date=datetime.date(1990, 3, 15), edition='a', path='IMP/1990/03/15/a/IMP-1990-03-15-a-issue.json'),
 IssueDirectory(journal='IMP', date=datetime.date(1990, 5, 10), edition='a', path='I

In [33]:
x = starmap(
        s3_get_articles,
        [
            (issue, bucket)
            for issue in issues
        ]
)

In [34]:
y = list(x)

In [35]:
len(y)

23

In [41]:
all_articles = reduce(lambda x,y: x+y, y)

In [42]:
len(all_articles)

3867

In [43]:
all_articles[:100]

[{'l': {'id': 'Ar00104', 'source': '113-IMP-1990-03-15-0001.pdf'},
  'm': {'id': 'IMP-1990-03-15-a-i0001',
   'l': 'french',
   'pp': [1],
   'pub': 'IMP',
   't': "' mff",
   'tp': 'article'}},
 {'l': {'id': 'Ar00101', 'source': '113-IMP-1990-03-15-0001.pdf'},
  'm': {'id': 'IMP-1990-03-15-a-i0003',
   'l': 'french',
   'pp': [1],
   'pub': 'IMP',
   't': 'Voie étroite',
   'tp': 'article'}},
 {'l': {'id': 'Ar00103', 'source': '113-IMP-1990-03-15-0001.pdf'},
  'm': {'id': 'IMP-1990-03-15-a-i0004',
   'l': 'french',
   'pp': [1],
   'pub': 'IMP',
   't': "A S'aide des victimes de violences",
   'tp': 'article'}},
 {'l': {'id': 'Ar00100', 'source': '113-IMP-1990-03-15-0001.pdf'},
  'm': {'id': 'IMP-1990-03-15-a-i0006',
   'l': 'french',
   'pp': [1],
   'pub': 'IMP',
   't': 'Le sacre de Gorbatchev',
   'tp': 'article'}},
 {'l': {'id': 'Ar00102', 'source': '113-IMP-1990-03-15-0001.pdf'},
  'm': {'id': 'IMP-1990-03-15-a-i0008',
   'l': 'french',
   'pp': [1],
   'pub': 'IMP',
   't': 'La

In [49]:
all_articles[0]

{'l': {'id': 'Ar00104', 'source': '113-IMP-1990-03-15-0001.pdf'},
 'm': {'id': 'IMP-1990-03-15-a-i0001',
  'l': 'french',
  'pp': [1],
  'pub': 'IMP',
  't': "' mff",
  'tp': 'article'}}

In [52]:
rebuilt_articles = [
    rebuild_article(article, bucket_id='canonical-json')
    for article in all_articles
]

In [59]:
serialization_result = [
    serialize_article(metadata, fulltext, out_dir="")
    for metadata, fulltext in zip(all_articles, rebuilt_articles)
]