In [1]:
import pdb
import json
import sys
import smart_open
from datetime import date

In [2]:
from impresso_commons.path import IssueDir, detect_issues
from smart_open import s3_iter_bucket
from impresso_commons.utils.s3 import get_s3_connection

In [3]:
from impresso_commons.path import s3_detect_issues

In [4]:
from itertools import starmap
from functools import reduce

In [5]:
# issues = detect_issues('/home/romanell/canonical-json/', journal_filter=["GDL", "IMP"])

In [6]:
conn = get_s3_connection()

In [7]:
bucket = [b for b in conn.get_all_buckets() if b.name=="canonical-json"][0]

In [7]:
bucket

<Bucket: canonical-json>

In [None]:
def get_articles(issue, base_dir):
    pass

In [9]:
ll {issues[0].path}

total 4
drwxr-xr-x 2 romanell 4096 Apr 20 06:56 [0m[01;34ma[0m/


In [8]:
def s3_get_articles(issue, bucket):
    """
    Read an issue from S3 and return the list of articles it contains.
    
    NB: Content items with type = "ad" (advertisement) are filtered out.
    
    """
    issue_data = list(s3_iter_bucket(bucket, prefix=issue.path))[0][1]
    issue_json = json.loads(issue_data.decode('utf-8'))
    articles = [
        item
        for item in issue_json["i"]
        if item["m"]["tp"]=="article"]
    return articles

In [9]:
def serialize_article(article_text, article_metadata, out_dir=None, s3_bucket_id=None):
    pass

In [10]:
%%time
issues = s3_detect_issues(bucket, prefix="IMP/1990")

CPU times: user 144 ms, sys: 88.2 ms, total: 233 ms
Wall time: 3.77 s


In [11]:
len(issues)

23

In [12]:
issues[0]

IssueDirectory(journal='IMP', date=datetime.date(1990, 2, 7), edition='a', path='IMP/1990/02/07/a/IMP-1990-02-07-a-issue.json')

In [13]:
articles_by_issue = starmap(
        s3_get_articles,
        [
            (issue, bucket)
            for issue in issues
        ]
)

In [14]:
all_articles = reduce(lambda x, y: x + y, articles_by_issue)

In [15]:
len(all_articles)

3867

In [28]:
# all_articles[:100]

In [34]:
rebuilt_articles = [
    rebuild_article(article, bucket_id='canonical-json')
    for article in all_articles
]

In [None]:
serialization_result = [
    serialize_article(metadata, fulltext, out_dir="")
    for metadata, fulltext in zip(all_articles, rebuilt_articles)
]

## DEV

In [16]:
from impresso_commons.path import get_issueshortpath

In [17]:
def s3_get_pages(issue_id, page_names, bucket):
    return {
        key.name.split('/')[-1]: json.loads(content.decode('utf-8'))
        for key, content in s3_iter_bucket(bucket, prefix=issue_id.replace('-', '/'))
        if key.name.split('/')[-1] in list(page_names.values())
    }

In [100]:
def rebuild_text(tokens, string=None):
    """    
    TODO: handle better the space insertion
    """
    
    def get_space(token):
        return " "
    
    regions = []
    
    if string is None:
        string = ""
    
    for token in tokens:
        region = {}
        region["coords"] = token["c"]
        region["start"] = len(string)
        region["length"] = len(token["tx"])
        string += "{} ".format(token["tx"])
        regions.append(region)
        
    return (string, regions)
        
    
    

In [99]:
# TODO: implement
def rebuild_article(article_metadata, bucket_id):
    """
    Logic:
    - for each page in ['m']['pp'], fetch the JSON pages from S3
    - for each page, filter the regions where occurs the article ID
    - pass the list of regions to `rebuild_text()`
    
    Returns a single string with the article's fulltext.
    """
    article_id = article_metadata["m"]["id"]
    issue_id = "-".join(article_id.split('-')[:-1])
    page_file_names = {
        p: "{}-p{}.json".format(issue_id, str(p).zfill(4))
        for p in article_metadata["m"]["pp"]
    }
    pages = s3_get_pages(issue_id, page_file_names, bucket)
    
    fulltext = ""
    article = {
        "id": article_id,
        # "series": None,
        "pages": [],
        "title": article_metadata["m"]["t"],
        "lang": article_metadata["m"]["l"],
        "journal": article_metadata["m"]["pub"],
    }
    
    for page_no in page_file_names:
        page = pages[page_file_names[page_no]]
        regions = [
            region
            for region in page["r"]
            if region["pOf"] == article_id
        ]
        tokens = [
            token
            for region in regions
            for para in region["p"]
            for line in para["l"]
            for token in line["t"]
            # TODO: handle better hyphenated words
        ]
        
        if fulltext == "":
            fulltext, regions = rebuild_text(tokens)
        else:
            fulltext, regions = rebuild_text(tokens, fulltext)
            
        page_doc = {
            "id": page_file_names[page_no],
            "n": page_no,
            "regions": regions
        }
        article["pages"].append(page_doc)
    
    article["text"] = fulltext
        
    return article

In [87]:
all_articles[0]

{'l': {'id': 'Ar00109', 'source': '113-IMP-1990-02-07-0001.pdf'},
 'm': {'id': 'IMP-1990-02-07-a-i0001',
  'l': 'french',
  'pp': [1],
  'pub': 'IMP',
  't': 'Untitled Article',
  'tp': 'article'}}

In [94]:
article = all_articles[0]
rebuilt_article = rebuild_article(article, bucket)

In [98]:
rebuilt_article["text"]

"Aujourd'hui : en partie ensoleu- Demain : encore en partie enso- lé avec des bancs de brouillard leillé et doux . Le soir quelques le matin sur le Plateau et des précipitations à partir de passages nuageux sur l ' ouest , l ' ouest . Lac des Brenels ' _vVV /> _Bj _^ _^ il // Lever Coucher _™ _SJL _, MEJ /| H 7 h 49 17 h 43 428 . 97 m | 10 ° ~ 1 ° | 2000 m _[ r _^ 15 h 02 6 h 36 Fête à souhaiter mercredi 7 février : Hélène "

In [97]:
rebuilt_article["text"][122:130]

'quelques'

In [81]:
rebuilt_article["pages"]

[{'id': 'IMP-1990-02-07-a-p0001.json',
  'n': 1,
  'regions': [{'coords': [57, 470, 138, 494], 'length': 11, 'start': 0},
   {'coords': [142, 470, 149, 494], 'length': 1, 'start': 12},
   {'coords': [150, 470, 166, 494], 'length': 2, 'start': 14},
   {'coords': [171, 470, 212, 494], 'length': 6, 'start': 17},
   {'coords': [217, 470, 276, 494], 'length': 8, 'start': 24},
   {'coords': [292, 470, 348, 494], 'length': 6, 'start': 33},
   {'coords': [349, 470, 357, 494], 'length': 1, 'start': 40},
   {'coords': [357, 470, 404, 494], 'length': 6, 'start': 42},
   {'coords': [408, 470, 425, 494], 'length': 2, 'start': 49},
   {'coords': [428, 470, 470, 494], 'length': 6, 'start': 52},
   {'coords': [474, 470, 512, 494], 'length': 5, 'start': 59},
   {'coords': [57, 488, 69, 512], 'length': 2, 'start': 65},
   {'coords': [73, 488, 105, 512], 'length': 4, 'start': 68},
   {'coords': [110, 488, 133, 512], 'length': 3, 'start': 73},
   {'coords': [137, 488, 178, 512], 'length': 5, 'start': 77},