## Notes

- the json file needs to have a canonical file name
- in the passim JSON use the `year` as `date` field
- pass an `IssueDir` object to `XML_to_passim_JSON`


In [1]:
import os
import sys
sys.path.append("../impresso-text-acquisition/")
import codecs
import shutil
import jsonlines
from bs4 import BeautifulSoup
from dask import compute, delayed
from dask.distributed import Client, progress
from dask.diagnostics import ProgressBar
from dask.multiprocessing import get as mp_get
from random import shuffle
from olive_importer import *

In [36]:
base_dir = "/scratch/matteo/letemps_data/data4/"

In [37]:
%%time
issues = detect_journal_issues(base_dir)

Path /scratch/matteo/letemps_data/data4/GDL/1996/09/@eaDir is not a valid issue directory
Path /scratch/matteo/letemps_data/data4/JDG/1914/05/@eaDir is not a valid issue directory


CPU times: user 3.28 s, sys: 3.76 s, total: 7.04 s
Wall time: 4 s


In [38]:
len(issues)

100251

In [39]:
def XML_to_passim_JSON(xml_files, issue_dir, out_dir):
    documents = []
    
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    
    doc_id = canonical_path(issue_dir, path_type="dir").replace("/", "-")
    
    for path in xml_files:
        
        with codecs.open(path, 'r', 'utf-8') as inp_file:
            xml = inp_file.read()
        
        soup = BeautifulSoup(xml, 'lxml')
        entities = soup.findAll('entity')
        
        if len(entities) == 0:
            continue
        
        fulltext = "\n".join(
                [
                    str(e.find('full_text').encode(formatter="xml").decode('utf-8'))
                    for e in entities
                ]
        )
        
        documents.append(
            {
                "id": "{}_{}".format(doc_id, "-".join([e.find("id").text for e in entities])),
                "series": issue_dir.journal,
                "date": issue_dir.date.year,
                "name": entities[0].find("name").text,
                "page_no": [e.find("page_no").text for e in entities],
                "text": fulltext
            }
        )
    """
    canonical_path(issue_dir, "", extension=".jsonl")

    with jsonlines.open(os.path.join(out_dir, doc_id + ".jsonl"), mode='w') as writer:
        writer.write_all(documents)
    """
    return documents

In [40]:
@delayed
def process_issue(issue_dir, out_dir):
    xml_files = [
        os.path.join(issue_dir.path, f) for f in os.listdir(issue_dir.path)
    ]
    
    docs = XML_to_passim_JSON(
        xml_files,
        issue_dir,
        os.path.join(out_dir, issue_dir.journal)
    )
    return docs

In [41]:
@delayed
def process_month(issues_in_month, out_dir):
    """Write all newspaper issue for a given month into a JSON lines file."""
    docs = [
        process_issue(i, out_dir)
        for i in issues_in_month
    ] 
    
    result = reduce(
        lambda x, y: x + y,
        compute(*docs)
    )
        
    out_filename = "{}-{}-{}.jsonl".format(
        issues_in_month[0].journal,
        issues_in_month[0].date.year,
        issues_in_month[0].date.month
    )
    
    with jsonlines.open(
        os.path.join(out_dir, issues_in_month[0].journal, out_filename),
        mode='w'
    ) as writer:
        writer.write_all(result)
    
    return

In [42]:
def group_issues(issues):
    """Group issues by month."""
    groups = {}
    for issue in issues:
        year = issue.date.year
        month = issue.date.month
        index = (issue.journal, year, month)
        
        if index not in groups:
            groups[index] = []
            
        groups[index].append(issue)
    return groups
            

In [43]:
filtered_issues = [
    issue for issue in issues
    # if issue.date.year >= 1945
    
]

In [44]:
len(filtered_issues)

100251

In [45]:
# will create a lists like [("GDL", 1900, 10], ...)
issues_by_month = group_issues(filtered_issues)

In [12]:
# issues_by_month.keys()

In [13]:
# [idx for idx in list(issues_by_month.keys())[:2]]

In [46]:
issues_indexes = list(issues_by_month.keys())
shuffle(issues_indexes)
# selected_issues = issues_indexes[:100]
selected_issues = issues_indexes

In [47]:
len(selected_issues)

4335

In [48]:
c = Client()

In [49]:
tasks = [
    process_month(issues_by_month[idx], "/scratch/matteo/passim_input_monthly/")
    for idx in selected_issues
]

In [1]:
"""
idx = selected_issues[0]
task = process_month(issues_by_month[idx], "/scratch/matteo/passim_input_monthly/")
r = c.compute(task)
"""

'\nidx = selected_issues[0]\ntask = process_month(issues_by_month[idx], "/scratch/matteo/passim_input_monthly/")\nr = c.compute(task)\n'

In [50]:
result = c.compute(tasks)

In [52]:
progress(result)

In [54]:
result

[<Future: status: pending, key: process_month-9079359c-b72a-45ca-ae35-e3005a5e15d6>,
 <Future: status: pending, key: process_month-d7879aec-bb97-47ce-a6eb-393048bd899a>,
 <Future: status: pending, key: process_month-c1e8e409-b9d6-435f-a650-657ff64e9aa2>,
 <Future: status: pending, key: process_month-be4e4093-28e3-4570-ba04-7f3a56c156a7>,
 <Future: status: pending, key: process_month-bbf1bbd9-1d50-40f8-9e83-23230653a819>,
 <Future: status: pending, key: process_month-0a78e412-bb51-4bf7-9b3a-bb7dfd7b7aeb>,
 <Future: status: pending, key: process_month-60a92e4c-fd83-4931-860b-84d3d668adfb>,
 <Future: status: pending, key: process_month-9e78a965-da77-471d-8780-5f83816dd78d>,
 <Future: status: pending, key: process_month-1fc96526-76da-4f70-bff6-98f845801bcb>,
 <Future: status: pending, key: process_month-235f43db-8f55-4524-8874-bcbfb4c53a18>,
 <Future: status: pending, key: process_month-565636a3-4a6d-43b2-a2ec-f80eff8ef2fd>,
 <Future: status: pending, key: process_month-07549f99-f87c-45ac-

In [59]:
[r for r in result if r.status == "error"]

[]

In [21]:
c.cancel(result)