In [1]:
import json
import os
import bz2
import io
from bz2 import BZ2File

### Reading newspaper archive data

Reminder: the data is already 'clean' and the files at hand contains only the following information:
- id
- date
- title
- type (article or advertisement)
- fulltext

In [2]:
input_dir = "data/" # update with your path 

In [3]:
# a helper function to get the lines from am archive
def read_jsonlines(bz2_file):
    text = f.read().decode('utf-8')
    for line in text.split('\n'):
        if line != '':
            yield line

### reading data the classical way

In [None]:
for archive in os.listdir(input_dir):
    if "reduced" in archive:
        # open the archive
        f = BZ2File(os.path.join(input_dir, archive), 'r')
        # get the list of articles it contains (= a json object on each line)
        articles = list(read_jsonlines(f))
        # load the articles as json and access their attributes
        for a in articles:
            json_article = json.loads(a)
            print(json_article["date"])

### using dask and map
see http://dask.pydata.org/en/latest/docs.html 

In [None]:
# make sure of having these libraries in your environment ('conda install' / or 'pip install')
from dask.diagnostics import ProgressBar
from dask.distributed import Client, progress
import dask.bag as db

In [7]:
def get_archives(path):
    for archive in os.listdir(path):
        archives = []
        if "reduced" in archive:
            archives.append(os.path.join(input_dir, archive))
    return archives

def get_articles(archive_file):
    articles = []
    # open the archive
    f = BZ2File(archive_file, 'r')
    # get the list of articles it contains (= a json object on each line)
    articles = list(read_jsonlines(f))
    # load the articles as json and access their attributes
    for a in articles:
        articles.append(json.loads(a))
    return articles

In [None]:
archives = get_archives(input_dir)

In [None]:
bag_archives = db.from_sequence(archives)
bag_article_lists = bag_archives.map(get_articles)
bag_articles = bag_article_lists.flatten().repartition(npartitions=100)
# => bag articles contains json article ready to be processed