In [1]:
import json
import os
import bz2
import io
from bz2 import BZ2File

### Transform the data


You can run bash commands from your notebook, just prefix the command with `!`

For example, let's check free space on the disk:

In [2]:
! df -h

Filesystem                          Size   Used  Avail Capacity iused      ifree %iused  Mounted on
/dev/disk0s2                       465Gi  460Gi  5.1Gi    99% 6786589 4288180690    0%   /
devfs                              187Ki  187Ki    0Bi   100%     648          0  100%   /dev
map -hosts                           0Bi    0Bi    0Bi   100%       0          0  100%   /net
map auto_home                        0Bi    0Bi    0Bi   100%       0          0  100%   /home
localhost:/qq-ZoaG3hRAhXXOGin2FIA  465Gi  465Gi    0Bi   100%       0          0  100%   /Volumes/MobileBackups


whereas without `!` it won't work:

In [3]:
df -h

NameError: name 'df' is not defined

In [None]:
! for f in data/*[0-9].jsonl.bz2; do bzcat $f | jq -c '{id: .id, type: .tp, date: .d, title: .t, fulltext: .ft}' | bzip2 > "{f%.jsonl.bz2}-reduced.jsonl.bz2" ; done

In [None]:
ls -la data/

### Reading newspaper archive data

Reminder: the data is already 'clean' and the files at hand contains only the following information:
- id
- date
- title
- type (article or advertisement)
- fulltext

In [4]:
input_dir = "data/" # update with your path 

In [5]:
# a helper function to get the lines from am archive
def read_jsonlines(bz2_file):
    text = bz2_file.read().decode('utf-8')
    for line in text.split('\n'):
        if line != '':
            yield line

### reading data the classical way

In [6]:
for archive in os.listdir(input_dir):
    
    # take only the transformed archives
    if "reduced" in archive:
        
        # open the archive
        f = BZ2File(os.path.join(input_dir, archive), 'r')
        
        # get the list of articles it contains (= a json object on each line)
        articles = list(read_jsonlines(f))
        
        # load the first 100 articles as json and access their attributes
        for a in articles[:100]:
            
            # decode the json string into an object (dict)
            json_article = json.loads(a)
            print(
                json_article["date"],
                json_article["id"],
                json_article["title"]
            )

1900-07-24 GDL-1900-07-24-a-i0001 None
1900-07-24 GDL-1900-07-24-a-i0002 None
1900-07-24 GDL-1900-07-24-a-i0003 None
1900-07-24 GDL-1900-07-24-a-i0005 None
1900-07-24 GDL-1900-07-24-a-i0006 None
1900-07-24 GDL-1900-07-24-a-i0007 None
1900-07-24 GDL-1900-07-24-a-i0008 En Chine
1900-07-24 GDL-1900-07-24-a-i0010 None
1900-07-24 GDL-1900-07-24-a-i0011 None
1900-07-24 GDL-1900-07-24-a-i0012 None
1900-07-24 GDL-1900-07-24-a-i0013 CONFÉDÉRATION SUISSE
1900-07-24 GDL-1900-07-24-a-i0014 Le chemin de fer électrique Thoune-Berthoud
1900-07-24 GDL-1900-07-24-a-i0015 Chronique alpestie
1900-07-24 GDL-1900-07-24-a-i0016 CHRONIQUE GENEVOISE
1900-07-24 GDL-1900-07-24-a-i0018 None
1900-07-24 GDL-1900-07-24-a-i0019 None
1900-07-24 GDL-1900-07-24-a-i0020 None
1900-07-24 GDL-1900-07-24-a-i0021 CHRONIQUE AGRICOLE
1900-07-24 GDL-1900-07-24-a-i0022 Dernières dépêches
1900-07-24 GDL-1900-07-24-a-i0023 LES LIVRES
1900-07-24 GDL-1900-07-24-a-i0024 None
1900-07-24 GDL-1900-07-24-a-i0025 None
1900-07-24 GDL-1900-

### using dask and map
see http://dask.pydata.org/en/latest/docs.html 

In [7]:
# make sure of having these libraries in your environment ('conda install' / or 'pip install')
from dask.diagnostics import ProgressBar
from dask.distributed import Client, progress
import dask.bag as db

#### Helper functions

In [8]:
def get_archives(path):
    archives = []
    for archive in os.listdir(path):
        if "reduced" in archive:
            archives.append(os.path.join(input_dir, archive))
    return archives

In [44]:
def get_articles(archive_file):
    articles = []
    # open the archive
    f = BZ2File(archive_file, 'r')
    # get the list of articles it contains (= a json object on each line)
    lines = list(read_jsonlines(f))
    # load the articles as json and access their attributes
    for a in lines:
        articles.append(json.loads(a))
    return articles

#### Read and filter articles in parallel

In [10]:
archives = get_archives(input_dir)

In [11]:
bag_archives = db.from_sequence(archives)
bag_articles = bag_archives.map(get_articles)\
                        .flatten()\
                        .filter(lambda ar: ar['fulltext'] != '')\
                        .repartition(npartitions=100)

In [12]:
with ProgressBar():
    bag_articles = bag_articles.persist()

[########################################] | 100% Completed |  1min 10.4s


In [45]:
bag_articles.take(4, npartitions=10)

({'id': 'GDL-1900-07-25-a-i0001',
  'type': 'ar',
  'date': '1900-07-25',
  'title': None,
  'fulltext': 'REDACTION .\'\'Eue Pcuinçt, 3. V BUREAU D\'ABONNEMENTS fiucilo St-François, 20. On s\'abonne dans lous les bureaux do poste Les abonnements partent du 1 " ou du l\'ô do chaque mois. PRIX D\'ABONNEMENT Un an 6 mois 8 moi} Suisso Fr. 20 dO 50 5 50 Union postale » 36 18 50 9 50 Prix du numéro : 10 centimes. '},
 {'id': 'GDL-1900-07-25-a-i0002',
  'type': 'ar',
  'date': '1900-07-25',
  'title': 'Lausanne, 25 juillet 1900. Le Péril socialiste',
  'fulltext': "Lausanne, 25 juillet 1900. Le Péril socialiste. Le socialisme prépare une révolution économique semblable à celle qui, éteignant la civilisation romaine, répandit sur l'Europe les ténèbres du moyen âge. Ce bouleversement est inévitable, non par la force des partis qui attaquent la société, mais par l'ignorance, la légèreté et la lâ cheté de ceux que les agresseurs veulent spolier. Telle est la thèse que M. Vilfredo Pareto, l'émine

In [90]:
bag_articles.count().compute()

6727

In [14]:
articles = bag_articles.compute()

In [15]:
len(articles)

6727

#### Compute total number of tokens (parallel)

In [46]:
n_tokens = bag_articles.map(lambda ar: len(ar['fulltext'].split()))

In [47]:
total_tokens = n_tokens.sum().compute()

In [48]:
"{:,}".format(total_tokens)

'4,178,435'

#### Extract Named Entities with SpaCy

In [18]:
import textacy

In [21]:
fr = textacy.load_spacy('fr_core_news_sm')

In [22]:
type(fr)

spacy.lang.fr.French

In [None]:
def extract_entities(article):

    # load spacy language model for French
    fr = textacy.load_spacy('fr_core_news_sm')
    
    # create a textacy Document
    doc = textacy.Doc(article['fulltext'], lang=fr)
    
    # apply SpaCy ner to the Document via textacy
    nes = list(textacy.extract.named_entities(doc))
    
    return {
        "id": article["id"],
        "entities": nes,
        "textacy_doc": doc
    }

In [49]:
%%time
entities_by_article = [
    extract_entities(art)
    for art in articles[:1000]
]

CPU times: user 9min 3s, sys: 3min 23s, total: 12min 26s
Wall time: 3min 45s


In [43]:
entities_by_article[1]

{'id': 'GDL-1900-07-25-a-i0002',
 'entities': [Lausanne,
  Péril socialiste,
  M. Vilfredo Pareto,
  Faculté de droit de l',
  Université de Lausanne,
  Pareto,
  Voyez,
  France,
  Voyez,
  Pline,
  Trajan,
  prince de Bismark,
  Pline,
  Bebel,
  Allemagne,
  Jules Guesde,
  France,
  Constantin,
  IV siècle,
  Marxistes,
  PÉRIL SOCUMSTK,
  Vilfredo Pareto,
  Extrait du Journal des Economistes,
  Paris,
  Guillauiiiin & Cie,
  Richelieu,
  Pareto,
  Allemagne,
  Sancho Pança,
  Allemagne,
  Heinze,
  Allez,
  Marqueté,
  modeste regard,
  Etat,
  M.,
  Pareto,
  Vérité]}