In [1]:
import json
import os
import bz2
import io
from bz2 import BZ2File

### Transform the data


You can run bash commands from your notebook, just prefix the command with `!`

For example, let's check free space on the disk:

In [2]:
! df -h

Filesystem      Size  Used Avail Use% Mounted on
udev             16G     0   16G   0% /dev
tmpfs           3.2G  2.7M  3.2G   1% /run
/dev/sda2       234G  114G  108G  52% /
tmpfs            16G  387M   16G   3% /dev/shm
tmpfs           5.0M  4.0K  5.0M   1% /run/lock
tmpfs            16G     0   16G   0% /sys/fs/cgroup
/dev/loop0      256K  256K     0 100% /snap/jq/6
/dev/loop2      216M  216M     0 100% /snap/atom/236
/dev/loop1       55M   55M     0 100% /snap/core18/1144
/dev/loop3      4.2M  4.2M     0 100% /snap/gnome-calculator/406
/dev/loop4      220M  220M     0 100% /snap/atom/238
/dev/loop6      5.2M  5.2M     0 100% /snap/htop/1066
/dev/loop5      150M  150M     0 100% /snap/gnome-3-28-1804/67
/dev/loop7      150M  150M     0 100% /snap/slack/16
/dev/loop13     1.0M  1.0M     0 100% /snap/gnome-logs/73
/dev/loop8      141M  141M     0 100% /snap/gnome-3-26-1604/90
/dev/loop11      43M   43M     0 100% /snap/gtk-common-themes/1313
/dev/loop10     4.3M  4.3

whereas without `!` it won't work:

In [3]:
df -h

NameError: name 'df' is not defined

In [4]:
!s3cmd get s3://impresso-public/sample-rebuilt-teaching/* data/

download: 's3://impresso-public/sample-rebuilt-teaching/GDL-1900.jsonl.bz2' -> 'data/GDL-1900.jsonl.bz2'  [1 of 2]
 72413580 of 72413580   100% in    0s   105.64 MB/s  done
download: 's3://impresso-public/sample-rebuilt-teaching/JDG-1900.jsonl.bz2' -> 'data/JDG-1900.jsonl.bz2'  [2 of 2]
 210016648 of 210016648   100% in    1s   105.04 MB/s  done


In [6]:
cd data/

/home/romanell/Documents/impresso/epfl-shs-class/data


In [18]:
ls -la

total 278648
drwxr-xr-x 2 romanell DHLAB-unit      4096 Sep 25 13:15 [0m[01;34m.[0m/
drwxr-xr-x 5 romanell DHLAB-unit      4096 Sep 25 13:14 [01;34m..[0m/
-rw-r--r-- 1 romanell DHLAB-unit  72413580 Sep 25 10:33 [01;31mGDL-1900.jsonl.bz2[0m
-rw-r--r-- 1 romanell DHLAB-unit   2887680 Sep 25 13:15 [01;31mGDL-1900-reduced.jsonl.bz2[0m
-rw-r--r-- 1 romanell DHLAB-unit 210016648 Sep 25 10:33 [01;31mJDG-1900.jsonl.bz2[0m


In [14]:
cd data/

/home/romanell/Documents/impresso/epfl-shs-class/data


In [19]:
!for f in *[0-9].jsonl.bz2; do bzcat $f | jq -c '{id: .id, type: .tp, date: .d, title: .t, fulltext: .ft}' | bzip2 > "${f%.jsonl.bz2}-reduced.jsonl.bz2" ; done

In [20]:
ls -la ./

total 312888
drwxr-xr-x 2 romanell DHLAB-unit      4096 Sep 25 13:16 [0m[01;34m.[0m/
drwxr-xr-x 5 romanell DHLAB-unit      4096 Sep 25 13:18 [01;34m..[0m/
-rw-r--r-- 1 romanell DHLAB-unit  72413580 Sep 25 10:33 [01;31mGDL-1900.jsonl.bz2[0m
-rw-r--r-- 1 romanell DHLAB-unit  11701319 Sep 25 13:16 [01;31mGDL-1900-reduced.jsonl.bz2[0m
-rw-r--r-- 1 romanell DHLAB-unit 210016648 Sep 25 10:33 [01;31mJDG-1900.jsonl.bz2[0m
-rw-r--r-- 1 romanell DHLAB-unit  26243380 Sep 25 13:17 [01;31mJDG-1900-reduced.jsonl.bz2[0m


In [21]:
cd ../

/home/romanell/Documents/impresso/epfl-shs-class


### Reading newspaper archive data

Reminder: the data is already 'clean' and the files at hand contains only the following information:
- id
- date
- title
- type (article or advertisement)
- fulltext

In [22]:
pwd

'/home/romanell/Documents/impresso/epfl-shs-class'

In [23]:
input_dir = "data/" # update with your path 

In [24]:
# a helper function to get the lines from am archive
def read_jsonlines(bz2_file):
    text = bz2_file.read().decode('utf-8')
    for line in text.split('\n'):
        if line != '':
            yield line

### Reading from s3

In [10]:
import dask.bag as db

_storage_options={
    'client_kwargs': {'endpoint_url':'https://os.zhdk.cloud.switch.ch'},
    'anon':True
}

ci_bag = db.read_text(
    's3://impresso-public/sample-rebuilt-teaching/*bz2',
    storage_options=_storage_options
).map(json.loads)

In [11]:
%%time
ci_bag.count().compute()

CPU times: user 85.1 ms, sys: 175 ms, total: 260 ms
Wall time: 1min 28s


45260

In [15]:
ci_bag.take(1)

({'id': 'GDL-1900-12-12-a-i0001',
  'pp': [1],
  'd': '1900-12-12',
  'olr': True,
  'ts': '2019-06-17T10:02:03Z',
  'lg': 'fr',
  'tp': 'ar',
  's3v': None,
  'ppreb': [{'id': 'GDL-1900-12-12-a-p0001',
    'n': 1,
    't': [{'c': [471, 1240, 406, 113], 's': 0, 'l': 12},
     {'c': [113, 1233, 15, 54], 's': 13, 'l': 1},
     {'c': [127, 1233, 44, 54], 's': 15, 'l': 2},
     {'c': [235, 1233, 27, 54], 's': 18, 'l': 1},
     {'c': [262, 1233, 135, 54], 's': 20, 'l': 6},
     {'c': [204, 1304, 67, 40], 's': 27, 'l': 4},
     {'c': [300, 1304, 135, 40], 's': 32, 'l': 7},
     {'c': [106, 1365, 56, 40], 's': 40, 'l': 3},
     {'c': [175, 1365, 42, 40], 's': 44, 'l': 2},
     {'c': [225, 1365, 75, 40], 's': 47, 'l': 4},
     {'c': [315, 1365, 48, 40], 's': 52, 'l': 2},
     {'c': [377, 1365, 48, 40], 's': 55, 'l': 2},
     {'c': [438, 1365, 169, 40], 's': 58, 'l': 8},
     {'c': [606, 1365, 10, 40], 's': 66, 'l': 1},
     {'c': [635, 1365, 63, 40], 's': 68, 'l': 3},
     {'c': [710, 1365, 17

In [None]:
%%time
ci_bag.map(lambda ci: len(ci['ft'])).sum().compute()

### reading data the classical way

In [25]:
for archive in os.listdir(input_dir):
    
    # take only the transformed archives
    if "reduced" in archive:
        
        # open the archive
        f = BZ2File(os.path.join(input_dir, archive), 'r')
        
        # get the list of articles it contains (= a json object on each line)
        articles = list(read_jsonlines(f))
        
        # load the first 100 articles as json and access their attributes
        for a in articles[:100]:
            
            # decode the json string into an object (dict)
            json_article = json.loads(a)
            print(
                json_article["date"],
                json_article["id"],
                json_article["title"]
            )

1900-10-10 JDG-1900-10-10-a-i0001 BULLETIN GENÈVE, 9 octobre 1900
1900-10-10 JDG-1900-10-10-a-i0057 None
1900-10-10 JDG-1900-10-10-a-i0058 None
1900-10-10 JDG-1900-10-10-a-i0059 None
1900-10-10 JDG-1900-10-10-a-i0004 None
1900-10-10 JDG-1900-10-10-a-i0013 Angleterre
1900-10-10 JDG-1900-10-10-a-i0030 RÉUNiONS.-CONVOCATIONS.-CONCERTS
1900-10-10 JDG-1900-10-10-a-i0060 None
1900-10-10 JDG-1900-10-10-a-i0005 NOUVELLES Df.S CANTONS
1900-10-10 JDG-1900-10-10-a-i0031 Comité électoral démocratique
1900-10-10 JDG-1900-10-10-a-i0061 None
1900-10-10 JDG-1900-10-10-a-i0032 DERNIERES DÉPÊCHES
1900-10-10 JDG-1900-10-10-a-i0062 None
1900-10-10 JDG-1900-10-10-a-i0007 None
1900-10-10 JDG-1900-10-10-a-i0016 CHRONIQUE LOCALE
1900-10-10 JDG-1900-10-10-a-i0033 Turquie CONSTANTINOPLE (via Sofia), 9. — Le sultan a ordonné au conseil des ministres de mettre de
1900-10-10 JDG-1900-10-10-a-i0063 None
1900-10-10 JDG-1900-10-10-a-i0017 DOCTEUR GERMAINE
1900-10-10 JDG-1900-10-10-a-i0034 Les affaires de Chine
1900-1

### using dask and map
see http://dask.pydata.org/en/latest/docs.html 

In [27]:
# make sure of having these libraries in your environment ('conda install' / or 'pip install')
from dask.diagnostics import ProgressBar
from dask.distributed import Client, progress
import dask.bag as db

#### Helper functions

In [28]:
def get_archives(path):
    archives = []
    for archive in os.listdir(path):
        if "reduced" in archive:
            archives.append(os.path.join(input_dir, archive))
    return archives

In [29]:
def get_articles(archive_file):
    articles = []
    # open the archive
    f = BZ2File(archive_file, 'r')
    # get the list of articles it contains (= a json object on each line)
    lines = list(read_jsonlines(f))
    # load the articles as json and access their attributes
    for a in lines:
        articles.append(json.loads(a))
    return articles

#### Read and filter articles in parallel

In [30]:
archives = get_archives(input_dir)

In [32]:
archives

['data/JDG-1900-reduced.jsonl.bz2', 'data/GDL-1900-reduced.jsonl.bz2']

In [33]:
bag_archives = db.from_sequence(archives)
bag_articles = bag_archives.map(get_articles)\
                        .flatten()\
                        .filter(lambda ar: ar['fulltext'] != '')\
                        .repartition(npartitions=100)

In [None]:
with ProgressBar():
    bag_articles = bag_articles.persist()

[####################                    ] | 50% Completed |  1min  7.0s

In [45]:
bag_articles.take(4, npartitions=10)

({'id': 'GDL-1900-07-25-a-i0001',
  'type': 'ar',
  'date': '1900-07-25',
  'title': None,
  'fulltext': 'REDACTION .\'\'Eue Pcuinçt, 3. V BUREAU D\'ABONNEMENTS fiucilo St-François, 20. On s\'abonne dans lous les bureaux do poste Les abonnements partent du 1 " ou du l\'ô do chaque mois. PRIX D\'ABONNEMENT Un an 6 mois 8 moi} Suisso Fr. 20 dO 50 5 50 Union postale » 36 18 50 9 50 Prix du numéro : 10 centimes. '},
 {'id': 'GDL-1900-07-25-a-i0002',
  'type': 'ar',
  'date': '1900-07-25',
  'title': 'Lausanne, 25 juillet 1900. Le Péril socialiste',
  'fulltext': "Lausanne, 25 juillet 1900. Le Péril socialiste. Le socialisme prépare une révolution économique semblable à celle qui, éteignant la civilisation romaine, répandit sur l'Europe les ténèbres du moyen âge. Ce bouleversement est inévitable, non par la force des partis qui attaquent la société, mais par l'ignorance, la légèreté et la lâ cheté de ceux que les agresseurs veulent spolier. Telle est la thèse que M. Vilfredo Pareto, l'émine

In [90]:
bag_articles.count().compute()

6727

In [14]:
articles = bag_articles.compute()

In [15]:
len(articles)

6727

#### Compute total number of tokens (parallel)

In [46]:
n_tokens = bag_articles.map(lambda ar: len(ar['fulltext'].split()))

In [47]:
total_tokens = n_tokens.sum().compute()

In [48]:
"{:,}".format(total_tokens)

'4,178,435'

#### Extract Named Entities with SpaCy

In [18]:
import textacy

In [21]:
fr = textacy.load_spacy('fr_core_news_sm')

In [22]:
type(fr)

spacy.lang.fr.French

In [None]:
def extract_entities(article):

    # load spacy language model for French
    fr = textacy.load_spacy('fr_core_news_sm')
    
    # create a textacy Document
    doc = textacy.Doc(article['fulltext'], lang=fr)
    
    # apply SpaCy ner to the Document via textacy
    nes = list(textacy.extract.named_entities(doc))
    
    return {
        "id": article["id"],
        "entities": nes,
        "textacy_doc": doc
    }

In [49]:
%%time
entities_by_article = [
    extract_entities(art)
    for art in articles[:1000]
]

CPU times: user 9min 3s, sys: 3min 23s, total: 12min 26s
Wall time: 3min 45s


In [43]:
entities_by_article[1]

{'id': 'GDL-1900-07-25-a-i0002',
 'entities': [Lausanne,
  Péril socialiste,
  M. Vilfredo Pareto,
  Faculté de droit de l',
  Université de Lausanne,
  Pareto,
  Voyez,
  France,
  Voyez,
  Pline,
  Trajan,
  prince de Bismark,
  Pline,
  Bebel,
  Allemagne,
  Jules Guesde,
  France,
  Constantin,
  IV siècle,
  Marxistes,
  PÉRIL SOCUMSTK,
  Vilfredo Pareto,
  Extrait du Journal des Economistes,
  Paris,
  Guillauiiiin & Cie,
  Richelieu,
  Pareto,
  Allemagne,
  Sancho Pança,
  Allemagne,
  Heinze,
  Allez,
  Marqueté,
  modeste regard,
  Etat,
  M.,
  Pareto,
  Vérité]}