In [1]:
import json
import os
from bz2 import BZ2File
from datetime import date
from collections import Counter
import unicodedata

In [2]:
%%time
articles = []
# open the archive
f = BZ2File(os.path.join("./data", "cleaned.json.bz2"), 'r')
extracted_articles = json.loads(f.read().decode('utf-8'))

CPU times: user 11.9 s, sys: 292 ms, total: 12.2 s
Wall time: 12.3 s


In [3]:
def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nfkd_form if not unicodedata.combining(c)])

In [4]:
%%time
articles = []
for i in extracted_articles["id"]:
    articles.append({
        "id": i,
        "journal": extracted_articles["journal"][i],
        "date": date.fromtimestamp(extracted_articles["date"][i] / 1000),
        "length": extracted_articles["length"][i],
        "text": remove_accents(extracted_articles["text"][i]).lower().replace('.', ' ').replace('\'', ' ').replace(';', ' ').replace(',', ' ').replace('—', ' ')
    })

CPU times: user 34.4 s, sys: 143 ms, total: 34.5 s
Wall time: 34.6 s


In [5]:
print(len(articles))

50728


In [6]:
for year in range(190, 200):
    a = [a for a in articles if str(year) in str(a["date"].year)]
    print(year * 10, len(a))

1900 0
1910 0
1920 0
1930 13
1940 176
1950 5367
1960 11757
1970 10255
1980 15848
1990 7312


In [7]:
excluded_words = ['de', 'la', 'les', 'des', 'et', 'le', 'que', 'en', 'une', 'un',
                  'qui', 'par', ':', ';', 'dans', 'du', 'est', 'a', 'plus', 'ne', 'se',
                  'au', 'sont', '«', 'on', 'd', 'il', 'ces', 'pas', 'sur', 'pour',
                  'dont', 'ce', 'ou', 'avec', 'cette', 'si', 'nous', 'comme', 'm',
                  'elle', 'l', 'leur', 'mais', 'deux', 'sans', 'ont', 'y', '»', 'ses',
                  'aussi', 'ete', 'donc', 'meme', 'c', '»', 's', 'qu', 'n', 'la', 'les',
                  'm', 'l', 'on', 'il', 'etre', 'ainsi', 'en', 'qu', 's', 'aux', 'ils',
                  '?', 'bien', 'sa', 'peut', 'leurs', 'autre', 'nombre', 'autres', 'son',
                  'tous', 'encore', 'lui', 'tres', 'fait', 'tout', 'etait', 'apres'
                  'sous', 'faire', 'toutes', 'simples', 'notre', 'entre', 'sera', 'e',
                  'avait', '~', 'notre', '1', '/', '%', 'entre', 'contre', 'h', 'apres',
                  'sous', 'soit', 'fut', 'quelques', 'point', '000', 'deja', 'toute',
                  'moins', '*', 'u', 'non', ')', '(', 't', '"', '2', 'i', '4', 'part',
                  '00', 'selon', 'ans', '20', '30', 'depuis', '10', 'dan', 'plu', '',
                  'pa', 'nou', 'mai', 'san', 'simple', 'tou', 'tre', 'effet', 'quelque',
                  'ans', 'an', 'annee', 'annees', 'apre', 'depui']

In [8]:
%%time
for year in range(190, 200):
    counter = Counter()
    for article in [a for a in articles if str(year) in str(a["date"].year)]:
        words = []
        for word in article["text"].split():
            if word[-1] == 's':
                words.append(word[:-1])
            else:
                words.append(word)
        counter.update(words)
    for word in excluded_words:
        del counter[word]
    print(year * 10, counter.most_common(20))

1900 []
1910 []
1920 []
1930 [('atome', 54), ('electron', 53), ('element', 42), ('corp', 39), ('noyau', 39), ('physique', 35), ('cosmique', 35), ('particule', 34), ('proton', 30), ('matiere', 28), ('atomique', 26), ('rayonnement', 24), ('rayon', 23), ('propriete', 22), ('neutron', 21), ('vie', 21), ('nucleaire', 19), ('champ', 19), ('phenomene', 18), ('recherche', 18)]
1940 [('atomique', 740), ('energie', 536), ('guerre', 329), ('bombe', 307), ('nucleaire', 258), ('homme', 243), ('controle', 222), ('recherche', 213), ('etat', 203), ('monde', 200), ('commission', 191), ('probleme', 184), ('grande', 184), ('etats-uni', 183), ('science', 180), ('professeur', 179), ('americain', 174), ('pay', 167), ('premiere', 166), ('experience', 164)]
1950 [('nucleaire', 10611), ('atomique', 10238), ('energie', 6909), ('pay', 6434), ('sovietique', 5522), ('conference', 5411), ('arme', 5059), ('suisse', 4943), ('etat', 4898), ('gouvernement', 4838), ('etats-uni', 4803), ('politique', 4407), ('probleme', 