In [18]:
# Importation des modules

import pandas as pd
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim

logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

from gensim.models.phrases import Phrases, Phraser

import multiprocessing

from gensim.models import Word2Vec
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models.callbacks import CallbackAny2Vec

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
 
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

#Affichage de toutes les colonnes

pd.set_option('display.max_columns', 500)

INFO - 21:02:18: adding document #0 to Dictionary(0 unique tokens: [])
INFO - 21:02:18: built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)


In [2]:
df0 = pd.read_csv('data/data_cleaned_NLP.csv', sep = ',', encoding = 'latin-1')

In [3]:
df1 = df0[['Réplique', 'Groupe', 'tokenized_replique']].copy()

df2 = df1[['Groupe', 'tokenized_replique']].copy()

df_novice = df2[df2['Groupe'] == 'Novice'].copy()
df_exp = df2[df2['Groupe'] == 'Exp'].copy()

In [4]:
df_novice = df_novice.dropna()

df_exp = df_exp.dropna()

In [5]:
sent_novice = [row.split() for row in df_novice['tokenized_replique']]

sent_exp = [row.split() for row in df_exp['tokenized_replique']]

In [6]:
phrases_novice = Phrases(sent_novice, min_count = 3, progress_per = 20000)

phrases_exp = Phrases(sent_exp, min_count = 3, progress_per = 20000)


# min_count : Ignore all words and bigrams with total collected count lower than this value.

INFO - 20:52:11: collecting all words and their counts
INFO - 20:52:11: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 20:52:12: PROGRESS: at sentence #20000, processed 544399 words and 367942 word types
INFO - 20:52:14: PROGRESS: at sentence #40000, processed 1061844 words and 645502 word types
INFO - 20:52:14: collected 773338 word types from a corpus of 1326670 words (unigram + bigrams) and 50191 sentences
INFO - 20:52:14: using 773338 counts as vocab in Phrases<0 vocab, min_count=3, threshold=10.0, max_vocab_size=40000000>
INFO - 20:52:14: collecting all words and their counts
INFO - 20:52:14: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 20:52:15: PROGRESS: at sentence #20000, processed 518787 words and 365570 word types
INFO - 20:52:17: PROGRESS: at sentence #40000, processed 1056472 words and 658662 word types
INFO - 20:52:18: PROGRESS: at sentence #60000, processed 1530225 words and 884119 word types
INFO - 20:52:19: PROGRESS: at sente

In [7]:
bigram_novice = Phraser(phrases_novice)

bigram_exp = Phraser(phrases_exp)

INFO - 20:52:39: source_vocab length 773338
INFO - 20:52:49: Phraser built with 16435 phrasegrams
INFO - 20:52:49: source_vocab length 3813404
INFO - 20:52:59: Phraser added 50000 phrasegrams
INFO - 20:53:37: Phraser built with 73757 phrasegrams


In [8]:
sentences_novice = bigram_novice[sent_novice]

sentences_exp = bigram_exp[sent_exp]

In [9]:
word_freq_novice = defaultdict(int)
word_freq_exp = defaultdict(int)

for sent in sentences_novice:
    for i in sent:
        word_freq_novice[i] += 1
        
for sent in sentences_exp:
    for i in sent:
        word_freq_exp[i] += 1


In [10]:
cores = multiprocessing.cpu_count()

### Choix des hyperparamètres

In [68]:
# Choix de la fenêtre, on aimerait que le modèle aprenne vite, on alongera ensuite le nombre d'epochs

liste_modeles = [Word2Vec(
                     window = i,
                     size = 300,
                     sample = 6e-5, 
                     alpha = 0.03, 
                     min_alpha = 0.0007, 
                     negative = 20,
                     workers = cores - 1,
                     compute_loss = True)
                 
                 for i in range(1, 10)
                ]

tests_fenetre = []

for i in range(len(liste_modeles)):
    
    model = liste_modeles[i]
    model.build_vocab(sentences_novice, progress_per = 10000)


    model.train(sentences_novice, total_examples = model.corpus_count, epochs = 5, report_delay = 1)

    tests_fenetre.append([
            (model.wv.most_similar(positive=['droite'])[i][0],
             model.wv.most_similar(positive=['vitesse'])[i][0],
             model.wv.most_similar(positive=['donc'])[i][0],
            )
         for i in range(10)
         ])

INFO - 00:14:18: collecting all words and their counts
INFO - 00:14:18: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 00:14:19: PROGRESS: at sentence #10000, processed 233965 words, keeping 30761 word types
INFO - 00:14:20: PROGRESS: at sentence #20000, processed 450032 words, keeping 39850 word types
INFO - 00:14:21: PROGRESS: at sentence #30000, processed 677772 words, keeping 48410 word types
INFO - 00:14:22: PROGRESS: at sentence #40000, processed 880381 words, keeping 51772 word types
INFO - 00:14:22: PROGRESS: at sentence #50000, processed 1095690 words, keeping 55266 word types
INFO - 00:14:22: collected 55316 word types from a corpus of 1099932 raw words and 50191 sentences
INFO - 00:14:22: Loading a fresh vocabulary
INFO - 00:14:25: effective_min_count=5 retains 26248 unique words (47% of original 55316, drops 29068)
INFO - 00:14:25: effective_min_count=5 leaves 1041812 word corpus (94% of original 1099932, drops 58120)
INFO - 00:14:25: deleting the 

INFO - 00:15:24: EPOCH 1 - PROGRESS: at 80.86% examples, 116687 words/s, in_qsize 0, out_qsize 0
INFO - 00:15:25: EPOCH 1 - PROGRESS: at 97.94% examples, 118390 words/s, in_qsize 2, out_qsize 1
INFO - 00:15:25: worker thread finished; awaiting finish of 2 more threads
INFO - 00:15:26: worker thread finished; awaiting finish of 1 more threads
INFO - 00:15:26: worker thread finished; awaiting finish of 0 more threads
INFO - 00:15:26: EPOCH - 1 : training on 1099932 raw words (749440 effective words) took 6.3s, 119339 effective words/s
INFO - 00:15:27: EPOCH 2 - PROGRESS: at 13.68% examples, 111637 words/s, in_qsize 0, out_qsize 1
INFO - 00:15:28: EPOCH 2 - PROGRESS: at 30.72% examples, 116361 words/s, in_qsize 0, out_qsize 0
INFO - 00:15:29: EPOCH 2 - PROGRESS: at 44.53% examples, 113176 words/s, in_qsize 0, out_qsize 0
INFO - 00:15:30: EPOCH 2 - PROGRESS: at 59.03% examples, 110460 words/s, in_qsize 0, out_qsize 0
INFO - 00:15:31: EPOCH 2 - PROGRESS: at 72.66% examples, 105978 words/s, 

INFO - 00:16:22: EPOCH 3 - PROGRESS: at 89.34% examples, 109034 words/s, in_qsize 0, out_qsize 0
INFO - 00:16:23: worker thread finished; awaiting finish of 2 more threads
INFO - 00:16:23: worker thread finished; awaiting finish of 1 more threads
INFO - 00:16:23: worker thread finished; awaiting finish of 0 more threads
INFO - 00:16:23: EPOCH - 3 : training on 1099932 raw words (749271 effective words) took 6.7s, 111363 effective words/s
INFO - 00:16:24: EPOCH 4 - PROGRESS: at 14.59% examples, 121419 words/s, in_qsize 0, out_qsize 0
INFO - 00:16:25: EPOCH 4 - PROGRESS: at 32.44% examples, 121836 words/s, in_qsize 0, out_qsize 0
INFO - 00:16:26: EPOCH 4 - PROGRESS: at 47.15% examples, 118365 words/s, in_qsize 0, out_qsize 0
INFO - 00:16:27: EPOCH 4 - PROGRESS: at 62.67% examples, 116044 words/s, in_qsize 0, out_qsize 0
INFO - 00:16:28: EPOCH 4 - PROGRESS: at 79.48% examples, 115878 words/s, in_qsize 0, out_qsize 0
INFO - 00:16:29: EPOCH 4 - PROGRESS: at 95.64% examples, 115767 words/s, 

INFO - 00:17:19: worker thread finished; awaiting finish of 1 more threads
INFO - 00:17:19: EPOCH 5 - PROGRESS: at 100.00% examples, 121729 words/s, in_qsize 0, out_qsize 1
INFO - 00:17:19: worker thread finished; awaiting finish of 0 more threads
INFO - 00:17:19: EPOCH - 5 : training on 1099932 raw words (749375 effective words) took 6.2s, 121700 effective words/s
INFO - 00:17:19: training on a 5499660 raw words (3746064 effective words) took 32.1s, 116596 effective words/s
INFO - 00:17:19: precomputing L2-norms of word weight vectors
INFO - 00:17:19: collecting all words and their counts
INFO - 00:17:19: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 00:17:20: PROGRESS: at sentence #10000, processed 233965 words, keeping 30761 word types
INFO - 00:17:21: PROGRESS: at sentence #20000, processed 450032 words, keeping 39850 word types
INFO - 00:17:22: PROGRESS: at sentence #30000, processed 677772 words, keeping 48410 word types
INFO - 00:17:22: PROGRESS: at se

INFO - 00:18:08: estimated required memory for 26248 words and 300 dimensions: 76119200 bytes
INFO - 00:18:08: resetting layer weights
INFO - 00:18:15: training model with 3 workers on 26248 vocabulary and 300 features, using sg=0 hs=0 sample=6e-05 negative=20 window=6
INFO - 00:18:16: EPOCH 1 - PROGRESS: at 11.29% examples, 91755 words/s, in_qsize 0, out_qsize 0
INFO - 00:18:17: EPOCH 1 - PROGRESS: at 28.29% examples, 105464 words/s, in_qsize 1, out_qsize 0
INFO - 00:18:18: EPOCH 1 - PROGRESS: at 42.19% examples, 106024 words/s, in_qsize 0, out_qsize 0
INFO - 00:18:19: EPOCH 1 - PROGRESS: at 57.44% examples, 106489 words/s, in_qsize 0, out_qsize 0
INFO - 00:18:20: EPOCH 1 - PROGRESS: at 70.25% examples, 103471 words/s, in_qsize 0, out_qsize 0
INFO - 00:18:21: EPOCH 1 - PROGRESS: at 87.49% examples, 106164 words/s, in_qsize 0, out_qsize 0
INFO - 00:18:22: worker thread finished; awaiting finish of 2 more threads
INFO - 00:18:22: worker thread finished; awaiting finish of 1 more threads

INFO - 00:19:14: EPOCH 2 - PROGRESS: at 100.00% examples, 101542 words/s, in_qsize 0, out_qsize 1
INFO - 00:19:14: worker thread finished; awaiting finish of 0 more threads
INFO - 00:19:14: EPOCH - 2 : training on 1099932 raw words (748969 effective words) took 7.4s, 101523 effective words/s
INFO - 00:19:15: EPOCH 3 - PROGRESS: at 14.59% examples, 119437 words/s, in_qsize 0, out_qsize 0
INFO - 00:19:16: EPOCH 3 - PROGRESS: at 32.44% examples, 122039 words/s, in_qsize 0, out_qsize 0
INFO - 00:19:17: EPOCH 3 - PROGRESS: at 46.07% examples, 116082 words/s, in_qsize 0, out_qsize 0
INFO - 00:19:18: EPOCH 3 - PROGRESS: at 60.80% examples, 114078 words/s, in_qsize 0, out_qsize 0
INFO - 00:19:19: EPOCH 3 - PROGRESS: at 76.24% examples, 112640 words/s, in_qsize 1, out_qsize 0
INFO - 00:19:20: EPOCH 3 - PROGRESS: at 91.94% examples, 112384 words/s, in_qsize 0, out_qsize 0
INFO - 00:19:21: worker thread finished; awaiting finish of 2 more threads
INFO - 00:19:21: worker thread finished; awaiting 

INFO - 00:20:13: worker thread finished; awaiting finish of 0 more threads
INFO - 00:20:13: EPOCH - 4 : training on 1099932 raw words (749412 effective words) took 6.9s, 108704 effective words/s
INFO - 00:20:14: EPOCH 5 - PROGRESS: at 12.88% examples, 107218 words/s, in_qsize 0, out_qsize 0
INFO - 00:20:15: EPOCH 5 - PROGRESS: at 29.08% examples, 109962 words/s, in_qsize 0, out_qsize 0
INFO - 00:20:17: EPOCH 5 - PROGRESS: at 44.53% examples, 112383 words/s, in_qsize 0, out_qsize 0
INFO - 00:20:18: EPOCH 5 - PROGRESS: at 60.80% examples, 113896 words/s, in_qsize 0, out_qsize 0
INFO - 00:20:19: EPOCH 5 - PROGRESS: at 75.18% examples, 111247 words/s, in_qsize 0, out_qsize 0
INFO - 00:20:20: EPOCH 5 - PROGRESS: at 87.49% examples, 106795 words/s, in_qsize 0, out_qsize 0
INFO - 00:20:20: worker thread finished; awaiting finish of 2 more threads
INFO - 00:20:20: worker thread finished; awaiting finish of 1 more threads
INFO - 00:20:20: worker thread finished; awaiting finish of 0 more thread

In [72]:
tests_fenetre

[[('résolu', 'suivis', 'donne_avis'),
  ('vécu', 'multiplient', 'adopté'),
  ('assemblée_sénat', 'gratuitement', 'voter'),
  ('représentés', 'conflit_intérêts', 'amendements'),
  ('abouti', 'faire_croire', 'déposé'),
  ('voter_texte', 'allégements', 'émets_donc'),
  ('débouché', 'vingt_sept', 'sous_amendements'),
  ('fait_unanimité', 'donnent', 'voilà_pourquoi'),
  ('france_comores', 'instaurée', 'gouvernement'),
  ('accord_trouvé', 'réclament', 'adopter')],
 [('pouvions', 'saurait_être', 'conséquent'),
  ('dépassé', 'reconstruction', 'article'),
  ('gauche', 'multiplient', 'donne_avis'),
  ('avril', 'rendra', 'voter'),
  ('vécu', 'bonne_nouvelle', 'rédaction'),
  ('résolu', 'confort', 'adopté'),
  ('représentés', 'regroupement', 'donc_satisfait'),
  ('faisait', 'cherché', 'texte'),
  ('attend', 'ajoutent', 'déposé'),
  ('référendum', 'continueront', 'adopter')],
 [('gauche', 'plus_élevés', 'sous_amendements'),
  ('pouvions', 'rendra', 'donne_avis'),
  ('résolu', 'massivement', 'rédact

On choisit window = 4 qualitativement (on se restreint d'abord aux modèles qui associent en premier 'gauche' à 'droite', puis on sélectionne sur la pertinence des autres mots sélectionnés)

In [None]:
# Maintenant qu'on a la fenêtre, on fait le choix du nombre d'époques lors de l'apprentissage. 
# On reprend le même procédé pour tester la fiabilité

tests_epochs = []

liste_nb_epochs = [250]

for nb_epochs in liste_nb_epochs:
    
    model = Word2Vec(
                     window = 4,
                     size = 300,
                     sample = 6e-5, 
                     alpha = 0.03, 
                     min_alpha = 0.0007, 
                     negative = 20,
                     workers = cores - 1,
                     compute_loss = True)
    
    model.build_vocab(sentences_novice, progress_per = 10000)

    model.train(sentences_novice, total_examples = model.corpus_count, epochs = nb_epochs, report_delay = 1)

    tests_epochs.append([
            (model.wv.most_similar(positive=['droite'])[i][0],
             model.wv.most_similar(positive=['vitesse'])[i][0],
             model.wv.most_similar(positive=['donc'])[i][0],
            )
         for i in range(10)
         ])

INFO - 01:04:04: collecting all words and their counts
INFO - 01:04:04: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 01:04:05: PROGRESS: at sentence #10000, processed 233965 words, keeping 30761 word types
INFO - 01:04:06: PROGRESS: at sentence #20000, processed 450032 words, keeping 39850 word types
INFO - 01:04:07: PROGRESS: at sentence #30000, processed 677772 words, keeping 48410 word types
INFO - 01:04:08: PROGRESS: at sentence #40000, processed 880381 words, keeping 51772 word types
INFO - 01:04:09: PROGRESS: at sentence #50000, processed 1095690 words, keeping 55266 word types
INFO - 01:04:09: collected 55316 word types from a corpus of 1099932 raw words and 50191 sentences
INFO - 01:04:09: Loading a fresh vocabulary
INFO - 01:04:09: effective_min_count=5 retains 26248 unique words (47% of original 55316, drops 29068)
INFO - 01:04:09: effective_min_count=5 leaves 1041812 word corpus (94% of original 1099932, drops 58120)
INFO - 01:04:09: deleting the 

In [None]:
tests_epochs

In [57]:
w2v_model_novice = Word2Vec(
                     window = 1,
                     size = 300,
                     sample = 6e-5, 
                     alpha = 0.03, 
                     min_alpha = 0.0007, 
                     negative = 20,
                     workers = cores - 1,
                     compute_loss = True)

w2v_model_exp = Word2Vec(
                     window = 4,
                     size = 300,
                     sample = 6e-5, 
                     alpha = 0.03, 
                     min_alpha = 0.0007, 
                     negative = 20,
                     workers = cores - 1,
                     compute_loss = True)

In [58]:
t = time()

w2v_model_novice.build_vocab(sentences_novice, progress_per = 10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

t = time()

w2v_model_novice.train(sentences_novice, total_examples = w2v_model_novice.corpus_count, epochs = 5, report_delay = 1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 21:28:19: collecting all words and their counts
INFO - 21:28:19: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 21:28:19: PROGRESS: at sentence #10000, processed 233965 words, keeping 30761 word types
INFO - 21:28:20: PROGRESS: at sentence #20000, processed 450032 words, keeping 39850 word types
INFO - 21:28:21: PROGRESS: at sentence #30000, processed 677772 words, keeping 48410 word types
INFO - 21:28:22: PROGRESS: at sentence #40000, processed 880381 words, keeping 51772 word types
INFO - 21:28:22: PROGRESS: at sentence #50000, processed 1095690 words, keeping 55266 word types
INFO - 21:28:22: collected 55316 word types from a corpus of 1099932 raw words and 50191 sentences
INFO - 21:28:22: Loading a fresh vocabulary
INFO - 21:28:22: effective_min_count=5 retains 26248 unique words (47% of original 55316, drops 29068)
INFO - 21:28:22: effective_min_count=5 leaves 1041812 word corpus (94% of original 1099932, drops 58120)
INFO - 21:28:23: deleting the 

Time to build vocab: 0.18 mins


INFO - 21:28:30: EPOCH 1 - PROGRESS: at 14.59% examples, 119085 words/s, in_qsize 0, out_qsize 1
INFO - 21:28:31: EPOCH 1 - PROGRESS: at 29.88% examples, 113220 words/s, in_qsize 0, out_qsize 0
INFO - 21:28:32: EPOCH 1 - PROGRESS: at 41.41% examples, 104014 words/s, in_qsize 0, out_qsize 0
INFO - 21:28:33: EPOCH 1 - PROGRESS: at 55.41% examples, 104272 words/s, in_qsize 0, out_qsize 0
INFO - 21:28:34: EPOCH 1 - PROGRESS: at 70.25% examples, 104299 words/s, in_qsize 0, out_qsize 0
INFO - 21:28:35: EPOCH 1 - PROGRESS: at 85.67% examples, 104728 words/s, in_qsize 0, out_qsize 0
INFO - 21:28:36: worker thread finished; awaiting finish of 2 more threads
INFO - 21:28:36: worker thread finished; awaiting finish of 1 more threads
INFO - 21:28:36: worker thread finished; awaiting finish of 0 more threads
INFO - 21:28:36: EPOCH - 1 : training on 1099932 raw words (749232 effective words) took 6.9s, 108082 effective words/s


Loss after epoch 0: 0.0


INFO - 21:28:37: EPOCH 2 - PROGRESS: at 14.59% examples, 123502 words/s, in_qsize 1, out_qsize 0
INFO - 21:28:38: EPOCH 2 - PROGRESS: at 32.44% examples, 125605 words/s, in_qsize 0, out_qsize 0
INFO - 21:28:39: EPOCH 2 - PROGRESS: at 50.17% examples, 128152 words/s, in_qsize 0, out_qsize 0
INFO - 21:28:40: EPOCH 2 - PROGRESS: at 66.47% examples, 126086 words/s, in_qsize 0, out_qsize 0
INFO - 21:28:41: EPOCH 2 - PROGRESS: at 82.48% examples, 121550 words/s, in_qsize 0, out_qsize 0
INFO - 21:28:42: EPOCH 2 - PROGRESS: at 97.94% examples, 120822 words/s, in_qsize 2, out_qsize 1
INFO - 21:28:42: worker thread finished; awaiting finish of 2 more threads
INFO - 21:28:42: worker thread finished; awaiting finish of 1 more threads
INFO - 21:28:42: worker thread finished; awaiting finish of 0 more threads
INFO - 21:28:42: EPOCH - 2 : training on 1099932 raw words (749796 effective words) took 6.2s, 121716 effective words/s


Loss after epoch 1: 0.0


INFO - 21:28:43: EPOCH 3 - PROGRESS: at 13.68% examples, 114145 words/s, in_qsize 0, out_qsize 0
INFO - 21:28:44: EPOCH 3 - PROGRESS: at 31.64% examples, 120600 words/s, in_qsize 0, out_qsize 0
INFO - 21:28:45: EPOCH 3 - PROGRESS: at 45.26% examples, 115764 words/s, in_qsize 0, out_qsize 0
INFO - 21:28:46: EPOCH 3 - PROGRESS: at 60.80% examples, 114749 words/s, in_qsize 0, out_qsize 0
INFO - 21:28:47: EPOCH 3 - PROGRESS: at 78.07% examples, 116350 words/s, in_qsize 1, out_qsize 0
INFO - 21:28:48: EPOCH 3 - PROGRESS: at 92.94% examples, 114964 words/s, in_qsize 0, out_qsize 0
INFO - 21:28:49: worker thread finished; awaiting finish of 2 more threads
INFO - 21:28:49: worker thread finished; awaiting finish of 1 more threads
INFO - 21:28:49: worker thread finished; awaiting finish of 0 more threads
INFO - 21:28:49: EPOCH - 3 : training on 1099932 raw words (749338 effective words) took 6.6s, 113274 effective words/s


Loss after epoch 2: 0.0


INFO - 21:28:50: EPOCH 4 - PROGRESS: at 11.29% examples, 88611 words/s, in_qsize 0, out_qsize 0
INFO - 21:28:51: EPOCH 4 - PROGRESS: at 26.72% examples, 98172 words/s, in_qsize 0, out_qsize 0
INFO - 21:28:52: EPOCH 4 - PROGRESS: at 42.19% examples, 105034 words/s, in_qsize 0, out_qsize 0
INFO - 21:28:53: EPOCH 4 - PROGRESS: at 59.89% examples, 110851 words/s, in_qsize 0, out_qsize 0
INFO - 21:28:54: EPOCH 4 - PROGRESS: at 76.24% examples, 111668 words/s, in_qsize 1, out_qsize 0
INFO - 21:28:55: EPOCH 4 - PROGRESS: at 92.94% examples, 112536 words/s, in_qsize 0, out_qsize 0
INFO - 21:28:56: worker thread finished; awaiting finish of 2 more threads
INFO - 21:28:56: worker thread finished; awaiting finish of 1 more threads
INFO - 21:28:56: worker thread finished; awaiting finish of 0 more threads
INFO - 21:28:56: EPOCH - 4 : training on 1099932 raw words (748841 effective words) took 6.6s, 113427 effective words/s


Loss after epoch 3: 0.0


INFO - 21:28:57: EPOCH 5 - PROGRESS: at 13.68% examples, 112279 words/s, in_qsize 0, out_qsize 0
INFO - 21:28:58: EPOCH 5 - PROGRESS: at 31.64% examples, 117498 words/s, in_qsize 0, out_qsize 0
INFO - 21:28:59: EPOCH 5 - PROGRESS: at 48.16% examples, 118506 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:00: EPOCH 5 - PROGRESS: at 64.46% examples, 118211 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:01: EPOCH 5 - PROGRESS: at 80.86% examples, 116209 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:02: EPOCH 5 - PROGRESS: at 97.79% examples, 117621 words/s, in_qsize 3, out_qsize 0
INFO - 21:29:02: worker thread finished; awaiting finish of 2 more threads
INFO - 21:29:02: worker thread finished; awaiting finish of 1 more threads
INFO - 21:29:02: worker thread finished; awaiting finish of 0 more threads
INFO - 21:29:02: EPOCH - 5 : training on 1099932 raw words (749292 effective words) took 6.3s, 118681 effective words/s
INFO - 21:29:02: training on a 5499660 raw words (3746499 effective words

Loss after epoch 4: 0.0
Time to train the model: 0.54 mins


In [59]:
t = time()

w2v_model_exp.build_vocab(sentences_novice, progress_per = 10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

t = time()

w2v_model_exp.train(sentences_novice, total_examples = w2v_model_exp.corpus_count, epochs = 10, report_delay = 1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 21:29:02: collecting all words and their counts
INFO - 21:29:02: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 21:29:03: PROGRESS: at sentence #10000, processed 233965 words, keeping 30761 word types
INFO - 21:29:04: PROGRESS: at sentence #20000, processed 450032 words, keeping 39850 word types
INFO - 21:29:05: PROGRESS: at sentence #30000, processed 677772 words, keeping 48410 word types
INFO - 21:29:05: PROGRESS: at sentence #40000, processed 880381 words, keeping 51772 word types
INFO - 21:29:06: PROGRESS: at sentence #50000, processed 1095690 words, keeping 55266 word types
INFO - 21:29:06: collected 55316 word types from a corpus of 1099932 raw words and 50191 sentences
INFO - 21:29:06: Loading a fresh vocabulary
INFO - 21:29:06: effective_min_count=5 retains 26248 unique words (47% of original 55316, drops 29068)
INFO - 21:29:06: effective_min_count=5 leaves 1041812 word corpus (94% of original 1099932, drops 58120)
INFO - 21:29:06: deleting the 

Time to build vocab: 0.19 mins


INFO - 21:29:14: EPOCH 1 - PROGRESS: at 8.94% examples, 75749 words/s, in_qsize 1, out_qsize 0
INFO - 21:29:15: EPOCH 1 - PROGRESS: at 18.67% examples, 74386 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:16: EPOCH 1 - PROGRESS: at 29.08% examples, 72742 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:17: EPOCH 1 - PROGRESS: at 42.19% examples, 79422 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:18: EPOCH 1 - PROGRESS: at 55.41% examples, 82884 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:19: EPOCH 1 - PROGRESS: at 69.26% examples, 85337 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:20: EPOCH 1 - PROGRESS: at 83.31% examples, 86118 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:21: EPOCH 1 - PROGRESS: at 96.69% examples, 87870 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:22: worker thread finished; awaiting finish of 2 more threads
INFO - 21:29:22: worker thread finished; awaiting finish of 1 more threads
INFO - 21:29:22: worker thread finished; awaiting finish of 0 more threads
INFO - 2

Loss after epoch 0: 0.0


INFO - 21:29:23: EPOCH 2 - PROGRESS: at 12.09% examples, 102900 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:24: EPOCH 2 - PROGRESS: at 28.29% examples, 108079 words/s, in_qsize 1, out_qsize 0
INFO - 21:29:25: EPOCH 2 - PROGRESS: at 42.96% examples, 108831 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:26: EPOCH 2 - PROGRESS: at 57.44% examples, 108245 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:27: EPOCH 2 - PROGRESS: at 73.50% examples, 109556 words/s, in_qsize 1, out_qsize 0
INFO - 21:29:28: EPOCH 2 - PROGRESS: at 89.34% examples, 110488 words/s, in_qsize 1, out_qsize 0
INFO - 21:29:28: worker thread finished; awaiting finish of 2 more threads
INFO - 21:29:28: worker thread finished; awaiting finish of 1 more threads
INFO - 21:29:28: worker thread finished; awaiting finish of 0 more threads
INFO - 21:29:28: EPOCH - 2 : training on 1099932 raw words (749026 effective words) took 6.7s, 111941 effective words/s


Loss after epoch 1: 0.0


INFO - 21:29:29: EPOCH 3 - PROGRESS: at 13.68% examples, 116468 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:30: EPOCH 3 - PROGRESS: at 30.72% examples, 118834 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:31: EPOCH 3 - PROGRESS: at 45.26% examples, 116017 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:32: EPOCH 3 - PROGRESS: at 61.71% examples, 117252 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:33: EPOCH 3 - PROGRESS: at 76.24% examples, 112925 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:34: EPOCH 3 - PROGRESS: at 84.88% examples, 103613 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:35: EPOCH 3 - PROGRESS: at 97.79% examples, 102387 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:36: worker thread finished; awaiting finish of 2 more threads
INFO - 21:29:36: worker thread finished; awaiting finish of 1 more threads
INFO - 21:29:36: worker thread finished; awaiting finish of 0 more threads
INFO - 21:29:36: EPOCH - 3 : training on 1099932 raw words (749041 effective words) took 7.4s, 1

Loss after epoch 2: 0.0


INFO - 21:29:37: EPOCH 4 - PROGRESS: at 12.09% examples, 96841 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:38: EPOCH 4 - PROGRESS: at 28.29% examples, 104410 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:39: EPOCH 4 - PROGRESS: at 42.96% examples, 107451 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:40: EPOCH 4 - PROGRESS: at 58.21% examples, 109042 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:41: EPOCH 4 - PROGRESS: at 74.28% examples, 110096 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:42: EPOCH 4 - PROGRESS: at 89.34% examples, 109838 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:43: worker thread finished; awaiting finish of 2 more threads
INFO - 21:29:43: worker thread finished; awaiting finish of 1 more threads
INFO - 21:29:43: worker thread finished; awaiting finish of 0 more threads
INFO - 21:29:43: EPOCH - 4 : training on 1099932 raw words (749485 effective words) took 7.0s, 107413 effective words/s


Loss after epoch 3: 0.0


INFO - 21:29:44: EPOCH 5 - PROGRESS: at 5.60% examples, 45192 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:45: EPOCH 5 - PROGRESS: at 18.67% examples, 72414 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:46: EPOCH 5 - PROGRESS: at 32.44% examples, 80824 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:47: EPOCH 5 - PROGRESS: at 49.26% examples, 91395 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:48: EPOCH 5 - PROGRESS: at 66.47% examples, 97581 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:49: EPOCH 5 - PROGRESS: at 84.93% examples, 102339 words/s, in_qsize 1, out_qsize 0
INFO - 21:29:50: worker thread finished; awaiting finish of 2 more threads
INFO - 21:29:50: worker thread finished; awaiting finish of 1 more threads
INFO - 21:29:50: worker thread finished; awaiting finish of 0 more threads
INFO - 21:29:50: EPOCH - 5 : training on 1099932 raw words (749059 effective words) took 7.1s, 105619 effective words/s


Loss after epoch 4: 0.0


INFO - 21:29:51: EPOCH 6 - PROGRESS: at 13.68% examples, 107661 words/s, in_qsize 0, out_qsize 1
INFO - 21:29:52: EPOCH 6 - PROGRESS: at 28.29% examples, 104059 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:53: EPOCH 6 - PROGRESS: at 43.78% examples, 109781 words/s, in_qsize 1, out_qsize 0
INFO - 21:29:54: EPOCH 6 - PROGRESS: at 59.89% examples, 112487 words/s, in_qsize 1, out_qsize 0
INFO - 21:29:55: EPOCH 6 - PROGRESS: at 77.09% examples, 114525 words/s, in_qsize 1, out_qsize 0
INFO - 21:29:56: EPOCH 6 - PROGRESS: at 94.60% examples, 116083 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:56: worker thread finished; awaiting finish of 2 more threads
INFO - 21:29:56: worker thread finished; awaiting finish of 1 more threads
INFO - 21:29:56: worker thread finished; awaiting finish of 0 more threads
INFO - 21:29:56: EPOCH - 6 : training on 1099932 raw words (748910 effective words) took 6.4s, 117051 effective words/s


Loss after epoch 5: 0.0


INFO - 21:29:57: EPOCH 7 - PROGRESS: at 14.59% examples, 116907 words/s, in_qsize 1, out_qsize 0
INFO - 21:29:58: EPOCH 7 - PROGRESS: at 32.44% examples, 121593 words/s, in_qsize 0, out_qsize 0
INFO - 21:29:59: EPOCH 7 - PROGRESS: at 46.07% examples, 115467 words/s, in_qsize 0, out_qsize 0
INFO - 21:30:00: EPOCH 7 - PROGRESS: at 61.71% examples, 115066 words/s, in_qsize 0, out_qsize 0
INFO - 21:30:01: EPOCH 7 - PROGRESS: at 80.86% examples, 116855 words/s, in_qsize 0, out_qsize 0
INFO - 21:30:02: EPOCH 7 - PROGRESS: at 97.79% examples, 118155 words/s, in_qsize 1, out_qsize 0
INFO - 21:30:02: worker thread finished; awaiting finish of 2 more threads
INFO - 21:30:02: worker thread finished; awaiting finish of 1 more threads
INFO - 21:30:03: worker thread finished; awaiting finish of 0 more threads
INFO - 21:30:03: EPOCH - 7 : training on 1099932 raw words (749052 effective words) took 6.3s, 118804 effective words/s


Loss after epoch 6: 0.0


INFO - 21:30:04: EPOCH 8 - PROGRESS: at 14.59% examples, 115425 words/s, in_qsize 0, out_qsize 1
INFO - 21:30:05: EPOCH 8 - PROGRESS: at 32.44% examples, 120776 words/s, in_qsize 0, out_qsize 0
INFO - 21:30:06: EPOCH 8 - PROGRESS: at 47.15% examples, 117979 words/s, in_qsize 0, out_qsize 0
INFO - 21:30:07: EPOCH 8 - PROGRESS: at 63.54% examples, 117762 words/s, in_qsize 1, out_qsize 0
INFO - 21:30:08: EPOCH 8 - PROGRESS: at 79.48% examples, 114819 words/s, in_qsize 0, out_qsize 0
INFO - 21:30:09: EPOCH 8 - PROGRESS: at 95.64% examples, 115588 words/s, in_qsize 0, out_qsize 0
INFO - 21:30:09: worker thread finished; awaiting finish of 2 more threads
INFO - 21:30:09: worker thread finished; awaiting finish of 1 more threads
INFO - 21:30:09: worker thread finished; awaiting finish of 0 more threads
INFO - 21:30:09: EPOCH - 8 : training on 1099932 raw words (749594 effective words) took 6.4s, 116618 effective words/s


Loss after epoch 7: 0.0


INFO - 21:30:10: EPOCH 9 - PROGRESS: at 14.59% examples, 117248 words/s, in_qsize 0, out_qsize 0
INFO - 21:30:11: EPOCH 9 - PROGRESS: at 32.44% examples, 120240 words/s, in_qsize 0, out_qsize 0
INFO - 21:30:12: EPOCH 9 - PROGRESS: at 49.26% examples, 122064 words/s, in_qsize 0, out_qsize 0
INFO - 21:30:13: EPOCH 9 - PROGRESS: at 65.34% examples, 121538 words/s, in_qsize 0, out_qsize 0
INFO - 21:30:14: EPOCH 9 - PROGRESS: at 83.31% examples, 121394 words/s, in_qsize 0, out_qsize 0
INFO - 21:30:15: EPOCH 9 - PROGRESS: at 97.79% examples, 118012 words/s, in_qsize 0, out_qsize 0
INFO - 21:30:15: worker thread finished; awaiting finish of 2 more threads
INFO - 21:30:15: worker thread finished; awaiting finish of 1 more threads
INFO - 21:30:15: worker thread finished; awaiting finish of 0 more threads
INFO - 21:30:15: EPOCH - 9 : training on 1099932 raw words (749475 effective words) took 6.4s, 117322 effective words/s


Loss after epoch 8: 0.0


INFO - 21:30:16: EPOCH 10 - PROGRESS: at 13.68% examples, 116357 words/s, in_qsize 0, out_qsize 0
INFO - 21:30:17: EPOCH 10 - PROGRESS: at 30.72% examples, 119219 words/s, in_qsize 1, out_qsize 0
INFO - 21:30:18: EPOCH 10 - PROGRESS: at 47.15% examples, 121133 words/s, in_qsize 1, out_qsize 0
INFO - 21:30:19: EPOCH 10 - PROGRESS: at 64.46% examples, 121811 words/s, in_qsize 0, out_qsize 0
INFO - 21:30:20: EPOCH 10 - PROGRESS: at 83.31% examples, 122310 words/s, in_qsize 0, out_qsize 0
INFO - 21:30:21: worker thread finished; awaiting finish of 2 more threads
INFO - 21:30:21: worker thread finished; awaiting finish of 1 more threads
INFO - 21:30:21: worker thread finished; awaiting finish of 0 more threads
INFO - 21:30:21: EPOCH - 10 : training on 1099932 raw words (749640 effective words) took 6.1s, 123889 effective words/s
INFO - 21:30:21: training on a 10999320 raw words (7492722 effective words) took 68.4s, 109599 effective words/s


Loss after epoch 9: 0.0
Time to train the model: 1.14 mins


In [108]:
w2v_model_novice.init_sims(replace = True)
w2v_model_exp.init_sims(replace = True)

w2v_model_novice.save("results/word2vec_novice_test.model")
w2v_model_exp.save("results/word2vec_exp_test.model")

INFO - 23:07:23: precomputing L2-norms of word weight vectors
INFO - 23:07:23: precomputing L2-norms of word weight vectors
INFO - 23:07:23: saving Word2Vec object under results/word2vec_novice.model, separately None
INFO - 23:07:23: not storing attribute vectors_norm
INFO - 23:07:23: not storing attribute cum_table
INFO - 23:07:23: saved results/word2vec_novice.model
INFO - 23:07:23: saving Word2Vec object under results/word2vec_exp.model, separately None
INFO - 23:07:23: not storing attribute vectors_norm
INFO - 23:07:23: not storing attribute cum_table
INFO - 23:07:23: saved results/word2vec_exp.model


In [109]:
#w2v_model.wv.most_similar(positive=["macron"])
#w2v_model.wv.most_similar(negative=["promesse"])
#w2v_model.wv.similarity("élection", 'présidentielle')
#w2v_model.wv.similarity("sport", 'études')
#print(w2v_model.wv.similarity("macron", 'droite'))
#print(w2v_model.wv.similarity("macron", 'gauche'))
#w2v_model.wv.doesnt_match(['gauche', 'président', 'droite'])
#w2v_model.wv.most_similar(positive=["père", "femme"], negative = ['homme'], topn=3)

In [127]:
w2v_model_novice.wv.most_similar(positive=["droite"])

[('gauche', 0.986516535282135),
 ('républicains', 0.9116498231887817),
 ('bancs', 0.9007534384727478),
 ('cet_hémicycle', 0.8813148736953735),
 ('hémicycle', 0.8689683675765991),
 ('france_insoumise', 0.8554179668426514),
 ('opposition', 0.8358474969863892),
 ('groupes', 0.802750825881958),
 ('voix', 0.800241231918335),
 ('groupe', 0.7991034984588623)]

In [128]:
w2v_model_exp.wv.most_similar(positive=["droite"])

[('droite_gauche', 0.6979137659072876),
 ('extrême_gauche', 0.6682906150817871),
 ('gauche', 0.6396193504333496),
 ('socialistes', 0.6157995462417603),
 ('majorité', 0.561996340751648),
 ('bancs', 0.5594807267189026),
 ('communistes', 0.5519882440567017),
 ('extrême_droite', 0.5514141321182251),
 ('rangs', 0.5431515574455261),
 ('oreille', 0.5347355008125305)]

In [129]:
df_export_novice = pd.DataFrame(w2v_model_novice.wv.vectors)

df_export_exp = pd.DataFrame(w2v_model_exp.wv.vectors)

In [130]:
words_novice = [w2v_model_novice.wv.most_similar(positive=[np.array(df_export_novice.iloc[i])])[0][0] for i in range(df_export_novice.shape[0])]

words_exp = [w2v_model_exp.wv.most_similar(positive=[np.array(df_export_exp.iloc[i])])[0][0] for i in range(df_export_exp.shape[0])]

In [131]:
df_export_novice['word'] = words_novice

df_export_exp['word'] = words_exp

In [132]:
df_export_novice.to_csv('results/embeddings_novice_test.csv')
df_export_exp.to_csv('results/embeddings_exp_test.csv')