In [90]:
# Importation des modules

import pandas as pd
import re  # For preprocessing
import pandas as pd  # For data handling
from time import time  # To time our operations
from collections import defaultdict  # For word frequency

import spacy  # For preprocessing

import logging  # Setting up the loggings to monitor gensim

logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

from gensim.models.phrases import Phrases, Phraser

import multiprocessing

from gensim.models import Word2Vec

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
 
import seaborn as sns
sns.set_style("darkgrid")

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

#Affichage de toutes les colonnes

pd.set_option('display.max_columns', 500)

In [91]:
df0 = pd.read_csv('data/data_cleaned_NLP.csv', sep = ',', encoding = 'latin-1')

In [92]:
df1 = df0[['Réplique', 'Groupe', 'tokenized_replique']].copy()

df2 = df1[['Groupe', 'tokenized_replique']].copy()

df_novice = df2[df2['Groupe'] == 'Novice'].copy()
df_exp = df2[df2['Groupe'] == 'Exp'].copy()

In [93]:
df_novice = df_novice.dropna()

df_exp = df_exp.dropna()

Groupe                0
tokenized_replique    0
dtype: int64

In [94]:
sent_novice = [row.split() for row in df_novice['tokenized_replique']]

sent_exp = [row.split() for row in df_exp['tokenized_replique']]

In [95]:
phrases_novice = Phrases(sent_novice, min_count = 30, progress_per = 10000)

phrases_exp = Phrases(sent_exp, min_count = 30, progress_per = 10000)



# min_count : Ignore all words and bigrams with total collected count lower than this value.

INFO - 22:52:25: collecting all words and their counts
INFO - 22:52:25: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 22:52:26: PROGRESS: at sentence #10000, processed 283650 words and 213089 word types
INFO - 22:52:26: PROGRESS: at sentence #20000, processed 544399 words and 367942 word types
INFO - 22:52:27: PROGRESS: at sentence #30000, processed 814087 words and 525857 word types
INFO - 22:52:27: PROGRESS: at sentence #40000, processed 1061844 words and 645502 word types
INFO - 22:52:28: PROGRESS: at sentence #50000, processed 1321510 words and 771106 word types
INFO - 22:52:28: collected 773338 word types from a corpus of 1326670 words (unigram + bigrams) and 50191 sentences
INFO - 22:52:28: using 773338 counts as vocab in Phrases<0 vocab, min_count=30, threshold=10.0, max_vocab_size=40000000>
INFO - 22:52:28: collecting all words and their counts
INFO - 22:52:28: PROGRESS: at sentence #0, processed 0 words and 0 word types
INFO - 22:52:29: PROGRESS: at sente

In [96]:
bigram_novice = Phraser(phrases_novice)

bigram_exp = Phraser(phrases_exp)

INFO - 22:53:24: source_vocab length 773338
INFO - 22:53:33: Phraser built with 1032 phrasegrams
INFO - 22:53:33: source_vocab length 3813404
INFO - 22:54:21: Phraser built with 8628 phrasegrams


In [97]:
sentences_novice = bigram_novice[sent_novice]

sentences_exp = bigram_exp[sent_exp]

In [98]:
word_freq_novice = defaultdict(int)
word_freq_exp = defaultdict(int)

for sent in sentences_novice:
    for i in sent:
        word_freq_novice[i] += 1
        
for sent in sentences_exp:
    for i in sent:
        word_freq_exp[i] += 1


In [99]:
cores = multiprocessing.cpu_count()

In [100]:
w2v_model_novice = Word2Vec(min_count = 100,
                     window = 2,
                     size = 150,
                     sample = 6e-5, 
                     alpha = 0.03, 
                     min_alpha = 0.0007, 
                     negative = 20,
                     workers = cores - 1)

w2v_model_exp = Word2Vec(min_count = 100,
                     window = 2,
                     size = 150,
                     sample = 6e-5, 
                     alpha = 0.03, 
                     min_alpha = 0.0007, 
                     negative = 20,
                     workers = cores - 1)



In [101]:
t = time()

w2v_model_novice.build_vocab(sentences_novice, progress_per = 10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 22:55:48: collecting all words and their counts
INFO - 22:55:48: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 22:55:49: PROGRESS: at sentence #10000, processed 260406 words, keeping 21122 word types
INFO - 22:55:50: PROGRESS: at sentence #20000, processed 500315 words, keeping 27422 word types
INFO - 22:55:51: PROGRESS: at sentence #30000, processed 750890 words, keeping 34618 word types
INFO - 22:55:52: PROGRESS: at sentence #40000, processed 976296 words, keeping 37465 word types
INFO - 22:55:52: PROGRESS: at sentence #50000, processed 1215322 words, keeping 40663 word types
INFO - 22:55:52: collected 40709 word types from a corpus of 1220042 raw words and 50191 sentences
INFO - 22:55:52: Loading a fresh vocabulary
INFO - 22:55:52: effective_min_count=100 retains 2303 unique words (5% of original 40709, drops 38406)
INFO - 22:55:52: effective_min_count=100 leaves 845991 word corpus (69% of original 1220042, drops 374051)
INFO - 22:55:52: deleting th

Time to build vocab: 0.08 mins


In [102]:
t = time()

w2v_model_exp.build_vocab(sentences_exp, progress_per = 10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 22:56:20: collecting all words and their counts
INFO - 22:56:20: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 22:56:21: PROGRESS: at sentence #10000, processed 240786 words, keeping 26127 word types
INFO - 22:56:22: PROGRESS: at sentence #20000, processed 455421 words, keeping 34531 word types
INFO - 22:56:23: PROGRESS: at sentence #30000, processed 693431 words, keeping 40583 word types
INFO - 22:56:24: PROGRESS: at sentence #40000, processed 926895 words, keeping 44706 word types
INFO - 22:56:24: PROGRESS: at sentence #50000, processed 1125581 words, keeping 47583 word types
INFO - 22:56:25: PROGRESS: at sentence #60000, processed 1340810 words, keeping 50171 word types
INFO - 22:56:26: PROGRESS: at sentence #70000, processed 1539704 words, keeping 52278 word types
INFO - 22:56:27: PROGRESS: at sentence #80000, processed 1767658 words, keeping 54619 word types
INFO - 22:56:28: PROGRESS: at sentence #90000, processed 2001917 words, keeping 56829 word

Time to build vocab: 0.66 mins


In [103]:
t = time()

w2v_model_novice.train(sentences_novice, total_examples = w2v_model_novice.corpus_count, epochs = 10, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 22:58:32: training model with 3 workers on 2303 vocabulary and 150 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
INFO - 22:58:33: EPOCH 1 - PROGRESS: at 20.96% examples, 83129 words/s, in_qsize 0, out_qsize 0
INFO - 22:58:34: EPOCH 1 - PROGRESS: at 40.52% examples, 79040 words/s, in_qsize 0, out_qsize 0
INFO - 22:58:35: EPOCH 1 - PROGRESS: at 53.22% examples, 67751 words/s, in_qsize 0, out_qsize 0
INFO - 22:58:36: EPOCH 1 - PROGRESS: at 74.19% examples, 70998 words/s, in_qsize 0, out_qsize 0
INFO - 22:58:37: EPOCH 1 - PROGRESS: at 97.94% examples, 74342 words/s, in_qsize 0, out_qsize 0
INFO - 22:58:37: worker thread finished; awaiting finish of 2 more threads
INFO - 22:58:37: worker thread finished; awaiting finish of 1 more threads
INFO - 22:58:37: worker thread finished; awaiting finish of 0 more threads
INFO - 22:58:37: EPOCH - 1 : training on 1220042 raw words (390992 effective words) took 5.2s, 74741 effective words/s
INFO - 22:58:38: EPOCH 2 - PROGRESS: at 19

Time to train the model: 0.86 mins


In [104]:
t = time()

w2v_model_exp.train(sentences_exp, total_examples = w2v_model_exp.corpus_count, epochs = 10, report_delay = 1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

INFO - 22:59:23: training model with 3 workers on 11233 vocabulary and 150 features, using sg=0 hs=0 sample=6e-05 negative=20 window=2
INFO - 22:59:24: EPOCH 1 - PROGRESS: at 1.90% examples, 116686 words/s, in_qsize 0, out_qsize 0
INFO - 22:59:25: EPOCH 1 - PROGRESS: at 4.10% examples, 117039 words/s, in_qsize 0, out_qsize 0
INFO - 22:59:27: EPOCH 1 - PROGRESS: at 5.74% examples, 109154 words/s, in_qsize 0, out_qsize 0
INFO - 22:59:28: EPOCH 1 - PROGRESS: at 7.25% examples, 106135 words/s, in_qsize 0, out_qsize 0
INFO - 22:59:29: EPOCH 1 - PROGRESS: at 9.04% examples, 105658 words/s, in_qsize 0, out_qsize 0
INFO - 22:59:30: EPOCH 1 - PROGRESS: at 11.48% examples, 107960 words/s, in_qsize 0, out_qsize 0
INFO - 22:59:31: EPOCH 1 - PROGRESS: at 13.54% examples, 108238 words/s, in_qsize 0, out_qsize 0
INFO - 22:59:32: EPOCH 1 - PROGRESS: at 15.56% examples, 106952 words/s, in_qsize 0, out_qsize 0
INFO - 22:59:33: EPOCH 1 - PROGRESS: at 17.34% examples, 107147 words/s, in_qsize 0, out_qsize

INFO - 23:00:51: EPOCH 2 - PROGRESS: at 72.92% examples, 106403 words/s, in_qsize 0, out_qsize 0
INFO - 23:00:52: EPOCH 2 - PROGRESS: at 74.29% examples, 105869 words/s, in_qsize 0, out_qsize 0
INFO - 23:00:53: EPOCH 2 - PROGRESS: at 76.46% examples, 106243 words/s, in_qsize 0, out_qsize 0
INFO - 23:00:54: EPOCH 2 - PROGRESS: at 79.51% examples, 107002 words/s, in_qsize 0, out_qsize 0
INFO - 23:00:55: EPOCH 2 - PROGRESS: at 84.52% examples, 107271 words/s, in_qsize 1, out_qsize 0
INFO - 23:00:56: EPOCH 2 - PROGRESS: at 86.83% examples, 107331 words/s, in_qsize 0, out_qsize 0
INFO - 23:00:57: EPOCH 2 - PROGRESS: at 88.94% examples, 107487 words/s, in_qsize 0, out_qsize 0
INFO - 23:00:58: EPOCH 2 - PROGRESS: at 90.65% examples, 107541 words/s, in_qsize 0, out_qsize 0
INFO - 23:00:59: EPOCH 2 - PROGRESS: at 93.22% examples, 107673 words/s, in_qsize 0, out_qsize 0
INFO - 23:01:00: EPOCH 2 - PROGRESS: at 94.89% examples, 107114 words/s, in_qsize 0, out_qsize 0
INFO - 23:01:01: EPOCH 2 - PRO

INFO - 23:02:12: EPOCH 4 - PROGRESS: at 52.11% examples, 116072 words/s, in_qsize 0, out_qsize 0
INFO - 23:02:13: EPOCH 4 - PROGRESS: at 54.23% examples, 115750 words/s, in_qsize 0, out_qsize 0
INFO - 23:02:15: EPOCH 4 - PROGRESS: at 56.40% examples, 115549 words/s, in_qsize 0, out_qsize 0
INFO - 23:02:16: EPOCH 4 - PROGRESS: at 58.61% examples, 115829 words/s, in_qsize 0, out_qsize 0
INFO - 23:02:17: EPOCH 4 - PROGRESS: at 60.41% examples, 115335 words/s, in_qsize 0, out_qsize 0
INFO - 23:02:18: EPOCH 4 - PROGRESS: at 62.29% examples, 114812 words/s, in_qsize 0, out_qsize 1
INFO - 23:02:19: EPOCH 4 - PROGRESS: at 64.37% examples, 114820 words/s, in_qsize 0, out_qsize 0
INFO - 23:02:20: EPOCH 4 - PROGRESS: at 66.07% examples, 114588 words/s, in_qsize 0, out_qsize 0
INFO - 23:02:21: EPOCH 4 - PROGRESS: at 68.08% examples, 115010 words/s, in_qsize 0, out_qsize 0
INFO - 23:02:22: EPOCH 4 - PROGRESS: at 70.11% examples, 115169 words/s, in_qsize 0, out_qsize 0
INFO - 23:02:23: EPOCH 4 - PRO

INFO - 23:03:34: EPOCH 6 - PROGRESS: at 25.02% examples, 108611 words/s, in_qsize 0, out_qsize 0
INFO - 23:03:35: EPOCH 6 - PROGRESS: at 27.16% examples, 110193 words/s, in_qsize 0, out_qsize 0
INFO - 23:03:36: EPOCH 6 - PROGRESS: at 29.79% examples, 111477 words/s, in_qsize 0, out_qsize 0
INFO - 23:03:37: EPOCH 6 - PROGRESS: at 32.97% examples, 112925 words/s, in_qsize 0, out_qsize 0
INFO - 23:03:38: EPOCH 6 - PROGRESS: at 38.67% examples, 113446 words/s, in_qsize 0, out_qsize 0
INFO - 23:03:39: EPOCH 6 - PROGRESS: at 40.82% examples, 114430 words/s, in_qsize 0, out_qsize 0
INFO - 23:03:40: EPOCH 6 - PROGRESS: at 43.24% examples, 115387 words/s, in_qsize 0, out_qsize 0
INFO - 23:03:41: EPOCH 6 - PROGRESS: at 45.07% examples, 115056 words/s, in_qsize 0, out_qsize 0
INFO - 23:03:42: EPOCH 6 - PROGRESS: at 46.69% examples, 114055 words/s, in_qsize 0, out_qsize 1
INFO - 23:03:43: EPOCH 6 - PROGRESS: at 48.73% examples, 113900 words/s, in_qsize 0, out_qsize 0
INFO - 23:03:44: EPOCH 6 - PRO

INFO - 23:04:55: EPOCH 8 - PROGRESS: at 17.80% examples, 109374 words/s, in_qsize 0, out_qsize 0
INFO - 23:04:56: EPOCH 8 - PROGRESS: at 19.42% examples, 108026 words/s, in_qsize 0, out_qsize 0
INFO - 23:04:57: EPOCH 8 - PROGRESS: at 21.25% examples, 106929 words/s, in_qsize 0, out_qsize 0
INFO - 23:04:58: EPOCH 8 - PROGRESS: at 23.00% examples, 106766 words/s, in_qsize 0, out_qsize 0
INFO - 23:04:59: EPOCH 8 - PROGRESS: at 24.47% examples, 104988 words/s, in_qsize 0, out_qsize 0
INFO - 23:05:00: EPOCH 8 - PROGRESS: at 25.89% examples, 104212 words/s, in_qsize 0, out_qsize 0
INFO - 23:05:01: EPOCH 8 - PROGRESS: at 27.51% examples, 103999 words/s, in_qsize 0, out_qsize 0
INFO - 23:05:02: EPOCH 8 - PROGRESS: at 29.62% examples, 103735 words/s, in_qsize 0, out_qsize 0
INFO - 23:05:03: EPOCH 8 - PROGRESS: at 31.89% examples, 102435 words/s, in_qsize 0, out_qsize 0
INFO - 23:05:04: EPOCH 8 - PROGRESS: at 33.45% examples, 101347 words/s, in_qsize 0, out_qsize 0
INFO - 23:05:05: EPOCH 8 - PRO

INFO - 23:06:23: EPOCH 9 - PROGRESS: at 80.04% examples, 110510 words/s, in_qsize 1, out_qsize 0
INFO - 23:06:25: EPOCH 9 - PROGRESS: at 85.40% examples, 110703 words/s, in_qsize 0, out_qsize 0
INFO - 23:06:26: EPOCH 9 - PROGRESS: at 87.99% examples, 111165 words/s, in_qsize 1, out_qsize 0
INFO - 23:06:27: EPOCH 9 - PROGRESS: at 90.05% examples, 111531 words/s, in_qsize 0, out_qsize 0
INFO - 23:06:28: EPOCH 9 - PROGRESS: at 91.96% examples, 111251 words/s, in_qsize 0, out_qsize 0
INFO - 23:06:29: EPOCH 9 - PROGRESS: at 94.48% examples, 111170 words/s, in_qsize 0, out_qsize 0
INFO - 23:06:30: EPOCH 9 - PROGRESS: at 96.94% examples, 111511 words/s, in_qsize 0, out_qsize 0
INFO - 23:06:31: EPOCH 9 - PROGRESS: at 99.48% examples, 111828 words/s, in_qsize 0, out_qsize 0
INFO - 23:06:31: worker thread finished; awaiting finish of 2 more threads
INFO - 23:06:31: worker thread finished; awaiting finish of 1 more threads
INFO - 23:06:31: worker thread finished; awaiting finish of 0 more threads

Time to train the model: 7.9 mins


In [108]:
w2v_model_novice.init_sims(replace = True)
w2v_model_exp.init_sims(replace = True)

w2v_model_novice.save("results/word2vec_novice.model")
w2v_model_exp.save("results/word2vec_exp.model")

INFO - 23:07:23: precomputing L2-norms of word weight vectors
INFO - 23:07:23: precomputing L2-norms of word weight vectors
INFO - 23:07:23: saving Word2Vec object under results/word2vec_novice.model, separately None
INFO - 23:07:23: not storing attribute vectors_norm
INFO - 23:07:23: not storing attribute cum_table
INFO - 23:07:23: saved results/word2vec_novice.model
INFO - 23:07:23: saving Word2Vec object under results/word2vec_exp.model, separately None
INFO - 23:07:23: not storing attribute vectors_norm
INFO - 23:07:23: not storing attribute cum_table
INFO - 23:07:23: saved results/word2vec_exp.model


In [109]:
#w2v_model.wv.most_similar(positive=["macron"])
#w2v_model.wv.most_similar(negative=["promesse"])
#w2v_model.wv.similarity("élection", 'présidentielle')
#w2v_model.wv.similarity("sport", 'études')
#print(w2v_model.wv.similarity("macron", 'droite'))
#print(w2v_model.wv.similarity("macron", 'gauche'))
#w2v_model.wv.doesnt_match(['gauche', 'président', 'droite'])
#w2v_model.wv.most_similar(positive=["père", "femme"], negative = ['homme'], topn=3)

In [127]:
w2v_model_novice.wv.most_similar(positive=["droite"])

[('gauche', 0.986516535282135),
 ('républicains', 0.9116498231887817),
 ('bancs', 0.9007534384727478),
 ('cet_hémicycle', 0.8813148736953735),
 ('hémicycle', 0.8689683675765991),
 ('france_insoumise', 0.8554179668426514),
 ('opposition', 0.8358474969863892),
 ('groupes', 0.802750825881958),
 ('voix', 0.800241231918335),
 ('groupe', 0.7991034984588623)]

In [128]:
w2v_model_exp.wv.most_similar(positive=["droite"])

[('droite_gauche', 0.6979137659072876),
 ('extrême_gauche', 0.6682906150817871),
 ('gauche', 0.6396193504333496),
 ('socialistes', 0.6157995462417603),
 ('majorité', 0.561996340751648),
 ('bancs', 0.5594807267189026),
 ('communistes', 0.5519882440567017),
 ('extrême_droite', 0.5514141321182251),
 ('rangs', 0.5431515574455261),
 ('oreille', 0.5347355008125305)]

In [129]:
df_export_novice = pd.DataFrame(w2v_model_novice.wv.vectors)

df_export_exp = pd.DataFrame(w2v_model_exp.wv.vectors)

In [130]:
words_novice = [w2v_model_novice.wv.most_similar(positive=[np.array(df_export_novice.iloc[i])])[0][0] for i in range(df_export_novice.shape[0])]

words_exp = [w2v_model_exp.wv.most_similar(positive=[np.array(df_export_exp.iloc[i])])[0][0] for i in range(df_export_exp.shape[0])]

In [131]:
df_export_novice['word'] = words_novice

df_export_exp['word'] = words_exp

In [132]:
df_export_novice.to_csv('results/embeddings_novice_test.csv')
df_export_exp.to_csv('results/embeddings_exp_test.csv')