In [None]:
# Importation des modules

import pandas as pd
from gensim.models import Word2Vec
import random
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.decomposition import PCA, KernelPCA
from sklearn.cluster import DBSCAN
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial import procrustes


#Affichage de toutes les colonnes

pd.set_option('display.max_columns', 500)

In [None]:
w2v_model_novice = Word2Vec.load("results/word2vec_novice_300.model")

w2v_model_exp = Word2Vec.load("results/word2vec_exp_300.model")

w2v_model_all = Word2Vec.load("results/word2vec_all_300.model")

In [None]:
words_novice = list(w2v_model_novice.wv.vocab.keys())
words_exp = list(w2v_model_exp.wv.vocab.keys())
words_all = list(w2v_model_all.wv.vocab.keys())
#words_all = random.sample(list(w2v_model_all.wv.vocab.keys()), 1000)

In [None]:
#v = w2v_model_novice.wv.get_vector('ministre') + w2v_model_novice.wv.get_vector('santé')

#w2v_model_all.similar_by_vector(v, topn=5)

In [None]:
# Construction de la matrice des vecteurs

df_vectors = pd.DataFrame(dict([(i, list(w2v_model_all.wv.get_vector(i))) for i in words_all])).T

In [None]:
pca = PCA(n_components=3)
pca.fit(df_vectors)


In [None]:
transformed_df = pd.DataFrame(pca.transform(df_vectors)).rename(columns = {
                            0 : 'var0', 
                            1 : 'var1', 
                            2 : 'var2',
                         }).rename(index=dict([(i, words_all[i]) for i in range(len(words_all))]))
transformed_df.reset_index(level=0, inplace=True)

transformed_df['provenance'] = transformed_df.apply(lambda row: 'Novice' if row.iloc[0][-1] == 'e' else 'Exp', axis = 1)

In [None]:
target = [i[0] for i in w2v_model_all.wv.most_similar(positive=["macron_Exp"], topn = 20)]
target += [i[0] for i in w2v_model_all.wv.most_similar(positive=["macron_Novice"], topn = 20)]


In [None]:
print_df = transformed_df
fig = px.scatter_3d(print_df, x='var0', y='var1', z = 'var2', opacity = .05, 
                    hover_data = ['index'], 
                    color = 'provenance')
fig.show()

In [None]:
fig = px.scatter(print_df, x='var0', y='var1', opacity = .5, 
                    hover_data = ['index'], 
                    color = 'provenance')
fig.show()

In [None]:
'hamon_Exp' in list(df_vectors.index)

### Création d'axes

In [None]:
key_words = ['droite_Exp', 'gauche_Exp', 'extrême_Exp_droite_Exp', 'extrême_Exp_gauche_Exp',
            'droite_Novice', 'gauche_Novice', 'extrême_Novice_droite_Novice', 
             'extrême_Novice_gauche_Novice', 
            ]

key_words0 = ['macron_Exp', 'mélenchon_Exp', 'pen_Exp', 'fillon_Exp', 'hamon_Exp',
        'macron_Novice', 'mélenchon_Novice', 'pen_Novice', 'fillon_Novice', 'hamon_Novice',
            ]





print_df['provenance'] = print_df.apply(lambda row: 'highlight' if row.iloc[0] in key_words else row.provenance, 
                                        axis = 1)

In [None]:
fig = px.scatter(print_df, x='var0', y='var1', opacity = 1, 
                    hover_data = ['index'], 
                    color = 'provenance')
fig.show()

On remarque un "motif politique" similaire à homothétie et rotation prêt.  
Peut-être les mots sélectionnés sont alignés dans l'espace de dimension 300. On vérifie cela en calculant les produits scalaires.

In [None]:
from numpy import linalg as LA

def alignement_mots(mot1, mot2):
    v1 = w2v_model_all.wv.get_vector(mot1)
    v1 = v1/LA.norm(v1)
    
    v2 = w2v_model_all.wv.get_vector(mot2)
    v2 = v2/LA.norm(v2)

    return (mot1, mot2,v1@v2)

In [None]:
print(alignement_mots('droite_Exp', 'gauche_Exp'), alignement_mots('centre_Exp', 'gauche_Exp'), '\n',
      alignement_mots('droite_Exp', 'centre_Exp'), '\n',
      alignement_mots('droite_Exp', 'extrême_Exp_droite_Exp'), '\n',
      alignement_mots('gauche_Exp', 'extrême_Exp_gauche_Exp'), '\n','\n',
      
      
      alignement_mots('droite_Novice', 'gauche_Novice'), alignement_mots('centre_Novice', 'gauche_Novice'), '\n',
      alignement_mots('droite_Novice', 'centre_Novice'),'\n',
      alignement_mots('droite_Novice', 'extrême_Novice_droite_Novice'), '\n',
      alignement_mots('gauche_Novice', 'extrême_Novice_gauche_Novice'), '\n',
)


Cela n'a pas l'air d'être le cas.

Deux pistes :  
- faire matcher les deux espaces ('problème de procuste')
- déformer les espaces pour aligner les mots clefs politiques

#### Application du problème de Procuste

In [None]:
df_vectors_Exp = df_vectors[df_vectors.index.str[-1] == 'p'].copy()
df_vectors_Novice = df_vectors[df_vectors.index.str[-1] == 'e'].copy()

In [None]:
df_vectors_Novice = df_vectors_Novice.append(pd.DataFrame([[0 for i in range(300)] for j in range(64657)]))

In [None]:
mtx1, mtx2, disparity = procrustes(df_vectors_Exp, df_vectors_Novice)

In [None]:
df_vectors_Exp_procrustes = pd.DataFrame(mtx1, index = df_vectors_Exp.index)
df_vectors_Novice_procrustes = pd.DataFrame(mtx2, index = df_vectors_Novice.index)

In [None]:
#c_norm_Exp = max([u@u for _,u in df_vectors_Exp_procrustes.iterrows()])
#c_norm_Novice = max([u@u for _,u in df_vectors_Novice_procrustes.iterrows()])

#df_vectors_Exp_procrustes = df_vectors_Exp_procrustes.div(c_norm_Exp)
#df_vectors_Novice_procrustes = df_vectors_Novice_procrustes.div(c_norm_Novice)

In [None]:
df_vectors_procrustes = df_vectors_Novice_procrustes.append(df_vectors_Exp_procrustes)

In [None]:
def closest_words(df, mot, k):
    
    v = df.loc[mot]
    
    L = [(n, v@u) for n, u in df.iterrows()]
    
    L.sort(key = lambda couple: couple[1])
    
    res = L[-1*k:][::-1]
    
    return res

In [None]:
a = closest_words(df_vectors_procrustes, 'droite_Exp', 10)

In [None]:
pca = PCA(n_components = 3)
pca.fit(df_vectors_procrustes)

In [None]:
transformed_df_procustes = pd.DataFrame(pca.transform(df_vectors_procrustes)).rename(columns = {
                            0 : 'var0', 
                            1 : 'var1', 
                            2 : 'var2',
                         })

transformed_df_procustes['word'] = df_vectors_procrustes.index

In [None]:
transformed_df_procustes['provenance'] = ['Novice' for i in range(df_vectors_Novice_procrustes.shape[0])] + ['Exp' for i in range(df_vectors_Exp_procrustes.shape[0])]

In [None]:
print_df = transformed_df_procustes

key_words = ['droite_Exp', 'gauche_Exp', 'centre_Exp', 'extrême_Exp_droite_Exp', 'extrême_Exp_gauche_Exp',
            'droite_Novice', 'gauche_Novice', 'centre_Novice', 'extrême_Novice_droite_Novice', 
             'extrême_Novice_gauche_Novice', 
            ]
print_df['provenance'] = print_df.apply(lambda row: 'highlight' if row.word in key_words else row.provenance, 
                                        axis = 1)

In [None]:
fig = px.scatter(print_df.sample(frac=1), x='var0', y='var1', opacity = .5, 
                    hover_data = [list(df_vectors_procrustes.index)], 
                    color = 'provenance')
fig.show()

In [None]:
print_df

### Restriction à l'intersection des vocabulaires

In [None]:
vocab_Novice = [i for i in list(w2v_model_all.wv.vocab) if '_Novice' in i]
vocab_Exp = [i for i in list(w2v_model_all.wv.vocab) if '_Exp' in i]

In [None]:
'donne_Novice_occasion_Novice'.replace('_Exp', '_Novice')

In [None]:
vocab_restriction_Exp = [i for i in vocab_Exp if i.replace('_Exp', '_Novice') in vocab_Novice]

In [None]:
restriction_Exp = df_vectors_Exp[df_vectors_Exp.index.isin(vocab_restriction_Exp)]

In [None]:
vocab_restriction_Novice = [i.replace('_Exp', '_Novice') for i in list(restriction_Exp.index) if i.replace('_Exp', '_Novice') in vocab_Novice]

In [None]:
restriction_Novice = df_vectors_Novice[df_vectors_Novice.index.isin(vocab_restriction_Novice)]

In [None]:
restriction_Novice.shape

In [None]:
restriction_Exp.shape

In [None]:
restriction_Novice = restriction_Novice / max(list(np.sqrt(np.square(restriction_Novice).sum(axis=1))))
restriction_Exp = restriction_Exp / max(list(np.sqrt(np.square(restriction_Exp).sum(axis=1))))

In [None]:
mtx1_restriction, mtx2_restriction, disparity = procrustes(restriction_Exp, restriction_Novice)

In [None]:
df_vectors_Exp_restriction_procrustes = pd.DataFrame(mtx1_restriction, index = restriction_Exp.index)
df_vectors_Novice_restriction_procrustes = pd.DataFrame(mtx2_restriction, index = restriction_Novice.index)

In [None]:
df_vectors_Exp_restriction_procrustes = df_vectors_Exp_restriction_procrustes / max(list(np.sqrt(np.square(df_vectors_Exp_restriction_procrustes).sum(axis=1))))

df_vectors_Novice_restriction_procrustes = df_vectors_Novice_restriction_procrustes / max(list(np.sqrt(np.square(df_vectors_Novice_restriction_procrustes).sum(axis=1))))




In [None]:
df_vectors_restriction_procrustes = df_vectors_Novice_restriction_procrustes.append(df_vectors_Exp_restriction_procrustes)

In [None]:
pca_restriction = PCA(n_components = 3)
pca_restriction.fit(df_vectors_restriction_procrustes)

In [None]:
transformed_df_procustes_restriction = pd.DataFrame(pca_restriction.transform(df_vectors_restriction_procrustes)).rename(columns = {
                            0 : 'var0', 
                            1 : 'var1', 
                            2 : 'var2',
                         })

transformed_df_procustes_restriction['word'] = df_vectors_restriction_procrustes.index

In [None]:
transformed_df_procustes_restriction['provenance'] = ['Novice' for i in range(df_vectors_Novice_restriction_procrustes.shape[0])] + ['Exp' for i in range(df_vectors_Exp_restriction_procrustes.shape[0])]

In [None]:
print_df_restriction = transformed_df_procustes_restriction.copy()

key_words = ['droite_Exp', 'gauche_Exp', 'centre_Exp', 'extrême_Exp_droite_Exp', 'extrême_Exp_gauche_Exp',
            'droite_Novice', 'gauche_Novice', 'centre_Novice', 'extrême_Novice_droite_Novice', 
             'extrême_Novice_gauche_Novice', 
            ]
print_df_restriction['provenance'] = print_df_restriction.apply(lambda row: 'key word' if row.word in key_words else row.provenance, 
                                        axis = 1)

In [None]:
fig = px.scatter(print_df_restriction, x='var0', y='var1', opacity = .5, 
                    hover_data = [list(print_df_restriction['word'])], 
                    color = 'provenance')
fig.show()

In [None]:
df_vectors_restriction_procrustes.sample(10)

In [None]:
closest_words(df_vectors_restriction_procrustes, 'droite_Exp', 10)

Ca ne marche pas...

On va essayer de se restreindre à un ensemble encore plus petit de mots, le but etant que les mots clefs se superposent...

### Restriction aux mots les plus cités

In [None]:
L = list(w2v_model_all.wv.vocab.keys())
count = [w2v_model_all.wv.vocab[word].count for word in L]

In [None]:
word_count = list(zip(count, L))
word_count.sort()
word_count.reverse()

In [None]:
word_count[:10]

In [None]:
mots_politiques_Novice = ['gauche_Novice', 'droite_Novice', 
                          'extrême_Novice_gauche_Novice', 'extrême_Novice_droite_Novice',
                         'populiste_Novice', 'extrême_Novice', 'socialiste_Novice', 'communiste_Novice', 
                         'libéral_Novice']

mots_politiques_Exp = ['gauche_Exp', 'droite_Exp', 
                          'extrême_Exp_gauche_Exp', 'extrême_Exp_droite_Exp',
                         'populiste_Exp', 'extrême_Exp', 'socialiste_Exp', 'communiste_Exp', 
                         'libéral_Exp']

In [None]:
#vocab_restriction_Novice0 = [i for i in vocab_Novice if i.replace('_Novice', '_Exp') in list(zip(*word_count[300:310]))[1]]

In [None]:
vocab_restriction_Novice0 = mots_politiques_Novice

In [None]:
#vocab_restriction_Exp0 = [i.replace('_Novice', '_Exp') for i in vocab_restriction_Novice0 if i.replace('_Novice', '_Exp') in vocab_Exp]

In [None]:
vocab_restriction_Exp0 = mots_politiques_Exp

In [None]:
restriction_Novice0 = df_vectors_Novice[df_vectors_Novice.index.isin(vocab_restriction_Novice0)]

In [None]:
restriction_Novice0

In [None]:
restriction_Exp0 = df_vectors_Exp[df_vectors_Exp.index.isin(vocab_restriction_Exp0)]

In [None]:
restriction_Exp0

In [None]:
restriction_Novice0 = restriction_Novice0 / max(list(np.sqrt(np.square(restriction_Novice0).sum(axis=1))))
restriction_Exp0 = restriction_Exp0 / max(list(np.sqrt(np.square(restriction_Exp0).sum(axis=1))))

In [None]:
mtx1_restriction0, mtx2_restriction0, disparity0 = procrustes(restriction_Exp0, restriction_Novice0)

In [None]:
df_vectors_Exp_restriction_procrustes0 = pd.DataFrame(mtx1_restriction0, index = restriction_Exp0.index)
df_vectors_Novice_restriction_procrustes0 = pd.DataFrame(mtx2_restriction0, index = restriction_Novice0.index)

In [None]:
df_vectors_Exp_restriction_procrustes0 = df_vectors_Exp_restriction_procrustes0 / max(list(np.sqrt(np.square(df_vectors_Exp_restriction_procrustes0).sum(axis=1))))

df_vectors_Novice_restriction_procrustes0 = df_vectors_Novice_restriction_procrustes0 / max(list(np.sqrt(np.square(df_vectors_Novice_restriction_procrustes0).sum(axis=1))))




In [None]:
df_vectors_restriction_procrustes0 = df_vectors_Novice_restriction_procrustes0.append(df_vectors_Exp_restriction_procrustes0)

In [None]:
pca_restriction0 = PCA(n_components = 3)
pca_restriction0.fit(df_vectors_restriction_procrustes0)

In [None]:
transformed_df_procustes_restriction0 = pd.DataFrame(pca_restriction0.transform(df_vectors_restriction_procrustes0)).rename(columns = {
                            0 : 'var0', 
                            1 : 'var1', 
                            2 : 'var2',
                         })

In [None]:
transformed_df_procustes_restriction0 = transformed_df_procustes_restriction0 / max(list(np.sqrt(np.square(transformed_df_procustes_restriction0).sum(axis=1))))

In [None]:
transformed_df_procustes_restriction0['word'] = df_vectors_restriction_procrustes0.index

In [None]:
transformed_df_procustes_restriction0['provenance'] = ['Novice' for i in range(df_vectors_Novice_restriction_procrustes0.shape[0])] + ['Exp' for i in range(df_vectors_Exp_restriction_procrustes0.shape[0])]

In [None]:
fig = px.scatter(transformed_df_procustes_restriction0, x='var0', y='var1', opacity = .5, 
                    hover_data = [list(transformed_df_procustes_restriction0['word'])], 
                    color = 'provenance')
fig.show()

## Clustering

Paramétrisation de eps

In [None]:
from sklearn.neighbors import NearestNeighbors

nearest_neighbors = NearestNeighbors(n_neighbors=11)
neighbors = nearest_neighbors.fit(df_vectors_Novice.head(10000))
distances, indices = neighbors.kneighbors(df_vectors_Novice.head(10000))
#print(distances)
distances = np.sort(distances[:,10], axis=0)

fig = plt.figure(figsize=(5, 5))
plt.plot(distances)
plt.xlabel("Points")
plt.ylabel("Distance")
plt.savefig("Distance_curve.png", dpi=300)

In [None]:
from kneed import KneeLocator

i = np.arange(len(distances))
knee = KneeLocator(i, distances, S=1, curve='concave', direction='increasing', interp_method='polynomial')
fig = plt.figure(figsize=(5, 5))
knee.plot_knee()
plt.xlabel("Points")
plt.ylabel("Distance")

print(distances[knee.knee])

In [None]:
#L = [DBSCAN(eps = 1, min_samples=2).fit(df_vectors_Novice) for i in range(1)]

In [None]:
#for i in range(len(L)):
 #   print(i, max(L[i].labels_[:]))
    
#df_test = df_vectors_Novice
#df_test['cluster'] = L[0].labels_
#df_test[['cluster']].to_csv('results/cluster_Novice.csv')

In [None]:
cluster_Novice = pd.read_csv('results/cluster_Novice.csv')
cluster_Exp = pd.read_csv('results/cluster_Exp.csv')

In [None]:
df_vectors_Novice['cluster'] = [int(i) for i in list(cluster_Novice.cluster)]
df_vectors_Exp['cluster'] = [int(i) for i in list(cluster_Exp.cluster)]

In [None]:
h_Novice = [cluster_Novice[cluster_Novice['cluster'] == i].shape[0] for i in range(1, max(list(cluster_Novice.cluster)))
           if 3 < cluster_Novice[cluster_Novice['cluster'] == i].shape[0] < 300]
px.histogram(h_Novice)

In [None]:
h_Exp = [cluster_Exp[cluster_Exp['cluster'] == i].shape[0] for i in range(max(list(cluster_Exp.cluster)))
        if 3 < cluster_Exp[cluster_Exp['cluster'] == i].shape[0] < 300]
px.histogram(h_Exp)

On se penche sur les cluster au cardinal intéressant

In [None]:
for i in range(-1, max(cluster_Novice.cluster)):
    clust = df_vectors_Novice[df_vectors_Novice['cluster'] == i]
    #print(clust.shape)
    if 5 < clust.shape[0] < 300:
        print(i)
        display(clust)

In [None]:
for i in range(-1, max(cluster_Exp.cluster)):
    clust = df_vectors_Exp[df_vectors_Exp['cluster'] == i]
    #print(clust.shape)
    if 5 < clust.shape[0] < 300:
        print(i)
        display(clust)

In [None]:
df_restriction_cluster_Novice = df_vectors_Novice[df_vectors_Novice['cluster'] == 19]

In [None]:
mots_Novice = list(df_restriction_cluster_Novice.index)
mots_Exp = [i.replace('_Novice', '_Exp') for i in mots_Novice if i.replace('_Novice', 
                                                                           '_Exp') in df_vectors_Exp.index]
mots_Novice = [i.replace('_Exp', '_Novice') for i in mots_Exp if i.replace('_Exp', 
                                                                           '_Novice') in df_vectors_Novice.index]

In [None]:
df_restriction_cluster_Exp = df_vectors_Exp[df_vectors_Exp.index.isin(mots_Exp)].drop(columns = ['cluster'])
df_restriction_cluster_Novice = df_vectors_Novice[df_vectors_Novice.index.isin(mots_Novice)].drop(columns = ['cluster'])

In [None]:
assert df_restriction_cluster_Exp.shape[0] == df_restriction_cluster_Novice.shape[0]

In [None]:
def procruste_analyse(mat_Exp, mat_Novice):
    mtx_Exp, mtx_Novice, disparity = procrustes(mat_Exp, mat_Novice)
    print('disparité : ', disparity)
    df_vectors_Exp_procruste = pd.DataFrame(mtx_Exp, index = mat_Exp.index)
    df_vectors_Novice_procrustes = pd.DataFrame(mtx_Novice, index = mat_Novice.index)
    
    df_vectors_procruste = df_vectors_Exp_procruste.append(df_vectors_Novice_procrustes)
    
    pca = PCA(n_components = 3)
    pca.fit(df_vectors_procruste)
    
    transformed_df_procuste = pd.DataFrame(pca.transform(df_vectors_procruste)).rename(columns = {
                            0 : 'var0', 
                            1 : 'var1', 
                            2 : 'var2',
                         })
    
    transformed_df_procuste['word'] = df_vectors_procruste.index
    transformed_df_procuste['provenance'] = ['Exp' for i in range(df_vectors_Exp_procruste.shape[0])] + ['Novice' for i in range(df_vectors_Novice_procrustes.shape[0])]
    
    fig = px.scatter(transformed_df_procuste, x='var0', y='var1', opacity = .5, 
                    hover_data = [list(transformed_df_procuste['word'])], 
                    color = 'provenance')
    fig.show()
    

In [None]:
def procuste_cluster_novice(n_cluster):

    df_restriction_cluster_Novice = df_vectors_Novice[df_vectors_Novice['cluster'] == n_cluster]

    mots_Novice = list(df_restriction_cluster_Novice.index)
    mots_Exp = [i.replace('_Novice', '_Exp') for i in mots_Novice if i.replace('_Novice', 
                                                                               '_Exp') in df_vectors_Exp.index]
    mots_Novice = [i.replace('_Exp', '_Novice') for i in mots_Exp if i.replace('_Exp', 
                                                                               '_Novice') in df_vectors_Novice.index]

    df_restriction_cluster_Exp = df_vectors_Exp[df_vectors_Exp.index.isin(mots_Exp)].drop(columns = ['cluster'])
    df_restriction_cluster_Novice = df_vectors_Novice[df_vectors_Novice.index.isin(mots_Novice)].drop(columns = ['cluster'])

    assert df_restriction_cluster_Exp.shape[0] == df_restriction_cluster_Novice.shape[0]
    
    procruste_analyse(df_restriction_cluster_Exp, df_restriction_cluster_Novice)

In [None]:
def procuste_cluster_exp(n_cluster):

    df_restriction_cluster_Exp = df_vectors_Exp[df_vectors_Exp['cluster'] == n_cluster]

    mots_Exp = list(df_restriction_cluster_Exp.index)
    mots_Novice = [i.replace('_Exp', '_Novice') for i in mots_Exp if i.replace('_Exp', 
                                                                               '_Novice') in df_vectors_Novice.index]
    mots_Exp = [i.replace('_Novice', '_Exp') for i in mots_Novice if i.replace('_Novice', 
                                                                               '_Exp') in df_vectors_Exp.index]

    df_restriction_cluster_Exp = df_vectors_Exp[df_vectors_Exp.index.isin(mots_Exp)].drop(columns = ['cluster'])
    df_restriction_cluster_Novice = df_vectors_Novice[df_vectors_Novice.index.isin(mots_Novice)].drop(columns = ['cluster'])

    assert df_restriction_cluster_Exp.shape[0] == df_restriction_cluster_Novice.shape[0]
    
    procruste_analyse(df_restriction_cluster_Exp, df_restriction_cluster_Novice)

In [None]:
procuste_cluster_novice(47)

In [None]:
Intéressant : 47

In [None]:
procuste_cluster_exp(138)

In [None]:
Intéressant : 19, 61, 138