In [2]:
import pandas as pd
import gensim
import numpy as np
import time
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

In [3]:
# Import data
X = pd.read_csv('DATA/clean_data/cleaned_n_stemming_input_train.csv', sep=";", index_col=0)
y = pd.read_csv('DATA/output_train.csv', sep=";", index_col=0)

features = X.columns
targets = y['intention'].unique()

In [6]:
sentences = [sentence.split() for sentence in X['question'].values]

# Train word2vec in french

In [13]:
EPOCHS = 400
MIN_COUNT = 2
SIZE = 100
WINDOW = 4
WORKERS = 4

training_it = 1
scratch_model  = gensim.models.Word2Vec(sentences, 
                                        sg=1,
                                        sample=0.01,
                                        min_count=MIN_COUNT, 
                                        size=SIZE, 
                                        window=WINDOW, 
                                        compute_loss = True,
                                        iter=EPOCHS)


In [15]:
scratch_model.wv.save_word2vec_format(fname = 'DATA/trained_vectors/custom_word2vec/scratch_fr_stemming.bin', 
                                          fvocab='DATA/trained_vectors/custom_word2vec/scratch_fr_vocab_stemming.txt', 
                                          binary=True)

## Test custom word2vec in french built from scratch

In [16]:
scratch_model =gensim.models.KeyedVectors.load_word2vec_format(fname = 'DATA/trained_vectors/custom_word2vec/scratch_fr_stemming.bin', 
                                                               fvocab='DATA/trained_vectors/custom_word2vec/scratch_fr_vocab_stemming.txt', 
                                                               binary=True)

# Plot TSNE 
taken from https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne

In [17]:
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()


In [None]:
tsne_plot(scratch_model)

In [12]:
scratch_model.most_similar(["dimanch"], topn=50)

[('lund', 0.5880324840545654),
 ('mercred', 0.5336698293685913),
 ('vendred', 0.5199204683303833),
 ('zapp', 0.5114241242408752),
 ('10e', 0.510537326335907),
 ('ixiem', 0.5062447190284729),
 ('prolamin', 0.5011074542999268),
 ('jet', 0.49038609862327576),
 ('bb2', 0.48954135179519653),
 ('inadvert', 0.48302024602890015),
 ('neanmoin', 0.4828013777732849),
 ('juillet', 0.47704246640205383),
 ('aurr', 0.476423978805542),
 ('sens', 0.4757467806339264),
 ('samed', 0.472331702709198),
 ('compt', 0.47000765800476074),
 ('2007', 0.4696352183818817),
 ('echang', 0.46853888034820557),
 ('rendu', 0.46745431423187256),
 ('jesper', 0.46114546060562134),
 ('condom', 0.4605788290500641),
 ('detest', 0.45993632078170776),
 ('norlevo', 0.4587255120277405),
 ('censi', 0.45803311467170715),
 ('secour', 0.45797401666641235),
 ('crac', 0.451654851436615),
 ('reveil', 0.4512515664100647),
 ('jan', 0.44979771971702576),
 ('matin', 0.4476741850376129),
 ('cke', 0.443220317363739),
 ('janvi', 0.4414350390434