In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import gensim 
import os
from gensim.models import FastText
from gensim.models.callbacks import CallbackAny2Vec
from gensim.test.utils import get_tmpfile

from sklearn.manifold import TSNE

In [2]:
# Data source path 
data_source_path = "../../DATA/"

In [3]:
# Import data
X = pd.read_csv(data_source_path+'clean_data/smart_soft_cleaning_with_nltk_stemming.csv', sep=";", index_col=0)
y = pd.read_csv(data_source_path+'output_train.csv', sep=";", index_col=0)

features = X.columns
targets = y['intention'].unique()

In [4]:
X = X[X['question'].notnull()]

In [5]:
sentences = [sentence.split() for sentence in X['question']]

# Train fasttext in french

In [6]:
print('number of cpus :', os.cpu_count()) 
print(os.setuid)

number of cpus : 4
<built-in function setuid>


In [18]:
class EpochSaver(CallbackAny2Vec):
    "Callback to save model after every epoch"
    def __init__(self, path_prefix):
        self.path_prefix = path_prefix
        self.epoch = 0
        self.epoch_duration = 0.0
        self.epoch_start_time = time.time()
        
        
    def on_epoch_end(self, model):
        output_path = '{}_epoch{}.model'.format(self.path_prefix, self.epoch)
        model.save(output_path)
        self.epoch_duration = (time.time() - self.epoch_start_time)/(1+self.epoch)
        print("Epoch {}, model saved to {}, {} hour(s) {} minutes ".format(self.epoch,
                                                                           output_path, 
                                                                           (EPOCHS-self.epoch)*int(self.epoch_duration / 60), 
              end='\r')
        self.epoch += 1

In [19]:
debut = time.time()
EPOCHS = 10000
MIN_COUNT = 5
SIZE = 100
WINDOW = 4
ALPHA = 0.075
NGRAMS = 1

epoch_saver = EpochSaver(get_tmpfile("temporary_model"))
scratch_model  = FastText(sentences, 
                          size = SIZE,
                          alpha=ALPHA, 
                          window=WINDOW, 
                          word_ngrams=NGRAMS,
                          min_count=MIN_COUNT, 
                          workers=os.cpu_count(),
                          iter=EPOCHS,
                          callbacks=[epoch_saver])
fin = time.time()
print(fin-debut)

7060.743767261505 saved to /tmp/temporary_model_epoch9999.model, 0.011767824745575587 minute left or 0.0001961304124262598 hour(s))


In [None]:
"""To test : 
sg=1,
sample=0.01, 
iter=EPOCHS)"""

In [20]:
scratch_model.wv.save_word2vec_format(fname = data_source_path+'trained_vectors/custom_word2vec/fr_stemming_no_accent.bin', 
                                          fvocab=data_source_path+'trained_vectors/custom_word2vec/fr_vocab_stemming_no_accent.txt', 
                                          binary=True)

## Test custom word2vec in french built from scratch

In [None]:
model_in_training = gensim.models.KeyedVectors.load('/tmp/temporary_model_epoch10.model')

In [None]:
model_in_training.wv.most_similar(["jeudi"], topn=50)

In [21]:
scratch_model_to_plot =gensim.models.KeyedVectors.load_word2vec_format(fname =data_source_path+'trained_vectors/custom_word2vec/fr_stemming_no_accent.bin', 
                                                                       fvocab=data_source_path+'trained_vectors/custom_word2vec/fr_vocab_stemming_no_accent.txt', 
                                                                       binary=True)

# Plot TSNE 
taken from https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne

In [None]:
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()


In [None]:
tsne_plot(scratch_model_to_plot)

In [29]:
scratch_model_to_plot.most_similar(["jeud"], topn=50)

[('dimanch', 0.4155712425708771),
 ('mard', 0.3164424002170563),
 ('lund', 0.310949444770813),
 ('demain', 0.3075084090232849),
 ('reprend', 0.2965714931488037),
 ('immediat', 0.2953723073005676),
 ('retabl', 0.276810884475708),
 ('journ', 0.26698005199432373),
 ('attendu', 0.26606640219688416),
 ('ensuit', 0.26546260714530945),
 ('repr', 0.2648485004901886),
 ('avoir', 0.26474517583847046),
 ('vendred', 0.2632172107696533),
 ('ouvertur', 0.254181444644928),
 ('couch', 0.2494654357433319),
 ('jaun', 0.24817900359630585),
 ('ordinal', 0.24674704670906067),
 ('natur', 0.23784078657627106),
 ('oubli', 0.23683807253837585),
 ('jour', 0.23027537763118744),
 ('seroquel', 0.22959519922733307),
 ('pu', 0.22801902890205383),
 ('existent', 0.22755958139896393),
 ('mois-c', 0.22677111625671387),
 ('plaquet', 0.22105248272418976),
 ('refair', 0.21967895328998566),
 ('prevu', 0.21929509937763214),
 ('lendemain', 0.21818400919437408),
 ('ovarien', 0.21743229031562805),
 ('cerazet', 0.217258915305137