In [1]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import gensim 
from gensim.models import FastText
from gensim.models.callbacks import CallbackAny2Vec
from gensim.test.utils import os

from sklearn.manifold import TSNE

In [2]:
# Data source path 
data_source_path = "../../DATA/"

In [3]:
# Import data
strategy = 'soft'
X_train = pd.read_csv(data_source_path+'input_train/'+strategy+'/clean_input_to_train.csv', sep=";", index_col=0)
X_test = pd.read_csv(data_source_path+'input_test/'+strategy+'/clean_input_to_test.csv', sep=";", index_col=0)

In [10]:
sentences = [sentence.split() for sentence in X_train['question']]
for sentence in X_test['question']:
    sentences.append(sentence.split())

In [5]:
number_of_too_short_sentences = 0
for i in range(len(sentences)): 
    if len(sentences[i])<=3: 
        #print('indice', i, 'sentences', sentences[i])
        number_of_too_short_sentences +=1
print(number_of_too_short_sentences)

1006


# Train fasttext in french

In [6]:
print('number of cpus :', os.cpu_count()) 
print('directory : ', os.getcwd())

number of cpus : 4
directory :  /home/laure/Documents/posos_data_challenge/src/embedding


In [11]:
class EpochSaver(CallbackAny2Vec):
    "Callback to save model after every epoch"
    def __init__(self, path_prefix):
        self.path_prefix = path_prefix
        self.epoch = 0
        self.epoch_duration = 0.0
        self.epoch_start_time = time.time()
        
        
    def on_epoch_end(self, model):
        output_path = '{}_epoch_{} '.format(self.path_prefix, self.epoch)
        if(self.epoch % 3000 == 0):
            model.wv.save_word2vec_format(fname = data_source_path+'input_train/'+strategy+'/'+output_path+'fr_posos.bin', 
                                          fvocab=data_source_path+'input_train/'+strategy+'/'+output_path+'fr_vocab_posos.txt', 
                                          binary=True)
        
        self.epoch_duration = (time.time() - self.epoch_start_time)/(1+self.epoch)
        REMAINING_HOURS = int((EPOCHS-self.epoch)* self.epoch_duration / 3600)
        REMAINING_MINUTES = int(((EPOCHS-self.epoch)* self.epoch_duration - REMAINING_HOURS*3600)/60)
        print("Epoch {}, model saved to {}, {} hour(s) {} minutes ".format(self.epoch,
                                                                           output_path, 
                                                                           REMAINING_HOURS,
                                                                           REMAINING_MINUTES),
              end='\r')
        self.epoch += 1


In [12]:
debut = time.time()
EPOCHS = 10
MIN_COUNT = 5
SIZE = 100
WINDOW = 4
ALPHA = 0.075
NGRAMS = 1

epoch_saver = EpochSaver('temporary_model')
scratch_model  = FastText(sentences, 
                          size = SIZE,
                          alpha=ALPHA, 
                          window=WINDOW, 
                          word_ngrams=NGRAMS,
                          min_count=MIN_COUNT, 
                          workers=os.cpu_count(),
                          iter=EPOCHS,
                          callbacks=[epoch_saver])
fin = time.time()
print(fin-debut)

10.791255712509155ed to temporary_model_epoch_9 , 0 hour(s) 0 minutes 


In [None]:
scratch_model.wv.save_word2vec_format(fname = data_source_path+'input_train/'+strategy+'/fr_posos.bin', 
                                      fvocab=data_source_path+'input_train/'+strategy+'/fr_vocab_posos.txt', 
                                      binary=True)

# Plot TSNE 
taken from https://www.kaggle.com/jeffd23/visualizing-word-vectors-with-t-sne

In [None]:
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()


In [None]:
scratch_model_to_plot =gensim.models.KeyedVectors.load_word2vec_format(fname =data_source_path+'input_train/'+strategy+'/fr_stemming_no_accent.bin', 
                                                                       fvocab=data_source_path+'trained_vectors/custom_word2vec/fr_vocab_stemming_no_accent.txt', 
                                                                       binary=True)

In [None]:
tsne_plot(scratch_model_to_plot)

In [None]:
scratch_model_to_plot.most_similar(["jeud"], topn=50)