In [40]:
import pandas as pd
import numpy as np
import gensim
import os 

from sklearn.model_selection import train_test_split

from keras.layers.recurrent import LSTM, GRU
from keras.layers.embeddings import Embedding
from keras.models import Model, Sequential
from keras.layers import Dense, Activation, Dropout
from keras.losses import sparse_categorical_crossentropy, categorical_hinge
from keras import optimizers
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping

In [2]:
# Import data
X = pd.read_csv('DATA/clean_data/cleaned_n_stemming_input_train.csv', sep=";", index_col=0)
y = pd.read_csv('DATA/output_train.csv', sep=";", index_col=0)

features = X.columns
targets = y['intention'].unique()

In [3]:
sentences = X['question'].values

In [4]:
word_model =gensim.models.KeyedVectors.load_word2vec_format(fname = 'DATA/trained_vectors/custom_word2vec/scratch_fr_stemming.bin', 
                                                            fvocab='DATA/trained_vectors/custom_word2vec/scratch_fr_vocab_stemming.txt', 
                                                            binary=True)
print('embedding taking into account ', len( word_model.vocab), 'words') 

embedding taking into account  2714 words


In [32]:
def word2idx(word):
    if word in word_model.vocab.keys():
        return word_model.vocab[word].index
    else:
        return 0
def idx2word(idx):
    return word_model.index2word[idx]

In [13]:
# Find max size of the sequences 
MAX_SEQUENCE_LENGTH = 0 
for sentence in X['question']:
    if MAX_SEQUENCE_LENGTH<len(sentence.split()):
        MAX_SEQUENCE_LENGTH = len(sentence.split())
MAX_SEQUENCE_LENGTH

382

In [9]:
word_model.similarity('malad', 'jeud')

0.028489382519683783

In [10]:
word_model.similarity('vendred', 'jeud')

0.47308977662157237

In [11]:
word_model.similarity('malad', 'enceint')

0.19189552314385092

In [33]:
X_embedded = np.zeros([len(sentences), MAX_SEQUENCE_LENGTH], dtype=np.int32)
for i, sentence in enumerate(sentences):
    for t, word in enumerate(sentence.split()):
        X_embedded[i, t] = word2idx(word)
print(X_embedded.shape)

(8028, 382)


In [45]:
X_train, X_test,y_train, y_test = train_test_split(X_embedded,y)

In [41]:
# try RNN with pretrained embedding

In [42]:
def generate_unique_logpath(logdir, raw_run_name):
        i = 0
        while(True):
                run_name = raw_run_name + "-" + str(i)
                log_path = os.path.join(logdir, run_name)
                if not os.path.isdir(log_path):
                        return log_path
                i = i + 1

In [43]:
run_name = "gru_"+str(NB_LSTM)+"_rmsprop_"+str(LEARNING_RATE)
run_name += "_decay_embedding_"+str(EMBEDDING_DIM)
run_name +="_dropout_"+str(PERC_DROPOUT)+"_early_stop_shuffle"

logpath = generate_unique_logpath("./logs_tensorboard", run_name)
tbcb = TensorBoard(log_dir=logpath)

In [47]:
#Define parameters 
#EMBEDDING_DIM = 200
NB_CATEGORIES = len(targets)
NB_LSTM = 50
PERC_DROPOUT = 0.5
EPOCHS = 10
#Define RMSProp optimizer
LEARNING_RATE = 0.06
RATE_DECAY = LEARNING_RATE / EPOCHS

optz = optimizers.RMSprop(lr=LEARNING_RATE, decay=RATE_DECAY)
sgd = optimizers.SGD(lr=LEARNING_RATE, decay=RATE_DECAY, momentum=0.9, nesterov=True)


#Model 
model= Sequential()
model.add(word_model.get_keras_embedding())
'''
model.add(Embedding(len(word_index)+1,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            weights = 
                            trainable=True))
'''
model.add(GRU(NB_LSTM))
model.add(Dropout(PERC_DROPOUT))
model.add((Dense(NB_CATEGORIES)))
model.add(Activation('softmax')) # reminder sigmoid if is for binary classification
model.compile(loss=sparse_categorical_crossentropy, optimizer=optz, metrics=['accuracy'])
print(model.summary())
early_stop = EarlyStopping(monitor='val_loss', patience=2, verbose=1,  mode='auto')
checkpoint_filepath = os.path.join(logpath,  "model.h1")
checkpoint_cb = ModelCheckpoint(checkpoint_filepath, save_best_only=True)
model.fit(X_train, y_train.values,
            validation_data=(X_test, y_test.values), 
            epochs=EPOCHS,
            shuffle=True,
            batch_size=64,
            verbose=1,
            callbacks=[tbcb, early_stop])


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, None, 100)         271400    
_________________________________________________________________
gru_5 (GRU)                  (None, 50)                22650     
_________________________________________________________________
dropout_4 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 51)                2601      
_________________________________________________________________
activation_4 (Activation)    (None, 51)                0         
Total params: 296,651
Trainable params: 25,251
Non-trainable params: 271,400
_________________________________________________________________
None
Train on 6021 samples, validate on 2007 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
E

<keras.callbacks.History at 0x7fa45b296630>