In [31]:
import os
import numpy as np
import pandas as pd 
from utils import train_test_validation_split

from sklearn.feature_extraction.text import CountVectorizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, Flatten, LSTM, GRU
from keras.losses import sparse_categorical_crossentropy, categorical_hinge
from keras import optimizers
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping

## Import dataset

In [13]:
# Import data
X = pd.read_csv('DATA/clean_data/cleaning_n_stemming_input_train.csv', sep=";", index_col=0)
y = pd.read_csv('DATA/output_train.csv', sep=";", index_col=0)

features = X.columns
targets = y['intention'].unique()

In [14]:
X['question'].head()

0    bonjour tromp forum question alor repos ici pr...
1                  <MEDICAMENT> soulag contr <MALADIE>
2    medecin <MEDICAMENT> prescr <MEDICAMENT> <ORDI...
3       est exist form adapt enfant <AGE> <MEDICAMENT>
4    medecin soign <MEDICAMENT> pharyngit <MEDICAME...
Name: question, dtype: object

## Prepare data for nn and find correct parameters

In [15]:
# Estimation of the size of the vocabulary 
vectorizer = CountVectorizer()
vectorizer.fit_transform(X['question'])
MAX_NB_WORDS = len(vectorizer.vocabulary_)

In [16]:
# Find max size of the sequences 
MAX_SEQUENCE_LENGTH = 0 
for sentence in X['question']:
    if MAX_SEQUENCE_LENGTH<len(sentence.split()):
        MAX_SEQUENCE_LENGTH = len(sentence.split())
MAX_SEQUENCE_LENGTH

382

In [17]:
# Preprocess text fo feed the net 
texts = X['question']
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 4746 unique tokens.


In [18]:
X_train, X_test, X_validation, y_train, y_test, y_validation = train_test_validation_split(X_sequences,y.values)

In [36]:
X_train = X_train / (len(word_index)-1)
X_test = X_test / (len(word_index)-1)

## Prepare Tensorboard

In [37]:
def generate_unique_logpath(logdir, raw_run_name):
        i = 0
        while(True):
                run_name = raw_run_name + "-" + str(i)
                log_path = os.path.join(logdir, run_name)
                if not os.path.isdir(log_path):
                        return log_path
                i = i + 1

## Create and train the model

In [38]:
#Define parameters 
EMBEDDING_DIM = 200
NB_CATEGORIES = len(targets)
NB_LSTM = 50
PERC_DROPOUT = 0.2 # to test between 0.2 and 0.5
EPOCHS = 10000
#Define RMSProp optimizer
LEARNING_RATE = 0.006
RATE_DECAY = 6e-4
optz = optimizers.RMSprop(lr=LEARNING_RATE, decay=RATE_DECAY)


sgd = optimizers.SGD(lr=LEARNING_RATE, decay=RATE_DECAY, momentum=0.9, nesterov=True)


In [39]:
run_name = "gru_norm_"+str(NB_LSTM)+"_rmsprop_"+str(LEARNING_RATE)
run_name += "_decay_embedding_"+str(EMBEDDING_DIM)
run_name +="_dropout_"+str(PERC_DROPOUT)+"_early_stop_shuffle"

logpath = generate_unique_logpath("./logs_tensorboard", run_name)
tbcb = TensorBoard(log_dir=logpath)

In [None]:
#Model 
model= Sequential()
model.add(Embedding(len(word_index)+1,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True))
model.add(GRU(NB_LSTM))
model.add(Dropout(PERC_DROPOUT))
model.add((Dense(NB_CATEGORIES)))
model.add(Activation('softmax')) # reminder sigmoid if is for binary classification
model.compile(loss=sparse_categorical_crossentropy, optimizer=optz, metrics=['accuracy'])
print(model.summary())
early_stop = EarlyStopping(monitor='val_loss', patience=2, verbose=1,  mode='auto')
checkpoint_filepath = os.path.join(logpath,  "model.h1")
checkpoint_cb = ModelCheckpoint(checkpoint_filepath, save_best_only=True)
model.fit(X_train, y_train,
            validation_data=(X_test, y_test), 
            epochs=EPOCHS,
            shuffle=True,
            batch_size=64,
            verbose=1,
            callbacks=[tbcb])


Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 382, 200)          949400    
_________________________________________________________________
gru_1 (GRU)                  (None, 50)                37650     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 51)                2601      
_________________________________________________________________
activation_1 (Activation)    (None, 51)                0         
Total params: 989,651
Trainable params: 989,651
Non-tra

In [None]:
run_name = "no_stemming_gru_"+str(NB_LSTM)+"_rmsprop_"+str(LEARNING_RATE)
run_name += "_decay_embedding_"+str(EMBEDDING_DIM)
run_name +="_dropout_"+str(PERC_DROPOUT)+"_early_stop_shuffle"

logpath = generate_unique_logpath("./logs_tensorboard", run_name)
tbcb = TensorBoard(log_dir=logpath)

# TEST 2 
PERC_DROPOUT = 0.2 # to test between 0.2 and 0.5

#Model 
model2= Sequential()
model2.add(Embedding(len(word_index)+1,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True))
model2.add(GRU(NB_LSTM))
model2.add(Dropout(PERC_DROPOUT))
model2.add((Dense(NB_CATEGORIES)))
model2.add(Activation('softmax')) # reminder sigmoid if is for binary classification
model2.compile(loss=sparse_categorical_crossentropy, optimizer=optz, metrics=['accuracy'])
print(model2.summary())
early_stop = EarlyStopping(monitor='val_loss', patience=2, verbose=1,  mode='auto')
checkpoint_filepath = os.path.join(logpath,  "model.h1")
checkpoint_cb = ModelCheckpoint(checkpoint_filepath, save_best_only=True)
model2.fit(X_train, y_train,
            validation_data=(X_test, y_test), 
            epochs=EPOCHS,
            shuffle=True,
            batch_size=64,
            verbose=1,
            callbacks=[tbcb])
