In [None]:
import os
import pandas as pd 
from utils import train_test_validation_split

from sklearn.feature_extraction.text import CountVectorizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, Flatten, LSTM, GRU
from keras.losses import sparse_categorical_crossentropy, categorical_hinge
from keras import optimizers
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping

## Import dataset

In [None]:
# Import data
X = pd.read_csv('DATA/clean_input_train.csv', sep=";", index_col=0)
y = pd.read_csv('DATA/output_train.csv', sep=";", index_col=0)

features = X.columns
targets = y['intention'].unique()

In [None]:
X['question'].head()

## Prepare data for nn and find correct parameters

In [None]:
# Estimation of the size of the vocabulary 
vectorizer = CountVectorizer()
vectorizer.fit_transform(X['question'])
MAX_NB_WORDS = len(vectorizer.vocabulary_)

In [None]:
# Find max size of the sequences 
MAX_SEQUENCE_LENGTH = 0 
for sentence in X['question']:
    if MAX_SEQUENCE_LENGTH<len(sentence.split()):
        MAX_SEQUENCE_LENGTH = len(sentence.split())
MAX_SEQUENCE_LENGTH

In [None]:
# Preprocess text fo feed the net 
texts = X['question']
tokenizer = Tokenizer(num_words=MAX_NB_WORDS/2)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [7]:
X_train, X_test, X_validation, y_train, y_test, y_validation = train_test_validation_split(X_sequences,y)

In [8]:
X_train

array([[   0,    0,    0, ..., 3888,  643, 1209],
       [   0,    0,    0, ...,  132,  250,   13],
       [   0,    0,    0, ..., 1393,   96,   41],
       ...,
       [   0,    0,    0, ...,  240,  463,  368],
       [   0,    0,    0, ...,    3,    2,    1],
       [   0,    0,    0, ...,   24,   10,   29]], dtype=int32)

## Prepare Tensorboard

In [9]:
def generate_unique_logpath(logdir, raw_run_name):
        i = 0
        while(True):
                run_name = raw_run_name + "-" + str(i)
                log_path = os.path.join(logdir, run_name)
                if not os.path.isdir(log_path):
                        return log_path
                i = i + 1

## Create and train the model

In [10]:
#Define parameters 
EMBEDDING_DIM = 200
NB_CATEGORIES = len(targets)
NB_LSTM = 50
PERC_DROPOUT = 0.5
EPOCHS = 10
#Define RMSProp optimizer
LEARNING_RATE = 0.006
RATE_DECAY = LEARNING_RATE / EPOCHS
optz = optimizers.RMSprop(lr=LEARNING_RATE, decay=RATE_DECAY)


sgd = optimizers.SGD(lr=LEARNING_RATE, decay=RATE_DECAY, momentum=0.9, nesterov=True)


In [14]:
run_name = "gru_"+str(NB_LSTM)+"_rmsprop_"+str(LEARNING_RATE)
run_name += "_decay_embedding_"+str(EMBEDDING_DIM)
run_name +="_dropout_"+str(PERC_DROPOUT)+"_early_stop_shuffle"

logpath = generate_unique_logpath("./logs_tensorboard", run_name)
tbcb = TensorBoard(log_dir=logpath)

In [15]:
#Model 
model= Sequential()
model.add(Embedding(len(word_index)+1,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True))
model.add(GRU(NB_LSTM))
model.add(Dropout(PERC_DROPOUT))
model.add((Dense(NB_CATEGORIES)))
model.add(Activation('softmax')) # reminder sigmoid if is for binary classification
model.compile(loss=sparse_categorical_crossentropy, optimizer=optz, metrics=['accuracy'])
print(model.summary())
early_stop = EarlyStopping(monitor='val_loss', patience=2, verbose=1,  mode='auto')
checkpoint_filepath = os.path.join(logpath,  "model.h1")
checkpoint_cb = ModelCheckpoint(checkpoint_filepath, save_best_only=True)
model.fit(X_train, y_train.values,
            validation_data=(X_test, y_test.values), 
            epochs=EPOCHS,
            shuffle=True,
            batch_size=64,
            verbose=1,
            callbacks=[tbcb, early_stop])


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 412, 200)          1766200   
_________________________________________________________________
gru_4 (GRU)                  (None, 50)                37650     
_________________________________________________________________
dropout_3 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 51)                2601      
_________________________________________________________________
activation_3 (Activation)    (None, 51)                0         
Total params: 1,806,451
Trainable params: 1,806,451
Non-trainable params: 0
_________________________________________________________________
None
Train on 5137 samples, validate on 1285 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Ep

<keras.callbacks.History at 0x7feb53818b00>