In [1]:
import os
import pandas as pd 
from utils import train_test_validation_split

from sklearn.feature_extraction.text import CountVectorizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, Flatten, LSTM, GRU
from keras.losses import sparse_categorical_crossentropy, categorical_hinge
from keras import optimizers
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## Import dataset

In [2]:
# Import data
X = pd.read_csv('DATA/clean_data/cleaning_n_stemming_input_train.csv', sep=";", index_col=0)
y = pd.read_csv('DATA/output_train.csv', sep=";", index_col=0)

features = X.columns
targets = y['intention'].unique()

In [3]:
X['question'].head()

0    bonjour tromp forum question alor repos ici pr...
1                  <MEDICAMENT> soulag contr <MALADIE>
2    medecin <MEDICAMENT> prescr <MEDICAMENT> <ORDI...
3       est exist form adapt enfant <AGE> <MEDICAMENT>
4    medecin soign <MEDICAMENT> pharyngit <MEDICAME...
Name: question, dtype: object

## Prepare data for nn and find correct parameters

In [4]:
# Estimation of the size of the vocabulary 
vectorizer = CountVectorizer()
vectorizer.fit_transform(X['question'])
MAX_NB_WORDS = len(vectorizer.vocabulary_)

In [5]:
# Find max size of the sequences 
MAX_SEQUENCE_LENGTH = 0 
for sentence in X['question']:
    if MAX_SEQUENCE_LENGTH<len(sentence.split()):
        MAX_SEQUENCE_LENGTH = len(sentence.split())
MAX_SEQUENCE_LENGTH

382

In [6]:
# Preprocess text fo feed the net 
texts = X['question']
tokenizer = Tokenizer(num_words=MAX_NB_WORDS/2)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 4746 unique tokens.


In [7]:
X_train, X_test, X_validation, y_train, y_test, y_validation = train_test_validation_split(X_sequences,y.values)

In [8]:
X_train

array([[  0,   0,   0, ...,   0, 545, 744],
       [  0,   0,   0, ..., 108, 244,  15],
       [  0,   0,   0, ..., 102, 156,  40],
       ...,
       [  0,   0,   0, ..., 232,  72, 294],
       [  0,   0,   0, ..., 110,  26,   1],
       [  0,   0,   0, ...,   2,  12,   4]], dtype=int32)

## Prepare Tensorboard

In [9]:
def generate_unique_logpath(logdir, raw_run_name):
        i = 0
        while(True):
                run_name = raw_run_name + "-" + str(i)
                log_path = os.path.join(logdir, run_name)
                if not os.path.isdir(log_path):
                        return log_path
                i = i + 1

## Create and train the model

In [10]:
#Define parameters 
EMBEDDING_DIM = 200
NB_CATEGORIES = len(targets)
NB_LSTM = 50
PERC_DROPOUT = 0.2 # to test between 0.2 and 0.5
EPOCHS = 10000
#Define RMSProp optimizer
LEARNING_RATE = 0.006
RATE_DECAY = 6e-4
optz = optimizers.RMSprop(lr=LEARNING_RATE, decay=RATE_DECAY)


sgd = optimizers.SGD(lr=LEARNING_RATE, decay=RATE_DECAY, momentum=0.9, nesterov=True)


In [11]:
run_name = "stemming_gru_"+str(NB_LSTM)+"_rmsprop_"+str(LEARNING_RATE)
run_name += "_decay_embedding_"+str(EMBEDDING_DIM)
run_name +="_dropout_"+str(PERC_DROPOUT)+"_early_stop_shuffle"

logpath = generate_unique_logpath("./logs_tensorboard", run_name)
tbcb = TensorBoard(log_dir=logpath)

In [12]:
#Model 
model= Sequential()
model.add(Embedding(len(word_index)+1,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True))
model.add(GRU(NB_LSTM))
model.add(Dropout(PERC_DROPOUT))
model.add((Dense(NB_CATEGORIES)))
model.add(Activation('softmax')) # reminder sigmoid if is for binary classification
model.compile(loss=sparse_categorical_crossentropy, optimizer=optz, metrics=['accuracy'])
print(model.summary())
early_stop = EarlyStopping(monitor='val_loss', patience=2, verbose=1,  mode='auto')
checkpoint_filepath = os.path.join(logpath,  "model.h1")
checkpoint_cb = ModelCheckpoint(checkpoint_filepath, save_best_only=True)
model.fit(X_train, y_train,
            validation_data=(X_test, y_test), 
            epochs=EPOCHS,
            shuffle=True,
            batch_size=64,
            verbose=1,
            callbacks=[tbcb])


Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
Instructions for updating:
keep_dims is deprecated, use keepdims instead
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 382, 200)          949400    
_________________________________________________________________
gru_1 (GRU)                  (None, 50)                37650     
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 51)                2601      
_________________________________________________________________
activation_1 (Activation)    (None, 51)                0         
Total params: 989,651
Trainable params: 989,651
Non-tra

KeyboardInterrupt: 

In [16]:

# TEST 2 
PERC_DROPOUT = 0.2 # to test between 0.2 and 0.5
EPOCHS = 100
EMBEDDING_DIM = 200
NB_CATEGORIES = len(targets)
NB_LSTM = 10
run_name = "stemming_gru_"+str(NB_LSTM)+"_rmsprop_"+str(LEARNING_RATE)
run_name += "_decay_embedding_"+str(EMBEDDING_DIM)
run_name +="_dropout_"+str(PERC_DROPOUT)+"_early_stop_shuffle"
logpath = generate_unique_logpath("./logs_tensorboard", run_name)
tbcb = TensorBoard(log_dir=logpath)

#Model 
model2= Sequential()
model2.add(Embedding(len(word_index)+1,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True))
model2.add(GRU(NB_LSTM))
model2.add(Dropout(PERC_DROPOUT))
model2.add((Dense(NB_CATEGORIES)))
model2.add(Activation('softmax')) # reminder sigmoid if is for binary classification
model2.compile(loss=sparse_categorical_crossentropy, optimizer=optz, metrics=['accuracy'])
print(model2.summary())
early_stop = EarlyStopping(monitor='val_loss', patience=2, verbose=1,  mode='auto')
checkpoint_filepath = os.path.join(logpath,  "model.h1")
checkpoint_cb = ModelCheckpoint(checkpoint_filepath, save_best_only=True)
model2.fit(X_train, y_train,
            validation_data=(X_test, y_test), 
            epochs=EPOCHS,
            shuffle=True,
            batch_size=64,
            verbose=1,
            callbacks=[tbcb])


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 382, 200)          949400    
_________________________________________________________________
gru_5 (GRU)                  (None, 10)                6330      
_________________________________________________________________
dropout_5 (Dropout)          (None, 10)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 51)                561       
_________________________________________________________________
activation_5 (Activation)    (None, 51)                0         
Total params: 956,291
Trainable params: 956,291
Non-trainable params: 0
_________________________________________________________________
None
Train on 5137 samples, validate on 1285 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100

Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x7fd01db37c50>

In [17]:

# TEST 2 
PERC_DROPOUT = 0.2 # to test between 0.2 and 0.5
EPOCHS = 100
EMBEDDING_DIM = 200
NB_CATEGORIES = len(targets)
NB_LSTM = 10
run_name = "stemming_gru_"+str(NB_LSTM)+"_rmsprop_"+str(LEARNING_RATE)
run_name += "_decay_embedding_"+str(EMBEDDING_DIM)
run_name +="_dropout_"+str(PERC_DROPOUT)+"_early_stop_shuffle"
logpath = generate_unique_logpath("./logs_tensorboard", run_name)
tbcb = TensorBoard(log_dir=logpath)

#Model 
model3= Sequential()
model3.add(Embedding(len(word_index)+1,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True))
model3.add(GRU(NB_LSTM))
model3.add(Dropout(PERC_DROPOUT))
model3.add(GRU(NB_LSTM))
model3.add(Dropout(PERC_DROPOUT))
model3.add((Dense(NB_CATEGORIES)))
model3.add(Activation('softmax')) # reminder sigmoid if is for binary classification
model3.compile(loss=sparse_categorical_crossentropy, optimizer=optz, metrics=['accuracy'])
print(model3.summary())
early_stop = EarlyStopping(monitor='val_loss', patience=2, verbose=1,  mode='auto')
checkpoint_filepath = os.path.join(logpath,  "model.h1")
checkpoint_cb = ModelCheckpoint(checkpoint_filepath, save_best_only=True)
model3.fit(X_train, y_train,
            validation_data=(X_test, y_test), 
            epochs=EPOCHS,
            shuffle=True,
            batch_size=64,
            verbose=1,
            callbacks=[tbcb])


ValueError: Input 0 is incompatible with layer gru_7: expected ndim=3, found ndim=2