In [116]:
import os
import math
import pandas as pd 
from utils import train_test_validation_split

from sklearn.feature_extraction.text import CountVectorizer
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, Flatten, LSTM, GRU, Concatenate
from keras.losses import sparse_categorical_crossentropy, categorical_hinge
from keras import optimizers
from keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping, LearningRateScheduler, History


import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess

from gensim.models.keyedvectors import KeyedVectors


## Import dataset

In [3]:
# Import data
X = pd.read_csv('DATA/clean_data/cleaned_n_stemming_input_train.csv', sep=";", index_col=0)
y = pd.read_csv('DATA/output_train.csv', sep=";", index_col=0)

features = X.columns
targets = y['intention'].unique()

In [4]:
X['question'].head()

0    bonjour tromp forum question alor repos ici pr...
1                  <MEDICAMENT> soulag contr <MALADIE>
2    medecin <MEDICAMENT> prescr <MEDICAMENT> <ORDI...
3       est exist form adapt enfant <AGE> <MEDICAMENT>
4    medecin soign <MEDICAMENT> pharyngit <MEDICAME...
Name: question, dtype: object

## Prepare data for nn and find correct parameters

In [5]:
# Estimation of the size of the vocabulary 
vectorizer = CountVectorizer()
vectorizer.fit_transform(X['question'])
MAX_NB_WORDS = len(vectorizer.vocabulary_)

In [6]:
# Find max size of the sequences 
MAX_SEQUENCE_LENGTH = 0 
for sentence in X['question']:
    if MAX_SEQUENCE_LENGTH<len(sentence.split()):
        MAX_SEQUENCE_LENGTH = len(sentence.split())
MAX_SEQUENCE_LENGTH

382

In [7]:
# Preprocess text fo feed the net 
texts = X['question']
tokenizer = Tokenizer(num_words=MAX_NB_WORDS/2)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 4746 unique tokens.


In [8]:
X_train, X_test, X_validation, y_train, y_test, y_validation = train_test_validation_split(X_sequences,y, test_size=0.1)

In [9]:
X_train

array([[  0,   0,   0, ...,   0, 545, 744],
       [  0,   0,   0, ..., 108, 244,  15],
       [  0,   0,   0, ..., 102, 156,  40],
       ...,
       [  0,   0,   0, ..., 232,  72, 294],
       [  0,   0,   0, ..., 110,  26,   1],
       [  0,   0,   0, ...,   2,  12,   4]], dtype=int32)

## Prepare Tensorboard

In [12]:
def generate_unique_logpath(logdir, raw_run_name):
        i = 0
        while(True):
                run_name = raw_run_name + "-" + str(i)
                log_path = os.path.join(logdir, run_name)
                if not os.path.isdir(log_path):
                        return log_path
                i = i + 1

## Create and train the model

In [10]:
def step_decay(epoch):
    initial_lrate = 0.006
    drop = 0.5
    epochs_drop = 2.0
    lrate = initial_lrate * math.pow(drop,  
           math.floor((1+epoch)/epochs_drop))
    return lrate

def exp_decay(epoch):
    initial_lrate = 0.06
    k = 0.1
    lrate = initial_lrate * np.exp(-k*t)
    return lrate

lrate = LearningRateScheduler(step_decay)

In [10]:
#Define parameters 
EMBEDDING_DIM = 200
NB_CATEGORIES = len(targets)
NB_LSTM = 50
PERC_DROPOUT = 0.8
EPOCHS = 30
#Define RMSProp optimizer
LEARNING_RATE = 0.006
RATE_DECAY = LEARNING_RATE / EPOCHS
optz = optimizers.RMSprop(lr=LEARNING_RATE, decay=RATE_DECAY)


sgd = optimizers.SGD(lr=LEARNING_RATE, momentum=0.9, nesterov=True)


In [13]:
run_name = "gru_run_stemming"

logpath = generate_unique_logpath("./logs_tensorboard", run_name)
tbcb = TensorBoard(log_dir=logpath)

In [15]:
#Model 
model= Sequential()
model.add(Embedding(len(word_index)+1,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True))
model.add(GRU(NB_LSTM))
model.add(Dropout(PERC_DROPOUT))
model.add((Dense(NB_CATEGORIES)))
model.add(Activation('softmax')) # reminder sigmoid if is for binary classification
model.compile(loss=sparse_categorical_crossentropy, optimizer=optz, metrics=['accuracy'])
print(model.summary())
early_stop = EarlyStopping(monitor='val_loss', patience=2, verbose=1,  mode='auto')
checkpoint_filepath = os.path.join(logpath,  "model.h1")
checkpoint_cb = ModelCheckpoint(checkpoint_filepath, save_best_only=True)
model.fit(X_train, y_train.values,
            validation_data=(X_test, y_test.values), 
            epochs=EPOCHS,
            shuffle=True,
            batch_size=64,
            verbose=1,
            callbacks=[tbcb , History()])


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 382, 200)          949400    
_________________________________________________________________
gru_2 (GRU)                  (None, 50)                37650     
_________________________________________________________________
dropout_2 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 51)                2601      
_________________________________________________________________
activation_2 (Activation)    (None, 51)                0         
Total params: 989,651
Trainable params: 989,651
Non-trainable params: 0
_________________________________________________________________
None
Train on 5137 samples, validate on 1285 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 

<keras.callbacks.History at 0x7f20a369f9b0>

## CNN try

In [None]:
word_vectors = KeyedVectors.load_word2vec_format('../GoogleNews-vectors-negative300.bin', binary=True)
vocabulary_size=min(len(word_index)+1,NUM_WORDS)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, i in word_index.items():
    if i>=NUM_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

del(word_vectors)

from keras.layers import Embedding
embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True)


In [77]:


from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import Conv3D, MaxPooling3D
from keras.models import Model
from keras.layers import Input, Dense, Embedding, Conv2D, MaxPooling2D, Dropout,concatenate
from keras.layers.core import Reshape, Flatten
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam
from keras.models import Model
from keras import regularizers

In [129]:
embedding_dim = 50          
filter_sizes = (3, 8)
num_filters = 10
dropout_prob = (0.5, 0.8)

inputs = Input(shape=(MAX_SEQUENCE_LENGTH,))
embedding_layer = Embedding(len(word_index),
                            embedding_dim,
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)
embedding = embedding_layer(inputs)
z = Dropout(dropout_prob[0])(embedding)
# Convolutional block
conv_blocks = []
for sz in filter_sizes:
    conv = Conv1D(filters=num_filters,
                         kernel_size=sz,
                         padding="valid",
                         activation="relu",
                         strides=1)(z)
    conv = MaxPooling1D(pool_size=2)(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

z = Dropout(dropout_prob[1])(z)
z = Dense(51, activation="relu")(z)
output = Dense(51, activation="sigmoid")(z)
# this creates a model that includes
model_cnn = Model(inputs, output)
print(model_cnn.summary())


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_51 (InputLayer)           (None, 382)          0                                            
__________________________________________________________________________________________________
embedding_53 (Embedding)        (None, 382, 50)      237300      input_51[0][0]                   
__________________________________________________________________________________________________
dropout_34 (Dropout)            (None, 382, 50)      0           embedding_53[0][0]               
__________________________________________________________________________________________________
conv1d_63 (Conv1D)              (None, 380, 10)      1510        dropout_34[0][0]                 
__________________________________________________________________________________________________
conv1d_64 

In [132]:
adam = Adam(lr=1e-3)

model_cnn.compile(loss=sparse_categorical_crossentropy,
              optimizer=adam,
              metrics=['acc'])

In [133]:
model_cnn.fit(X_train, y_train.values,
            validation_data=(X_test, y_test.values), 
            epochs=10,
            shuffle=True,
            batch_size=1000,
            verbose=1,
            callbacks=[tbcb , History()])

Train on 5137 samples, validate on 1285 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f205788a908>