In [65]:
import pandas as pd 
from utils import train_test_validation_split

from sklearn.feature_extraction.text import CountVectorizer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Embedding, Flatten, LSTM
from keras.losses import sparse_categorical_crossentropy, categorical_hinge
from keras import optimizers
from keras.layers import Masking, TimeDistributed

In [45]:
X = pd.read_csv('DATA/clean_input_train.csv', sep=";", index_col=0)
y = pd.read_csv('DATA/output_train.csv', sep=";", index_col=0)

features = X.columns
targets = y['intention'].unique()

array([28, 31, 44, 48, 22, 23, 42, 32, 26,  0, 34, 14,  7, 37,  4, 11, 24,
       40, 46, 30,  8, 38, 13, 21, 15, 27,  5, 33,  6, 25,  1, 50, 43, 45,
       39, 29, 19, 12, 47, 20,  9, 10, 41, 49, 18, 17,  2, 36, 16, 35,  3])

In [3]:
# Estimation of the size of the vocabulary 
vectorizer = CountVectorizer()
vectorizer.fit_transform(X['question'])
MAX_NB_WORDS = len(vectorizer.vocabulary_)

In [4]:
# Find max size of the sequences 
MAX_SEQUENCE_LENGTH = 0 
for sentence in X['question']:
    if MAX_SEQUENCE_LENGTH<len(sentence.split()):
        MAX_SEQUENCE_LENGTH = len(sentence.split())
MAX_SEQUENCE_LENGTH

412

In [51]:
# Preprocess text fo feed the net 
texts = X['question']
tokenizer = Tokenizer(num_words=MAX_NB_WORDS/2)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

X_sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 8830 unique tokens.


In [53]:
X_train, X_test, X_validation, y_train, y_test, y_validation = train_test_validation_split(X_sequences,y)

In [54]:
X_train

array([[   0,    0,    0, ..., 3888,  643, 1209],
       [   0,    0,    0, ...,  132,  250,   13],
       [   0,    0,    0, ..., 1393,   96,   41],
       ...,
       [   0,    0,    0, ...,  240,  463,  368],
       [   0,    0,    0, ...,    3,    2,    1],
       [   0,    0,    0, ...,   24,   10,   29]], dtype=int32)

In [60]:
#Define parameters 
EMBEDDING_DIM = 100
NB_CATEGORIES = len(targets)
#Define optimizer 
optz = optimizers.RMSprop(lr=0.006)

In [76]:
#Model 
model_2 = Sequential()
model_2.add(Embedding(len(word_index)+1,
                            EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True))
model_2.add(LSTM(100))
model_2.add((Dense(NB_CATEGORIES)))
model_2.add(Activation('softmax')) # reminder sigmoid if is for binary classification
model_2.compile(loss=sparse_categorical_crossentropy, optimizer=optz, metrics=['accuracy'])
print(model_2.summary())
model_2.fit(X_train, y_train.values, validation_data=(X_test, y_test.values), epochs=3, batch_size=64)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_32 (Embedding)     (None, 412, 100)          883100    
_________________________________________________________________
lstm_34 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dense_22 (Dense)             (None, 51)                5151      
_________________________________________________________________
activation_8 (Activation)    (None, 51)                0         
Total params: 968,651
Trainable params: 968,651
Non-trainable params: 0
_________________________________________________________________
None
Train on 5137 samples, validate on 1285 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7efce8e2ce10>