## Experiment DL learning with the custom embedding
Even if the training set is too small to produce a representative word embedding, this notebook illustrates an experiment to have a rough idea on the resulting score with such embedding choice

In [11]:
import pandas as pd
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Activation
from keras.layers.embeddings import Embedding

In [5]:
XTrain = pd.read_csv('../../data/staging_data/mispelling_fixed_clean_input_train.csv', sep=',')
YTrain = pd.read_csv('../../data/POSOS/label.csv', sep=';')
num_classes = 51

In [6]:
vocabulary_size = 10000
padding = 25
embed_out_size = 300

tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(XTrain['question'])

sequences = tokenizer.texts_to_sequences(XTrain['question'])
XEncodedTrain = pad_sequences(sequences, maxlen=padding)

### Build the neural network
The NN is composed of 3 layers:
* custom embeddding layer
* LSTM layer to learn on word sequence
* fully connected layer to learn classification

In [9]:
import tensorflow as tf
#config = tf.ConfigProto(device_count={"CPU": 32})
config = tf.ConfigProto(log_device_placement=True)
keras.backend.tensorflow_backend.set_session(tf.Session(config=config))

model_lstm = Sequential()
model_lstm.add(Embedding(vocabulary_size , embed_out_size, input_length=padding))
model_lstm.add(LSTM(embed_out_size, dropout=0.2, recurrent_dropout=0.2))
model_lstm.add(Dense(num_classes, activation='softmax'))
model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 25, 300)           3000000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 300)               721200    
_________________________________________________________________
dense_3 (Dense)              (None, 51)                15351     
Total params: 3,736,551
Trainable params: 3,736,551
Non-trainable params: 0
_________________________________________________________________


### Train the network

In [12]:
YOneHotEncodedTrain = keras.utils.to_categorical(YTrain['intention'])

In [None]:
import keras.utils

model_lstm.fit(XEncodedTrain, np.array(YOneHotEncodedTrain), validation_split=0.2, epochs=10)

Train on 6422 samples, validate on 1606 samples
Epoch 1/10
Epoch 2/10
 800/6422 [==>...........................] - ETA: 9s - loss: 2.0658 - acc: 0.4988