## Experiment DL learning with custom embedding
Even if the training set is too small to produce a representative word embedding, this notebook illustrates an experiment to have a rough idea on the resulting score with such embedding choice

In [1]:
import pandas as pd
import numpy as np

In [2]:
XTrain = pd.read_csv('../../data/staging_data/mispelling_fixed_clean_input_train.csv', sep=',')
YTrain = pd.read_csv('../../data/POSOS/label.csv', sep=';')

In [14]:
# hyper params
num_classes = 51
patience=10
vocabulary_size = 10000
sequence_length = 30
embedding_out_dims = 300

In [15]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(XTrain['question'])

sequences = tokenizer.texts_to_sequences(XTrain['question'])
XEncodedTrain = pad_sequences(sequences, maxlen=sequence_length)

### Build the neural network
The NN is composed of 3 layers:
* custom embeddding layer
* LSTM layer to learn on word sequence
* fully connected layer to learn classification

In [16]:
import keras
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Activation
from keras.layers.embeddings import Embedding

import tensorflow as tf
#config = tf.ConfigProto(device_count={"CPU": 32})
config = tf.ConfigProto()
keras.backend.tensorflow_backend.set_session(tf.Session(config=config))

model_lstm = Sequential()
model_lstm.add(Embedding(vocabulary_size , embedding_out_dims, input_length=sequence_length))
model_lstm.add(LSTM(embedding_out_dims, dropout=0.25, recurrent_dropout=0.25))
model_lstm.add(Dense(num_classes, activation='softmax'))
model_lstm.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 30, 300)           3000000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 300)               721200    
_________________________________________________________________
dense_3 (Dense)              (None, 51)                15351     
Total params: 3,736,551
Trainable params: 3,736,551
Non-trainable params: 0
_________________________________________________________________


### Train the network

In [13]:
YOneHotEncodedTrain = keras.utils.to_categorical(YTrain['intention'])

In [None]:
import keras.utils

# stop criterion to avoid overfitting
call_back_early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_loss',
    min_delta=0,
    patience=patience,
    verbose=0,
    mode='auto',
    baseline=None)

model_lstm.fit(
    XEncodedTrain,
    np.array(YOneHotEncodedTrain),
    validation_split=0.2,
    epochs=40,
    callbacks = [call_back_early_stopping])

Train on 6422 samples, validate on 1606 samples
Epoch 1/40
Epoch 2/40
1408/6422 [=====>........................] - ETA: 1:55 - loss: 2.1691 - acc: 0.4418

In [None]:
# %load "../utils/vizu.py"
import matplotlib.pyplot as plt
import itertools

def plot_keras_model_learning_curve(history):
    ''' plot accuracy of the Keras model on training and validation set '''
    plt.figure(figsize=(12,8))
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'validation'], loc='upper left')
    plt.show()

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')

    plt.xlabel('Predicted label')

In [None]:
%matplotlib inline

plot_keras_model_learning_curve(model_lstm.history)

The model is overfitting rapidly beyond 5 epochs but no observed accuracy improvement on validation set (get rapidly stationary accuracy aounrd 62%)