# Message Classifier, Multi class with a Neural Network

Gilbert François Duivesteijn (gilbert@deep-impact.ch)



<img src="https://github.com/learning-stack/Colab-ML-Playbook/blob/master/NLP/Automatic%20tagging%20of%20short%20texts/images/dt140704.gif?raw=1" width=800>

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

import itertools

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Input
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model

from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Conv1D, MaxPooling1D
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping

from keras.utils import to_categorical

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

np.set_printoptions(precision=3, linewidth=100)

In [0]:
# Import custom library from my github

import urllib.request

def download(url):
    filename = url.split('/')[-1]
    print ('Downloading', filename)
    f =  urllib.request.urlopen(url)
    data = f.read()
    f.close()
    with open(filename, 'wb') as myfile:
        myfile.write(data)

# get .py file from repository
download('https://raw.githubusercontent.com/learning-stack/Colab-ML-Playbook/master/NLP/Automatic%20tagging%20of%20short%20texts/lib/utils.py')

from utils import plot_confusion_matrix

In [0]:
import matplotlib.pyplot as plt
%matplotlib notebook

In [0]:
!wget https://github.com/gilbertfrancois/pydata2018-tagging/blob/master/data/messages-cls.pkl?raw=true

In [0]:
df_cls_messages = pd.read_pickle('messages-cls.pkl?raw=true')

In [0]:
dict_classes = {
    0: 'late/early',
    1: 'holidays',
    2: 'home office',
    3: 'med app',
    4: 'ill',
    5: 'business',
    6: 'in office',
    7: 'miscellaneous'
}

In [0]:
samples = df_cls_messages.dropna()
samples['class'] = samples.loc[:, 'class'].astype(np.uint8).values

samples['text'] = samples['text'].str.lower()
samples['text'] = samples['text'].str.replace('ho','home office')
samples['text'] = samples['text'].str.replace('homeoffice','home office')

len(samples)

In [0]:
X = samples['text']
y = samples['class']

X[:10]

In [0]:
# Embedding
max_features = 3000
maxlen = 50
embedding_size = 100

# Convolution
kernel_size = 5
filters = 64
pool_size = 4

# LSTM
lstm_output_size = 70

# Training
batch_size = 20
epochs = 12

## Tokenizing

In [0]:
tokenizer = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True, split=' ', char_level=False)
tokenizer.fit_on_texts(X)
Xt = tokenizer.texts_to_sequences(X)

print(X.iloc[2])
print(Xt[2])

In [0]:
word_index = tokenizer.word_index
num_words = len(word_index)
print('Number of words in the corpus: {}'.format(num_words))

In [0]:
len_list = np.array([len(x) for x in Xt])
mean_len = np.mean(len_list)
std_len = np.std(len_list)
print(mean_len, std_len)

## Multi Label Binarizer

In [0]:
yc = to_categorical(y-1)
yc[:10]

## Sequence padding

Make all input vectors the same size by truncating or adding zeros.

In [0]:
Xts = sequence.pad_sequences(Xt, maxlen=maxlen, padding='post')
print(X[0], '\n')
print(Xt[0], '\n')
print(Xts[0], '\n')

## Split dataset

In [0]:
X_train, X_test, y_train, y_test = train_test_split(Xts, yc, test_size=0.10, random_state=1)

print('X_train shape: {}'.format(X_train.shape))
print('X_test shape : {}'.format(X_test.shape))

## Build and train the model

In [0]:
def build_and_train_model(X_train, X_test, y_train, y_test, verbose=1):
    print('Build model...')
    model = Sequential()
    model.add(Embedding(max_features, embedding_size, input_length=maxlen))
    model.add(Dropout(0.25))
    model.add(Conv1D(filters,
                     kernel_size,
                     padding='valid',
                     activation='relu',
                     strides=1))
    model.add(MaxPooling1D(pool_size=pool_size))
    model.add(LSTM(lstm_output_size))
    model.add(Dense(8))
    model.add(Activation('softmax'))

    callback = EarlyStopping(monitor='val_acc', patience=2)

    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    print('Train...')
    H = model.fit(X_train, y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=(X_test, y_test), callbacks=[callback], verbose=verbose)
    score, acc = model.evaluate(X_test, y_test, batch_size=batch_size)
    print('Test accuracy:', acc)
    return model, H, acc

In [0]:
model, H, acc = build_and_train_model(X_train, X_test, y_train, y_test)

In [0]:
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
axs[0].plot(H.history['acc'], label='acc')
axs[0].plot(H.history['val_acc'], label='val_acc')
axs[1].plot(H.history['loss'], label='loss')
axs[1].plot(H.history['val_loss'], label='val_loss')

In [0]:
y_pred = model.predict(X_test)

In [0]:
print(classification_report(y_pred=np.argmax(y_pred, axis=1), y_true=np.argmax(y_test, axis=1)))
cm = confusion_matrix(y_pred=np.argmax(y_pred, axis=1), y_true=np.argmax(y_test, axis=1))
plt.figure()
plot_confusion_matrix(cm, classes=dict_classes.values(), normalize=True);

## Training an ensemble

Since we do early stopping, to minimise overfitting, the model is not fully trained yet. The effect of random initialisation is noticable. Hence, when training e.g. 10 times from scratch, you end up with 10 models with different performance. An idea to smooth this effect is to train multiple times and use all models for the prediction. Add all resulting output vectors and take the $argmax$. This will be your final prediction.

In [0]:
n_models = 10

model_list = []
H_list = []
acc_list = []
for i in range(n_models):
    print('Training model {}'.format(i+1))
    _model, _H, _acc = build_and_train_model(X_train, X_test, y_train, y_test, verbose=0)
    model_list.append(_model)
    H_list.append(_H)
    acc_list.append(_acc)

In [0]:
print('Mean test accuracy: {:.3f}'.format(np.mean(np.array(acc_list))))

In [0]:
fig, axs = plt.subplots(1, 2, figsize=(10, 5))
for H in H_list:
    axs[0].plot(H.history['acc'], label='acc')
    axs[0].plot(H.history['val_acc'], label='val_acc')
    axs[1].plot(H.history['loss'], label='loss')
    axs[1].plot(H.history['val_loss'], label='val_loss')

In [0]:
y_pred_list = []
for index in range(len(X_test)):
    y_pred = np.zeros(8)
    for model in model_list:
        y_pred += model.predict(X_test[index][np.newaxis])[0]
    y_pred_list.append(y_pred)
    
y_pred = np.array(y_pred_list)

In [0]:
accuracy_score(y_true=np.argmax(y_test, axis=1), y_pred=np.argmax(y_pred, axis=1))

In [0]:
print(classification_report(y_pred=np.argmax(y_pred, axis=1), y_true=np.argmax(y_test, axis=1)))
print('Accuracy: {:.3f}'.format(accuracy_score(y_true=np.argmax(y_test, axis=1), y_pred=np.argmax(y_pred, axis=1))))
cm = confusion_matrix(y_pred=np.argmax(y_pred, axis=1), y_true=np.argmax(y_test, axis=1))
plt.figure()
plot_confusion_matrix(cm, classes=dict_classes.values(), normalize=True);

In [0]:
def tag_message(message):
    X_new = tokenizer.texts_to_sequences([message])
    x1_new = sequence.pad_sequences(X_new, maxlen=maxlen, padding='post')
    y_pred_list = []
    for model in model_list:
        y_pred_list.append(model.predict(x1_new))
    y_pred = np.argmax(np.sum(np.array(y_pred_list), axis=0) / 10.0)
    print('{:>20} | {}'.format(dict_classes[y_pred], message))

In [0]:
tag_message('My alarm clock was not set properly. I come to the office asap.')
tag_message('It is my scheduled day off.')
tag_message('See you on Wednesday.')
tag_message('Not feeling well today, I had to vomit.')
tag_message('I work at home on Tuesday.')
tag_message('This morning I have a meeting at ACME.')
tag_message('The roads are super slippery, will miss the daily')
tag_message('get well soon!')
tag_message('I\'m away for a long lunch between 12:00 and 15:30')
tag_message('I have an appointment at the physio.')

## Transfer learning with pretrained embeddings

In [0]:
embeddings_index = {}
with open('glove/glove.6B.100d.txt', 'r') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

In [0]:
embeddings_index['home'].shape

In [0]:
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        
print(embedding_matrix.shape)

In [0]:
print('Build model...')

model = Sequential()
# model.add(Input(shape=(maxlen,), dtype='int32'))
model.add(Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], weights=[embedding_matrix], input_length=maxlen, trainable=False))
model.add(Dropout(0.25))
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(MaxPooling1D(pool_size=pool_size))
model.add(LSTM(lstm_output_size))
model.add(Dense(8))
model.add(Activation('softmax'))

callback = EarlyStopping(monitor='val_acc', patience=1)

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
H = model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(X_test, y_test), callbacks=[callback])
score, acc = model.evaluate(X_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

In [0]:
y_pred = model.predict(X_test)

In [0]:
print(classification_report(y_pred=np.argmax(y_pred, axis=1), y_true=np.argmax(y_test, axis=1)))
cm = confusion_matrix(y_pred=np.argmax(y_pred, axis=1), y_true=np.argmax(y_test, axis=1))
plt.figure()
plot_confusion_matrix(cm, classes=dict_classes.values(), normalize=True);