# Message Classifier, Multi Class, Single Label
Gilbert François Duivesteijn (gilbert@deep-impact.ch)


<img src="images/dt010612.gif" width=800>

In [None]:
import os
import pickle as pkl
import re
import string

import matplotlib.pyplot as plt
import nltk
from nltk import tokenize
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_curve

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from lib.utils import plot_confusion_matrix

import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap, BoundaryNorm
from sklearn import neighbors
import matplotlib.patches as mpatches

In [None]:
np.set_printoptions(precision=3, linewidth=100)
%matplotlib notebook

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

## Library functions

In [None]:
stemmer = nltk.PorterStemmer()
analyzer = CountVectorizer().build_analyzer()


def stemmed(doc):
    return [stemmer.stem(w) for w in analyzer(doc)]


def no_stemmed(doc):
    return [w for w in analyzer(doc)]


def mask_integers(s):
    return re.sub(r'\d+', '_INT', s)


def mask_times(text):
    """
    Replaces times written like 12:50, 1PM, 4:15am, etc to _time token.
    :param    text    Input text
    :return           Output text with replaced times.
    """
    re_time1 = '\d{1,2}[:.]\d{2}(?:am|pm|AM|PM)'
    re_time2 = '\d{1,2}[:.]\d{2}'
    re_time3 = '\d{1,2}(?:am|pm|AM|PM)'
    rec_time = re.compile(re_time1 + '|' + re_time2 + '|' + re_time3)
    return re.sub(rec_time, '_TIME', text)


def mask_emojis(text):
    """
    Replaces all different emojis to _emoji token.
    :param    text    Input text
    :return           Output text with replaced emojis.    
    """
    re_icons = ':[a-z-_]*:'
    re_ldsd = '\<(.*?)\>'
    rec_icons = re.compile(re_icons + "|" + re_ldsd)
    return re.sub(rec_icons, '_EMOJI', text)


def mask_all(text):
    text = mask_times(text)
    text = mask_emojis(text)
    text = mask_integers(text)
    return text


def train_and_test(steps, X_train, X_test, y_train, y_test):
    """
    Trains and tests the pipeline with the given steps. 
    :param steps:       List of operations inside the pipeline.
    :param X_train:     Training data
    :param X_test:      Training labels
    :param y_train:     Testing data
    :param y_test:      Testing labels
    :return:            Trained model
    """
    pipeline = Pipeline(steps)
    folds = 10
    xval_score = cross_val_score(pipeline, X_train, y_train, cv=folds, n_jobs=-1)
    
    xv_min = np.min(xval_score)
    xv_max = np.max(xval_score)
    xv_mean = np.mean(xval_score)
    xv_std = np.std(xval_score)
    print('{} fold Cross Validation Score: <{:.2f}, {:.2f}>; µ={:.2f}'.format(folds, xv_min, xv_max, xv_mean))
    pipeline = pipeline.fit(X_train, y_train)
    print('Score on test set: {:.2f}'.format(pipeline.score(X_test, y_test)))
    return pipeline


def tag_message(pipeline, message):
    y_pred = pipeline.predict([message])[0]
    print('{:>20} | {}'.format(dict_classes[y_pred], message))

Challenges:
- Real data, not easy separable in different classes.
- Multi class classification, more difficult than binary classification.
- Small dataset, not a lot of samples to train and test.
- Number of samples are not equally divided over the classes.

| Class | Description                                       |
| ----- | ------------------------------------------------- |
| 1     | Too late, away during office hours or early leave |
| 2     | Holidays or scheduled free days                   |
| 3     | Home Office                                       |
| 4     | Medical appointment                               |
| 5     | Ill, without consulting a medical                 |
| 6     | Work related absence (at client, conference)      |
| 7     | In office announcement                            |
| 8     | Miscellanious                                     |

In [None]:
dict_classes = {
    1: 'late/early',
    2: 'holidays',
    3: 'home office',
    4: 'med app',
    5: 'ill',
    6: 'business',
    7: 'in office',
    8: 'miscellanious'
}

## Vector Space Model (VSM)

### Vectorizer

Before we can use the text messages to train a classifier, we have to transform text into numbers. 

In [None]:
documents = [
    "But I don’t want to go among mad people, Alice remarked.",
    "Oh, you can’t help that, said the Cat: we’re all mad here. I’m mad. You’re mad.",
    "How do you know I’m mad? said Alice.",
    "You must be, said the Cat, or you wouldn’t have come here."
]

vectorizer = CountVectorizer(stop_words='english')
# vectorizer = CountVectorizer(tokenizer=nltk.tokenize.word_tokenize, stop_words='english', strip_accents='unicode')
X = vectorizer.fit_transform(documents)
vectorizer.vocabulary_

It creates vector for every document:

In [None]:
X.toarray()

When transforming a text with unknown words, like `cat` in this example, vector is empty.

In [None]:
documents2 = ["Cat: Where are you going?",
            "Alice: Which way should I go?"]
X12 = vectorizer.transform(documents2).toarray()
X12

### Intermezzo: Search engine 

In [None]:
# Document collection
D = X.toarray()

# query
q = vectorizer.transform(["is alice mad?"]).toarray()

# Do the search by computing the dot product
res = D * q

# Sort highest ranked documents and show only documents with a score > 0
res_ranked = np.sum(res, axis=1)
res_index_sorted = np.argsort(res_ranked)[::-1]
res_index_sorted_filtered = res_index_sorted[res_ranked[res_index_sorted] > 0]

# Print the search results
for index in res_index_sorted_filtered:
    print('[ score: {} ] {}'.format(res_ranked[index], documents[index]))

### Transformer

With a count vectorizer, every word gets counted with the same weight, making frequently occuring words too important. There are several ways to penalizing frequent occuring words and rewarding rare occuring words. The most well known method is called term frequency - inverse document frequency (TF-IDF). TfIdf is implemented in scikit-learn as `TfidfTransformer`. Another algoritm is Okapi-BM25. 

In [None]:
transformer = TfidfTransformer()
Xt = transformer.fit_transform(X)
Xt.toarray()

### Stopwords

Removing common words from the documents can improve the performance of the classifier. Both NLTK as CountVectorizer provide a lists of stopwords in different languages. Beware that the lists are not the same and might give different results.

In [None]:
for word in sorted(nltk.corpus.stopwords.words('english')):
    print('{}, '.format(word), end='')

In [None]:
for word in sorted(vectorizer.get_stop_words()):
    print('{}, '.format(word), end='')

### Stemming

Stemming truncates variations of words into a same shape which helps the classifier to recognise these words as the same token.

In [None]:
doc11 = 'computer computers computing computed'
print(no_stemmed(doc11))
print(stemmed(doc11))

## Load and prepare the data

In [None]:
df_cls_messages = pd.read_pickle('data/messages-cls.pkl')
df_cls_messages.head(10)

In [None]:
# Remove all rows which have no annotation
samples = df_cls_messages.dropna()

# Convert the classification column to unsigned int, in case it is stored as string
samples['class'] = samples.loc[:, 'class'].astype(np.uint8).values

X = samples['text']
y = samples['class']

print('[.] Number of training samples: {}'.format(len(X)))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1)

## Building a pipeline and training the first model

Scikit Learn offers a great way to combine the preprocessing (vectorization, stemming, stopword removal, etc) and training/predicting by building a pipeline. Let's see how that works...

More information on pipelines, look at this nice blog post:

https://buhrmann.github.io/sklearn-pipelines.html

In [None]:
steps = [('vectorizer', CountVectorizer()),
         ('classifier', LinearSVC(random_state=1))]
pipeline1 = train_and_test(steps, X_train, X_test, y_train, y_test)

In [None]:
y_pred = pipeline1.predict(X_test)
cm = confusion_matrix(y_pred=y_pred, y_true=y_test)
plt.figure()
plot_confusion_matrix(cm, classes=dict_classes.values(), normalize=True);

Let's see what parameters are available and have been set by default:

In [None]:
pipeline1.get_params()

All objects in the pipeline are stored in a dictonary. You can easily access them like any ordinary python dictionary. E.g. you want to transform only a document to a vector. This can be useful if you want to have the output of the preprocessing step for plotting or further analysis. 

In [None]:
vectorizer = pipeline1.get_params()['vectorizer']
vectorizer.transform(['A new document']).toarray()

Let's add a TF-IDF transformer that suppress the weight of common words and make special words more important.

In [None]:
steps = [('vectorizer', CountVectorizer()),
         ('tfidf', TfidfTransformer()),
         ('classifier', LinearSVC(random_state=1))]
pipeline2 = train_and_test(steps, X_train, X_test, y_train, y_test)

In [None]:
steps = [('vectorizer', CountVectorizer(analyzer=stemmed)),
         ('tfidf', TfidfTransformer()),
         ('classifier', LinearSVC(random_state=1))]
pipeline3 = train_and_test(steps, X_train, X_test, y_train, y_test)

In [None]:
steps = [('vectorizer', CountVectorizer(analyzer=stemmed, 
                                        stop_words='english')),
        ('tfidf', TfidfTransformer()),
        ('classifier', LinearSVC(random_state=1))]
pipeline4 = train_and_test(steps, X_train, X_test, y_train, y_test)

In [None]:
steps = [('vectorizer', CountVectorizer(ngram_range=(1, 2))),
         ('tfidf', TfidfTransformer()),
         ('classifier', LinearSVC(random_state=1))]
pipeline5 = train_and_test(steps, X_train, X_test, y_train, y_test)

In [None]:
steps = [('vectorizer', CountVectorizer(ngram_range=(1, 3))),
         ('tfidf', TfidfTransformer()),
         ('classifier', LinearSVC(random_state=1))]
pipeline6 = train_and_test(steps, X_train, X_test, y_train, y_test)

In [None]:
steps = [('vectorizer', CountVectorizer(preprocessor=mask_all, 
                                        analyzer=stemmed, 
                                        stop_words='english', 
                                        ngram_range=(1, 3))),
         ('tfidf', TfidfTransformer()),
         ('classifier', LinearSVC(random_state=1))]
pipeline7 = train_and_test(steps, X_train, X_test, y_train, y_test)

## One vs Rest Classifiers

Let's train 8 classifiers: One vs Rest

In [None]:
enc = OneHotEncoder()
ym_train = enc.fit_transform(np.array(y_train.tolist()).reshape(-1,1))
ym_test = enc.transform(np.array(y_test.tolist()).reshape(-1,1))

steps = [('vectorizer', CountVectorizer(analyzer=stemmed, 
                                        stop_words='english', 
                                        preprocessor=mask_all)),
         ('transformer', TfidfTransformer()),
         ('classifier', OneVsRestClassifier(LinearSVC(random_state=1, multi_class='ovr')))]
pipeline8 = train_and_test(steps, X_train, X_test, ym_train, ym_test)

In [None]:
steps = [('vectorizer', CountVectorizer(ngram_range=(1,3), 
                                        analyzer=stemmed, 
                                        stop_words='english', 
                                        preprocessor=mask_all)),
         ('transformer', TfidfTransformer()),
         ('classifier', OneVsOneClassifier(LinearSVC(random_state=1, multi_class='ovr')))]
pipeline9 = train_and_test(steps, X_train, X_test, y_train, y_test)

In [None]:
y_pred = pipeline9.predict(X_test)

print(classification_report(y_pred=y_pred, y_true=y_test))

In [None]:
cm = confusion_matrix(y_pred=y_pred, y_true=y_test)
plt.figure()
plot_confusion_matrix(cm, classes=dict_classes.values(), normalize=True);

In [None]:
%%time

steps = [('vectorizer', CountVectorizer()),
         ('transformer', TfidfTransformer()),
         ('classifier', SVC(kernel='linear', random_state=1))]
pipeline10 = Pipeline(steps)

params = {
    'vectorizer__tokenizer': [None, nltk.tokenize.word_tokenize],
    'vectorizer__analyzer': ['word', stemmed],
    'vectorizer__stop_words': [None, nltk.corpus.stopwords.words('english'), 'english'],
    'vectorizer__ngram_range': [(1, 1), (1, 2), (2, 2), (1, 3)],
    'vectorizer__preprocessor': [None, mask_all, mask_integers, mask_times, mask_emojis],
    'classifier__C': np.logspace(-2, 2, 5),
    'classifier__gamma': np.logspace(-5, 3, 9)
}

gs = GridSearchCV(pipeline10, params, n_jobs=-1)
gs.fit(X_train, y_train)

print(gs.best_params_)
y_pred = gs.predict(X_test)
print(classification_report(y_pred=y_pred, y_true=y_test))
print('Score on the test set: {:.2f}'.format(gs.score(X_test, y_test)))

You can try a gridsearch with another classifier. But it is hard to beat Linear SVM.

In [None]:
%%time

steps = [('vectorizer', CountVectorizer()),
         ('transformer', TfidfTransformer()),
         ('classifier', RandomForestClassifier())]
pipeline11 = Pipeline(steps)

params = {
    'vectorizer__tokenizer': [None, nltk.tokenize.word_tokenize],
    'vectorizer__analyzer': ['word', stemmed],
    'vectorizer__stop_words': [None, nltk.corpus.stopwords.words('english'), 'english'],
    'vectorizer__ngram_range': [(1, 1), (1, 2), (2, 2), (1, 3)],
    'vectorizer__preprocessor': [None, mask_all, mask_integers, mask_times, mask_emojis]
    'classifier__n_estimators': [128, 512, 2048],
    'classifier__bootstrap': [True, False],
    'classifier__criterion': ['gini', 'entropy'], 
    'classifier__max_depth': [2, 4, 8, None],
    'classifier__min_samples_leaf': [1, 3, 10], 
    'classifier__min_samples_split': [2, 8, 16]
}

gs = GridSearchCV(pipeline11, params, n_jobs=-1)
gs.fit(X_train, y_train)

print(gs.best_params_)
y_pred = gs.predict(X_test)
print(classification_report(y_pred=y_pred, y_true=y_test))
print('Score on the test set: {:.2f}'.format(gs.score(X_test, y_test)))

In [None]:
pipeline = gs

tag_message(pipeline, 'My alarm clock was not set properly. I come to the office asap.')
tag_message(pipeline, 'It is my scheduled day off, see you on Tuesday.')
tag_message(pipeline, 'Not feeling well today, I stay home and work from here.')
tag_message(pipeline, 'I work at home on Tuesday.')
tag_message(pipeline, 'This morning I have a meeting at SPS.')
tag_message(pipeline, 'I\'m off, see you tomorrow.')
tag_message(pipeline, 'get well soon!')
tag_message(pipeline, 'I\'m away for a long lunch between 12:00 and 15:30')
tag_message(pipeline, 'I\'ve an appointment at 12:00 at the physiotherapy.')