### CNN training
This is an updated and slightly cleaned version of the notebook I used to train my CNN as proposed in my thesis (see the pdf). 


In [None]:
#name used for saving models between epochs, etc. Should be a unique identifier for this notebook. 
experiment_name = "example_cnn_showcase"

In [None]:
# we set the seed first to not introduce any randomness during imports
# this should in theory mean our results are reproducible
# however, there seems to be some randomness when using a GPU due to cuDNN
# see https://github.com/keras-team/keras/issues/2479
import numpy as np
np.random.seed(42)

# imports
import os
import pandas as pd
import seaborn as sns
import sys
import re

# Keras - todo, refactor these imports
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding, Flatten
from keras.models import Model, load_model
from keras.callbacks import ModelCheckpoint
from keras.initializers import Constant
from keras.layers import concatenate, BatchNormalization

# Gensim NLP library (word embeddings and such)
import gensim
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
from gensim.models.keyedvectors import KeyedVectors


import matplotlib.pyplot as plt

# for storing and loading objects
from joblib import dump, load

#labelencoding
from sklearn import preprocessing 

#confusion matrix
from sklearn.metrics import confusion_matrix

# F1-score and friends
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [None]:
# configs

# How long the sentences are going to be. Longer sentences are cut off, shorter are padded
MAX_SEQUENCE_LENGTH = 16 

#dictionary size
MAX_NUM_WORDS = 20000

# Word embedding dimensionality. 
# In general, higher dims means longer training, but increased accuracy
EMBEDDING_DIM = 300 

# scheme for calculating accuracy etc
avg = 'weighted'

# How much data to use. Set to 0 for no limits, useful for debugging
DATA_LIMIT = 0 

data_folder = '../data/'
train_data = format(data_folder + "train.csv")
val_data = format(data_folder + "val.csv")

#store objects here
model_folder = '../models_and_encoders/'

#output csv to this folder
results_folder = '../results/'

In [None]:
def cleanSentence(text):
    """
    Does some cleaning by replacing any non-alphanummeric 
    content. Only useful for when the training data has 
    not already been cleaned. In reality, this cleaning is
    too rigorous to be practical. 
    
    Extra whitespace is also removed. 
    This allows for a quick trial of the network.
    
    Parameters:
        text (str) : a document to be cleaned
    
    Returns:
        text (str) : document cleaned 
    """
    
    text = re.sub("[^a-zA-Z0-9-]"," ", text)
    text = re.sub(r"\s{2,}", " ", text).strip()
    
    return text

In [None]:
def tokenizetext(texts):
    """
    Takes an list of input texts and converts them 
    to a sequence of integers. The sequences are then padded
    at the end, to assure that each is only <MAX_SEQUENCE_LENGTH> 
    words (or integers) long. Longer texts are shortened to 
    <MAX_SEQUENCE_LENGTH>. This number can be modified at the start
    of the notebook.
    
    Parameters:
        texts: a list of texts
        
    Returns:
        sequences: a list of sequences of 16 integers long each
    """
    
    sequences = tokenizer.texts_to_sequences(texts)
    sequences = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
    
    return sequences

In [None]:
# prepare data: we just need this to look up the feature vector of every word we can encounter. 
# unsure whether to keep this in a function
def preparedataset(data_path, data_limit=DATA_LIMIT, shuffle=False):
    """
    This method loads in the data set from data path and splits it into a X (data) and y (labels).
    
    Arguably it needs to be refactored, because it's grown to be too long for a single function
    
    Parameters:
        data_path: path to the data file
        data_limit: int between 0 and len(data) to indicate how many rows should be returned
        shuffle: boolean whether to shuffle the data or not. Not really needed if you shuffle the data during training. 
    Returns:
        X: list of descriptions (strings)
        y: list of labels (strings)
        
    """
    
    X, y = [], []
    i = 0
    
    #this reads in the data oneline at a time.
    with open(data_path, "r") as infile:
        for line in infile:
            label = line.split("|")[0]
            text = line.split("|")[1]
            #print(text)
            
            # texts are already cleaned, just split on |. 
            # hacky way to get around skipping the header
            if i > 0:
                label = "0" + label if len(label) % 2 != 0 else label #solves a string - int issue

                #cleanup in case this is needed - (not for our data) 
                # perhaps make this faster via spacy or so?
                text = cleanSentence(text)
                X.append(text.split())
                y.append(label[:6])

            i = i+1 #temporary solution pending pipeline changes

    X, y  = np.array(X), np.array(y)

    # shuffle the data
    if shuffle:
        idx = np.random.permutation(len(y))
        X,y = X[idx], y[idx]

    # apply limit
    if data_limit > 0:
        X,y = X[:data_limit], y[:data_limit]
    
    return X, y 

In [None]:
def getCNN():
    """
    Defines the CNN architecture as proposed in my thesis. For full details, please refer to the pdf found in this repository.
    
    The CNN consists out of 4 'channels' of different filter sizes which all respond to different n-grams (word level)
    I apply Batch Normalization, which I havent seen in any paper for NLP but it seems too obvious so I doubt I'm the first
    The flattening operation essentially spreads the tensor from (e.g.) 15 x 256 -> ,3840 so the data can be used by the fully
    connected layers
    
    Eventually the channels are merged together, and another dense layer is used for learning more complex representations.
    
    Parameters:
        -
    Returns:
        model: a CNN architecture compliant with Keras' API
    """
    
    # I removed all the memes I referenced here. 
    s = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(s)
    # ---------------------- kernel size 2 ----------------------


    x = Conv1D(256, 2, activation='relu')(embedded_sequences)
    x = BatchNormalization()(x)
    x = Flatten()(x)
    x = Dense(1024, activation='relu')(x)
    x = BatchNormalization()(x)

    # ---------------------- kernel size 3 ----------------------
    y = Conv1D(256, 3, activation='relu')(embedded_sequences)
    y = BatchNormalization()(y)
    y = Flatten()(y)
    y = Dense(1024, activation='relu')(y)
    y = BatchNormalization()(y)
    
    # ---------------------- kernel size 4 ----------------------
    z = Conv1D(256, 4, activation='relu')(embedded_sequences)
    z = BatchNormalization()(z)
    z = Flatten()(z)
    z = Dense(1024, activation='relu')(z)
    z = BatchNormalization()(z)

    # ---------------------- kernel size 5 ----------------------
    q = Conv1D(256, 5, activation='relu')(embedded_sequences)
    q = BatchNormalization()(q)
    q = Flatten()(q)
    q = Dense(1024, activation='relu')(q)
    q = BatchNormalization()(q)
    
    #tie them all together
    merged = concatenate([x, y, z, q])    
    
    #merged gets another dense + BN stack
    merged = Dense(1024, activation='relu')(merged)
    merged = BatchNormalization()(merged)
    
    # define our softmax output, automatically gets the number of classes from the labelencoder
    # who needs automl? I don't.
    preds = Dense(len(le.classes_), activation='softmax')(merged)

    # needs to be updated to new API in TF 2
    model = Model(inputs=s, output=preds)
    return model

In [None]:
#get the data
texts, y = preparedataset(training_data)

In [None]:
#show a couple of texts
texts

In [None]:
# fit the labelencoder
le = preprocessing.LabelEncoder()
le = le.fit(y)

#fit the tokenizer
tokenizer = Tokenizer(num_words = MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)

In [None]:
# now transform the labels
labels = le.transform(y)

In [None]:
#show the list of classes in our labelencoder
le.classes_

In [None]:
# convert to labes
labels = to_categorical(np.asarray(labels), num_classes=len(le.classes_))

In [None]:
np.shape(y)

In [None]:
word_index = tokenizer.word_index

# tokenize our text now
x_train_a = tokenizetext(texts)

In [None]:
#sanity check, whats the shape of our training data now?
np.shape(x_train_a)

In [None]:
#load the word embeddings, this consumes the keyedvectors and the npy file associated with it
word_vectors = KeyedVectors.load(model_folder+'skipg_all_embeddings_march_01_full.kv')

#construct the embeddings from our word vectors
vocabulary_size=min(len(word_index)+1,MAX_NUM_WORDS)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))
for word, i in word_index.items():
    if i>=MAX_NUM_WORDS:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=np.random.normal(0,np.sqrt(0.25),EMBEDDING_DIM)

del(word_vectors)


# define the embedding layer 
# Set trainable = True to allow fine-tuning of embeddings!
embedding_layer = Embedding(vocabulary_size,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            trainable=True)

In [None]:
#save the labelencoder so we could hypothethically re-run this experiment
dump(le, model_folder+experiment_name+'_labelencoder.joblib') 

#save the tokenizer as well
dump(tokenizer, model_folder+experiment_name+'_tokenizer.joblib') 

In [None]:
# get the model
model = getCNN()

#print a model summary (text)
model.summary()

In [None]:
# define early stopping criteria, model serialization, and related functions
callbacks = [ModelCheckpoint(filepath=model_folder+experiment_name+'.hdf5', monitor='val_acc', verbose=1, save_best_only=True)]

In [None]:
#define our optimizer
opt = keras.optimizers.RMSprop(lr=0.0001)

#compile the model
model.compile(loss='categorical_crossentropy',
              optimizer=opt,
              metrics=['acc'])

In [None]:
# sanity check - what is our original text?
texts_a[0]

In [None]:
# sanity check - what is our tokenized text?
x_train_a[0]

In [None]:
# This function takes a tokenized sentence and returns the original words
def sequence_to_text(list_of_indices):
    """
    Reverse engineers the tokenized text so we can move from a tokenized text (integers) back to text
    
    Parameters:
        - list of indices (usually a line from the training data that was tokenized)
    
    Returns:
        - the words from the tokenized text
    """
    # Creating a reverse dictionary
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
    
    # Look up words in the dictionary
    words = [reverse_word_map.get(letter) for letter in list_of_indices]
    
    return(words)

# Creating texts 
detokenizedtext = list(map(sequence_to_text, [x_train_a[0]]))

#sanity check - what is our de-tokenized (back and forth) text?
print(detokenizedtext)

In [None]:
# sanity check - show the label again
le.inverse_transform(np.argmax([y_a[0]], axis=1))

In [None]:
# fit the model. You can get coffee now, it'll be a while.
history = model.fit(x_train_a, y_a,
    batch_size=batch_size,
    epochs=5,
    verbose=1,
    shuffle=True,
    validation_split=0.1,
    callbacks = callbacks)

In [None]:
#load the best model as saved during training
model = load_model(filepath=model_folder+experiment_name+'.hdf5')

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['Training Loss', 'Validation Loss'], loc='upper right')
plt.show()

In [None]:
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['Training Data', 'Validation'], loc='upper left')
plt.show()

In [None]:
#show the actual class labels (again, a bit of a sanity check)
le.classes_

In [None]:
#load these again (start here if you want to predict with an older model)
tokenizer = load(model_folder+experiment_name+'_tokenizer.joblib')

le = load(model_folder+experiment_name+'_labelencoder.joblib')

In [None]:
# load the best model saved during training (on the first data set- so not including fine tuning)
model = load_model(filepath='models_and_encoders/'+experiment_name+'.hdf5')

In [None]:
# get the validation data
X_v, y_val = preparedataset(validation_data, data_limit=0)

In [None]:
#check the shape of the validation data
np.shape(y_val)

In [None]:
#get predictions
print('Started gathering predictions..')
predictions = model.predict(tokenizetext(X_v))

#convert the one-hot encoded labesl
predictions = le.inverse_transform(np.argmax(predictions, axis=1))

#print the accuracy score
print(accuracy_score(y_val, predictions))

#add prediction results to dataframe
print(f1_score(y_val, predictions, average=avg))
print(recall_score(y_val, predictions, average=avg))
print(precision_score(y_val, predictions, average=avg))

In [None]:
#manually inspect a sample
X_v[2]

In [None]:
# and see what the label should be
y_val[2]

In [None]:
#and check what was predicted by our system
predictions[2]

Ideally, we can draw up a confusion matrix to see what classes go well and what absolutely goes awful

In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()


# Compute confusion matrix 
#cnf_matrix = confusion_matrix(y_val, predictions)
#np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
#plt.figure(figsize=(72, 64))
#plot_confusion_matrix(cnf_matrix, classes=le.classes_,
#                      title='Confusion matrix, without normalization')

In [None]:
# Plot normalized confusion matrix
#plt.figure(figsize=(72, 64))
#plot_confusion_matrix(cnf_matrix, classes=le.classes_, normalize=True,
#                      title='Normalized confusion matrix')
#
#plt.show()

In [None]:
#plotdf = pd.DataFrame(data={'accuracy': accs, 'num_samples': counts})

#plt.figure(figsize=(18, 9))
#plt.yscale('log')
#plt.xscale('log')
#plt.scatter(counts, accs, s=156)
#plt.title('Accuracy versus number of training samples (log)')

In [None]:
sampleindex = 222

#manually inspect a sample
print(X_v[sampleindex])

# and see what the label should be
print(y_val[sampleindex])

#and check what was predicted by our system
print(predictions[sampleindex])

In [None]:
# now the good part, since we can go back and evaluate against HS-4 and HS-2 as well, we can get a rough guess at
# at performance at these levels.
hs2preds = [pred[:2] for pred in predictions]
hs2v = [pred[:2] for pred in y_val]
print("HS-2 performance of this model: "+ format(accuracy_score(hs2v, hs2preds)))

hs4preds = [pred[:4] for pred in predictions]
hs4v = [pred[:4] for pred in y_val]
print("HS-4 performance of this model: "+ format(accuracy_score(hs4v, hs4preds)))

hs6preds = [pred[:6] for pred in predictions]
hs6v = [pred[:6] for pred in y_val]
print("HS-6 performance of this model: "+ format(accuracy_score(hs6v, hs6preds)))

print("\n(HS-6 performance was "+format(accuracy_score(y_val, predictions))+")")