# Convolutional Long-Short Term Memory Network Experiments with Resampling

## ML Classification for Records Management

Jason Franks

Master of Data Science Minor Thesis

Supervisors: Dr Greg Rolan, Dr Lan Du

## Install CUDA and SimpleTransformers

In [None]:
%tensorflow_version 1.x

In [None]:
import numpy as np
from google.colab import files
from google.colab import drive
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense,Conv1D,MaxPooling1D, GlobalMaxPooling1D
from keras.layers import LSTM,Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

# fix random seed for reproducibility
np.random.seed(7)
from prettytable import PrettyTable

import nltk as nltk
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder, TrigramAssocMeasures
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef, make_scorer, balanced_accuracy_score

from imblearn.over_sampling import SMOTE, SVMSMOTE, RandomOverSampler
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.combine import SMOTEENN,SMOTETomek
from datetime import datetime
from datetime import timedelta

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [None]:
drive.mount('/content/drive')

## **Set the following variables to load the data**

**mount_path**: path into a google drive to your working folder

**data_file**: name of the file containing your data. This must be a tab-separated .tsv file with two columns: 'label', containing the category name, and 'text', containing the record's raw text.

Evey category in the data file should have *at least* 10 records.

**resample**: set a resampling strategy. Choose from [None, 'random', 'smote', 'smoteenn']


In [None]:
mount_path = '/content/drive/My Drive/'
all_docs = pd.read_csv( mount_path + "all_docs_trimmed.tsv", "\t")
resample = None

## Import and prepare the data

In [None]:
def isNumber(s):    
      try:
          float(s)
          return True
      except ValueError:
          return False

def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

# Clean text and vectorize
def clean_and_drop_stopwords( df, lowercase = False ):
  tokenizer = RegexpTokenizer(r"\w+(?:[-.]\w+)?")
  if lowercase:
    df['text'] = df['text'].str.lower() 

  df['pretext'] = df['text'].apply(lambda x: tokenizer.tokenize(x))

  
  nltk.download('stopwords')
  stopWords = set(stopwords.words('english'))
  df['posttext'] = df['pretext'].apply(lambda toks: [word for word in toks if not word in stopWords])

  df['posttext'] = df['posttext'].apply(lambda toks: [word for word in toks if not hasNumbers(word)])

  df['posttext'] = df['posttext'].apply(lambda toks: [word for word in toks if len(word) > 2])

  df['text'] = df['posttext'].apply(lambda x: ' '.join(x))  
  df.drop( ['posttext', 'pretext'], axis=1)
  return df

In [None]:
all_docs = clean_and_drop_stopwords(all_docs, True)

In [None]:
label_names = all_docs['label'].unique()
all_docs['label_i'] = all_docs['label'].astype('category').cat.codes

y_label = all_docs['label_i']
labels = all_docs['label_i'].unique()
num_labels = len(labels)


In [None]:
# Utility functions to help assess the output

def get_within_category_accuracies( cat_list, cm ):
    cat_accuracies = []
    for row in range(len(cat_list)):
        cm_row = cm[row]
        num_correct = cm_row[row]

        total = sum(cm[row])
        if total == 0:
            continue
        
        cat_accuracies.append(num_correct/total)
            
    df = pd.DataFrame(zip(cat_list, cat_accuracies), columns=['label', 'accuracy'])
    return df

def assess_model(test, preds, title, labels, draw_plot=True):        
    final_test_accuracy = accuracy_score(test, preds)
    final_test_f1 = f1_score(test, preds, average='macro') 
    final_cat_f1s = f1_score(test, preds, average=None) 
    final_test_f1_weighted = f1_score(test, preds, average='weighted')    
    final_test_precision = precision_score(test, preds, average='macro') 
    final_cat_precision = precision_score(test, preds, average=None) 
    final_test_precision_weighted = precision_score(test, preds, average='weighted')    
    final_test_recall = recall_score(test, preds, average='macro') 
    final_cat_recall = recall_score(test, preds, average=None) 
    final_test_recall_weighted = recall_score(test, preds, average='weighted')    
    cm = confusion_matrix(test, preds)

    metrics=[]
    metrics.append( ["accuracy", final_test_accuracy])
    metrics.append( ["f1", final_test_f1])
    metrics.append( ["f1 weighted", final_test_f1_weighted])
    metrics.append( ["precision", final_test_precision])
    metrics.append( ["precision weighted", final_test_precision_weighted])
    metrics.append( ["recall", final_test_recall])
    metrics.append( ["recall weighted", final_test_recall_weighted])

    print( "------------Model assessment-----")

    print( "test f1 / category, {}\n".format( final_cat_f1s))   
    print( "test precision / category, {}\n".format( final_cat_precision))   
    print( "test recall / category, {}\n".format( final_cat_recall))   
    
    model_assessment = pd.DataFrame(metrics, columns=["metric", "value"])
    print(model_assessment)
    model_assessment.to_csv(f'{mount_path}/{title}_assess.csv', index=False )

    acc_by_cat = get_within_category_accuracies( labels, cm)

    acc_by_cat.to_csv(f'{mount_path}/output/{title}_acc_by_cat.csv', index=False )

    if draw_plot:
      ax = acc_by_cat.plot.bar( x='label', y='accuracy', title=f'{title} Accuracy by Category', legend=None, figsize=(20,20), fontsize=14)
      ax.set_ylabel("Accuracy", fontsize=12)
      ax.set_xticklabels(labels, rotation=90, fontsize=12)
      plt.tight_layout()
      fig = ax.get_figure()
      fig.savefig(mount_path + f'/output/{title}_Accuracy_by_Category.png', dpi=300)
    
    print("-------Confusion Matrix---------")
    print(cm)
    
    cmDF = pd.DataFrame.from_records(cm)    
    cmDF.columns=labels
    cmDF.index=labels
    cmDF.to_csv(f'{mount_path}/output/{title}_cm.csv', index=True)

    return acc_by_cat


In [None]:
# One hot encode the labels
encoder = LabelEncoder()
encoder.fit(all_docs['label_i'])
encoded_Y = encoder.transform(all_docs['label_i'])

dummy_y = np_utils.to_categorical(encoded_Y)
dummy_y


In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_docs['text'])
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)
encoded_X = tokenizer.texts_to_sequences(all_docs['text'])

In [None]:
top_words = 10000 

In [None]:
X_train_all, X_test, y_train_all, y_test = train_test_split(encoded_X, dummy_y, test_size=0.2, random_state=94606619, stratify=dummy_y)
X_train, X_cv, y_train, y_cv = train_test_split(X_train_all, y_train_all, test_size = 0.2, random_state=94606619, stratify=y_train_all)

In [None]:

max_document_length = 15000
X_train = sequence.pad_sequences(X_train, maxlen=max_document_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_document_length)
X_cv = sequence.pad_sequences(X_cv,maxlen=max_document_length)

In [None]:
y_test_max = np.argmax(y_test, axis=1)

In [None]:
if resample == 'random':
  sme = RandomOverSampler(random_state=42)
  X_train, y_train = sme.fit_resample(X_train, y_train)
elif resample == 'smote':
  sm = SMOTE(random_state=777, k_neighbors=3)
  X_train, y_train = sm.fit_sample(X_train, y_train)
elif resample == 'smoteenn':
  sm = SMOTE(random_state=777, k_neighbors=3)
  sm_nn = SMOTEENN(random_state=777, smote=sm)
  X_train, y_train = sm_nn.fit_sample(X_train, y_train)

In [None]:
embedding_vector_length = 64
num_epochs = 32

In [None]:
starttime = datetime.now()
starttime

In [None]:

# LSTM + CNN
# create the model
model = Sequential()
model.add(Embedding(top_words, embedding_vector_length, input_length=max_document_length))
model.add(Conv1D(filters=100, kernel_size=3, padding='valid', activation='relu', strides=1)) # filters=32
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(250))
model.add(Dropout(0.2))
model.add(Dense(num_labels, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())
filepath="weights_best_cnn.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max',save_weights_only=True)
callbacks_list = [checkpoint]
model.fit(X_train, y_train, epochs=num_epochs, batch_size=256,verbose = 1,callbacks = callbacks_list,validation_data=(X_cv,y_cv))


In [None]:
endtime = datetime.now()
endtime

In [None]:
# Final evaluation of the model
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
y_pred = model.predict(X_test)

In [None]:
title = "C-LSTM"
if resample != None:
  title = f"C-LSTM-{resample}"

In [None]:
y_pred_max = np.argmax(y_pred, axis=1)
acc_by_cat = assess_model(y_test_max, y_pred_max, title, label_names.tolist())