# Transformer Language Model Experiments

## ML Classification for Records Management

Jason Franks

Master of Data Science Minor Thesis

Supervisors: Dr Greg Rolan, Dr Lan Du


## Install CUDA and SimpleTransformers

In [None]:
%%writefile setup.sh
export CUDA_HOME=/usr/local/cuda-10.1
git clone https://github.com/NVIDIA/apex
pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./apex

In [None]:
!sh setup.sh

In [None]:
!pip install simpletransformers

In [None]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef, make_scorer, balanced_accuracy_score
from simpletransformers.classification import ClassificationModel

from functools import partial
import io

import nltk as nltk
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder, TrigramAssocMeasures
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.mwe import MWETokenizer

from google.colab import files
from google.colab import drive

In [None]:
drive.mount('/content/drive')

## **Set the following variables to load the data**

**mount_path**: path into a google drive to your working folder

**data_file**: name of the file containing your data. This must be a tab-separated .tsv file with two columns: 'label', containing the category name, and 'text', containing the record's raw text.

**model_type**: Select the language model to train fro ['xlnet', 'bert', 'roberta']

Evey category in the data file should have *at least* 10 records.



In [None]:
mount_path = '/content/drive/My Drive/'
data_file = 'all_docs_trimmed.tsv'
model_type = 'xlnet'

## Import and prepare the data

In [None]:
all_docs = pd.read_csv(mount_path + data_file, "\t")

In [None]:
labels = all_docs['label'].unique()
num_labels = len(labels)

In [None]:
# Utility functions to help assess the output

def get_within_category_accuracies( cat_list, cm ):
    cat_accuracies = []
    for row in range(len(cat_list)):
        cm_row = cm[row]
        num_correct = cm_row[row]

        total = sum(cm[row])
        if total == 0:
            continue
        
        cat_accuracies.append(num_correct/total)
            
    df = pd.DataFrame(zip(cat_list, cat_accuracies), columns=['label', 'accuracy'])
    return df

def assess_model(test, preds, title, labels, draw_plot=True):        
    final_test_accuracy = accuracy_score(test, preds)
    final_test_f1 = f1_score(test, preds, average='macro') 
    final_cat_f1s = f1_score(test, preds, average=None) 
    final_test_f1_weighted = f1_score(test, preds, average='weighted')    
    final_test_precision = precision_score(test, preds, average='macro') 
    final_cat_precision = precision_score(test, preds, average=None) 
    final_test_precision_weighted = precision_score(test, preds, average='weighted')    
    final_test_recall = recall_score(test, preds, average='macro') 
    final_cat_recall = recall_score(test, preds, average=None) 
    final_test_recall_weighted = recall_score(test, preds, average='weighted')    
    cm = confusion_matrix(test, preds)

    metrics=[]
    metrics.append( ["accuracy", final_test_accuracy])
    metrics.append( ["f1", final_test_f1])
    metrics.append( ["f1 weighted", final_test_f1_weighted])
    metrics.append( ["precision", final_test_precision])
    metrics.append( ["precision weighted", final_test_precision_weighted])
    metrics.append( ["recall", final_test_recall])
    metrics.append( ["recall weighted", final_test_recall_weighted])

    print( "------------Model assessment-----")

    print( "test f1 / category, {}\n".format( final_cat_f1s))   
    print( "test precision / category, {}\n".format( final_cat_precision))   
    print( "test recall / category, {}\n".format( final_cat_recall))   
    
    model_assessment = pd.DataFrame(metrics, columns=["metric", "value"])
    print(model_assessment)
    model_assessment.to_csv(f'{mount_path}/{title}_assess.csv', index=False )

    acc_by_cat = get_within_category_accuracies( labels, cm)

    acc_by_cat.to_csv(f'{mount_path}/output/{title}_acc_by_cat.csv', index=False )

    if draw_plot:
      ax = acc_by_cat.plot.bar( x='label', y='accuracy', title=f'{title} Accuracy by Category', legend=None, figsize=(20,20), fontsize=14)
      ax.set_ylabel("Accuracy", fontsize=12)
      ax.set_xticklabels(labels, rotation=90, fontsize=12)
      plt.tight_layout()
      fig = ax.get_figure()
      fig.savefig(mount_path + f'/output/{title}_Accuracy_by_Category.png', dpi=300)
    
    print("-------Confusion Matrix---------")
    print(cm)
    
    cmDF = pd.DataFrame.from_records(cm)    
    cmDF.columns=labels
    cmDF.index=labels
    cmDF.to_csv(f'{mount_path}/output/{title}_cm.csv', index=True)

    return acc_by_cat


In [None]:
all_docs['label'] = all_docs['label'].astype('category').cat.codes

In [None]:
def isNumber(s):    
      try:
          float(s)
          return True
      except ValueError:
          return False

def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

# Clean text and vectorize
def clean_and_drop_stopwords( df, lowercase = False ):
  tokenizer = RegexpTokenizer(r"\w+(?:[-.]\w+)?")
  if lowercase:
    df['text'] = df['text'].str.lower() 

  df['pretext'] = df['text'].apply(lambda x: tokenizer.tokenize(x))

  
  nltk.download('stopwords')
  stopWords = set(stopwords.words('english'))
  df['posttext'] = df['pretext'].apply(lambda toks: [word for word in toks if not word in stopWords])

  df['posttext'] = df['posttext'].apply(lambda toks: [word for word in toks if not hasNumbers(word)])

  df['posttext'] = df['posttext'].apply(lambda toks: [word for word in toks if len(word) > 2])

  df['text'] = df['posttext'].apply(lambda x: ' '.join(x))  
  df.drop( ['posttext', 'pretext'], axis=1)
  return df

In [None]:

lowercase = ( model_name == 'roberta')

df = clean_and_drop_stopwords(all_docs, lowercase)

In [None]:
df = df.reset_index()

In [None]:
df['posttext'] = df['posttext'].apply(lambda x: ' '.join(x))

In [None]:
df['text'] = df['posttext']
df.drop('pretext', axis=1, inplace=True)
df.drop('index', axis=1, inplace=True)
df.drop('posttext', axis=1, inplace=True)



## Split test and train sets

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=94606619, stratify=df[['label']])

In [None]:
del all_docs

## Train model

In [None]:
if model_type == 'xlnet':
  model_name = 'xlnet-base-cased' 
elif model_type == 'bert':
  model_name = 'bert-base-cased'
elif model_type == 'roberta':
  model_name = 'roberta-base'


model = ClassificationModel(model_type, model_name, num_labels=num_labels,  args={ "num_train_epochs": 18, "save_eval_checkpoints" : False,  "save_model_every_epoch": False, "sliding_window": False, 'overwrite_output_dir': True, "max_seq_length": 256}, use_cuda=True)


In [None]:
# Train the model
model.train_model(train)

## Assess Results

In [None]:
def wrap_f1( preds, y):
  f1s = f1_score(preds, y, average='macro')
  print(f1s)
  return np.mean(f1s)

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(test, acc=accuracy_score, balanced_accuracy=balanced_accuracy_score, f1=wrap_f1)

In [None]:
result

In [None]:
accuracy = 1- len(wrong_predictions)/len(test)
accuracy


In [None]:
preds = model.predict( test['text'].tolist())

In [None]:
acc_by_cat = assess_model(test['label'], preds[0], model_type, labels)