# TF-IDF Experiments with resampling

## ML Classification for Records Management

Jason Franks

Master of Data Science Minor Thesis

Supervisors: Dr Greg Rolan, Dr Lan Du

In [None]:
import os
import sys
from datetime import datetime

import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, matthews_corrcoef, make_scorer, balanced_accuracy_score
from sklearn.metrics import average_precision_score, auc
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_recall_curve, roc_curve
from sklearn.preprocessing import label_binarize
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE, SVMSMOTE, RandomOverSampler
from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.combine import SMOTEENN,SMOTETomek


import math

import pandas as pd
import numpy as np
from functools import partial
import io
import nltk as nltk
from nltk.collocations import BigramCollocationFinder, BigramAssocMeasures, TrigramCollocationFinder, TrigramAssocMeasures
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize.mwe import MWETokenizer

from google.colab import files
from google.colab import drive


In [None]:
drive.mount('/content/drive')

## **Set the following variables to load the data**

**mount_path**: path into a google drive to your working folder

**data_file**: name of the file containing your data. This must be a tab-separated .tsv file with two columns: 'label', containing the category name, and 'text', containing the record's raw text.

Evey category in the data file should have *at least* 10 records.



In [None]:
mount_path = '/content/drive/My Drive/'
data_file = 'all_docs_trimmed.tsv'


## Import and prepare the data

In [None]:
all_docs = pd.read_csv(mount_path + data_file, "\t")

In [None]:
# get the labels
label_names = all_docs['label'].unique()
num_labels = len(list(label_names))

In [None]:
# convert the labels into numbers
all_docs['label_i'] = all_docs['label'].astype('category').cat.codes

In [1]:
# Utility functions to help assess the output

def get_within_category_accuracies( cat_list, cm ):
    cat_accuracies = []
    for row in range(len(cat_list)):
        cm_row = cm[row]
        num_correct = cm_row[row]

        total = sum(cm[row])
        if total == 0:
            continue
        
        cat_accuracies.append(num_correct/total)
            
    df = pd.DataFrame(zip(cat_list, cat_accuracies), columns=['label', 'accuracy'])
    return df

def assess_model(test, preds, title, labels, draw_plot=True):        
    final_test_accuracy = accuracy_score(test, preds)
    final_test_f1 = f1_score(test, preds, average='macro') 
    final_cat_f1s = f1_score(test, preds, average=None) 
    final_test_f1_weighted = f1_score(test, preds, average='weighted')    
    final_test_precision = precision_score(test, preds, average='macro') 
    final_cat_precision = precision_score(test, preds, average=None) 
    final_test_precision_weighted = precision_score(test, preds, average='weighted')    
    final_test_recall = recall_score(test, preds, average='macro') 
    final_cat_recall = recall_score(test, preds, average=None) 
    final_test_recall_weighted = recall_score(test, preds, average='weighted')    
    cm = confusion_matrix(test, preds)

    metrics=[]
    metrics.append( ["accuracy", final_test_accuracy])
    metrics.append( ["f1", final_test_f1])
    metrics.append( ["f1 weighted", final_test_f1_weighted])
    metrics.append( ["precision", final_test_precision])
    metrics.append( ["precision weighted", final_test_precision_weighted])
    metrics.append( ["recall", final_test_recall])
    metrics.append( ["recall weighted", final_test_recall_weighted])

    print( "------------Model assessment-----")

    print( "test f1 / category, {}\n".format( final_cat_f1s))   
    print( "test precision / category, {}\n".format( final_cat_precision))   
    print( "test recall / category, {}\n".format( final_cat_recall))   
    
    model_assessment = pd.DataFrame(metrics, columns=["metric", "value"])
    print(model_assessment)
    model_assessment.to_csv(f'{mount_path}/{title}_assess.csv', index=False )

    acc_by_cat = get_within_category_accuracies( labels, cm)

    acc_by_cat.to_csv(f'{mount_path}/output/{title}_acc_by_cat.csv', index=False )

    if draw_plot:
      ax = acc_by_cat.plot.bar( x='label', y='accuracy', title=f'{title} Accuracy by Category', legend=None, figsize=(20,20), fontsize=14)
      ax.set_ylabel("Accuracy", fontsize=12)
      ax.set_xticklabels(labels, rotation=90, fontsize=12)
      plt.tight_layout()
      fig = ax.get_figure()
      fig.savefig(mount_path + f'/output/{title}_Accuracy_by_Category.png', dpi=300)
    
    print("-------Confusion Matrix---------")
    print(cm)
    
    cmDF = pd.DataFrame.from_records(cm)    
    cmDF.columns=labels
    cmDF.index=labels
    cmDF.to_csv(f'{mount_path}/output/{title}_cm.csv', index=True)

    return acc_by_cat



In [None]:
label_names = all_docs['label'].unique()
labels = all_docs['label_i'].unique()

## Tokenize text and find bigrams and trigrams

In [None]:
## Tokenize text, find ngrams, reduce to TF-IDF Vectors
df = all_docs[pd.notnull(all_docs['text'])]

tokenizer = RegexpTokenizer(r"\w+(?:[-.]\w+)?")
df['text'] = df['text'].str.lower()
df['pretext'] = df['text'].apply(lambda x: tokenizer.tokenize(x))

In [None]:
df = df.dropna(subset=['text', 'pretext'])

In [None]:
def isNumber(s):    
    try:
        float(s)
        return True
    except ValueError:
        return False

In [None]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

In [None]:
nltk.download('stopwords')
stopWords = set(stopwords.words('english'))

In [None]:
allTokens = []

df['pretext'].apply(lambda x: allTokens.extend(x))

trigram_measures = TrigramAssocMeasures()
finder = TrigramCollocationFinder.from_words(allTokens)
# Find all bigrams - every combination of two words
trigram_measures = TrigramAssocMeasures()
finder = TrigramCollocationFinder.from_words(allTokens)

del allTokens
df.drop(['pretext'], axis=1)

mostFreqTrigrams = finder.nbest(trigram_measures.raw_freq, 500)

# Pull out trigrams
trigrams = set([trigram for trigram in mostFreqTrigrams if
               not isNumber(trigram[0]) and not isNumber(trigram[1]) and not isNumber(trigram[2])
               and not (trigram[0] in stopWords) and not (trigram[2] in stopWords) 
               and (len(trigram[0]) > 2) and (len(trigram[1]) > 2) and (len(trigram[2]) > 2) 
                and not hasNumbers(trigram[0]) and not hasNumbers(trigram[1]) and not hasNumbers(trigram[2])])

print(trigrams)

print('Selected [{}] Trigrams.'.format(len(trigrams)))

tri_mwe_tokenizer = MWETokenizer(trigrams)

df['trigrammed'] = df['pretext'].apply(lambda x: tri_mwe_tokenizer.tokenize(x))

In [None]:
allTokens = []

df['trigrammed'].apply(lambda x: allTokens.extend(x))

bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(allTokens)
# Find all bigrams - every combination of two words
bigram_measures = BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(allTokens)
del allTokens

mostFreqBigrams = finder.nbest(bigram_measures.raw_freq, 1000)
# Pull out bigrams
bigrams = set([bigram for bigram in mostFreqBigrams if
               not isNumber(bigram[0]) and not isNumber(bigram[1])
               and not (bigram[0] in stopWords) and not (bigram[1] in stopWords) 
               and (len(bigram[0]) > 2) and (len(bigram[1]) > 2)
               and not hasNumbers(bigram[0]) and not hasNumbers(bigram[1])])

print(bigrams)

print('Selected [{}] Bigrams.'.format(len(bigrams)))

bi_mwe_tokenizer = MWETokenizer(bigrams)

df['bitrigrammed'] = df['trigrammed'].apply(lambda x: bi_mwe_tokenizer.tokenize(x))

In [None]:
# restore ngrams into text
df['posttext'] = df['bitrigrammed'].apply(lambda toks: [word for word in toks if not word in stopWords])

df['posttext'] = df['posttext'].apply(lambda toks: [word for word in toks if not hasNumbers(word)])

df['posttext'] = df['posttext'].apply(lambda toks: [word for word in toks if len(word) > 2])


df['posttext'] = df['posttext'].apply(lambda toks: [word for word in toks if not word.startswith('_')])

In [None]:
df['posttext'] = df['posttext'].apply(lambda x: ' '.join(x))

In [None]:

df.drop('text', axis=1, inplace=True)
df.drop('pretext', axis=1, inplace=True)
df.drop('trigrammed', axis=1, inplace=True)
df.drop('bitrigrammed', axis=1, inplace=True)

## Split test and train sets

In [None]:
y_label = df['label_i']

In [None]:
x_train_docs, x_test_docs, y_train, y_test = train_test_split(df['posttext'], y_label, test_size=0.2, random_state=94606619, stratify=y_label)

In [None]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.1, min_df=5, analyzer='word')
# Fit all the training docs
x_train = vectorizer.fit_transform(x_train_docs)
# Now use it to process the test docs so they don't influence the training set
x_test = vectorizer.transform(x_test_docs)

In [None]:
vocab_size = len(vectorizer.vocabulary_.keys())
print({ f"Vocabulary size = {vocab_size}")

## Resampling
We'll try SMOTe, random oversampling, and a combination over/undersampling with SMOTE - Edited Nearest Neighbours on the training set.

In [None]:
sm = SMOTE(random_state=777, k_neighbors=3)
sm_xtrain_tfidf, sm_train_y = sm.fit_sample(x_train, y_train)

In [None]:
plt.hist(sm_train_y, density=False, bins=num_labels)  

In [None]:
sm_nn = SMOTEENN(random_state=777, smote=sm)
sm_nn_xtrain_tfidf, sm_nn_train_y = sm_nn.fit_sample(x_train, y_train)

In [None]:
plt.hist(sm_nn_train_y, density=False, bins=num_labels)  

In [None]:
sme = RandomOverSampler(random_state=42)
X_res, y_res = sme.fit_resample(x_train, y_train)

In [None]:
plt.hist(y_res, density=False, bins=num_labels)  

## Model Selection

In [None]:
# utils func to x-validate a model and record its metrics
def score_model(model, x_train, y_train, k_fold):
    def balanced_accuracy_fn(y_true, y_pred): return np.mean(balanced_accuracy_score(y_true, y_pred))
    def accuracy_fn(y_true, y_pred): return np.mean(accuracy_score(y_true, y_pred))
    def precision_fn(y_true, y_pred): return np.mean(precision_score(y_true, y_pred, average=None))
    def recall_fn(y_true, y_pred): return np.mean(recall_score(y_true, y_pred, average=None))
    def f1_fn(y_true, y_pred): return f1_score(y_true, y_pred, average='macro')
    def mcc_fn(y_true, y_pred): return np.mean(matthews_corrcoef(y_true, y_pred))
    scoring = {
        'accuracy': make_scorer(accuracy_fn),
        'balanced_accuracy': make_scorer(balanced_accuracy_fn),
        'precision': make_scorer(precision_fn ),
        'recall': make_scorer(recall_fn),
        'f1': make_scorer(f1_fn),
        'mcc': make_scorer(mcc_fn)}

    scores = cross_validate(model, x_train, y_train, cv=k_fold, n_jobs=1, scoring = scoring, return_train_score=True )
    
    result = {
        "xval_test_accuracy" : np.mean(scores["test_accuracy"]),
        "xval_test_balanced_accuracy" : np.mean(scores["test_balanced_accuracy"]),        
        "xval_test_precision": np.mean(scores["test_precision"]),
        "xval_test_recall": np.mean(scores["test_recall"]),
        "xval_test_f1": np.mean(scores["test_f1"]),
        "xval_test_mcc": np.mean(scores["test_mcc"]),
        "xval_train_accuracy": np.mean(scores["train_accuracy"]),
        "xval_train_precision": np.mean(scores["train_precision"]),
        "xval_train_recall": np.mean(scores["train_recall"]),
        "xval_train_mcc": np.mean(scores["train_mcc"]),
        "model" : model
    }

    return result

In [None]:
# Calculate test and train metrics for a category; draw a plot of within-category accuracies
def fit_assess_model(model, train_x, train_y, test_x, test_y, title):
    test_model = best_model_stats["model"].fit(train_x, train_y)

    train_preds = test_model.predict(train_x)
    test_preds = test_model.predict(test_x)

    final_train_accuracy = accuracy_score(train_y, train_preds)
    final_test_accuracy = accuracy_score(test_y, test_preds)
    final_train_bal_accuracy = balanced_accuracy_score(train_y, train_preds)
    final_test_bal_accuracy = balanced_accuracy_score(test_y, test_preds)
    final_test_f1 = f1_score(test_y, test_preds, average='macro')
    final_train_f1 = f1_score(train_y, train_preds, average='macro')
    cm = confusion_matrix(test_y, test_preds)
    
    print(f"-------{title} accuracy and f1---------")
    print( "test accuracy: {}".format(final_test_accuracy))
    print( "train accuracy: {}".format(final_train_accuracy))
    print( "test f1: {}".format( final_test_f1))
    print( "train f1: {}".format( final_train_f1))

    return test_preds

In [None]:
models = [    
  SVC(C=1000, gamma='auto', kernel='rbf', probability=True),
  SVC(C=100, gamma='auto', kernel='rbf', probability=True),
  SVC(C=10, gamma='auto', kernel='rbf', probability=True),
  SVC(C=1, gamma='auto', kernel='rbf', probability=True),
  SVC(C=.001, gamma='auto', kernel='rbf',probability=True),
  SVC(C=1, gamma='auto', kernel='linear', probability=True)] # linear is not very sensititive to different C values

scoredModels = []

k_fold = StratifiedKFold(n_splits=5, shuffle=True )


In [None]:
# Choose best model using original data without resampling
start = datetime.now()

for m in models:
    print( "Training model [{}]".format(m))
    scoredModel = score_model(m, x_train, y_train, k_fold)
    print('Model metrics: {}.'.format(scoredModel))
    scoredModels.append(scoredModel)

best_model_stats = max(scoredModels, key=lambda x: x["xval_test_accuracy"])

print("------------------------------------------------------------------------------------------")
print("Scored models: {0}".format(scoredModels))
print("------------------------------------------------------------------------------------------")
print("Chosen model xval stats: {0}".format(best_model_stats))

In [None]:
# create a model with all test data without resampling
print( "---------No resampling:-----------")

test_preds = fit_assess_model(best_model_stats["model"], x_train, y_train, x_test, y_test, "Raw") 

In [None]:
acc_by_cat = assess_model(y_test, test_preds, "SVM", label_names.tolist())

In [None]:
# Now test out the model on the dataset with random resampling
print( "---------Random resampling:-----------")
test_preds = fit_assess_model(best_model_stats["model"], X_res, y_res, x_test, y_test, "SVM-Random") 

ress_acc_by_cat = assess_model(y_test, test_preds, "SVM-Random", label_names.tolist())


In [None]:
# SMOTE resampling
print( "---------SMOTE resampling:-----------")

test_preds = fit_assess_model(best_model_stats["model"],  sm_xtrain_tfidf, sm_train_y, x_test, y_test, "TF-IDF - SVM SMOTE") 

sm_acc_by_cat = assess_model(y_test, test_preds, "SVM-SMOTE", label_names.tolist())

In [None]:
# SMOTEENN resampling
print( "---------SMOTE ENN resampling:-----------")

test_preds = fit_assess_model(best_model_stats["model"],  sm_nn_xtrain_tfidf, sm_nn_train_y, x_test, y_test, "SVM-SMOTEENN") 

smnn_acc_by_cat = assess_model(y_test, test_preds, "SVM-SMOTEENN", label_names.tolist())