# Final Project

## TRAC2- Baseline Models

In this notebook we build the baseline models for the TRAC-2 dataset. 

Characteristics of the models:
- Neural Bag of Words architecture
- A single dense layer with dropout
- Use Glove embeddings (dim=300) without fine tuning them
- Maximum sequence length is 150
- Use keras tokenizer




## Package imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, Input, Dense, Lambda, Dropout
from keras.models import Sequential
from tensorflow.keras.layers import TextVectorization
import tensorflow.keras.backend as K
# for hyperparameter tunning
# from keras_tuner import HyperModel
# import keras_tuner as kt
# import sklearn to calculate the metrics
from sklearn import metrics

from sklearn.preprocessing import label_binarize

import statistics

In [2]:
print(tf.__version__)

2.6.0


## Load data
Load training, development and test datasets.

In [3]:
# Load aggressiveness dataset
train_data = pd.read_csv('../../../data/release-files/eng/trac2_eng_train.csv')
dev_data = pd.read_csv('../../../data/release-files/eng/trac2_eng_dev.csv')

# test data data and labels is in separate files
test_data = pd.read_csv('../../../data/release-files/test/trac2_eng_test.csv')
test_labels_a = pd.read_csv('../../../data/release-files/gold/trac2_eng_gold_a.csv')
test_labels_b = pd.read_csv('../../../data/release-files/gold/trac2_eng_gold_b.csv')

## Helper functions

In [4]:
def from_prob_to_labels(model, x, task):
    '''
    Returns labels based on predicted probability on labels [CAG,NAG,OAG] for task A. Task B is binary, and 'GEN' represents 
    the positive class.
    Parameters:
    model: trained model
    x: input data
    task: either 'A' or 'B'
    '''
    pred = model.predict(x)
    
    index_a = {0:'CAG', 1:'NAG', 2:'OAG'}
    
    if task == 'A':
        highest_prob_class = np.argmax(pred, axis=1)
        labels = np.vectorize(index_a.get)(highest_prob_class.astype(int))
        
    elif task == 'B':
        labels = np.where(pred <0.5, 'NGEN', 'GEN')
    else:
        labels = []
        
    return labels    

In [5]:
def to_binary_labels(string_labels, classes_list):
    '''
    Returns an array with 0 and 1 for a binary classification problem.
    Parameters:
    string_labels: array with 2 categories defined as strings e.g. ['cat', 'dog', 'dog', ...] 
    classes_list: array with the two classes. The order of the array defines which gets 0 and which gets 1. The first
                  gets 0.
    '''

    labels = label_binarize(string_labels, classes = classes_list).flatten()
    
    return labels 

In [6]:
def to_one_hot_labels(string_labels):
    '''
    Returns one-hot encoded labels from a multi-class label vector e.g. ['cat', 'dog', 'dog', 'lion', 'cat', ...] 
    Parameters:
    string_labels: 
    '''
    labels = pd.get_dummies(string_labels)
    labels = labels.to_numpy()
    
    return labels

In [7]:
def confusion_matrix_plot(model, input_data, true_labels, task, normalize=None):
    '''
    Returns a confusion matrix with a nice format.
    Parameters:
    model: trained model
    input data: data we want to use to evaluate the model
    true_labels: true labels 
    task: 'A' or 'B'
    normalize: if want to normalize the confusion matrix normalize='true'
    '''
    
    # get predicted labels
    pred_labels = from_prob_to_labels(model, input_data, task)
    
    # Create a confusion matrix
    cm = metrics.confusion_matrix(true_labels, pred_labels, normalize=normalize)
    cm = np.around(cm, 2)

    # Plot the confusion matrix
    if task == 'A':
        axis_labels = ['CAG', 'NAG', 'OAG']
    elif task == 'B':
        axis_labels = ['GEN', 'NGEN']

    fig, ax = plt.subplots(figsize=(4,4))
    im = ax.imshow(cm, cmap="Blues")

    # Create the ticks and labels
    ax.set_xticks(np.arange(len(axis_labels)))
    ax.set_yticks(np.arange(len(axis_labels)))
    ax.set_xticklabels(axis_labels)
    ax.set_yticklabels(axis_labels)

    # Axis titles
    plt.ylabel('True label', size=12)
    plt.xlabel('Predicted label', size=12)

    # Loop over data dimensions and create text annotations.
    for i in range(len(axis_labels)):
        for j in range(len(axis_labels)):
            text = ax.text(j, i, cm[i, j],ha="center", va="center", color="dimgrey", size=12)
    
    ax.set_title("Confusion Matrix", size=16, weight="bold")
    fig.tight_layout()
    plt.show()


In [8]:
def loss_accuracy_plots(training_history, xrange, task):
    '''
    Returns plots for loss and accuracy during the training process of a NN.
    Parameters:
    training_history: object that stores the training history of the NN (from model.fit(...))
    xrange: range in x axis
    task: string used for the title in the plot
    '''
    
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,6))
    
    # loss plot
    ax1.plot(training_history.history['loss'], color='black')
    ax1.plot(training_history.history['val_loss'], color='blue')
    ax1.set_title('Training and validation loss Sub-Task ' + task)
    ax1.legend(['training', 'development'])
    ax1.grid(which='both')
    ax1.set_xticks(np.arange(0, xrange, 2))
    
    # accuracy plot
    if task == 'A':
        ax2.plot(training_history.history['categorical_accuracy'], color='black')
        ax2.plot(training_history.history['val_categorical_accuracy'], color='blue')
        ax2.set_title('Training and validation acccuracy Sub_Task ' + task)
        ax2.legend(['training', 'development'])
        ax2.grid(which='both')
        ax2.set_xticks(np.arange(0, xrange, 2))
    elif task == 'B':
        ax2.plot(training_history.history['binary_accuracy'], color='black')
        ax2.plot(training_history.history['val_binary_accuracy'], color='blue')
        ax2.set_title('Training and validation acccuracy Sub_Task ' + task)
        ax2.legend(['training', 'development'])
        ax2.grid(which='both')
        ax2.set_xticks(np.arange(0, xrange, 2))
    plt.show()
    

## Prepare the data for modeling


In [9]:
# create arrays of text examples for train, development and test data
train_text = np.array(train_data['Text'])
dev_text = np.array(dev_data['Text'])
test_text = np.array(test_data['Text'])

In [10]:
# create arrays of labels for train, development and test data

# Sub-Task A
train_labels_a = np.array(train_data['Sub-task A'])
dev_labels_a = np.array(dev_data['Sub-task A'])
test_labels_a = np.array(test_labels_a['Sub-task A'])

# Sub-Task B
# create arrays of labels for train, development and test data
train_labels_b = np.array(train_data['Sub-task B'])
dev_labels_b = np.array(dev_data['Sub-task B'])
test_labels_b = np.array(test_labels_b['Sub-task B'])

In [11]:
# encode labels

# Sub-Task A - [CAG,NAG,OAG]
train_labels_a_enc = to_one_hot_labels(train_labels_a)
dev_labels_a_enc = to_one_hot_labels(dev_labels_a)
test_labels_a_enc = to_one_hot_labels(test_labels_a)

# Sub-Task B
# encode the labels. As this is a binary classification we use binary labels 0:NGEN, 1:GEN
train_labels_b_enc = to_binary_labels(train_labels_b, classes_list=['NGEN', 'GEN'])
dev_labels_b_enc = to_binary_labels(dev_labels_b, classes_list=['NGEN', 'GEN'])
test_labels_b_enc = to_binary_labels(test_labels_b, classes_list=['NGEN', 'GEN'])


In [12]:
# load pre-trained word embeddings. In this case Glove
# This is commented out to avoid downloading it again
# !wget http://nlp.stanford.edu/data/glove.6B.zip -P ~/data/

In [13]:
# unzip the file
# commented out for the same reason above
# !unzip ~/data/glove.6B.zip -d ~/data/

In [14]:
# path to glove file- will use the embeddings with dimension = 300
glove_file ="../../../data/glove6B/glove.6B.300d.txt"

In [15]:
# create a vocabulary index 
# consider this maximum number of words- Played with larger vocab sizes, but 10,000 is enough.
max_tokens = 10000
# truncate or pad sequences to be this long
max_sequence_lenght = 150

vectorizer = TextVectorization(max_tokens=max_tokens, output_sequence_length=max_sequence_lenght)

Metal device set to: Apple M1


2021-10-20 07:12:31.221072: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-10-20 07:12:31.221156: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [16]:
# train vocabulary
vectorizer.adapt(train_text)

# save vocabulary in a variable
vocab = vectorizer.get_vocabulary()

2021-10-20 07:12:31.248825: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-10-20 07:12:31.249023: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
2021-10-20 07:12:31.276697: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


In [17]:
print(f'Vocabulary has {len(vocab)} words.')

Vocabulary has 10000 words.


In [18]:
# create an index for each word {word: index}
word_idx = dict(zip(vocab, range(len(vocab))))

In [19]:
# Map words with their vector representation (embeddings)
embeddings_glove = {}
with open(glove_file) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_glove[word] = coefs

print(f'Number of embeddings available: {len(embeddings_glove)}')

Number of embeddings available: 400000


In [20]:
# just to check let's get the dimensions of one of the embeddings
embeddings_glove['home'].shape

(300,)

In [21]:
# build embedding matrix to use it in the model
dimensions_emb = 300
# the plus two is for padding and unknown tokens
total_tokens = len(vocab) + 2
with_embedding = []
without_embedding = []

# initialize embedding matrix with zeroes
embedding_matrix = np.zeros((total_tokens, dimensions_emb))

for word, index in word_idx.items():
    emb_vector = embeddings_glove.get(word)
    # add to matrix
    # count converted and not converted words
    if emb_vector is not None:
        embedding_matrix[index] = emb_vector
        with_embedding.append(word)
    else:
        without_embedding.append(word)

print(f'Number of words with embeddings found: {len(with_embedding)}')
print(f'Number of words with embeddings NOT found: {len(without_embedding)}')

Number of words with embeddings found: 6928
Number of words with embeddings NOT found: 3072


In [22]:
# prepare input data for the model
# convert the train sentences to sequences of ids
train_input = vectorizer(train_text)

In [23]:
# check the dimensions
# looks good: number examples and 150 max lenght
train_input.shape

TensorShape([4263, 150])

In [24]:
# do the same for development and test data
dev_input = vectorizer(dev_text)
test_input = vectorizer(test_text)

## Model Task A

In [25]:
# initialize lists to keep statistics of all runs
f1_NAG = []
f1_CAG = []
f1_OAG = []
f1_macro = []
f1_weighted = []
accuracy = []

for i in range(5):
    # delete model if exists
    try:
        del model
    except:
        pass
    
    # sequential model
    model = tf.keras.Sequential()
        
    # embedding layer
    model.add(Embedding(embedding_matrix.shape[0],
                        embedding_matrix.shape[1],
                        embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                        input_length=max_sequence_lenght,
                        trainable=False))
        
    # average embedding vectors
    model.add(tf.keras.layers.Lambda(lambda x: K.mean(x, axis=1))) 
        
    # hidden layer
    model.add(Dense(units=10, activation='relu'))

    # dropout layer 
    model.add(Dropout(0.3))
        
    # output layer 
    model.add(Dense(3, activation='softmax'))

    # compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', 
                  metrics=[tf.keras.metrics.categorical_accuracy, 'categorical_crossentropy'])
    
    # train model
    training_history = model.fit(train_input, train_labels_a_enc, 
                                 validation_data=(dev_input, dev_labels_a_enc), 
                                 epochs=50, verbose=0)
    
    # evaluate model
    pred_labels_test_a = from_prob_to_labels(model, test_input, 'A')
    
    x = metrics.classification_report(test_labels_a, pred_labels_test_a, digits=3, output_dict=True)

    # append values to keep scores
    f1_NAG.append(x['NAG']['f1-score'])
    f1_CAG.append(x['CAG']['f1-score'])
    f1_OAG.append(x['OAG']['f1-score'])
    f1_macro.append(x['macro avg']['f1-score'])
    f1_weighted.append(x['weighted avg']['f1-score'])
    accuracy.append(x['accuracy'])

# calculate mean
f1_NAG_mean = round(statistics.mean(f1_NAG), 3)
f1_CAG_mean = round(statistics.mean(f1_CAG), 3)
f1_OAG_mean = round(statistics.mean(f1_OAG), 3)
f1_macro_mean = round(statistics.mean(f1_macro), 3)
f1_weighted_mean = round(statistics.mean(f1_weighted), 3)
accuracy_mean = round(statistics.mean(accuracy), 3)

# calculate standard deviation
f1_NAG_std = round(statistics.stdev(f1_NAG), 3)
f1_CAG_std = round(statistics.stdev(f1_CAG), 3)
f1_OAG_std = round(statistics.stdev(f1_OAG), 3)
f1_macro_std = round(statistics.stdev(f1_macro), 3)
f1_weighted_std = round(statistics.stdev(f1_weighted), 3)
accuracy_std = round(statistics.stdev(accuracy), 3)

print('Class NAG')
print(f'Mean f1-score = {f1_NAG_mean}')
print(f'Standard deviation f1-score = {f1_NAG_std}\n')

print('Class CAG')
print(f'Mean f1-score = {f1_CAG_mean}')
print(f'Standard deviation f1-score = {f1_CAG_std}\n')

print('Class OAG')
print(f'Mean f1-score = {f1_OAG_mean}')
print(f'Standard deviation f1-score = {f1_OAG_std}\n')

print('Class Macro')
print(f'Mean f1-score = {f1_macro_mean}')
print(f'Standard deviation f1-score = {f1_macro_std}\n')

print('Class Weighted')
print(f'Mean f1-score = {f1_weighted_mean}')
print(f'Standard deviation f1-score = {f1_weighted_std}\n')

print('Accuracy')
print(f'Mean = {accuracy_mean}')
print(f'Standard deviation = {accuracy_std}\n')

2021-10-20 07:12:44.848006: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-10-20 07:12:46.062266: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-10-20 07:13:30.271749: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-10-20 07:13:30.520189: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-10-20 07:13:31.341624: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-10-20 07:14:16.906523: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-10-20 07:14:17.172589: I tensorflow/core/grappler/optimizers/cust

Class NAG
Mean f1-score = 0.806
Standard deviation f1-score = 0.017

Class CAG
Mean f1-score = 0.128
Standard deviation f1-score = 0.106

Class OAG
Mean f1-score = 0.58
Standard deviation f1-score = 0.049

Class Macro
Mean f1-score = 0.505
Standard deviation f1-score = 0.056

Class Weighted
Mean f1-score = 0.626
Standard deviation f1-score = 0.04

Accuracy
Mean = 0.693
Standard deviation = 0.024



2021-10-20 07:16:34.384884: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


## Model Task B


In [26]:
# initialize lists to keep statistics of all runs
f1_NGEN = []
f1_GEN = []
f1_macro_b = []
f1_weighted_b = []
accuracy_b = []

for i in range(5):
    # delete model if exists
    try:
        del model
    except:
        pass
    
    # sequential model
    model = tf.keras.Sequential()
        
    # embedding layer
    model.add(Embedding(embedding_matrix.shape[0],
                        embedding_matrix.shape[1],
                        embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                        input_length=max_sequence_lenght,
                        trainable=False))
        
    # average embedding vectors
    model.add(tf.keras.layers.Lambda(lambda x: K.mean(x, axis=1))) 
        
    # hidden layer
    model.add(Dense(units=9, activation='relu'))

    # dropout layer 
    model.add(Dropout(0.2))
        
    # output layer 
    model.add(Dense(1, activation='sigmoid'))

    # compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_accuracy', 'binary_crossentropy'])
    
    # train the model and store the training history
    training_history = model.fit(train_input, train_labels_b_enc, 
                                 validation_data=(dev_input, dev_labels_b_enc), 
                                 epochs=50, verbose=0)
    
    # evaluate model
    pred_labels_test_b = from_prob_to_labels(model, test_input, 'B')
    
    x = metrics.classification_report(test_labels_b, pred_labels_test_b, digits=3, output_dict=True)
    
    # append values to keep scores
    f1_NGEN.append(x['NGEN']['f1-score'])
    f1_GEN.append(x['GEN']['f1-score'])
    f1_macro_b.append(x['macro avg']['f1-score'])
    f1_weighted_b.append(x['weighted avg']['f1-score'])
    accuracy_b.append(x['accuracy'])

# calculate mean
f1_NGEN_mean = round(statistics.mean(f1_NGEN), 3)
f1_GEN_mean = round(statistics.mean(f1_GEN), 3)
f1_macro_b_mean = round(statistics.mean(f1_macro_b), 3)
f1_weighted_b_mean = round(statistics.mean(f1_weighted_b), 3)
accuracy_b_mean = round(statistics.mean(accuracy_b), 3)

# calculate standard deviation
f1_NGEN_std = round(statistics.stdev(f1_NGEN), 3)
f1_GEN_std = round(statistics.stdev(f1_GEN), 3)
f1_macro_b_std = round(statistics.stdev(f1_macro_b), 3)
f1_weighted_b_std = round(statistics.stdev(f1_weighted_b), 3)
accuracy_b_std = round(statistics.stdev(accuracy_b), 3)

print('Class NGEN')
print(f'Mean f1-score = {f1_NGEN_mean}')
print(f'Standard deviation f1-score = {f1_NGEN_std}\n')

print('Class GEN')
print(f'Mean f1-score = {f1_GEN_mean}')
print(f'Standard deviation f1-score = {f1_GEN_std}\n')

print('Macro')
print(f'Mean f1-score = {f1_macro_b_mean}')
print(f'Standard deviation f1-score = {f1_macro_b_std}\n')

print('Weighted')
print(f'Mean f1-score = {f1_weighted_b_mean}')
print(f'Standard deviation f1-score = {f1_weighted_b_std}\n')

print('Accuracy')
print(f'Mean = {accuracy_b_mean}')
print(f'Standard deviation = {accuracy_b_std}\n')


2021-10-20 07:16:34.732539: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-10-20 07:16:36.493436: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-10-20 07:17:14.391872: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-10-20 07:17:14.664874: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-10-20 07:17:15.448488: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-10-20 07:17:53.561038: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-10-20 07:17:53.819636: I tensorflow/core/grappler/optimizers/cust

Class NGEN
Mean f1-score = 0.924
Standard deviation f1-score = 0.004

Class GEN
Mean f1-score = 0.279
Standard deviation f1-score = 0.256

Macro
Mean f1-score = 0.602
Standard deviation f1-score = 0.129

Weighted
Mean f1-score = 0.83
Standard deviation f1-score = 0.04

Accuracy
Mean = 0.864
Standard deviation = 0.01



2021-10-20 07:19:50.628329: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
