# Procedural

Mount my drive and create a folder for the data if it doesn't already exist

In [1]:
3# Mount my drive
from google.colab import drive
drive.mount('/content/drive')

# Create a folder for the data if it does not already exist
import os
if not os.path.exists('/content/drive/MyDrive/MastersProject/data/'):
    os.makedirs('/content/drive/MyDrive/MastersProject/data/')
    print("Created the folder!")
else:
    print("Folder already existed!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Folder already existed!


In [2]:
!pip install psutil



In [3]:
!pip install transformers



In [4]:
import time
import pandas as pd
import torch
import numpy as np
from torch import nn, optim, utils
import psutil
from sklearn.metrics import confusion_matrix
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import random
import transformers



In [5]:
RANDOM_SEED = 42

# Create the classifer

Set the hyperparameters needed for creating the classifier

In [6]:
NUMBER_NEURONS_LAYER_1 = 0
NUMBER_NEURONS_LAYER_2 = 0
NUMBER_NEURONS_LAYER_3 = 0
NUMBER_NEURONS_LAYER_4 = 0
DROPOUT_PROPORTION = 0.8
POOLED_BERT_OUTPUT = False

Use GPU if available

In [7]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.cuda.empty_cache()
print(device)

cuda:0


Specify the architecture of the classifier

In [8]:
class SentimentClassifier(nn.Module):
    '''
    The sentiment classifier class that handles BERT's output
    '''
    def __init__(self, n_neurons_1, n_neurons_2, n_neurons_3, n_neurons_4, dropout_proportion, pooled_bert_output):
        super(SentimentClassifier, self).__init__()

        # Instantiate the straight forward attributes
        self.n_neurons_1 = n_neurons_1
        self.n_neurons_2 = n_neurons_2
        self.n_neurons_3 = n_neurons_3
        self.n_neurons_4 = n_neurons_4
        self.dropout_proportion = dropout_proportion
        self.pooled_bert_output = pooled_bert_output

        # Determine the features of each sample input depending on whether it is the pooled output or full last hidden state of BERT.
        if self.pooled_bert_output:
          self.feats_in = 768
        else:
          self.feats_in = 768 * 512

        # Determine the number of layers based on the given neurons for each layer prior to the last which by default has 1 neuron.
        if self.n_neurons_1 == 0:
          self.num_layers = 1
        elif self.n_neurons_2 == 0:
          self.num_layers = 2
        elif self.n_neurons_3 == 0:
          self.num_layers = 3
        elif self.n_neurons_4 == 0:
          self.num_layers = 4
        else:
          self.num_layers = 5

        # Structure the architecture of the network depending on the number of layers and their number of neurons
        if self.num_layers == 1:

          self.classifier = nn.Sequential(
              nn.Linear(self.feats_in, 1),
              nn.Sigmoid())

        elif self.num_layers == 2:

          self.classifier = nn.Sequential(
              nn.Linear(self.feats_in, self.n_neurons_1),
              nn.ReLU(),
              nn.BatchNorm1d(self.n_neurons_1),
              nn.Dropout(p=self.dropout_proportion),

              nn.Linear(self.n_neurons_1, 1),
              nn.Sigmoid())
          
        elif self.num_layers == 3:

          self.classifier = nn.Sequential(
              nn.Linear(self.feats_in, self.n_neurons_1),
              nn.ReLU(),
              # nn.BatchNorm1d(self.n_neurons_1),
              # nn.Dropout(p=0.9),

              nn.Linear(self.n_neurons_1, self.n_neurons_2),
              nn.ReLU(),
              # nn.BatchNorm1d(self.n_neurons_2),
              # nn.Dropout(p=self.dropout_proportion),
              
              nn.Linear(self.n_neurons_2, 1),
              nn.Sigmoid())
          
        elif self.num_layers == 4:

          self.classifier = nn.Sequential(
              nn.Linear(self.feats_in, self.n_neurons_1),
              nn.ReLU(),
              # nn.BatchNorm1d(self.n_neurons_1),
              # nn.Dropout(p=0.9),

              nn.Linear(self.n_neurons_1, self.n_neurons_2),
              nn.ReLU(),
              # nn.BatchNorm1d(self.n_neurons_2),
              # nn.Dropout(p=self.dropout_proportion),

              nn.Linear(self.n_neurons_2, self.n_neurons_3),
              nn.ReLU(),
              # nn.BatchNorm1d(self.n_neurons_3),
              # nn.Dropout(p=self.dropout_proportion),
              
              nn.Linear(self.n_neurons_3, 1),
              nn.Sigmoid())
          
        elif self.num_layers == 5:

          self.classifier = nn.Sequential(
              nn.Linear(self.feats_in, self.n_neurons_1),
              nn.ReLU(),
              nn.BatchNorm1d(self.n_neurons_1),
              nn.Dropout(p=0.9),

              nn.Linear(self.n_neurons_1, self.n_neurons_2),
              nn.ReLU(),
              nn.BatchNorm1d(self.n_neurons_2),
              nn.Dropout(p=self.dropout_proportion),

              nn.Linear(self.n_neurons_2, self.n_neurons_3),
              nn.ReLU(),
              nn.BatchNorm1d(self.n_neurons_3),
              nn.Dropout(p=self.dropout_proportion),

              nn.Linear(self.n_neurons_3, self.n_neurons_4),
              nn.ReLU(),
              nn.BatchNorm1d(self.n_neurons_4),
              nn.Dropout(p=self.dropout_proportion),
              
              nn.Linear(self.n_neurons_4, 1),
              nn.Sigmoid())
            
    def forward(self, X):
        if not self.pooled_bert_output:
            batch_size = X.shape[0]
            X = X.view(batch_size, -1)
        
        out = self.classifier(X)
        out = out.view(-1)

        return out

Instantiate the classifier

In [9]:
model = SentimentClassifier(NUMBER_NEURONS_LAYER_1, NUMBER_NEURONS_LAYER_2, NUMBER_NEURONS_LAYER_3, NUMBER_NEURONS_LAYER_4,
                            DROPOUT_PROPORTION, POOLED_BERT_OUTPUT)
model = model.to(device)

# Print info on the model
print(model)
print()
model_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("The total number of trainable parameters in the classifier is: {}".format(model_trainable_params))

SentimentClassifier(
  (classifier): Sequential(
    (0): Linear(in_features=393216, out_features=1, bias=True)
    (1): Sigmoid()
  )
)

The total number of trainable parameters in the classifier is: 393217


# Training

Set the hyperparameters needed for training

In [10]:
LR = 0.000005
EPOCHS = 2
OPTIMISER = "AdamW"   # in {SGD, AdamW}
BATCH_SIZE = 32

Create a function that yields evaluation metrics from a give confusion matrix

In [11]:
def get_metrics_from_conf_matrix(confusion_matrix: np.ndarray, print_on=False):
    '''
    Takes in a confusion matrix and deduces evaluation metrics based on it.

    params: confusion_matrix: numpy array giving the confusion matrix of binary classification
            print_on: Boolean determining whether or not to print input confusion matrix and deduced matrices
    return: Various self-explanatory evaluation metrics
    '''

    # Get TP, FP, FN, TN (with different names).
    true_asshole = confusion_matrix[1][1]
    false_asshole = confusion_matrix[0][1]
    true_sweetheart = confusion_matrix[0][0]
    false_sweetheart = confusion_matrix[1][0]

    # Calculate
    accuracy = (true_asshole + true_sweetheart) / (true_sweetheart + true_asshole + false_asshole + false_sweetheart)
    asshole_detection_precision = true_asshole / (true_asshole + false_asshole)
    sweetheart_detection_precision = true_sweetheart / (true_sweetheart + false_sweetheart)
    asshole_detection_accuracy = true_asshole / (true_asshole + false_sweetheart)
    sweetheart_detection_accuracy = true_sweetheart / (true_sweetheart + false_asshole)
    f1_ass = 2 * ((asshole_detection_precision * asshole_detection_accuracy) / (asshole_detection_precision + asshole_detection_accuracy))
    f1_sweet = 2 * ((sweetheart_detection_precision * sweetheart_detection_accuracy) / (sweetheart_detection_precision + sweetheart_detection_accuracy))

    # If print is on, print the input confusion matrix and its evaluation metrics
    if print_on:
        print("Here is the confusion matrix:")
        print(confusion_matrix)
        print()
        print("Here are the metrics derived from the confusion matrix:")
        print("recall wrt assholes =", asshole_detection_accuracy)
        print("recall wrt sweethearts =", sweetheart_detection_accuracy)
        print("precision wrt assholes =", asshole_detection_precision)
        print("precision wrt sweethearts) =", sweetheart_detection_precision)
        print()
        print("accuracy =", accuracy)
        print("f1 wrt assholes =", f1_ass)
        print("f1 wrt sweethearts =", f1_sweet)

    # Return various self explanatory evaluation metrics
    return(asshole_detection_accuracy, sweetheart_detection_accuracy,
           asshole_detection_precision, sweetheart_detection_precision,
           accuracy, f1_ass, f1_sweet)

Create a function that performs one epoch of training

In [12]:
def train_epoch(model, dataloader, optimiser, device, scheduler=None, print_on=False):
    '''
    Function that performs one epoch (one pass through each sample in the given loader) of training of the given model.
    '''
    model = model.train()

    losses = []
    correct_predictions = 0
    total_predictions = 0

    many_batches_losses = []
    many_batches_accs = []
    for i, batch in enumerate(dataloader):
  
      X = batch[0].float().to(device)
      y = batch[1].float().to(device)
      w = batch[2].float().to(device)

      # Pass the batch through the classifier (output layers)
      y_out = model(X)

      # Binarise output probs to predictions in {0, 1}
      y_preds = y_out.detach()
      y_preds = torch.where(y_preds > 0.5, 1, 0)
      correct_predictions += int(torch.sum(y_preds == y))
      total_predictions += len(y_preds)
      many_batches_accs.append((torch.sum(y_preds == y) / len(y_preds)).cpu())

      # Get the mean loss for the batch
      loss_fn = nn.BCELoss(weight=w, reduction="mean").to(device)
      loss = loss_fn(y_out, y)
      loss.backward()
      losses.append(loss.item())
      many_batches_losses.append(loss.item())
      
      # Not sure why I am clipping the grad here. Apparently it helps prevent exploding gradients.
      nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

      # Take an optimisation step
      optimiser.step()
      if scheduler:
        scheduler.step()

      if print_on:
      # Every so often print the current training accuracy
            if (i + 1) % 10 == 0:
                print("...Batch #{} : Training Loss={}, Training Accuracy={}".format(i + 1, sum(many_batches_losses) / len(many_batches_losses), sum(many_batches_accs) / len(many_batches_accs)))
                many_batches_losses = []
                many_batches_accs = []

    # Return the training accuracy and the mean training loss for the given epoch
    return correct_predictions / total_predictions, np.mean(losses)

Create a function that performs one epoch of validation

In [13]:
def eval_model(model, dataloader, device, print_conf_matr_on=True):
  '''
  Function that performs evaluation of the given model for one full pass of the samples in the give dataloader
  '''

  model = model.eval()

  '''
  losses = []
  '''
  total_conf_matr = np.array([[0, 0], [0, 0]])

  with torch.no_grad():
        for batch in dataloader:

            X = batch[0].float().to(device)
            y = batch[1].float().to(device)

            # Send the current batch through the model to get output probabilities
            y_out = model(X)

            '''
            # Get the mean loss for the batch
            loss_fn = nn.BCELoss(reduction="mean").to(device)
            loss = loss_fn(outputs, targets)
            losses.append(loss.item())
            '''

            # Turn the probabilities into binary predictions
            y_preds = y_out.detach()
            y_preds = torch.where(y_preds > 0.5, 1, 0)

            # Get the confusion matrix for the current batch and add it to the total confusion matrix
            small_conf_matr = confusion_matrix(y.cpu(), y_preds.cpu())
            total_conf_matr += small_conf_matr

  # Get various metrics from the total confusion matrix
  asshole_recall, sweetheart_recall, asshole_precision, sweetheart_precision, accuracy, f1_ass, f1_sweet = get_metrics_from_conf_matrix(total_conf_matr, print_on=print_conf_matr_on)

  # Return the appropriate evaluation metrics
  return accuracy, f1_ass, f1_sweet

Perform many epochs of validation and training

In [14]:
start_time = time.time()

# Creating the optimiser with its initial learning rate
if OPTIMISER == "AdamW":
    optimiser = optim.AdamW(model.parameters(), lr=LR, weight_decay=0.08)
elif OPTIMISER == "SGD":
    optimiser = optim.SGD(model.parameters(), lr=LR)

train_losses = []
train_accs = []
valid_accs = []
valid_reb_accs = []
valid_f1s_ass = []
valid_f1s_sweet = []

for epoch in range(EPOCHS):
    print("Epoch {} / {}".format(epoch+1, EPOCHS))

    for fragment_idx in range(1, 2):

        # Load the data (outputs from BERT, labels and weights) for the particular fragment onto the cpu RAM
        X_train = torch.load('/content/drive/MyDrive/MastersProject/BERT_outputs/split_in_10/X_train_{}_unpooled.pt'.format(fragment_idx), map_location=torch.device('cpu'))
        y_train = torch.load('/content/drive/MyDrive/MastersProject/BERT_outputs/split_in_10/y_train_{}.pt'.format(fragment_idx), map_location=torch.device('cpu'))
        w_train = torch.load('/content/drive/MyDrive/MastersProject/BERT_outputs/split_in_10/w_train_{}.pt'.format(fragment_idx), map_location=torch.device('cpu'))
        train_dataset = utils.data.TensorDataset(X_train, y_train, w_train)
        train_loader = utils.data.DataLoader(train_dataset, BATCH_SIZE, shuffle=True, num_workers=2)

        # Use the loaded data for training. The fragment data will be split in batches and sent to the GPU RAM one batch at a time for training.
        train_acc, train_loss = train_epoch(model, train_loader, optimiser, device, print_on=False)
        train_losses.append(train_loss)
        train_accs.append(train_acc)
        print("...Fragment {} / 10: Train: loss {}, accuracy {}".format(fragment_idx, train_loss, train_acc))

    # When all fragment have gone through, i.e. one epoch of training has been completed, delete all data from the CPU RAM
    del X_train, y_train, w_train, train_dataset, train_loader
    # Every epoch also clear the cache of the GPU memory
    torch.cuda.empty_cache()

    # Load the validation data onto the CPU RAM
    X_valid = torch.load('/content/drive/MyDrive/MastersProject/BERT_outputs/split_in_10/X_valid_unpooled.pt', map_location=torch.device('cpu'))
    y_valid = torch.load('/content/drive/MyDrive/MastersProject/BERT_outputs/split_in_10/y_valid.pt', map_location=torch.device('cpu'))
    valid_dataset = utils.data.TensorDataset(X_valid, y_valid)
    valid_loader = utils.data.DataLoader(valid_dataset, BATCH_SIZE, shuffle=True, num_workers=2)

    # Send the validation data for validation. One batch at a time, X_valid will be being sent to the GPU RAM to go through the model.
    val_acc, val_f1_ass, val_f1_sweet = eval_model(model, valid_loader, device, print_conf_matr_on=False)

    valid_accs.append(val_acc)
    valid_f1s_ass.append(val_f1_ass)
    valid_f1s_sweet.append(val_f1_sweet)
    print("Valid: accuracy {}, f1_ass {}, f1_sweet {}".format(val_acc, val_f1_ass, val_f1_sweet))

    # Delete validation data from the CPU RAM
    del X_valid, y_valid, valid_dataset, valid_loader
    torch.cuda.empty_cache()

    # Load the rebalanced validation data onto the CPU RAM
    X_valid_reb = torch.load('/content/drive/MyDrive/MastersProject/BERT_outputs/split_in_10/X_valid_reb_unpooled.pt', map_location=torch.device('cpu'))
    y_valid_reb = torch.load('/content/drive/MyDrive/MastersProject/BERT_outputs/split_in_10/y_valid_reb.pt', map_location=torch.device('cpu'))
    valid_reb_dataset = utils.data.TensorDataset(X_valid_reb, y_valid_reb)
    valid_reb_loader = utils.data.DataLoader(valid_reb_dataset, BATCH_SIZE, shuffle=True, num_workers=2)
    
    # Send the rebalanced validation data for validation. One batch at a time, X_valid_reb will be being sent to the GPU RAM to go through the model.
    val_reb_acc, val_reb_f1_ass, val_reb_f1_sweet = eval_model(model, valid_reb_loader, device, print_conf_matr_on=False)
    valid_reb_accs.append(val_reb_acc)

    # Delete validation data from the CPU RAM
    del X_valid_reb, y_valid_reb, valid_reb_dataset, valid_reb_loader
    torch.cuda.empty_cache()

    print(100*"#")


print("--- %s seconds ---" % (time.time() - start_time))

Epoch 1 / 2
...Fragment 1 / 10: Train: loss 0.6917475497236057, accuracy 0.5413572343149808
Valid: accuracy 0.7155587421899007, f1_ass 0.12314493211240922, f1_sweet 0.8302463475762578


AssertionError: ignored

In [None]:
torch.cuda.empty_cache()
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
print(gpu_info)

In [None]:
plt.plot(train_losses)
plt.xlabel("Epochs")
plt.ylabel("Training Loss")
plt.grid()

In [None]:
plt.plot(train_accs)
plt.xlabel("Epochs")
plt.ylabel("Training Accuracy")
plt.grid()

In [None]:
plt.plot(valid_accs)
plt.xlabel("Epochs")
plt.ylabel("Validation Accuracy")
plt.grid()

In [None]:
plt.plot(valid_f1s_ass)
plt.xlabel("Epochs")
plt.ylabel("F1 w.r.t. Assholes")
plt.grid()

In [None]:
plt.plot(valid_f1s_sweet)
plt.xlabel("Epochs")
plt.ylabel("F1 w.r.t. Sweethearts")
plt.grid()

In [None]:
plt.plot(valid_reb_accs)
plt.xlabel("Epochs")
plt.ylabel("Rebalanced Validation Accuracy")
plt.grid()