# Procedural

Mount the my drive and create a folder for the data if it doesn't already exist

In [344]:
# Mount my drive
from google.colab import drive
drive.mount('/content/drive')

# Create a folder for the data if it does not already exist
import os
if not os.path.exists('/content/drive/MyDrive/MastersProject/data/'):
    os.makedirs('/content/drive/MyDrive/MastersProject/data/')
    print("Created the folder!")
else:
    print("Folder already existed!")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Folder already existed!


In [345]:
!pip install transformers



In [346]:
import transformers
import pandas as pd
import torch
import numpy as np
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

In [347]:
RANDOM_SEED = 42

# Prepare the data for the classifier

## Prepare the input to the BERT

Set the hyperparameters needed for data preparation

In [353]:
MODEL_NAME = "bert-base-cased"
MAX_LEN = 512
SMALL_DATASET_PROPORTION = 0.00100
BATCH_SIZE = 100  # Make it large so that the whole small dataset fits in 1 batch

Read the dataset as a pandas dataframe

In [354]:
df = pd.read_csv('drive/MyDrive/MastersProject/data/aita_clean.csv')
df['text'] = df["title"] + " " + df["body"].fillna("")

Specify the dataset class

In [355]:
class AITADataset(Dataset):
    # Upon onject instance creation, you feed the text samples, their targets, the tokeniser and the max length.
    def __init__(self, texts, targets, tokeniser, max_len, weight_per_class):
        self.texts = texts
        self.targets = targets
        self.tokeniser = tokeniser
        self.max_len = max_len
        self.weight_per_class = weight_per_class
        
    def __len__(self):
        return len(self.texts)
    
    # This method is called when a batch is created. "item" is the index of each sample to be in batch.
    def __getitem__(self, item):
        # Normally it is already a string
        text = str(self.texts[item])

        # Create a dictionary constituting the encoding of the current item (i.e. current text)
        encoding = tokeniser(
            text,
            truncation=True,
            max_length=self.max_len,
            add_special_tokens=True,
            padding='max_length',
            return_attention_mask=True,
            return_token_type_ids=False,
            return_tensors='pt')
        
        # These are unnecessary I think
        encoding['input_ids'] = encoding['input_ids'].flatten()
        encoding['attention_mask'] = encoding['attention_mask'].flatten()

        # Find the weight that the current sample should have during training
        w = weight_per_class[self.targets[item]]
        
        # In the encoding dictionary for the current text, add the target corresponding to it and the actual test
        dic_out = {'input_ids': encoding['input_ids'],
                   'attention_mask': encoding['attention_mask'],
                   'targets': torch.tensor(self.targets[item], dtype=torch.long),
                   'sample_text': text,
                   'weights': w}
        
        return dic_out

Make a function that creates a dataloader

In [356]:
def create_data_loader(df, tokeniser, max_len, batch_size):
    '''
    Creates a dataset from the given dataframe and a dataloader spitting batches of the dataset
    '''
    ds = AITADataset(
        texts=df.text.to_numpy(),
        targets=df.is_asshole.to_numpy(),
        tokeniser=tokeniser,
        max_len=max_len,
        weight_per_class=weight_per_class)
    
    dataloader = DataLoader(ds, batch_size=batch_size, num_workers=2)
    
    return dataloader

Split the dataframes. Here, we will try to overfit the small one.

In [357]:
df_large, df_small = train_test_split(df, test_size=SMALL_DATASET_PROPORTION, random_state=RANDOM_SEED)
print("Shape of the small dataframe:", df_small.shape)

Shape of the small dataframe: (98, 10)


Correct class imbalance

In [358]:
##########################INPUT##########################
rebalance = "reweighing"   # in {"undersampling", "reweighing"}
#########################################################

# Rebalance by undersampling the majority class in the training data
if rebalance == "undersampling":
  counts = df_small['is_asshole'].value_counts()
  prob_drop = (counts[0] - counts[1]) / counts[0] + 0.13

  indices_to_drop = []
  for idx, row in df_small.iterrows():
    if row['is_asshole'] == 0:
        drop = random.choices(["drop", "dont_drop"], [prob_drop, 1-prob_drop])
        if drop[0] == "drop":
          indices_to_drop.append(idx)
  df_small.drop(indices_to_drop, axis=0, inplace=True)
  # Inspecting class imbalance
  sns.countplot(df_train.is_asshole)
  plt.xlabel('is asshole')
  plt.ylabel('count')
  weight_per_class = [1] * len(counts)
  print()
  print("These are the weights per class:", weight_per_class)

# Rebalance by weighing samples of each class by their inverse class occurrence rate during training
elif rebalance == "reweighing":
  counts = df_small['is_asshole'].value_counts()
  weight_per_class = []
  num_classes = len(counts)
  for i in range(0, num_classes):
    w = (sum(counts)) / (num_classes * counts[i])
    weight_per_class.append(w)
    print("class {}: occurrences * weight =".format(i), w * counts[i])
  print()
  print("These are the weights per class:", weight_per_class)
else:
  counts = df_small['is_asshole'].value_counts()
  print("We are continuing without rebalancing then!")
  weight_per_class = [1] * len(counts)

class 0: occurrences * weight = 49.0
class 1: occurrences * weight = 49.0

These are the weights per class: [0.7538461538461538, 1.4848484848484849]


Initialise the tokeniser based on the chosen model name

In [359]:
tokeniser = transformers.BertTokenizer.from_pretrained(MODEL_NAME)

Create a dataloader from the small dataframe to be overfit

In [360]:
small_loader = create_data_loader(df_small, tokeniser, MAX_LEN, BATCH_SIZE)

Inspect a batch from the dataloader

In [361]:
batch = next(iter(small_loader))
loader_keys = batch.keys()
print("Each dataloader batch is like a dictionary with keys:", loader_keys)
print()
print("input_ids batch shape:", batch['input_ids'].shape)
print("attention_mask batch shape:", batch['attention_mask'].shape)
print("targets batch shape:", batch['targets'].shape)
print("weights batch shape:", batch['weights'].shape)
print(100*"-")
print("targets batch:", batch['targets'])
print("weights batch:", batch['weights'])
print(100*"-")
print()

Each dataloader batch is like a dictionary with keys: dict_keys(['input_ids', 'attention_mask', 'targets', 'sample_text', 'weights'])

input_ids batch shape: torch.Size([98, 512])
attention_mask batch shape: torch.Size([98, 512])
targets batch shape: torch.Size([98])
weights batch shape: torch.Size([98])
----------------------------------------------------------------------------------------------------
targets batch: tensor([0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0,
        1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1,
        0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
        0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1,
        0, 0])
weights batch: tensor([0.7538, 1.4848, 1.4848, 0.7538, 0.7538, 0.7538, 0.7538, 0.7538, 1.4848,
        1.4848, 0.7538, 1.4848, 1.4848, 1.4848, 1.4848, 0.7538, 1.4848, 1.4848,
        0.7538, 1.4848, 0.7538, 0.7538, 0.7538, 0.7538, 1.4848, 0.75

## Create BERT and send the primary data through it

Use GPU if available

In [362]:
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.cuda.empty_cache()
print(device)

cuda:0


Instantiate BERT using a custom configuration and freeze it

In [363]:
bert_config = transformers.BertConfig(vocab_size=28996,
                                      hidden_size=768,
                                      num_hidden_layers=12,
                                      num_attention_heads=12,
                                      max_position_embeddings=MAX_LEN)

bert_model = transformers.BertModel.from_pretrained(MODEL_NAME, config=bert_config)
# Freeze BERT so that its weights are not further fine-tuned from their pretrained values and when samples are passed into it, grads are not stored in the RAM
for param in bert_model.parameters():
    param.requires_grad = False
bert_model = bert_model.to(device)

Send the current batch (whole small dataset) through BERT and save the output tensors

In [364]:
bert_output = bert_model(batch['input_ids'].to(device), batch['attention_mask'].to(device))
full_bert_output = bert_output['last_hidden_state']
pooled_bert_output = bert_output['pooler_output']

print("Shape of BERT's last hidden state:   ", full_bert_output.shape)
print("Shape of BERT's pooled output:       ", pooled_bert_output.shape)

Shape of BERT's last hidden state:    torch.Size([98, 512, 768])
Shape of BERT's pooled output:        torch.Size([98, 768])


Define the data to be fed to the classifier from BERT's output

In [365]:
X = pooled_bert_output
y = batch['targets'].to(device)
w = batch['weights'].to(device)

# Classifier

Set the hyperparameters relavant to the classifier

In [366]:
NUMBER_OF_CLASSES = 2
NUMBER_NEURONS_PENULTIMATE = 150

Specify the architecture of the classifier that will handle BERT's output

In [367]:
class SentimentClassifier(nn.Module):
    '''
    The sentiment classifier class that includes BERT
    '''
    def __init__(self, n_neurons_penultimate):
        super(SentimentClassifier, self).__init__()

        self.n_neur_pen = n_neurons_penultimate 

        self.ll = nn.Linear(768, self.n_neur_pen)
        self.activ = nn.ReLU()
        self.drop = nn.Dropout(p=0.05)

        # self.ll2 = nn.Linear(self.n_neur_pen, 200)

        self.layer_out = nn.Linear(self.n_neur_pen, 1)
        self.activ_out = nn.Sigmoid()
            
    def forward(self, X):
        output = self.ll(X)
        output = self.activ(output)
        output = self.drop(output)

        # output = self.ll2(output)
        # output = self.activ(output)
        # output = self.drop(output)

        output = self.layer_out(output)
        output = self.activ_out(output)
        output = output.view(-1)
        return output

Instantiate the classifier

In [399]:
model = SentimentClassifier(NUMBER_NEURONS_PENULTIMATE)
model = model.to(device)
model_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("The total number of trainable parameters in the classifier is: {}".format(model_trainable_params))

The total number of trainable parameters in the classifier is: 115501


# Training

Set the hyperparameters needed for training

In [400]:
LR = 0.0012
EPOCHS = 1000

Create function that performs one epoch of training

In [401]:
def train_epoch(
    model,
    X,
    y,
    weights,
    optimiser,
    device,
    scheduler=None):
    '''
    Function that performs one epoch (one pass through each sample in the given loader) of training of the given model.
    '''

    model = model.train()

    losses = []
    correct_predictions = 0
    total_predictions = 0

    # Pass the batch through the classifier (output layers)
    outputs = model(X)

    # Binarise output probs to predictions in {0, 1}
    preds = outputs.detach()
    preds = torch.where(preds > 0.5, 1, 0)
    correct_predictions += int(torch.sum(preds == y))
    total_predictions += len(preds)

    # Get the mean loss for the batch
    loss_fn = nn.BCELoss(weight=weights, reduction="mean").to(device)
    loss = loss_fn(outputs, y)
    losses.append(loss.item())
    loss.backward()

    # Not sure why I am clipping the grad here. Apparently it helps prevent exploding gradients.
    nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

    # Take an optimisation step
    optimiser.step()
    if scheduler:
      scheduler.step()
    optimiser.zero_grad()

    # Return the training accuracy and the mean training loss for the given epoch
    return correct_predictions / total_predictions, np.mean(losses)

Set up the training loop and train

In [402]:
# Creating the optimiser with its initial learning rate
optimiser = transformers.AdamW(model.parameters(), lr=LR, correct_bias=False)

for epoch in range(EPOCHS):

  train_acc, train_loss = train_epoch(model, X, y.float(), w, optimiser, device)
  if (epoch+1) % 10 == 0:
      print("Epoch {} / {}: Training Loss={}, Training Accuracy={}".format(epoch+1, EPOCHS, round(float(train_loss), 2), round(float(train_acc), 2)))


Epoch 10 / 1000: Training Loss=0.72, Training Accuracy=0.34
Epoch 20 / 1000: Training Loss=0.69, Training Accuracy=0.47
Epoch 30 / 1000: Training Loss=0.68, Training Accuracy=0.52
Epoch 40 / 1000: Training Loss=0.67, Training Accuracy=0.62
Epoch 50 / 1000: Training Loss=0.65, Training Accuracy=0.63
Epoch 60 / 1000: Training Loss=0.64, Training Accuracy=0.69
Epoch 70 / 1000: Training Loss=0.61, Training Accuracy=0.67
Epoch 80 / 1000: Training Loss=0.6, Training Accuracy=0.74
Epoch 90 / 1000: Training Loss=0.56, Training Accuracy=0.79
Epoch 100 / 1000: Training Loss=0.55, Training Accuracy=0.77
Epoch 110 / 1000: Training Loss=0.5, Training Accuracy=0.84
Epoch 120 / 1000: Training Loss=0.51, Training Accuracy=0.78
Epoch 130 / 1000: Training Loss=0.5, Training Accuracy=0.82
Epoch 140 / 1000: Training Loss=0.46, Training Accuracy=0.79
Epoch 150 / 1000: Training Loss=0.45, Training Accuracy=0.85
Epoch 160 / 1000: Training Loss=0.46, Training Accuracy=0.84
Epoch 170 / 1000: Training Loss=0.4,

## Hyperparameter Search

In [None]:
lr_powers = np.linspace(-6, -1, 50)
lrs = [10 ** power for power in lr_powers]
nums_neurons = [1, 5, 10, 50, 100, 200, 500]

for num_neurons_penultimate in nums_neurons:
  print("# Neurons =", num_neurons_penultimate)
  print()
  for lr in lrs:
    print("LR =", lr)

    model = SentimentClassifier(num_neurons_penultimate)
    model = model.to(device)

    optimiser = transformers.AdamW(model.parameters(), lr=LR, correct_bias=False)

    for epoch in range(EPOCHS):

      train_acc, train_loss = train_epoch(model, X, y.float(), w, optimiser, device)
      if (epoch+1) % 10 == 0:
          print("Epoch {} / {}: Training Loss={}, Training Accuracy={}".format(epoch+1, EPOCHS, round(float(train_loss), 2), round(float(train_acc), 2)))
    print(100*"-")
  print(100*"#")


# Neurons = 1

LR = 1e-06
Epoch 10 / 50: Training Loss=0.71, Training Accuracy=0.4
Epoch 20 / 50: Training Loss=0.69, Training Accuracy=0.5
Epoch 30 / 50: Training Loss=0.7, Training Accuracy=0.4
Epoch 40 / 50: Training Loss=0.74, Training Accuracy=0.5
Epoch 50 / 50: Training Loss=0.69, Training Accuracy=0.5
----------------------------------------------------------------------------------------------------
LR = 1.2648552168552959e-06
Epoch 10 / 50: Training Loss=0.7, Training Accuracy=0.4
Epoch 20 / 50: Training Loss=0.69, Training Accuracy=0.3
Epoch 30 / 50: Training Loss=0.73, Training Accuracy=0.3
Epoch 40 / 50: Training Loss=0.68, Training Accuracy=0.6
Epoch 50 / 50: Training Loss=0.67, Training Accuracy=0.6
----------------------------------------------------------------------------------------------------
LR = 1.5998587196060574e-06
Epoch 10 / 50: Training Loss=0.7, Training Accuracy=0.6
Epoch 20 / 50: Training Loss=0.75, Training Accuracy=0.4
Epoch 30 / 50: Training Loss=0.72, 