# Question 1: Classifier

In [1]:
import os
from itertools import product

import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
from torch.nn.functional import cross_entropy
from transformers import AdamW, AutoTokenizer, BertModel
from utils.data_loader import MultipleChoiceDataloader, read_file, read_json_data
from utils.train_classifier import train_loop
from utils.valid_classifier import valid_loop

In [2]:
NUM_EPOCHS = 5
PRINT_EVERY = 100
BATCH_SIZE = 150

## Dataloader

In [3]:
train_file_name = "data/train_complete.jsonl"
dev_file_name = "data/dev_complete.jsonl"
test_file_name = "data/test_complete.jsonl"

In [4]:
train_json = read_file(train_file_name)
dev_json = read_file(dev_file_name)
test_json = read_file(test_file_name)

In [5]:
train_dataset = read_json_data(train_json)
dev_dataset = read_json_data(dev_json, permute=False)
test_dataset = read_json_data(test_json, permute=False)

In [6]:
train_dataloader = MultipleChoiceDataloader(train_dataset, batch_size=BATCH_SIZE)
dev_dataloader = MultipleChoiceDataloader(dev_dataset, batch_size=BATCH_SIZE)
test_dataloader = MultipleChoiceDataloader(test_dataset, batch_size=BATCH_SIZE)

Sample sequence:

In [7]:
for text, label in train_dataloader:
    print(text[0])
    print(label[0])
    break

[CLS] the sun is the source of energy for physical cycles on Earth [SEP] The sun is responsible for puppies learning new tricks [SEP] children growing up and getting old [SEP] flowers wilting in a vase [SEP] plants sprouting, blooming and wilting [END]
tensor([0, 0, 0, 1])


The answers and stem are encoded as a single sequence. Everything is separated using `[SEP]`; so our format is `[CLS] stem [SEP] option [SEP] option [SEP] option [SEP] option [END]`. For example:

```
[CLS] the sun is the source of energy for physical cycles on Earth [SEP] The sun is responsible for puppies learning new tricks [SEP] children growing up and getting old [SEP] flowers wilting in a vase [SEP] plants sprouting, blooming and wilting [END]
```

During training, we're permuting the options. This way we get an artifically larger dataset (a form of bootstrap), while also ensuring the transfomer doesn't memorize the options.

## Hyperparameter Tuning

In [8]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
linear = nn.Linear(768, 4)

* We're using `AdamW` instead of the standard `Adam` for both the BERT model and the linear layer together.
* AdamW is better suited for  BERT due to its handling of weight decay, which it applies differently from traditional L2 regularization, directly influencing the gradients rather than just the weight update step.
* We're also varying our probability of drop out, so our model doesn't memorize the training data.

In [9]:
hyperparams = {
    "learning_rate": [1e-5, 3e-5, 5e-5],
    "weight_decay": [0.01, 0.05],
    "dropout_prob": [0.1, 0.3],
}
param_combinations = list(product(*hyperparams.values()))
param_names = list(hyperparams.keys())

In [10]:
loss_fn = nn.CrossEntropyLoss()

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Do not run the code snippet below unless you want to train all the hyperparameters.

In [None]:
results = []

for values in param_combinations:
    params = dict(zip(param_names, values))
    print("Training with parameters:", params)

    model.config.hidden_dropout_prob = params["dropout_prob"]
    model.config.attention_probs_dropout_prob = params["dropout_prob"]
    optimizer = AdamW(
        [{"params": model.parameters()}, {"params": linear.parameters()}],
        lr=params["learning_rate"],
        weight_decay=params["weight_decay"],
    )
    model.to(device)
    linear.to(device)

    epoch_train_losses = []
    epoch_train_accuracies = []
    epoch_valid_losses = []
    epoch_valid_accuracies = []

    for epoch in range(NUM_EPOCHS):
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}")
        print("Training...")
        train_dataloader.shuffle_data()
        train_metrics = train_loop(
            train_dataloader,
            tokenizer,
            model,
            linear,
            loss_fn,
            optimizer,
            device,
            PRINT_EVERY,
        )
        epoch_train_losses.append(train_metrics[0])
        epoch_train_accuracies.append(train_metrics[1])

        print("Validating...")
        valid_metrics = valid_loop(
            dev_dataloader, tokenizer, model, linear, loss_fn, device, PRINT_EVERY
        )
        epoch_valid_losses.append(valid_metrics[0])
        epoch_valid_accuracies.append(valid_metrics[1])

        results.append(
            {
                **params,
                "epoch": epoch + 1,
                "train_loss": train_metrics[0],
                "train_accuracy": train_metrics[1],
                "valid_loss": valid_metrics[0],
                "valid_accuracy": valid_metrics[1],
            }
        )

    # Plotting
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(epoch_train_losses, label="Train Loss")
    plt.plot(epoch_valid_losses, label="Valid Loss")
    plt.title("Training and Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(epoch_train_accuracies, label="Train Accuracy")
    plt.plot(epoch_valid_accuracies, label="Valid Accuracy")
    plt.title("Training and Validation Accuracy")
    plt.xlabel("Epoch")
    plt.ylabel("Accuracy")
    plt.legend()

    param_info = "\n".join([f"{key}: {val}" for key, val in params.items()])
    plt.figtext(
        0.5,
        0.01,
        param_info,
        ha="center",
        fontsize=10,
        bbox={"facecolor": "white", "alpha": 0.5, "pad": 5},
    )

    plot_path = os.path.join(
        image_directory,
        f'epoch_metrics_{params["learning_rate"]}_{params["weight_decay"]}_{params["dropout_prob"]}.png',
    )
    plt.savefig(plot_path, bbox_inches="tight")
    plt.close()
    print(f"Saved plot to {plot_path}")

Training with parameters: {'learning_rate': 1e-05, 'weight_decay': 0.01, 'dropout_prob': 0.1}
Epoch 1/5
Training...




Batch 100/794: Loss = 1.3926, Accuracy = 0.2466, Time = 52.27s
Batch 200/794: Loss = 1.3890, Accuracy = 0.2659, Time = 53.84s
Batch 300/794: Loss = 1.3366, Accuracy = 0.3291, Time = 52.57s
Batch 400/794: Loss = 1.2534, Accuracy = 0.4041, Time = 53.73s


In [None]:
results_df = pd.DataFrame(results)
results_df.to_csv("/results/hyperparam_results.csv", index=False)
print("Saved hyperparameter tuning results to '/results/hyperparam_results.csv'.")