### Load

In [None]:
import pandas as pd

# Load data
train_file = "../common-sense/train_data.csv"
train_labels_file = "../common-sense/train_answers.csv"

df_sentences = pd.read_csv(train_file)
df_labels = pd.read_csv(train_labels_file)

In [None]:
# Combine sentences and labels dataframes
df = pd.merge(df_sentences, df_labels, on="id")

# Rename answer to label and options to correspond with binary label
df = df.rename(columns={"answer": "label"})

In [None]:
from datasets import Dataset, load_metric

# Convert dataframe to huggingface dataset, split in train/test(for eval) set, encode label
data = Dataset.from_pandas(df).train_test_split(test_size = 0.3).class_encode_column("label")

In [None]:
labels = {0: 'OptionA', 1: 'OptionB', 2: 'OptionC'}

def show(example):
    """
        Shows an example in the dataset
    """
    print(f"Sentence: {example['FalseSent']}\n")

    print(f"Options:\nA) {example['OptionA']}\nB) {example['OptionB']}\nC) {example['OptionC']}")

    gold_label = example['label']
    print(f"Correct label: {example['label']}\n")
    print(f"Ground truth: {labels[gold_label]}")


In [None]:
show(data["train"][1])

In [None]:
from datasets import Dataset, load_metric

# Convert dataframe to huggingface dataset, split in train/test(for eval) set, encode label
data = Dataset.from_pandas(df).train_test_split(test_size = 0.3).class_encode_column("label")

### Preprocessing

In [None]:
from transformers import AlbertTokenizer

MODEL_NAME = "albert-base-v2"
tokenizer = AlbertTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples: Dataset):
    """ Perform preprocessing of the input.
        # Arguments
            Dataset: Huggingface Dataset containing features
        # Output
            Huggingface dataset_dict with the tokenized examples 
        with corresponding input_ids, attention_mask, and labels.
    """
    first = [[i] * 3 for i in examples["FalseSent"]]

    second = [
        [f"{examples[opt][i]}" for opt in list(labels.values())] for i, j in enumerate(examples['FalseSent'])
    ]

    first = sum(first, [])
    sec = sum(second, [])

    # Truncation makes sure to make sure input is not longer than max
    tokenized_examples = tokenizer(first, sec, truncation=True)
 
    return {k: [v[i : i + 3] for i in range(0, len(v), 3)] for k, v in tokenized_examples.items()}


# Apply preprocess function on entire dataset
tokenized = data.map(preprocess_function, batched=True)

Intermezzo: check check

In [None]:
# Check if tokenized has worked: how many examples (5600), how many options (3), list of with varying features because no padding yet
print(len(tokenized["train"]["input_ids"]), len(tokenized["train"]["input_ids"][0]), [len(x) for x in tokenized["train"]["input_ids"][0]])

In [None]:
# Check decoded tokenized and compare with ground truth 
idx = 2
[tokenizer.decode(tokenized["train"]["input_ids"][idx][i]) for i in range(3)]


In [None]:
show(data["train"][2])

Seems okay.

### Fine-tuning

In [None]:
from transformers import AlbertTokenizer, AlbertForMultipleChoice, AlbertConfig
from transformers import TrainingArguments, Trainer

albert_base_v2_config = AlbertConfig()
config = AlbertConfig(
    hidden_size=768,
    num_attention_heads=12,
    intermediate_size=3072,
)
model = AlbertForMultipleChoice.from_pretrained(MODEL_NAME)
configuration = model.config


In [None]:
training_args = TrainingArguments(
    output_dir = f"./results/{MODEL_NAME}",
    evaluation_strategy = "epoch",
    learning_rate = 5e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 1, # Default = 3
    weight_decay = 0.01,
)

In [None]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    Flattens all model inputs, apply padding, unflatten results.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.long)
        return batch

CHECK check

In [None]:
accepted_keys = ["input_ids", "attention_mask", "label"]
features = [{k: v for k, v in tokenized["train"][i].items() if k in accepted_keys} for i in range(10)]

batch = DataCollatorForMultipleChoice(tokenizer)(features)

In [None]:
[tokenizer.decode(batch["input_ids"][5][i].tolist()) for i in range(3)]

In [None]:
show(data["train"][5])

#TODO Padding doesn't seem to work/no effect

In [None]:
import numpy as np

def compute_metrics(eval_predictions):
    """
    Compute metrics from the predictions
    """
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis = 1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized["train"],
    eval_dataset = tokenized["test"],
    tokenizer = tokenizer,
    data_collator = DataCollatorForMultipleChoice(tokenizer = tokenizer),
    compute_metrics = compute_metrics,
    )

In [None]:
trainer.train()

In [None]:
trainer.save_model() 