### Load

In [1]:
import pandas as pd

# Load data
dir = "common-sense"

train_file = f"{dir}/train_data.csv"
train_labels_file = f"{dir}/train_answers.csv"

df_sentences = pd.read_csv(train_file)
df_labels = pd.read_csv(train_labels_file)

In [2]:
# Combine sentences and labels dataframes
df = pd.merge(df_sentences, df_labels, on="id")

# Rename answer to label and options to correspond with binary label
df = df.rename(columns={"answer": "label"})

In [3]:
from datasets import Dataset, load_metric

# Convert dataframe to huggingface dataset, split in train/test(for eval) set, encode label
data = Dataset.from_pandas(df).train_test_split(test_size = 0.3).class_encode_column("label")

Flattening the indices:   0%|          | 0/6 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/6 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/3 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/3 [00:00<?, ?ba/s]

In [6]:
labels = {0: 'OptionA', 1: 'OptionB', 2: 'OptionC'}

def show(example):
    """
        Shows an example in the dataset
    """
    print(f"Sentence: {example['FalseSent']}\n")

    print(f"Options:\nA) {example['OptionA']}\nB) {example['OptionB']}\nC) {example['OptionC']}")

    gold_label = example['label']
    print(f"Correct label: {example['label']}\n")
    print(f"Ground truth: {labels[gold_label]}")


In [7]:
show(data["train"][1])

Sentence: Sam lives in space.

Options:
A) Space is vast.
B) Space contains many unknowns.
C) Space is not suitable for human existence
Correct label: 2

Ground truth: OptionC


In [8]:
from datasets import Dataset, load_metric

# Convert dataframe to huggingface dataset, split in train/test(for eval) set, encode label
data = Dataset.from_pandas(df).train_test_split(test_size = 0.3).class_encode_column("label")

Flattening the indices:   0%|          | 0/6 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/6 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/3 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/3 [00:00<?, ?ba/s]

### Preprocessing

In [9]:
from transformers import AlbertTokenizer

MODEL_NAME = "albert-base-v2"
tokenizer = AlbertTokenizer.from_pretrained(MODEL_NAME)

def preprocess_function(examples: Dataset):
    """ Perform preprocessing of the input.
        # Arguments
            Dataset: Huggingface Dataset containing features
        # Output
            Huggingface dataset_dict with the tokenized examples 
        with corresponding input_ids, attention_mask, and labels.
    """
    first = [[i] * 3 for i in examples["FalseSent"]]

    second = [
        [f"{examples[opt][i]}" for opt in list(labels.values())] for i, j in enumerate(examples['FalseSent'])
    ]

    first = sum(first, [])
    sec = sum(second, [])

    # Truncation makes sure to make sure input is not longer than max
    tokenized_examples = tokenizer(first, sec, truncation=True)
 
    return {k: [v[i : i + 3] for i in range(0, len(v), 3)] for k, v in tokenized_examples.items()}


# Apply preprocess function on entire dataset
tokenized = data.map(preprocess_function, batched=True)

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

Intermezzo: check check

In [10]:
# Check if tokenized has worked: how many examples (5600), how many options (3), list of with varying features because no padding yet
print(len(tokenized["train"]["input_ids"]), len(tokenized["train"]["input_ids"][0]), [len(x) for x in tokenized["train"]["input_ids"][0]])

5600 3 [21, 27, 24]


In [11]:
# Check decoded tokenized and compare with ground truth 
idx = 2
[tokenizer.decode(tokenized["train"]["input_ids"][idx][i]) for i in range(3)]


['play is for replenishing energy sources people need energy to play games',
 'play is for replenishing energy sources energy is not unlimited consumption',
 'play is for replenishing energy sources playing games consumes energy']

In [12]:
show(data["train"][2])

Sentence: Play is for replenishing energy sources

Options:
A) People need energy to play games
B) Energy is not unlimited consumption
C) Playing games consumes energy
Correct label: 2

Ground truth: OptionC


### Fine-tuning

In [25]:
from transformers import AlbertTokenizer, AlbertForMultipleChoice, AlbertConfig
from transformers import TrainingArguments, Trainer

albert_base_v2_config = AlbertConfig()
config = AlbertConfig(
    hidden_size=768,
    num_attention_heads=12,
    intermediate_size=3072,
)
model = AlbertForMultipleChoice.from_pretrained(MODEL_NAME)
configuration = model.config


Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertForMultipleChoice: ['predictions.bias', 'predictions.dense.bias', 'predictions.decoder.bias', 'predictions.LayerNorm.bias', 'predictions.dense.weight', 'predictions.decoder.weight', 'predictions.LayerNorm.weight']
- This IS expected if you are initializing AlbertForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of AlbertForMultipleChoice were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on

In [26]:
training_args = TrainingArguments(
    output_dir = f"./results/{MODEL_NAME}",
    evaluation_strategy = "epoch",
    learning_rate = 5e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16,
    num_train_epochs = 1, # Default = 3
    weight_decay = 0.01,
)

In [27]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    Flattens all model inputs, apply padding, unflatten results.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.long)
        return batch

CHECK check

In [28]:
accepted_keys = ["input_ids", "attention_mask", "label"]
features = [{k: v for k, v in tokenized["train"][i].items() if k in accepted_keys} for i in range(10)]

batch = DataCollatorForMultipleChoice(tokenizer)(features)

In [29]:
[tokenizer.decode(batch["input_ids"][5][i].tolist()) for i in range(3)]

["it's right to drink and drive you cannot drive without a driver's license",
 "it's right to drink and drive drunk driving is prone to cause accidents",
 "it's right to drink and drive some people enjoy drinking while others enjoy driving"]

In [30]:
show(data["train"][5])

Sentence: It's right to drink and drive

Options:
A) You cannot drive without a driver's license
B) Drunk driving is prone to cause accidents
C) Some people enjoy drinking while others enjoy driving
Correct label: 1

Ground truth: OptionB


Hmmm.. Padding doesn't seem to work

In [31]:
import numpy as np

def compute_metrics(eval_predictions):
    """
    Compute metrics from the predictions
    """
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis = 1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [32]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized["train"],
    eval_dataset = tokenized["test"],
    tokenizer = tokenizer,
    data_collator = DataCollatorForMultipleChoice(tokenizer = tokenizer),
    compute_metrics = compute_metrics,
    )

In [33]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `AlbertForMultipleChoice.forward` and have been ignored: OptionC, OptionB, OptionA, id, __index_level_0__, FalseSent. If OptionC, OptionB, OptionA, id, __index_level_0__, FalseSent are not expected by `AlbertForMultipleChoice.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5600
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 350
  Number of trainable parameters = 11684353


  0%|          | 0/350 [00:00<?, ?it/s]

## Inference

In [None]:
import torch
from transformers import AlbertTokenizer, AlbertForMultipleChoice

model_name = "albert-base-v2"
tokenizer = AlbertTokenizer.from_pretrained(model_name)
model = AlbertForMultipleChoice.from_pretrained(model_name)

prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
choice0 = "It is eaten with a fork and a knife."
choice1 = "It is eaten while held in the hand."
labels = torch.tensor(0).unsqueeze(0)  # choice0 is correct (according to Wikipedia ;)), batch size 1

In [None]:
encoding = tokenizer([prompt, prompt], [choice0, choice1], return_tensors="pt", padding=True)
outputs = model(**{k: v.unsqueeze(0) for k, v in encoding.items()}, labels=labels)  # batch size is 1

loss = outputs.loss
logits = outputs.logits

predicted_class = logits.argmax().item()


In [None]:
predicted_class