In [7]:
import pandas as pd
from datasets import Dataset
from transformers import RobertaConfig, RobertaModel, RobertaTokenizer, RobertaForMultipleChoice
from transformers import TrainingArguments, Trainer
from transformers.tokenization_utils_base import PaddingStrategy
from typing import Optional, Union
import torch
import numpy as np

MODEL_NAME = "roberta-base"
CHECKPOINT = "checkpoint-500"


options = ['OptionA', 'OptionB', 'OptionC']

tokenizer = RobertaTokenizer.from_pretrained("roberta-base") 

In [2]:
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    Flattens all model inputs, apply padding, unflatten results.
    """
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.long)
        return batch


In [3]:
def load_data(sentences, labels):
    """
        Load and merge sentences, labels convert to Huggingface DatasetDict
    """
    df_sentences = pd.read_csv(sentences)
    df_labels = pd.read_csv(labels)

    # Combine sentences and labels dataframes
    df = pd.merge(df_sentences, df_labels, on="id")

    df = df.rename(columns={"answer": "label"})

    # Encode answer to make it binary
    data = Dataset.from_pandas(df).train_test_split(test_size = 0.3).class_encode_column("label")

    return data

In [4]:
def preprocess_function(examples: Dataset):
    """ Perform preprocessing of the input.
        # Arguments
            Dataset: Huggingface Dataset containing features
        # Output
            Huggingface dataset_dict with the tokenized examples 
        with corresponding input_ids, attention_mask, and labels.
    """
    first = [[i] * 3 for i in examples["FalseSent"]]

    second = [
        [f"{examples[opt][i]}" for opt in options] for i in range(len(examples['FalseSent']))
    ]

    first = sum(first, [])
    sec = sum(second, [])

    # Truncation makes sure to make sure input is not longer than max
    tokenized_examples = tokenizer(first, sec, truncation=True)
 
    return {k: [v[i : i + 3] for i in range(0, len(v), 3)] for k, v in tokenized_examples.items()}


In [5]:
def compute_metrics(eval_predictions):
    """
    Compute metrics from the predictions.
    """
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [10]:
if __name__ == "__main__":
    
    sentences = "common-sense/train_data.csv"
    labels = "common-sense/train_answers.csv"

    data = load_data(sentences, labels)

    tokenized = data.map(preprocess_function, batched=True)
    
    roberta_base_config = RobertaConfig()
    config = RobertaConfig(
    hidden_size=768,
    num_attention_heads=12,
    intermediate_size=3072,)

    model = RobertaForMultipleChoice.from_pretrained(MODEL_NAME)
    configuration = model.config

    training_args = TrainingArguments(
        output_dir = f"./results/{MODEL_NAME}",
        evaluation_strategy = "epoch",
        learning_rate = 5e-5,
        per_device_train_batch_size = 16,
        per_device_eval_batch_size = 16,
        num_train_epochs = 3, # Default = 3
        weight_decay = 0.01,
    )

Flattening the indices:   0%|          | 0/6 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/6 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/3 [00:00<?, ?ba/s]

Casting to class labels:   0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForMultipleChoice: ['lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForMultipleChoice from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForMultipleChoice from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predi

In [11]:
trainer = Trainer(
        model = model,
        args = training_args,
        train_dataset = tokenized["train"],
        eval_dataset = tokenized["test"],
        tokenizer = tokenizer,
        data_collator = DataCollatorForMultipleChoice(),
        compute_metrics = compute_metrics,
    )

trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForMultipleChoice.forward` and have been ignored: OptionB, OptionA, id, FalseSent, __index_level_0__, OptionC. If OptionB, OptionA, id, FalseSent, __index_level_0__, OptionC are not expected by `RobertaForMultipleChoice.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 5600
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1050
  Number of trainable parameters = 124646401


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.380333,0.8675
2,0.456300,0.417506,0.878333
3,0.154400,0.488436,0.884583


The following columns in the evaluation set don't have a corresponding argument in `RobertaForMultipleChoice.forward` and have been ignored: OptionB, OptionA, id, FalseSent, __index_level_0__, OptionC. If OptionB, OptionA, id, FalseSent, __index_level_0__, OptionC are not expected by `RobertaForMultipleChoice.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2400
  Batch size = 16
Saving model checkpoint to ./results/roberta-base\checkpoint-500
Configuration saved in ./results/roberta-base\checkpoint-500\config.json
Model weights saved in ./results/roberta-base\checkpoint-500\pytorch_model.bin
tokenizer config file saved in ./results/roberta-base\checkpoint-500\tokenizer_config.json
Special tokens file saved in ./results/roberta-base\checkpoint-500\special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForMultipleChoice.forward` and have been ignored: OptionB, OptionA, id, FalseSent

TrainOutput(global_step=1050, training_loss=0.2950720078604562, metrics={'train_runtime': 9232.8411, 'train_samples_per_second': 1.82, 'train_steps_per_second': 0.114, 'total_flos': 885303196891488.0, 'train_loss': 0.2950720078604562, 'epoch': 3.0})