In [1]:
!pip install transformers peft datasets accelerate -U



In [2]:
# This cell makes sure modules are auto-loaded when you change external python files
%load_ext autoreload
%autoreload 2

In [3]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Direct to your assignment folder.
%cd /content/drive/MyDrive/project-m3-2024-jim

/content/drive/MyDrive/project-m3-2024-jim


In [5]:
import os

# Set the environment variable
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [6]:
from datasets import load_dataset

dataset = load_dataset('json', data_files='datasets/mcq_aquarat_dataset.jsonl')

# Print dataset information
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['subject', 'question', 'answer'],
        num_rows: 82846
    })
})


In [84]:
from datasets import load_dataset


# Shuffle the entire training dataset
shuffled_dataset = dataset['train'].shuffle(seed=42)

# Define the size for the subsets (e.g., 10% for training, 10% for validation)
subset_size = len(dataset['train']) // 60

# Generate the training subset (first 10%)
train_subset = shuffled_dataset.select(range(0, subset_size))

# Generate the validation subset (next 10%)
validation_subset = shuffled_dataset.select(range(subset_size, 2 * subset_size))

# Print the subset information
print("Training Subset:")
print(train_subset)

print("\nValidation Subset:")
print(validation_subset)

#### Having a small training dataset and validation dataset


Training Subset:
Dataset({
    features: ['subject', 'question', 'answer'],
    num_rows: 1380
})

Validation Subset:
Dataset({
    features: ['subject', 'question', 'answer'],
    num_rows: 1380
})


Map:   0%|          | 0/1218 [00:00<?, ? examples/s]

Map:   0%|          | 0/1218 [00:00<?, ? examples/s]

In [85]:
#### Continue here
#### The idea is to customize a loss function for the MCQA data training:
#### If we just send the prompt (MCQ) to the model,
#### we assume it's able to generate an answer, but not in the right format
#### for example, with the MCQ "What is the biggest? A)2; B)4; C)6; D)8"
#### we assume the model generates an answer like "8 is the biggest"
#### but for MCQA we hope it generates "D"
#### so we apply a post-process function that compares the genrated answer like "8 is the biggest" with all the options (2, 4, 6, 8)
#### and choose the closest option (8) and take that choice (D) as the "final output"
#### so the new loss function would be to compare the label/target (D) and the "final output"

import json
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from sklearn.preprocessing import LabelEncoder


# Initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained("gpt2")


# Initialize LabelEncoder and fit it with the possible choice labels
label_encoder = LabelEncoder()
label_encoder.fit(['A', 'B', 'C', 'D', 'E'])

def tokenize_function(example):
    input_text = example["question"]
    target_text = example["answer"]

    # Extracting choices and their labels
    choices = [line.split(":")[1].strip() for line in input_text.split("\n") if line.startswith(('A:', 'B:', 'C:', 'D:', 'E:'))]
    choice_labels = [line.split(":")[0] for line in input_text.split("\n") if line.startswith(('A:', 'B:', 'C:', 'D:', 'E:'))]

    # Tokenizing input text
    inputs = tokenizer(input_text, padding="max_length", truncation=True, max_length=128, return_tensors="pt")

    # Encode the target text (single letter choice)
    target_label = label_encoder.transform([target_text])[0]

    # Encode choice labels to integers
    encoded_choice_labels = label_encoder.transform(choice_labels)

    return {
        "input_ids": inputs["input_ids"].squeeze(),
        "attention_mask": inputs["attention_mask"].squeeze(),
        "labels": torch.tensor(target_label),
        "choices": choices,
        "choice_labels": torch.tensor(encoded_choice_labels)
    }

# Tokenize the dataset
tokenized_datasets_train = [tokenize_function(item) for item in train_subset]
tokenized_datasets_eval = [tokenize_function(item) for item in validation_subset]


class MCQADataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.data[idx]['input_ids']),
            'attention_mask': torch.tensor(self.data[idx]['attention_mask']),
            'labels': torch.tensor(self.data[idx]['labels']),
            'choices': self.data[idx]['choices'],
            'choice_labels': self.data[idx]['choice_labels']
        }
        return item

    def __len__(self):
        return len(self.data)



# Create the dataset
train_dataset = MCQADataset(tokenized_datasets_train)

eval_dataset = MCQADataset(tokenized_datasets_eval)

# Inspect the first sample
sample = train_dataset[4]
print(sample)


{'input_ids': tensor([24361,    25,  1002,   281,  2708,   318,  2702,   379,  1160,     4,
         7630,  2427,   286,  1478,     4,  7630,    11,   788,   262,  7630,
          561,   307, 12820,    13, 11546,   517,    13,  1867,   318,   262,
         1575,  2756,    30,   198,   198, 29046,    25,   198,    32,    25,
        31273,    13, 12825,   198,    33,    25, 31273,    13,  4751,   198,
           34,    25, 31273,    13, 20343,   198,    35,    25, 31273,    13,
        30123,   198,    36,    25, 31273,    13,    20,  2167,   198, 33706,
           25, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 

  'input_ids': torch.tensor(self.data[idx]['input_ids']),
  'attention_mask': torch.tensor(self.data[idx]['attention_mask']),
  'labels': torch.tensor(self.data[idx]['labels']),


In [86]:
#### this collator was meant to ensure that columns "choices" and "choice labels" are kept
from transformers import DataCollatorWithPadding

class CustomDataCollator(DataCollatorWithPadding):
    def __call__(self, features):
        # Extract the special fields
        choices = [feature.pop('choices') for feature in features]
        choice_labels = [feature.pop('choice_labels') for feature in features]

        # Use the parent class's __call__ method to handle the rest
        batch = super().__call__(features)

        # Add the special fields back to the batch
        batch['choices'] = choices
        batch['choice_labels'] = choice_labels

        return batch

data_collator = CustomDataCollator(tokenizer)

In [87]:

def levenshtein_distance(a, b):
    if len(a) < len(b):
        return levenshtein_distance(b, a)

    if len(b) == 0:
        return len(a)

    previous_row = range(len(b) + 1)
    for i, c1 in enumerate(a):
        current_row = [i + 1]
        for j, c2 in enumerate(b):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]


In [88]:
import torch.nn.functional as F

def custom_loss_function(logits, labels, choices, choice_labels, temperature=1.0):
    # Ensure labels are of type torch.long
    labels = labels.long()

    # Generate the most probable sequence from logits
    generated_ids = torch.argmax(logits, dim=-1)
    generated_answers = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

    # Initialize a list to store the new logits
    new_logits_list = []
    valid_labels = []

    # Post-process the generated answers to compute new logits
    for idx, (generated_answer, choice_list, label) in enumerate(zip(generated_answers, choices, labels)):
        if len(choice_list) != 5:
            #print(f"Skipping entry because it does not have exactly 5 choices.")
            continue

        scores = []
        max_len = max(len(generated_answer), max(len(choice) for choice in choice_list))
        for choice in choice_list:
            score = levenshtein_distance(generated_answer, choice)
            normalized_score = score / max_len  # Normalize distance
            inverse_score = 1 / (normalized_score + 1e-5)  # Apply inverse transformation
            scores.append(inverse_score)

        # Apply min-max scaling
        scores = torch.tensor(scores, device=logits.device, dtype=torch.float)
        min_score = scores.min()
        max_score = scores.max()
        epsilon = 1e-5  # Small value to prevent division by zero
        scaled_scores = (scores - min_score) / (max_score - min_score + epsilon)

        # Apply temperature-scaled softmax
        exp_scores = torch.exp(scaled_scores / temperature)
        softmax_scores = exp_scores / exp_scores.sum()
        new_logits_list.append(softmax_scores)
        valid_labels.append(label)

    # Convert the list of logits and labels to tensors
    if new_logits_list:
        new_logits = torch.stack(new_logits_list).requires_grad_(True)  # Ensure gradient tracking
        valid_labels = torch.tensor(valid_labels).long().to(logits.device)

        # Debugging: Print tensor shapes and values
        #print(f"Logits shape: {logits.shape}")
        #print(f"New logits shape: {new_logits.shape}")
        #print(f"Labels: {valid_labels}")
        #print(f"New logits: {new_logits}")

        # Ensure labels are within the valid range
        if valid_labels.max() >= new_logits.size(1) or valid_labels.min() < 0:
            raise ValueError(f"Labels out of range: {valid_labels}")

        # Calculate cross-entropy loss
        loss = F.cross_entropy(new_logits, valid_labels)

    # Calculate cross-entropy loss
    #loss = F.cross_entropy(new_logits, labels)

    return loss

In [89]:
#### now the problem is, as shown by the print of "inputs", it doesn't contain "labels"
# Custom Trainer class
from transformers import Trainer, TrainingArguments, BertForSequenceClassification
from torch.utils.data import DataLoader, SequentialSampler

class CustomTrainer(Trainer):
    def get_train_dataloader(self):
        train_sampler = self._get_train_sampler()
        return DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            sampler=train_sampler,
            collate_fn=self.data_collator,
        )

    def get_eval_dataloader(self, eval_dataset=None):
        eval_dataset = eval_dataset if eval_dataset is not None else self.eval_dataset
        return DataLoader(
            eval_dataset,
            batch_size=self.args.eval_batch_size,
            sampler=SequentialSampler(eval_dataset),
            collate_fn=self.data_collator,
        )
    #print(inputs)
    def compute_loss(self, model, inputs, return_outputs=False):
        #print(inputs)
        input_ids = inputs.pop("input_ids")
        attention_mask = inputs.pop("attention_mask")
        labels = inputs.pop("labels")
        choices = inputs.pop("choices")
        choice_labels = inputs.pop("choice_labels")

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Custom loss computation
        loss = custom_loss_function(logits, labels, choices, choice_labels)

        return (loss, outputs) if return_outputs else loss


In [92]:
#### as long as the function "compute_loss" works, which means we can access columns "labels" in the train_dataset
# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,  # Adjust logging steps as needed
    log_level='info',  # Ensure logging level includes info
)
#print(train_dataset[0])
# Initialize the custom trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator
)

# Start training
trainer.train()

***** Running training *****
  Num examples = 1,380
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 865
  Number of trainable parameters = 124,439,808
  'input_ids': torch.tensor(self.data[idx]['input_ids']),
  'attention_mask': torch.tensor(self.data[idx]['attention_mask']),
  'labels': torch.tensor(self.data[idx]['labels']),


Epoch,Training Loss,Validation Loss
1,1.5996,1.609528
2,1.6216,1.609528
3,1.5989,1.609528
4,1.6062,1.609528
5,1.6102,1.609528


***** Running Evaluation *****
  Num examples = 1380
  Batch size = 8
***** Running Evaluation *****
  Num examples = 1380
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Configuration saved in ./results/checkpoint-500/generation_config.json
Model weights saved in ./results/checkpoint-500/model.safetensors
  'input_ids': torch.tensor(self.data[idx]['input_ids']),
  'attention_mask': torch.tensor(self.data[idx]['attention_mask']),
  'labels': torch.tensor(self.data[idx]['labels']),
***** Running Evaluation *****
  Num examples = 1380
  Batch size = 8
***** Running Evaluation *****
  Num examples = 1380
  Batch size = 8
***** Running Evaluation *****
  Num examples = 1380
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=865, training_loss=1.6047728202246516, metrics={'train_runtime': 304.1112, 'train_samples_per_second': 22.689, 'train_steps_per_second': 2.844, 'total_flos': 450728755200000.0, 'train_loss': 1.6047728202246516, 'epoch': 5.0})

In [93]:
# Save the final model and tokenizer
model.save_pretrained("./MCQA_model_1")
tokenizer.save_pretrained("./MCQA_model_1")

Configuration saved in ./MCQA_model_1/config.json
Configuration saved in ./MCQA_model_1/generation_config.json
Model weights saved in ./MCQA_model_1/model.safetensors
tokenizer config file saved in ./MCQA_model_1/tokenizer_config.json
Special tokens file saved in ./MCQA_model_1/special_tokens_map.json


('./MCQA_model_1/tokenizer_config.json',
 './MCQA_model_1/special_tokens_map.json',
 './MCQA_model_1/vocab.json',
 './MCQA_model_1/merges.txt',
 './MCQA_model_1/added_tokens.json')