In [1]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from datasets import load_dataset


# Load the tokenizer and model
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=5)  # Assuming 5 possible answer choices

# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

load dataset from disk

In [2]:
from datasets import load_from_disk


# Example to load from the directory named "formatted_dataset"
loaded_dataset = load_from_disk('./formatted_dataset')

# Verify the loaded dataset
print(loaded_dataset['train'][1]['generated_output'])
print(loaded_dataset['validation'][1]['generated_output'])

$answer$ = It's wrong to make generalizations about people in certain situations.
$answer$ = It's good to give someone a ride who needs one.


In [3]:
def remove_answer_prefix(example):
    if 'generated_output' in example:
        example['generated_output'] = [output.replace("$answer$ =", "").strip() for output in example['generated_output']]
    return example

In [4]:
from datasets import load_from_disk

# Apply the function to the dataset
transformed_dataset = loaded_dataset.map(remove_answer_prefix, batched=True)

# Verify the transformation
print(transformed_dataset['train'][0]['generated_output'])
print(transformed_dataset['validation'][0]['generated_output'])

It is good to be aware of your surroundings.



In [5]:
transformed_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'choices', 'answer', 'abstractive_explanation', 'extractive_explanation', 'formatted_question', 'generated_output'],
        num_rows: 9741
    })
    validation: Dataset({
        features: ['id', 'question', 'choices', 'answer', 'abstractive_explanation', 'extractive_explanation', 'formatted_question', 'generated_output'],
        num_rows: 1221
    })
})

In [6]:
def preprocess_function(examples):
    inputs = [
        "question: " + q + " [SEP] " +generated_output+ " [SEP] " +  " ".join(choices) 
        for q, choices, generated_output in zip(examples['question'], examples['choices'], examples['generated_output'])
    ]
    # Map the answer keys from string to the index of the correct choice
    labels = [choices.index(answer) for choices, answer in zip(examples['choices'], examples['answer'])]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding='max_length')
    model_inputs['labels'] = labels
    return model_inputs

tokenized_datasets = transformed_dataset.map(preprocess_function, batched=True)


Map:   0%|          | 0/9741 [00:00<?, ? examples/s]

Map:   0%|          | 0/1221 [00:00<?, ? examples/s]

In [7]:
from torch.utils.data import DataLoader, TensorDataset

def convert_to_tensors(dataset):
    input_ids = torch.tensor(dataset['input_ids'])
    attention_mask = torch.tensor(dataset['attention_mask'])
    labels = torch.tensor(dataset['labels'])
    return TensorDataset(input_ids, attention_mask, labels)

train_dataset = convert_to_tensors(tokenized_datasets["train"])
eval_dataset = convert_to_tensors(tokenized_datasets["validation"])

train_loader = DataLoader(train_dataset, shuffle=True, batch_size=128)
eval_loader = DataLoader(eval_dataset, batch_size=128)


In [8]:
# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training loop
def train_loop(model, loader, optimizer):
    model.train()
    total_loss = 0

    for batch in tqdm(loader, desc="Training"):
        optimizer.zero_grad()
        inputs = {
            'input_ids': batch[0].to(device),
            'attention_mask': batch[1].to(device),
            'labels': batch[2].to(device)
        }
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(loader)




In [9]:
from sklearn.metrics import accuracy_score

# Validation loop
def validate_loop(model, loader):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in tqdm(loader, desc="Validation"):
            inputs = {
                'input_ids': batch[0].to(device),
                'attention_mask': batch[1].to(device),
                'labels': batch[2].to(device)
            }
            outputs = model(**inputs)
            loss = outputs.loss
            total_loss += loss.item()

            preds = outputs.logits.argmax(dim=-1).cpu().numpy()
            labels = batch[2].cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels)

    avg_loss = total_loss / len(loader)
    accuracy = accuracy_score(all_labels, all_preds)

    return avg_loss, accuracy




In [10]:

from tqdm import tqdm# Training and validation
num_epochs = 3
for epoch in range(num_epochs):
    train_loss = train_loop(model, train_loader, optimizer)
    val_loss, val_accuracy = validate_loop(model, eval_loader)
    print(f"Epoch {epoch + 1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}")
    print(f"Validation Accuracy: {val_accuracy:.4f}")

Training:   0%|          | 0/77 [00:00<?, ?it/s]

Training: 100%|██████████| 77/77 [02:05<00:00,  1.63s/it]
Validation: 100%|██████████| 10/10 [00:04<00:00,  2.00it/s]


Epoch 1/3
Train Loss: 1.6160
Validation Loss: 1.6093
Validation Accuracy: 0.1966


Training: 100%|██████████| 77/77 [02:05<00:00,  1.63s/it]
Validation: 100%|██████████| 10/10 [00:05<00:00,  1.99it/s]


Epoch 2/3
Train Loss: 1.6102
Validation Loss: 1.6140
Validation Accuracy: 0.1925


Training: 100%|██████████| 77/77 [02:06<00:00,  1.64s/it]
Validation: 100%|██████████| 10/10 [00:05<00:00,  1.99it/s]

Epoch 3/3
Train Loss: 1.6054
Validation Loss: 1.6108
Validation Accuracy: 0.2048



