In [1]:
import torch
from datasets import load_dataset
from transformers import (
    RobertaTokenizerFast,
    RobertaForMultipleChoice,
    TrainingArguments,
    Trainer,
)
import numpy as np
from dataclasses import dataclass
from typing import Optional, Union
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy

2024-08-14 01:34:25.180154: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-14 01:34:25.180273: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-14 01:34:25.317710: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [3]:
# Load the PIQA dataset
print("Loading PIQA dataset...")
piqa_dataset = load_dataset("piqa")
print("Dataset loaded. Sample:")
print(piqa_dataset['train'][:3])

Loading PIQA dataset...


Downloading builder script:   0%|          | 0.00/5.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/8.41k [00:00<?, ?B/s]

The repository for piqa contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/piqa.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/815k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16113 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3084 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1838 [00:00<?, ? examples/s]

Dataset loaded. Sample:
{'goal': ["When boiling butter, when it's ready, you can", 'To permanently attach metal legs to a chair, you can', 'how do you indent something?'], 'sol1': ['Pour it onto a plate', 'Weld the metal together to get it to stay firmly in place', 'leave a space before starting the writing'], 'sol2': ['Pour it into a jar', 'Nail the metal together to get it to stay firmly in place', 'press the spacebar'], 'label': [1, 0, 0]}


In [4]:
# Load model and tokenizer
print("\nLoading RoBERTa model and tokenizer...")
model_checkpoint = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_checkpoint)
model = RobertaForMultipleChoice.from_pretrained(model_checkpoint).to(device)


Loading RoBERTa model and tokenizer...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForMultipleChoice were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
# Define preprocessing function
def preprocess_function(examples):
    first_sentences = [[context] * 2 for context in examples["goal"]]
    second_sentences = [[sol1, sol2] for sol1, sol2 in zip(examples["sol1"], examples["sol2"])]
    
    # Flatten everything
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])
    
    # Tokenize
    tokenized_examples = tokenizer(first_sentences, second_sentences, padding=False, truncation=True)
    
    # Un-flatten
    result = {
        k: [v[i : i + 2] for i in range(0, len(v), 2)]
        for k, v in tokenized_examples.items()
    }
    
    # Add labels
    if "label" in examples:
        result["labels"] = examples["label"]
    
    return result

In [7]:
# Preprocess the dataset
print("Preprocessing the dataset...")
encoded_datasets = piqa_dataset.map(preprocess_function, batched=True, remove_columns=piqa_dataset["train"].column_names)

# Data collator
@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)]
            for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="np",
        )

        # Un-flatten
        batch = {
            k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()
        }
        # Add back labels
        batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
        return batch

Preprocessing the dataset...


Map:   0%|          | 0/16113 [00:00<?, ? examples/s]

Map:   0%|          | 0/3084 [00:00<?, ? examples/s]

Map:   0%|          | 0/1838 [00:00<?, ? examples/s]

In [8]:
# Preprocess the dataset
print("Preprocessing the dataset...")
encoded_datasets = piqa_dataset.map(preprocess_function, batched=True, remove_columns=piqa_dataset["train"].column_names)


Preprocessing the dataset...


Map:   0%|          | 0/16113 [00:00<?, ? examples/s]

In [9]:
@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)]
            for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        # Un-flatten
        batch = {
            k: v.view(batch_size, num_choices, -1) for k, v in batch.items()
        }
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.long)
        return batch

# Create an instance of the data collator
data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)

In [10]:
# Set up data collator
data_collator = DataCollatorForMultipleChoice(tokenizer)

In [11]:
from torch.utils.data import DataLoader

# Split the data into train and validation sets
print("Splitting dataset into train and validation sets...")
train_test_split = encoded_datasets["train"].train_test_split(test_size=0.1, seed=42)
encoded_datasets_2 = {
    "train": train_test_split["train"],
    "test": train_test_split["test"],
    "validation": encoded_datasets["validation"]
}

# Prepare datasets for training
print("Preparing datasets for training...")
batch_size = 4  # Reduced batch size

# Create DataLoaders
train_dataset = encoded_datasets_2['train']
val_dataset = encoded_datasets_2['test']

train_dataloader = DataLoader(
    train_dataset, 
    shuffle=True, 
    batch_size=batch_size, 
    collate_fn=data_collator
)

val_dataloader = DataLoader(
    val_dataset, 
    shuffle=False, 
    batch_size=batch_size, 
    collate_fn=data_collator
)

Splitting dataset into train and validation sets...
Preparing datasets for training...


In [None]:
# Load and preprocess the dataset
piqa_dataset = load_dataset("piqa")
encoded_datasets = piqa_dataset.map(preprocess_function, batched=True, remove_columns=piqa_dataset["train"].column_names)

# Set up TrainingArguments
training_args = TrainingArguments(
    output_dir="./results_roberta_base",
    remove_unused_columns=False,
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=10,
    weight_decay=0.1,
    adam_beta1=0.9,
    adam_beta2=0.98,
    adam_epsilon=1e-6,
    max_grad_norm=1.0,
    warmup_steps=int(0.06 * (16113 * 10) / 16),
    lr_scheduler_type="polynomial",
    logging_dir="./logs_roberta_base",
    logging_steps=100,
    save_strategy="epoch",
    eval_strategy="epoch",  # Changed from evaluation_strategy to eval_strategy
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=torch.cuda.is_available(),
    gradient_accumulation_steps=8,
    seed=42,
    dataloader_num_workers=4,
    report_to="none",
)

# Define compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.argmax(axis=1)
    return {"accuracy": (predictions == labels).astype(float).mean().item()}

# Set up Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_datasets["train"],
    eval_dataset=encoded_datasets["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

# Train the model
print("Starting model training...")
trainer.train()

Starting model training...


  self.pid = os.fork()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster tha

Epoch,Training Loss,Validation Loss,Accuracy
0,0.6932,0.692911,0.564744


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to enc