In [10]:
import tensorflow as tf
from datasets import load_dataset
from transformers import RobertaTokenizerFast, TFRobertaForMultipleChoice, create_optimizer
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from dataclasses import dataclass
from typing import Optional, Union
import numpy as np

In [11]:
# Load the PIQA dataset
print("Loading PIQA dataset...")
piqa_dataset = load_dataset("piqa")
print("Dataset loaded. Sample:")
print(piqa_dataset['train'][:3])

Loading PIQA dataset...
Dataset loaded. Sample:
{'goal': ["When boiling butter, when it's ready, you can", 'To permanently attach metal legs to a chair, you can', 'how do you indent something?'], 'sol1': ['Pour it onto a plate', 'Weld the metal together to get it to stay firmly in place', 'leave a space before starting the writing'], 'sol2': ['Pour it into a jar', 'Nail the metal together to get it to stay firmly in place', 'press the spacebar'], 'label': [1, 0, 0]}


In [12]:
# Load model and tokenizer
print("\nLoading RoBERTa model and tokenizer...")
model_checkpoint = "roberta-base"
tokenizer = RobertaTokenizerFast.from_pretrained(model_checkpoint)
model = TFRobertaForMultipleChoice.from_pretrained(model_checkpoint)


Loading RoBERTa model and tokenizer...


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForMultipleChoice: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForMultipleChoice from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForMultipleChoice from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForMultipleChoice were not initialized from the PyTorch model and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
# Define preprocessing function
def preprocess_function(examples):
    first_sentences = [[context] * 2 for context in examples["goal"]]
    second_sentences = [[sol1, sol2] for sol1, sol2 in zip(examples["sol1"], examples["sol2"])]
    
    # Flatten everything
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])
    
    # Tokenize
    tokenized_examples = tokenizer(first_sentences, second_sentences, padding=False, truncation=True)
    
    # Un-flatten
    result = {
        k: [v[i : i + 2] for i in range(0, len(v), 2)]
        for k, v in tokenized_examples.items()
    }
    
    # Add labels
    if "label" in examples:
        result["labels"] = examples["label"]
    
    return result

In [24]:
# Preprocess the dataset
print("Preprocessing the dataset...")
encoded_datasets = piqa_dataset.map(preprocess_function, batched=True, remove_columns=piqa_dataset["train"].column_names)

# Data collator
@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)]
            for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="np",
        )

        # Un-flatten
        batch = {
            k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()
        }
        # Add back labels
        batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
        return batch

Preprocessing the dataset...


Map:   0%|          | 0/16113 [00:00<?, ? examples/s]

Map:   0%|          | 0/3084 [00:00<?, ? examples/s]

Map:   0%|          | 0/1838 [00:00<?, ? examples/s]

In [25]:
# Preprocess the dataset
print("Preprocessing the dataset...")
encoded_datasets = piqa_dataset.map(preprocess_function, batched=True, remove_columns=piqa_dataset["train"].column_names)


Preprocessing the dataset...


In [27]:
# Data collator
@dataclass
class DataCollatorForMultipleChoice:
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)]
            for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="np",
        )

        # Un-flatten
        batch = {
            k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()
        }
        # Add back labels
        batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
        return batch

In [28]:
# Set up data collator
data_collator = DataCollatorForMultipleChoice(tokenizer)

In [38]:
# Split the data into train and validation sets
print("Splitting dataset into train and validation sets...")
train_test_split = encoded_datasets["train"].train_test_split(test_size=0.1, seed=42)
encoded_datasets_2 = {
    "train": train_test_split["train"],
    "test": train_test_split["test"],
    "validation": encoded_datasets["validation"]
}

# Prepare datasets with a smaller batch size
print("Preparing datasets for training...")
batch_size = 4  # Reduced batch size


train_set = model.prepare_tf_dataset(
    encoded_datasets_2['train'],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

val_set = model.prepare_tf_dataset(
    encoded_datasets_2['test'],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

Splitting dataset into train and validation sets...
Preparing datasets for training...


In [39]:
# Model parameters
learning_rate = 2e-5
num_train_epochs = 3
weight_decay = 0.01

# Prepare optimizer with gradient accumulation
num_train_steps = len(train_set) * num_train_epochs
accumulation_steps = 4
optimizer, lr_schedule = create_optimizer(
    init_lr=learning_rate,
    num_warmup_steps=0,
    num_train_steps=num_train_steps // accumulation_steps,
    weight_decay_rate=weight_decay
)

In [None]:
# Compile the model
print("Compiling the model...")
model.compile(optimizer=optimizer, metrics=["accuracy"])

# Train the model
print("Starting model training...")
history = model.fit(
    train_set,
    validation_data=val_set,
    epochs=num_train_epochs
)

print("Training complete!")

Compiling the model...
Starting model training...
Epoch 1/3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: <gast.gast.Expr object at 0x794610156b90>
 749/3625 [=====>........................] - ETA: 13:34 - loss: 0.6953 - accuracy: 0.4930