In [17]:
from transformers import GPT2LMHeadModel
from torchtyping import TensorType
import torch
from transformers import GPT2Tokenizer

In [18]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from torch.utils.data import Dataset

class AdditionDataset(Dataset):
    def __init__(self, tokenizer, examples, max_length=50):
        self.tokenizer = tokenizer
        self.examples = examples
        self.max_length = max_length

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        question, answer = self.examples[idx]
        inputs = self.tokenizer.encode_plus(
            question,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        labels = self.tokenizer.encode_plus(
            answer,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'labels': labels['input_ids'].flatten()
        }

In [19]:
from transformers import TrainingArguments
from sklearn.model_selection import train_test_split

def generate_addition_validation(num_examples=100, max_number=100000000):
    import random
    examples = []
    for _ in range(num_examples):
        a = random.randint(0, max_number)
        b = random.randint(0, max_number)
        question = f"{a}+{b}="
        answer = str(a + b)
        examples.append((question, answer))
    return examples

def generate_random_addition_training(num_examples=100, max_number=100000000):
    import random
    examples = []
    for _ in range(num_examples):
        a = random.randint(0, max_number)
        b = random.randint(0, max_number)
        question = f"{a}+{b}="
        answer = str(a + b)
        examples.append((question, answer))
    return examples

def generate_ordered_addition_training(num_examples=100, max_number=100000000):
    import random
    examples = []
    for _ in range(num_examples):
        a = random.randint(0, max_number)
        b = random.randint(0, max_number)
        question = f"{a}+{b}="
        answer = str(a + b)
        examples.append((question, answer))
    # Sort the examples based on the sum of the numbers in the question
    examples.sort(key=lambda x: sum(int(num) for num in x[0].split('+') if num.strip().isdigit()))
    return examples

def generate_reverse_ordered_addition_training(num_examples=100, max_number=100000000):
    import random
    examples = []
    for _ in range(num_examples):
        a = random.randint(0, max_number)
        b = random.randint(0, max_number)
        question = f"{a}+{b}="
        answer = str(a + b)
        examples.append((question, answer))
    # Sort the examples based on the sum of the numbers in the question in reverse order
    examples.sort(key=lambda x: sum(int(num) for num in x[0].split('+') if num.strip().isdigit()), reverse=True)
    return examples

In [None]:
import matplotlib.pyplot as plt


datasets = {
    "random": generate_random_addition_training(),
    "ordered": generate_ordered_addition_training(),
    "reverse_ordered": generate_reverse_ordered_addition_training()
}

training_losses = {}

# Initialize the GPT2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token  # Add this line

examples = generate_addition_validation()
val_dataset = AdditionDataset(tokenizer, examples)

for dataset_name, examples in datasets.items():

    # Create a AdditionDataset instance for the validation set
    train_dataset = AdditionDataset(tokenizer, examples)

    model = GPT2LMHeadModel.from_pretrained('gpt2')

    def compute_metrics(eval_pred):
        logits, labels = eval_pred
        predictions = logits.argmax(-1)
        accuracy = (predictions == labels).mean().item()
        return {"accuracy": accuracy}

    training_args = TrainingArguments(
        output_dir="./results",  # output directory
        num_train_epochs=30,  # total number of training epochs
        per_device_train_batch_size=2,  # batch size per device during training
        per_device_eval_batch_size=8,  # batch size for evaluation
        warmup_steps=100,  # number of warmup steps for learning rate scheduler
        weight_decay=0.01,  # strength of weight decay
        logging_dir="./logs",  # directory for storing logs
        evaluation_strategy="epoch"
    )

    # Define the trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    # Train the model
    trainer_state = trainer.train()

    # Store the training losses
    training_losses[dataset_name] = [log.get('eval_loss', None) for log in trainer.state.log_history if 'eval_loss' in log]

# Plot the training losses
for dataset_name, losses in training_losses.items():
    epochs = range(1, len(losses) + 1)
    plt.plot(epochs, losses, label=f'{dataset_name} training loss')

for dataset_name, losses in training_losses.items():
    print(f"Final loss for {dataset_name}: {losses[-1]}")

plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

  3%|▎         | 50/1500 [00:42<20:27,  1.18it/s]

[A
[A
[A
[A

OutOfMemoryError: CUDA out of memory. Tried to allocate 768.00 MiB (GPU 0; 8.00 GiB total capacity; 5.16 GiB already allocated; 440.13 MiB free; 5.78 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF


[A