<a href="https://colab.research.google.com/github/kiankyars/Ultra-Scale-Playbook-Series/blob/main/notebooks/bonus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Google Colab Exercise Notebook: Transformer Training with Gradient Accumulation

# This notebook explores training a GPT-2 model on a single GPU, with an exercise on gradient accumulation.

!pip install torch transformers

import torch
import torch.nn as nn
from transformers import GPT2Config, GPT2LMHeadModel



In [None]:
# Exercise:
# Complete the train_with_accumulation function by:
# 1. Calculating micro_steps
# 2. Implementing the forward/backward pass with loss accumulation
# 3. Adding the optimizer step after all micro-batches

# Define a standard GPT-2 model
config = GPT2Config(
    n_embd=768,    # Standard hidden dimension
    n_layer=12,    # 12 layers
    n_head=12,     # 12 attention heads
    vocab_size=50257,  # Full vocab size
    n_positions=1024   # Standard sequence length
)
model = GPT2Model(config).cuda()

# Training step function
def train_step(model, optimizer, input_ids):
    optimizer.zero_grad()
    outputs = model(input_ids)
    loss = torch.mean(outputs.last_hidden_state)  # Dummy loss for simplicity
    loss.backward()
    optimizer.step()
    return loss.item()

# Batch size experiment
def run_training(model, batch_size, seq_len=1024, steps=5):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    for step in range(steps):
        input_ids = torch.randint(0, 50257, (batch_size, seq_len)).cuda()
        loss = train_step(model, optimizer, input_ids)
        print(f"Step {step}, Batch Size {batch_size}, Loss: {loss:.4f}")

# Gradient accumulation exercise
def train_with_accumulation(model, total_batch_size=8, micro_batch_size=2, seq_len=1024, steps=5):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    micro_steps = None  # TODO: Calculate number of micro-batches

    for step in range(steps):
        optimizer.zero_grad()
        total_loss = 0

        for _ in range(micro_steps):
            input_ids = torch.randint(0, 50257, (micro_batch_size, seq_len)).cuda()
            # TODO: Implement forward pass, backward pass, and loss accumulation
            # Do not step the optimizer until all micro-batches are processed

        # TODO: Perform the optimizer step after accumulating gradients
        print(f"Step {step}, Average Loss: {total_loss/micro_steps:.4f}")

    print("Finished training with gradient accumulation")

# Main execution
if __name__ == "__main__":
    print("Starting GPT-2 training experiments...")

    # Run with different batch sizes
    print("\nBatch Size Effects:")
    for i in range(1, 11, 1):
        run_training(model, batch_size=i)

    # Gradient accumulation exercise
    print("\nGradient Accumulation Exercise:")
    train_with_accumulation(model)

In [None]:
# Exercise:
# Complete the train_with_accumulation function by:
# 1. Calculating micro_steps
# 2. Implementing the forward/backward pass with loss accumulation
# 3. Adding the optimizer step after all micro-batches

# Define a standard GPT-2 model
config = GPT2Config(
    n_embd=768,    # Standard hidden dimension
    n_layer=12,    # 12 layers
    n_head=12,     # 12 attention heads
    vocab_size=50257,  # Full vocab size
    n_positions=1024   # Standard sequence length
)
model = GPT2LMHeadModel(config).cuda()

# Training step function
def train_step(model, optimizer, input_ids):
    optimizer.zero_grad()
    # Shift inputs and targets for next-token prediction
    input_ids, labels = input_ids[:, :-1], input_ids[:, 1:]
    outputs = model(input_ids, labels=labels)
    loss = outputs.loss  # Cross-entropy loss from model
    loss.backward()
    optimizer.step()
    return loss.item()

# Gradient accumulation implementation
def train_with_accumulation(model, total_batch_size=8, micro_batch_size=2, seq_len=1024, steps=20):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    micro_steps = total_batch_size // micro_batch_size

    for step in range(steps):
        optimizer.zero_grad()
        total_loss = 0

        for _ in range(micro_steps):
            input_ids = torch.randint(0, 50257, (micro_batch_size, seq_len)).cuda()
            input_ids, labels = input_ids[:, :-1], input_ids[:, 1:]
            outputs = model(input_ids, labels=labels)
            loss = outputs.loss
            loss.backward()  # Gradients accumulate
            total_loss += loss.item()

        optimizer.step()  # Update after all micro-batches
        print(f"Step {step}, Average Loss: {total_loss/micro_steps:.4f}")

    print("Finished training with gradient accumulation")

# Main execution
if __name__ == "__main__":
    print("Starting GPT-2 training experiments...")

    # Gradient accumulation exercise
    print("\nGradient Accumulation Exercise:")
    train_with_accumulation(model)

Starting GPT-2 training experiments...

Gradient Accumulation Exercise:
Step 0, Average Loss: 10.9998
Step 1, Average Loss: 10.9842
Step 2, Average Loss: 10.9824
Step 3, Average Loss: 11.0622
Step 4, Average Loss: 11.0066
Step 5, Average Loss: 11.0169
Step 6, Average Loss: 11.0006
Step 7, Average Loss: 10.9932
Step 8, Average Loss: 11.0006
Step 9, Average Loss: 11.0247
Step 10, Average Loss: 11.0560
Step 11, Average Loss: 11.0714
Step 12, Average Loss: 11.0651
Step 13, Average Loss: 11.0603
Step 14, Average Loss: 11.0382
Step 15, Average Loss: 11.0371
Step 16, Average Loss: 11.0261
Step 17, Average Loss: 11.0205
Step 18, Average Loss: 11.0276
Step 19, Average Loss: 11.0250
Finished training with gradient accumulation
