In [1]:
# Install required packages optimized for CUDA 12.x
!pip install transformers
!pip install peft
!pip install datasets
!pip install accelerate
!pip install matplotlib
!pip install tqdm



In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.amp import autocast, GradScaler
from torch.utils.data import DataLoader

from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    GPT2Config,
    get_linear_schedule_with_warmup,
    set_seed
)

from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset

import matplotlib.pyplot as plt
import time
import gc
from tqdm.auto import tqdm
import numpy as np

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA Version: {torch.version.cuda}")

# Enable mixed precision
scaler = GradScaler()

# Set random seeds for reproducibility
set_seed(42)

Using device: cuda
GPU: Tesla T4
CUDA Version: 12.6


In [4]:
# Memory-optimized hyperparameters
BATCH_SIZE = 4  # Reduced from 32
GRADIENT_ACCUMULATION_STEPS = 8  # This gives effective batch size of 32
NUM_BATCHES = 50  # Reduced for memory
EPOCHS = 10 # Can be set to a higher value for better results
MAX_SEQUENCE_LENGTH = 64  # Reduced from 128
MAX_GENERATION_LENGTH = 700
LEARNING_RATE = 5e-5
WEIGHT_DECAY = 0.01

GPT2_MODEL_NAME = "gpt2"

# LoRA-specific hyperparameters
LORA_RANK = 4
LORA_ALPHA = 32
LORA_DROPOUT = 0.1

# Set memory optimization FIRST
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True,max_split_size_mb:128'

print(f"Effective batch size: {BATCH_SIZE * GRADIENT_ACCUMULATION_STEPS}")

Effective batch size: 32


In [5]:
# Load Reddit TIFU dataset
try:
    dataset = load_dataset("Fredithefish/Reddit-TIFU", split="train")

    # Map to expected 'documents' field if needed
    if 'documents' not in dataset.column_names:
        def map_to_documents(example):
            return {'documents': example.get('selftext', example.get('title', ''))}
        dataset = dataset.map(map_to_documents)

except:
    # Fallback to custom dataset if download fails
    from datasets import Dataset
    texts = ["TIFU by accidentally sending an embarrassing text to the wrong person."] * (NUM_BATCHES * BATCH_SIZE)
    dataset = Dataset.from_dict({'documents': texts})

print(f"Dataset size: {len(dataset)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tifu_collection.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/619 [00:00<?, ? examples/s]

Map:   0%|          | 0/619 [00:00<?, ? examples/s]

Dataset size: 619


In [6]:
# Examine dataset structure
sample = dataset[0]
print("Sample document:")
print(sample['documents'][:500] + "...")
print("\nSample title:")
print(sample['title'])

Sample document:
TIFU by raising the flag upside down on a military base and causing local farmers to think the base was in distress....

Sample title:
TIFU by raising the flag upside down on a military base and causing local farmers to think the base was in distress.


In [7]:
# Initialize tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(GPT2_MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

# Preprocess dataset
def tokenize_function(examples):
    # Use only the documents field
    texts = examples['documents']
    return tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=MAX_SEQUENCE_LENGTH,
        return_tensors='pt'
    )

# Adjust sample size based on available data
available_samples = len(dataset)
total_needed = NUM_BATCHES * BATCH_SIZE
actual_samples = min(available_samples, total_needed)

print(f"Dataset has {available_samples} examples")
print(f"Using {actual_samples} examples")

# Take subset and tokenize
small_dataset = dataset.select(range(actual_samples))
tokenized_dataset = small_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=small_dataset.column_names
)

# Convert to PyTorch dataset
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Create DataLoader
train_dataloader = DataLoader(
    tokenized_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True
)

print(f"Training batches: {len(train_dataloader)}")

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Dataset has 619 examples
Using 200 examples


Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Training batches: 50


In [8]:
class GPUMemoryTracker:
    def __init__(self, target_batches, print_stats=False):
        self.target_batches = target_batches
        self.print_stats = print_stats
        self.memory_usage = []
        self.labels = []

    def _compute_memory_usage(self):
        if torch.cuda.is_available():
            # Convert bytes to GB
            peak_usage = torch.cuda.max_memory_allocated() / (2**30)
            self.memory_usage.append(round(peak_usage, 3))

            if self.print_stats:
                current_usage = torch.cuda.memory_allocated() / (2**30)
                print(f"Current memory: {current_usage:.3f}GB, Peak memory: {peak_usage:.3f}GB")

    def on_epoch_begin(self, epoch):
        self._compute_memory_usage()
        self.labels.append(f"epoch {epoch} start")

    def on_batch_begin(self, batch):
        if batch in self.target_batches:
            self._compute_memory_usage()
            self.labels.append(f"batch {batch}")

    def on_epoch_end(self, epoch):
        self._compute_memory_usage()
        self.labels.append(f"epoch {epoch} end")

In [9]:
def generate_text(model, tokenizer, input_text, max_length=200, device='cuda'):
    start = time.time()

    model.eval()

    # Tokenize input
    input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

    # Generate text
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            top_p=0.95
        )

    # Decode output
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    print("\nOutput:")
    print(generated_text)

    end = time.time()
    print(f"Total Time Elapsed: {end - start:.2f}s")

In [10]:
def get_optimizer_and_scheduler(model, num_training_steps):
    # Separate parameters for weight decay
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad],
            "weight_decay": WEIGHT_DECAY,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad],
            "weight_decay": 0.0,
        },
    ]

    optimizer = optim.AdamW(
        optimizer_grouped_parameters,
        lr=LEARNING_RATE,
        eps=1e-6
    )

    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=100,
        num_training_steps=num_training_steps
    )

    return optimizer, scheduler

In [11]:
# Load GPT-2 model
gpt2_model = GPT2LMHeadModel.from_pretrained(GPT2_MODEL_NAME)
gpt2_model = gpt2_model.to(device)

print(f"Model loaded to {device}")
print(f"Total parameters: {sum(p.numel() for p in gpt2_model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in gpt2_model.parameters() if p.requires_grad):,}")

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Model loaded to cuda
Total parameters: 124,439,808
Trainable parameters: 124,439,808


In [12]:
# Initialize memory tracker
gpu_memory_tracker = GPUMemoryTracker(
    target_batches=[5, 10, 25, 50, 100, 150, 200, 300, 400, 500],
    print_stats=True,
)

# Calculate total training steps
num_training_steps = len(train_dataloader) * EPOCHS

# Get optimizer and scheduler
optimizer, scheduler = get_optimizer_and_scheduler(gpt2_model, num_training_steps)

print(f"Total training steps: {num_training_steps}")

Total training steps: 500


In [15]:
'''
# COMPLETE GPU MEMORY CLEARANCE CODE BLOCK
# Run this in a new cell to completely clear GPU memory

import gc
import torch
import os

print("Starting GPU memory cleanup...")

# Step 1: Delete all model-related variables
variables_to_delete = [
    'gpt2_model', 'trained_gpt2', 'base_model', 'lora_model', 'loaded_lora_model',
    'optimizer', 'scheduler', 'lora_optimizer', 'lora_scheduler',
    'train_dataloader', 'dataset', 'tokenized_dataset', 'small_dataset',
    'tokenizer', 'scaler', 'gpu_memory_tracker', 'lora_memory_tracker',
    'outputs', 'loss', 'input_ids', 'attention_mask', 'labels'
]

deleted_count = 0
for var_name in variables_to_delete:
    if var_name in locals():
        exec(f"del {var_name}")
        deleted_count += 1
        print(f"   ✓ Deleted {var_name}")
    elif var_name in globals():
        exec(f"del {var_name}")
        deleted_count += 1
        print(f"   ✓ Deleted {var_name} (global)")

print(f"Deleted {deleted_count} variables")

# Step 2: Force garbage collection
print("Running garbage collection...")
for i in range(3):  # Run multiple times for thorough cleanup
    collected = gc.collect()
    print(f"   Cycle {i+1}: Collected {collected} objects")

# Step 3: Clear CUDA cache
if torch.cuda.is_available():
    print("Clearing CUDA cache...")
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

    # Reset CUDA memory stats
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.reset_accumulated_memory_stats()

    print("Current GPU memory status:")
    allocated = torch.cuda.memory_allocated() / (1024**3)  # GB
    reserved = torch.cuda.memory_reserved() / (1024**3)   # GB
    max_allocated = torch.cuda.max_memory_allocated() / (1024**3)  # GB

    print(f"   Allocated: {allocated:.2f} GB")
    print(f"   Reserved: {reserved:.2f} GB")
    print(f"   Max allocated: {max_allocated:.2f} GB")
    print(f"   Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / (1024**3):.2f} GB")

    # Calculate free memory
    free_memory = (torch.cuda.get_device_properties(0).total_memory / (1024**3)) - reserved
    print(f"Free memory: {free_memory:.2f} GB")

else:
    print("CUDA not available")

# Step 4: Set memory optimization environment variables
print("Setting memory optimization flags...")
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True,max_split_size_mb:128'
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'  # For debugging if needed

print("GPU memory cleanup completed!")
print("\n You can now run your training code with a clean GPU state.")
print("\n Recommended next steps:")
print("   1. Restart your kernel for the cleanest start (optional but recommended)")
print("   2. Use the memory-optimized hyperparameters:")
print("      - BATCH_SIZE = 2 or 4")
print("      - MAX_SEQUENCE_LENGTH = 64")
print("      - Use gradient accumulation")
'''

'\n# COMPLETE GPU MEMORY CLEARANCE CODE BLOCK\n# Run this in a new cell to completely clear GPU memory\n\nimport gc\nimport torch\nimport os\n\nprint("Starting GPU memory cleanup...")\n\n# Step 1: Delete all model-related variables\nvariables_to_delete = [\n    \'gpt2_model\', \'trained_gpt2\', \'base_model\', \'lora_model\', \'loaded_lora_model\',\n    \'optimizer\', \'scheduler\', \'lora_optimizer\', \'lora_scheduler\',\n    \'train_dataloader\', \'dataset\', \'tokenized_dataset\', \'small_dataset\',\n    \'tokenizer\', \'scaler\', \'gpu_memory_tracker\', \'lora_memory_tracker\',\n    \'outputs\', \'loss\', \'input_ids\', \'attention_mask\', \'labels\'\n]\n\ndeleted_count = 0\nfor var_name in variables_to_delete:\n    if var_name in locals():\n        exec(f"del {var_name}")\n        deleted_count += 1\n        print(f"   ✓ Deleted {var_name}")\n    elif var_name in globals():\n        exec(f"del {var_name}")\n        deleted_count += 1\n        print(f"   ✓ Deleted {var_name} (globa

In [13]:
def train_model_memory_optimized(model, dataloader, optimizer, scheduler, memory_tracker, epochs=1):
    model.train()

    # Enable gradient checkpointing to save memory
    if hasattr(model, 'gradient_checkpointing_enable'):
        model.gradient_checkpointing_enable()

    for epoch in range(epochs):
        memory_tracker.on_epoch_begin(epoch)
        epoch_loss = 0

        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")

        # Initialize gradient accumulation
        optimizer.zero_grad()

        for batch_idx, batch in enumerate(progress_bar):
            memory_tracker.on_batch_begin(batch_idx)

            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = input_ids.clone()

            # Forward pass with mixed precision
            with autocast('cuda'):
                outputs = model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    labels=labels
                )
                loss = outputs.loss / GRADIENT_ACCUMULATION_STEPS  # Scale loss

            # Backward pass
            scaler.scale(loss).backward()

            # Only step optimizer every GRADIENT_ACCUMULATION_STEPS
            if (batch_idx + 1) % GRADIENT_ACCUMULATION_STEPS == 0 or (batch_idx + 1) == len(dataloader):
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
                optimizer.zero_grad()

            epoch_loss += loss.item() * GRADIENT_ACCUMULATION_STEPS
            progress_bar.set_postfix({'loss': loss.item() * GRADIENT_ACCUMULATION_STEPS})

            # Clear cache periodically
            if batch_idx % 5 == 0:  # More frequent clearing
                torch.cuda.empty_cache()

        memory_tracker.on_epoch_end(epoch)
        avg_epoch_loss = epoch_loss / len(dataloader)
        print(f"Epoch {epoch+1} average loss: {avg_epoch_loss:.4f}")

    return model, memory_tracker.memory_usage


In [16]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, get_linear_schedule_with_warmup
import torch.optim as optim

# 1. Load GPT-2 model & tokenizer
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# 2. Define optimizer & scheduler
optimizer = optim.AdamW(
    [p for p in gpt2_model.parameters() if p.requires_grad],
    lr=LEARNING_RATE,
    weight_decay=WEIGHT_DECAY,
)
num_training_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=num_training_steps)

# 3. Train and then save
print("Starting training...")
trained_gpt2, _ = train_model_memory_optimized(
    gpt2_model,
    train_dataloader,
    optimizer,
    scheduler,
    gpu_memory_tracker,
    EPOCHS
)
print("Training completed!")

def save_full_gpt2_model(model, tokenizer, save_path="./my-fine-tuned-gpt2"):
    import os
    os.makedirs(save_path, exist_ok=True)
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)
    print(f"Saved to {save_path}")

save_full_gpt2_model(trained_gpt2, tokenizer)

print("\nTesting:")
generate_text(trained_gpt2, tokenizer, "I like basketball", max_length=MAX_GENERATION_LENGTH, device=device)

Starting training...
Current memory: 0.476GB, Peak memory: 0.952GB


Epoch 1/10:   0%|          | 0/50 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Current memory: 0.985GB, Peak memory: 1.354GB
Current memory: 0.985GB, Peak memory: 1.354GB
Current memory: 0.986GB, Peak memory: 1.358GB
Current memory: 1.447GB, Peak memory: 2.380GB
Epoch 1 average loss: 7.9074
Current memory: 1.447GB, Peak memory: 2.380GB


Epoch 2/10:   0%|          | 0/50 [00:00<?, ?it/s]

Current memory: 1.915GB, Peak memory: 2.380GB
Current memory: 1.915GB, Peak memory: 2.380GB
Current memory: 1.916GB, Peak memory: 2.383GB
Current memory: 1.447GB, Peak memory: 2.383GB
Epoch 2 average loss: 6.8101
Current memory: 1.447GB, Peak memory: 2.383GB


Epoch 3/10:   0%|          | 0/50 [00:00<?, ?it/s]

Current memory: 1.915GB, Peak memory: 2.383GB
Current memory: 1.915GB, Peak memory: 2.383GB
Current memory: 1.916GB, Peak memory: 2.383GB
Current memory: 1.447GB, Peak memory: 2.383GB
Epoch 3 average loss: 3.9802
Current memory: 1.447GB, Peak memory: 2.383GB


Epoch 4/10:   0%|          | 0/50 [00:00<?, ?it/s]

Current memory: 1.915GB, Peak memory: 2.383GB
Current memory: 1.915GB, Peak memory: 2.383GB
Current memory: 1.916GB, Peak memory: 2.383GB
Current memory: 1.447GB, Peak memory: 2.383GB
Epoch 4 average loss: 1.8619
Current memory: 1.447GB, Peak memory: 2.383GB


Epoch 5/10:   0%|          | 0/50 [00:00<?, ?it/s]

Current memory: 1.915GB, Peak memory: 2.383GB
Current memory: 1.915GB, Peak memory: 2.383GB
Current memory: 1.916GB, Peak memory: 2.383GB
Current memory: 1.447GB, Peak memory: 2.383GB
Epoch 5 average loss: 1.1371
Current memory: 1.447GB, Peak memory: 2.383GB


Epoch 6/10:   0%|          | 0/50 [00:00<?, ?it/s]

Current memory: 1.915GB, Peak memory: 2.383GB
Current memory: 1.915GB, Peak memory: 2.383GB
Current memory: 1.916GB, Peak memory: 2.383GB
Current memory: 1.447GB, Peak memory: 2.383GB
Epoch 6 average loss: 0.8726
Current memory: 1.447GB, Peak memory: 2.383GB


Epoch 7/10:   0%|          | 0/50 [00:00<?, ?it/s]

Current memory: 1.915GB, Peak memory: 2.383GB
Current memory: 1.915GB, Peak memory: 2.383GB
Current memory: 1.916GB, Peak memory: 2.383GB
Current memory: 1.447GB, Peak memory: 2.383GB
Epoch 7 average loss: 0.7611
Current memory: 1.447GB, Peak memory: 2.383GB


Epoch 8/10:   0%|          | 0/50 [00:00<?, ?it/s]

Current memory: 1.915GB, Peak memory: 2.383GB
Current memory: 1.915GB, Peak memory: 2.383GB
Current memory: 1.916GB, Peak memory: 2.383GB
Current memory: 1.447GB, Peak memory: 2.383GB
Epoch 8 average loss: 0.7096
Current memory: 1.447GB, Peak memory: 2.383GB


Epoch 9/10:   0%|          | 0/50 [00:00<?, ?it/s]

Current memory: 1.915GB, Peak memory: 2.383GB
Current memory: 1.915GB, Peak memory: 2.383GB
Current memory: 1.916GB, Peak memory: 2.383GB
Current memory: 1.447GB, Peak memory: 2.383GB
Epoch 9 average loss: 0.6581
Current memory: 1.447GB, Peak memory: 2.383GB


Epoch 10/10:   0%|          | 0/50 [00:00<?, ?it/s]

Current memory: 1.915GB, Peak memory: 2.383GB
Current memory: 1.915GB, Peak memory: 2.383GB
Current memory: 1.916GB, Peak memory: 2.383GB
Current memory: 1.447GB, Peak memory: 2.383GB
Epoch 10 average loss: 0.6199
Training completed!


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Saved to ./my-fine-tuned-gpt2

Testing:

Output:
I like basketball and I think I'm getting a little too caught up in my own life."
Total Time Elapsed: 0.76s


In [18]:
print("Generating text with base GPT-2...")
generate_text(gpt2_model, tokenizer, "I like basketball", max_length=MAX_GENERATION_LENGTH, device=device)
#generate_text(gpt2_model, tokenizer, "That Italian restaurant is", max_length=MAX_GENERATION_LENGTH, device=device)

Generating text with base GPT-2...

Output:
I like basketball with my mom.
Total Time Elapsed: 0.05s


In [22]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
# Load from disk
model = GPT2LMHeadModel.from_pretrained("./my-fine-tuned-gpt2").to(device)
tokenizer = GPT2Tokenizer.from_pretrained("./my-fine-tuned-gpt2")
tokenizer.pad_token = tokenizer.eos_token  # if needed
# Then generate
print("Generating text with loaded fine-tuned GPT-2…")
generate_text(model, tokenizer, "I like Chocolate", max_length=MAX_GENERATION_LENGTH, device=device)

Generating text with loaded fine-tuned GPT-2…

Output:
I like Chocolate with my teeth
Total Time Elapsed: 0.04s


In [23]:
# Clean up memory from previous model
del trained_gpt2
del optimizer
del scheduler
gc.collect()
torch.cuda.empty_cache()

print("Memory cleared. Loading fresh GPT-2 for LoRA...")

Memory cleared. Loading fresh GPT-2 for LoRA...


In [24]:
# Load a fresh GPT-2 model for LoRA
base_model = GPT2LMHeadModel.from_pretrained(GPT2_MODEL_NAME)

# Configure LoRA
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=LORA_RANK,  # rank
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    target_modules=["c_attn", "c_proj"],  # target attention modules in GPT-2
)

# Create LoRA model
lora_model = get_peft_model(base_model, lora_config)
lora_model = lora_model.to(device)

# Print trainable parameters
lora_model.print_trainable_parameters()



trainable params: 405,504 || all params: 124,845,312 || trainable%: 0.3248


In [25]:
# Test forward pass
test_input = tokenizer("LoRA is very useful for quick LLM finetuning", return_tensors='pt')
test_input = {k: v.to(device) for k, v in test_input.items()}

with torch.no_grad():
    outputs = lora_model(**test_input)

print(f"Forward pass successful! Output shape: {outputs.logits.shape}")

Forward pass successful! Output shape: torch.Size([1, 12, 50257])


In [26]:
# Initialize memory tracker for LoRA
lora_memory_tracker = GPUMemoryTracker(
    target_batches=[5, 10, 25, 50, 100, 150, 200, 300, 400, 500],
    print_stats=True,
)

print("Creating LoRA model...")

# STEP 1: Load base model
base_model = GPT2LMHeadModel.from_pretrained(GPT2_MODEL_NAME)
base_model = base_model.to(device)

# STEP 2: CRITICAL FIX - Enable input gradients BEFORE applying LoRA
if hasattr(base_model, "enable_input_require_grads"):
    base_model.enable_input_require_grads()
else:
    def make_inputs_require_grad(module, input, output):
        output.requires_grad_(True)
    base_model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)

print("Input gradients enabled")

# STEP 3: Create LoRA configuration
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=LORA_RANK,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    target_modules=["c_attn", "c_proj", "c_fc"],
    bias="none",
)

# STEP 4: Apply LoRA to the model
lora_model = get_peft_model(base_model, lora_config)
lora_model.train()

# STEP 5: Verify trainable parameters
lora_model.print_trainable_parameters()

print(f"LoRA Model created successfully!")
print(f"Total parameters: {sum(p.numel() for p in lora_model.parameters()):,}")
print(f"Trainable parameters: {sum(p.numel() for p in lora_model.parameters() if p.requires_grad):,}")

# Get optimizer and scheduler for LoRA model
lora_optimizer, lora_scheduler = get_optimizer_and_scheduler(lora_model, num_training_steps)

print("Starting LoRA GPT-2 training...")
trained_lora_model, lora_memory_usage = train_model_memory_optimized(
    lora_model, train_dataloader, lora_optimizer, lora_scheduler, lora_memory_tracker, EPOCHS
)
print("LoRA GPT-2 training completed!")

Creating LoRA model...
Input gradients enabled
trainable params: 589,824 || all params: 125,029,632 || trainable%: 0.4717
LoRA Model created successfully!
Total parameters: 125,029,632
Trainable parameters: 589,824
Starting LoRA GPT-2 training...
Current memory: 1.451GB, Peak memory: 2.383GB


Epoch 1/10:   0%|          | 0/50 [00:00<?, ?it/s]

Current memory: 1.479GB, Peak memory: 2.383GB
Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.481GB, Peak memory: 2.383GB
Epoch 1 average loss: 7.8713
Current memory: 1.481GB, Peak memory: 2.383GB


Epoch 2/10:   0%|          | 0/50 [00:00<?, ?it/s]

Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.481GB, Peak memory: 2.383GB
Epoch 2 average loss: 7.8622
Current memory: 1.481GB, Peak memory: 2.383GB


Epoch 3/10:   0%|          | 0/50 [00:00<?, ?it/s]

Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.481GB, Peak memory: 2.383GB
Epoch 3 average loss: 7.8176
Current memory: 1.481GB, Peak memory: 2.383GB


Epoch 4/10:   0%|          | 0/50 [00:00<?, ?it/s]

Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.481GB, Peak memory: 2.383GB
Epoch 4 average loss: 7.6686
Current memory: 1.481GB, Peak memory: 2.383GB


Epoch 5/10:   0%|          | 0/50 [00:00<?, ?it/s]

Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.481GB, Peak memory: 2.383GB
Epoch 5 average loss: 7.3795
Current memory: 1.481GB, Peak memory: 2.383GB


Epoch 6/10:   0%|          | 0/50 [00:00<?, ?it/s]

Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.481GB, Peak memory: 2.383GB
Epoch 6 average loss: 7.0520
Current memory: 1.481GB, Peak memory: 2.383GB


Epoch 7/10:   0%|          | 0/50 [00:00<?, ?it/s]

Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.481GB, Peak memory: 2.383GB
Epoch 7 average loss: 6.5195
Current memory: 1.481GB, Peak memory: 2.383GB


Epoch 8/10:   0%|          | 0/50 [00:00<?, ?it/s]

Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.481GB, Peak memory: 2.383GB
Epoch 8 average loss: 5.6640
Current memory: 1.481GB, Peak memory: 2.383GB


Epoch 9/10:   0%|          | 0/50 [00:00<?, ?it/s]

Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.481GB, Peak memory: 2.383GB
Epoch 9 average loss: 4.5784
Current memory: 1.481GB, Peak memory: 2.383GB


Epoch 10/10:   0%|          | 0/50 [00:00<?, ?it/s]

Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.483GB, Peak memory: 2.383GB
Current memory: 1.481GB, Peak memory: 2.383GB
Epoch 10 average loss: 3.4724
LoRA GPT-2 training completed!


In [28]:
print("Generating text with LoRA fine-tuned GPT-2...")
generate_text(trained_lora_model, tokenizer, "I like basketball", max_length=MAX_GENERATION_LENGTH, device=device)
generate_text(trained_lora_model, tokenizer, "That Italian restaurant is", max_length=MAX_GENERATION_LENGTH, device=device)

Generating text with LoRA fine-tuned GPT-2...

Output:
I like basketball. I like to get to the rim, run, and then go after the ball. I like the way they run and the way they pass. That's the way I want to play. I'm not trying to get too much of anything out of it.

"I'm trying to get to the basket. I'm trying to get to the rim and I want to get to the rim and I want to get to the rim and I want to get to the rim and I want to get to the rim and I want to get to the rim and I want to get to the rim and I want to get to the rim and I want to get to the rim and I want to get to the rim and I want to get to the rim and I want to get to the rim and I want to get to the rim and I want to get to the rim and I want to get to the rim and I want to get to the rim and I want to get to the rim and I want to get to the rim and I want to get to the rim and I want to get to the rim and I want to get to the rim and I want to get to the rim and I want to get to the rim and I want to get to the rim and 

In [29]:
# Save LoRA adapters
trained_lora_model.save_pretrained("./gpt2-lora-reddit")
print("LoRA adapters saved to ./gpt2-lora-reddit")

# Calculate adapter size
import os
adapter_size = sum(
    os.path.getsize(os.path.join("./gpt2-lora-reddit", f))
    for f in os.listdir("./gpt2-lora-reddit")
    if os.path.isfile(os.path.join("./gpt2-lora-reddit", f))
) / (1024 * 1024)  # Convert to MB

print(f"LoRA adapter size: {adapter_size:.2f} MB")
print("Compare this to the full GPT-2 model which is ~500MB!")

LoRA adapters saved to ./gpt2-lora-reddit
LoRA adapter size: 2.27 MB
Compare this to the full GPT-2 model which is ~500MB!


In [30]:
from peft import PeftModel

# Load base model
base_model_for_inference = GPT2LMHeadModel.from_pretrained(GPT2_MODEL_NAME)

# Load LoRA adapters
loaded_lora_model = PeftModel.from_pretrained(base_model_for_inference, "./gpt2-lora-reddit")
loaded_lora_model = loaded_lora_model.to(device)

print("LoRA model loaded successfully!")

# Test generation
generate_text(loaded_lora_model, tokenizer, "Today I learned that", max_length=150, device=device)

LoRA model loaded successfully!

Output:
Today I learned that I had an illness in the early 90's, and I was just trying to keep from getting sick. I had some kind of anxiety, so I was trying to get help. It was really hard to get help, and I was struggling to make ends meet, so I went to the hospital. I felt really bad for myself, but I was trying to find other ways to get better, and that's when my doctor came in. I was in a very strong mood, and he said, "You're not going to make any more progress, you're going to die."

The doctor said, "I have the disease and I'm going to die. I want to die." I said, "Well,
Total Time Elapsed: 2.68s
