# Colab Setup

In [None]:
# Install dependencies (Colab environments usually need these)
!pip install datasets transformers tqdm faiss-cpu sentence-transformers adafactor

In [None]:
# Mount Google Drive to save the model permanently
from google.colab import drive
drive.mount('/content/drive')

# Fine-Tune GPT-2 Large on FineWeb-Edu

This notebook fine-tunes the **GPT-2 Large (774M params)** pre-trained model on high-quality educational content from FineWeb-Edu.

**Hardware**: Optimized for **T4 GPU (15GB VRAM)** on Google Colab.

In [None]:
import torch
from datasets import load_dataset, Dataset
from transformers import (
    GPT2LMHeadModel,
    GPT2TokenizerFast,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
import os
from tqdm import tqdm

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
print(f"CUDA available: {torch.cuda.is_available()}")
if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

## 1. Load Pre-trained GPT-2 Large Model & Tokenizer

We upgrade to **GPT-2 Large (774M params)** for much better reasoning and knowledge capacity.

In [None]:
MODEL_NAME = "gpt2-large"

print(f"Loading pre-trained model: {MODEL_NAME}...")
tokenizer = GPT2TokenizerFast.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)

model.gradient_checkpointing_enable()
# Enabling gradient checkpointing to fit 774M params in 15GB VRAM

num_params = sum(p.numel() for p in model.parameters())
print(f"Model loaded: {num_params / 1e6:.1f}M parameters")
print(f"Max context length: {model.config.n_positions}")

## 2. Load & Materialize FineWeb-Edu Dataset

Using **1,000,000 samples** for the larger model's training.

In [None]:
NUM_SAMPLES = 1_000_000

print(f"Loading {NUM_SAMPLES:,} samples from FineWeb-Edu (streaming)...")
dataset = load_dataset(
    "HuggingFaceFW/fineweb-edu",
    split="train",
    streaming=True
)

# Materializing
subset_iter = dataset.take(NUM_SAMPLES)
data_list = [row for row in tqdm(subset_iter, total=NUM_SAMPLES, desc="Materializing dataset")]
print(f"Total samples materialized: {len(data_list):,}")
print(f"Sample keys: {data_list[0].keys()}")

## 3. Tokenize the Dataset

In [None]:
MAX_LENGTH = 1024

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    )

print("Converting to HuggingFace Dataset...")
hf_dataset = Dataset.from_list(data_list)

print("Tokenizing (this may take 15-30 mins for 1M samples)...")
tokenized_dataset = hf_dataset.map(
    tokenize_function,
    batched=True,
    batch_size=1000,
    remove_columns=hf_dataset.column_names,
    desc="Tokenizing"
)

print(f"Tokenized dataset: {len(tokenized_dataset):,} samples")
print(f"Token sequence length: {MAX_LENGTH}")

## 4. Configure Training

Optimized for **T4 15GB VRAM**:
- **Batch size 2** + accumulation 8 = effective batch 16
- **Adafactor** + **FP16** + **Gradient Checkpointing** enabled

In [None]:
output_dir = "out/models/gpt2_large_finetuned"
os.makedirs(output_dir, exist_ok=True)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False
)

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=2,   # Increased for T4
    gradient_accumulation_steps=8,   # Effective batch 16
    learning_rate=2e-5,
    num_train_epochs=1,
    fp16=True,
    gradient_checkpointing=True,
    optim="adafactor",
    logging_steps=100,
    save_steps=2000,
    save_total_limit=2,
    report_to="none",
    dataloader_num_workers=4
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)

print(f"Training config:")
print(f"  Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"  Total steps: {len(tokenized_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps)}")
print(f"  Optimizer: {training_args.optim}")
print(f"  FP16: {training_args.fp16}")
print(f"  Gradient checkpointing: {training_args.gradient_checkpointing}")

## 5. Train

In [None]:
print("Starting fine-tuning on T4 cloud...")
trainer.train()

In [None]:
print("Saving model and tokenizer...")
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to: {output_dir}")
print("Fine-tuning complete!")

## 6. Build RAG Index

In [None]:
import faiss
from sentence_transformers import SentenceTransformer
import numpy as np

RAG_SAMPLES = 100_000
RAG_DIR = "out/rag_index"
os.makedirs(RAG_DIR, exist_ok=True)

passages = []
for row in tqdm(data_list[:RAG_SAMPLES], desc="Extracting passages"):
    text = row["text"].strip()
    for i in range(0, len(text), 500):
        chunk = text[i:i + 500].strip()
        if len(chunk) > 50: passages.append(chunk)

print(f"Total passages: {len(passages):,}")

print("Loading sentence-transformer model...")
embedder = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

print("Encoding passages (this may take a few minutes)...")
embeddings = embedder.encode(passages, show_progress_bar=True, batch_size=256, convert_to_numpy=True)

print("Building FAISS index...")
index = faiss.IndexFlatIP(embeddings.shape[1])
faiss.normalize_L2(embeddings)
index.add(embeddings)

faiss.write_index(index, os.path.join(RAG_DIR, "faiss_index.bin"))
np.save(os.path.join(RAG_DIR, "passages.npy"), np.array(passages, dtype=object))
print("RAG Index Built!")
print(f"RAG index saved to {RAG_DIR}/")
print(f"  Index: {index.ntotal:,} vectors, {dimension}D")
print(f"  Passages: {len(passages):,}")
print("Done!")

## 7. Save to Google Drive
Run this to persist your model so you can download it locally.

In [None]:
import shutil
drive_path = "/content/drive/MyDrive/fineweb_edu_gpt2_large"
os.makedirs(drive_path, exist_ok=True)

print("Copying model to Google Drive...")
shutil.copytree(output_dir, os.path.join(drive_path, "model"), dirs_exist_ok=True)
shutil.copytree(RAG_DIR, os.path.join(drive_path, "rag_index"), dirs_exist_ok=True)
print(f"All files saved to: {drive_path}")