# Character-level GPT-2 Training Notebook
This notebook combines the model definition and training loop for a character-level GPT-2 model. You can use this notebook to train your model on a GPU platform.

In [None]:
# Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from pathlib import Path
import tqdm

In [None]:
# Model Definition (from model.py)
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads
        assert self.head_dim * heads == embed_size, "Embed size must be divisible by heads"
        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(embed_size, embed_size)
    def forward(self, values, keys, query, mask):
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = query.reshape(N, query_len, self.heads, self.head_dim)
        energy = torch.einsum("nqhd,nkhd->nhqk", [queries, keys])
        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))
        attention = torch.softmax(energy / (self.embed_size ** (0.5)), dim=3)
        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(N, query_len, self.embed_size)
        out = self.fc_out(out)
        return out

class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        self.ffn = nn.Sequential(
            nn.Linear(embed_size, forward_expansion * embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion * embed_size, embed_size)
        )
        self.dropout = nn.Dropout(dropout)
    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)
        x = self.dropout(self.norm1(attention + query))
        forward = self.ffn(x)
        out = self.dropout(self.norm2(forward + x))
        return out

class GPT2(nn.Module):
    def __init__(self, vocab_size, embed_size=128, num_layers=2, heads=2, dropout=0.1, forward_expansion=2, max_length=64):
        super(GPT2, self).__init__()
        self.embed_size = embed_size
        self.token_embedding = nn.Embedding(vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)
        self.layers = nn.ModuleList([
            TransformerBlock(embed_size, heads, dropout, forward_expansion)
            for _ in range(num_layers)
        ])
        self.dropout = nn.Dropout(dropout)
        self.fc_out = nn.Linear(embed_size, vocab_size)
        self.max_length = max_length
    def forward(self, x, mask=None):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(x.device)
        token_embeds = self.token_embedding(x)
        pos_embeds = self.position_embedding(positions)
        x = self.dropout(token_embeds + pos_embeds)
        for layer in self.layers:
            x = layer(x, x, x, mask)
        out = self.fc_out(x)
        return out

In [None]:
# Dataset Definition (from train.py)
class TextDataset(Dataset):
    def __init__(self, text, seq_length=64, pad_token='[PAD]'):
        self.text = text
        self.seq_length = seq_length
        self.vocab = sorted(set(text) | set([pad_token]))
        self.vocab_size = len(self.vocab)
        self.char_to_idx = {ch: i for i, ch in enumerate(self.vocab)}
        self.idx_to_char = {i: ch for i, ch in enumerate(self.vocab)}
        self.pad_token = pad_token
        self.pad_idx = self.char_to_idx[pad_token]
        self.data = [self.char_to_idx[ch] for ch in text]
    def __len__(self):
        return len(self.data) - self.seq_length
    def __getitem__(self, idx):
        x = self.data[idx:idx + self.seq_length]
        y = self.data[idx + self.seq_length]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

In [None]:
# Load Data
# Place your data.txt file in the same directory as this notebook or update the path below.
data_path = Path("data.txt")
with open(data_path, "r") as f:
    data = f.read()

In [None]:
# Training Setup
seq_length = 64
batch_size = 32
num_epochs = 20
lr = 3e-4
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_save_path = "gpt2_char_model.pt"

# Prepare dataset and dataloader
dataset = TextDataset(data, seq_length=seq_length)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)

# Model
model = GPT2(vocab_size=dataset.vocab_size, max_length=seq_length, embed_size=128, num_layers=2, heads=2, dropout=0.1, forward_expansion=2)
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

In [None]:
# Training Loop
for epoch in range(1, num_epochs + 1):
    model.train()
    total_loss = 0
    pbar = tqdm.tqdm(train_loader, desc=f"Epoch {epoch}")
    for x, y in pbar:
        x = x.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        logits = model(x)
        logits = logits[:, -1, :]
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x.size(0)
        pbar.set_postfix({"loss": loss.item()})
    avg_loss = total_loss / len(dataset)
    print(f"Epoch {epoch} - Loss: {avg_loss:.4f}")
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': epoch,
        'vocab': dataset.vocab,
        'char_to_idx': dataset.char_to_idx,
        'idx_to_char': dataset.idx_to_char
    }, model_save_path)
    # Evaluation after 10 epochs
    if epoch % 10 == 0:
        model.eval()
        eval_loss = 0
        with torch.no_grad():
            for x, y in tqdm.tqdm(train_loader, desc="Evaluating"):
                x = x.to(device)
                y = y.to(device)
                logits = model(x)
                logits = logits[:, -1, :]
                loss = criterion(logits, y)
                eval_loss += loss.item() * x.size(0)
        eval_loss /= len(dataset)
        print(f"Evaluation Loss after {epoch} epochs: {eval_loss:.4f}")

# Notes
- Make sure your `data.txt` file is present in the same directory as this notebook.
- Adjust hyperparameters as needed for your GPU platform.
- The model and optimizer state will be saved to `gpt2_char_model.pt` after each epoch.