# Build your own GPT

### 1. Asses Compute 

In [5]:
import torch, platform

print("Torch:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))
print("CPU:", platform.processor())
print("Platform:", platform.platform())


Torch: 2.10.0+cpu
CUDA available: False
CPU: Intel64 Family 6 Model 78 Stepping 3, GenuineIntel
Platform: Windows-10-10.0.19045-SP0


### 2. Choose and Download Dataset (Project Gutenberg)

In [6]:
from urllib import request

BOOK_URLS = [
    "https://www.gutenberg.org/cache/epub/2701/pg2701.txt",  # Moby Dick; Or, The Whale
    "https://www.gutenberg.org/cache/epub/11/pg11.txt",    # Alice's Adventures in Wonderland
]

def download_text(url: str) -> str:
    raw = request.urlopen(url).read()
    text = raw.decode("utf-8", errors="replace")
    start = text.find("*** START OF")
    end = text.find("*** END OF")
    if start != -1 and end != -1:
        text = text[start:end]

    return text

text = "\n\n".join(download_text(u) for u in BOOK_URLS)

print("Dataset characters:", len(text))
print(text[:500])


Dataset characters: 1389023
*** START OF THE PROJECT GUTENBERG EBOOK MOBY DICK; OR, THE WHALE ***




MOBY-DICK;

or, THE WHALE.

By Herman Melville



CONTENTS

ETYMOLOGY.

EXTRACTS (Supplied by a Sub-Sub-Librarian).

CHAPTER 1. Loomings.

CHAPTER 2. The Carpet-Bag.

CHAPTER 3. The Spouter-Inn.

CHAPTER 4. The Counterpane.

CHAPTER 5. Breakfast.

CHAPTER 6. The Street.

CHAPTER 7. The Chapel.

CHAPTER 8. The Pulpit.

CHAPTER 9. The Sermon.

CHAPTER 10. A Bosom Friend.

CHAPTER 11. Ni


### 3. Preprocess: vocabulary + encode/decode

In [7]:
# Build vocabulary
chars = sorted(list(set(text)))
vocab_size = len(chars)
print("Vocab size:", vocab_size)

# Create mappings
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

# Encode/decode functions
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

# Quick sanity test
test_str = "the princess smiled"
print(encode(test_str)[:40])
print(decode(encode(test_str)))


Vocab size: 103
[74, 62, 59, 2, 70, 72, 63, 68, 57, 59, 73, 73, 2, 73, 67, 63, 66, 59, 58]
the princess smiled


### 4. Tokenize entire dataset + train/val split

In [8]:
import torch

# Convert full text to integer tokens
data = torch.tensor(encode(text), dtype=torch.long)
print("Total tokens:", data.numel())

# Split train/val
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

print("Train tokens:", train_data.numel())
print("Val tokens:", val_data.numel())


Total tokens: 1389023
Train tokens: 1250120
Val tokens: 138903


### 5. Batching

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# Define initial hyperparameters 
batch_size = 16
block_size = 32

def get_batch(split: str):
    data_split = train_data if split == "train" else val_data
    ix = torch.randint(len(data_split) - block_size, (batch_size,))
    x = torch.stack([data_split[i:i+block_size] for i in ix])
    y = torch.stack([data_split[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

# Sanity check
xb, yb = get_batch("train")
print("x batch shape:", xb.shape)
print("y batch shape:", yb.shape)


x batch shape: torch.Size([16, 32])
y batch shape: torch.Size([16, 32])


### 6. Hyperparameters (CPU)

In [10]:
import torch
torch.manual_seed(1337)

# CPU-friendly hyperparameters
batch_size = 16
block_size = 32

max_iters = 3000
eval_interval = 300
learning_rate = 1e-3
eval_iters = 100

n_embd = 128
n_head = 4
n_layer = 4
dropout = 0.0

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)


Device: cpu


### 7. Loss estimation

In [11]:
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ["train", "val"]:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean().item()
    model.train()
    return out


### 8. Transformer blocks

In [12]:
import torch.nn as nn
from torch.nn import functional as F

class Head(nn.Module):
    """One head of self-attention."""

    def __init__(self, head_size: int):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer("tril", torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # x: (B, T, C)
        B, T, C = x.shape
        k = self.key(x)      # (B, T, head_size)
        q = self.query(x)    # (B, T, head_size)

        # attention scores
        wei = q @ k.transpose(-2, -1) * (k.shape[-1] ** -0.5)  # (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float("-inf"))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)

        v = self.value(x)    # (B, T, head_size)
        out = wei @ v        # (B, T, head_size)
        return out


class MultiHeadAttention(nn.Module):
    """Multiple heads of self-attention in parallel."""

    def __init__(self, num_heads: int, head_size: int):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)  # (B, T, C)
        out = self.proj(out)
        out = self.dropout(out)
        return out


class FeedForward(nn.Module):
    """A simple MLP."""

    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)


class Block(nn.Module):
    """Transformer block: communication followed by computation."""

    def __init__(self):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward()
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x


### 9. GPT Language Model

In [13]:
class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        # embeddings
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)

        # transformer
        self.blocks = nn.Sequential(*[Block() for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)  # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        # idx: (B, T)
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx)  # (B, T, C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=idx.device))  # (T, C)
        x = tok_emb + pos_emb  # (B, T, C)

        x = self.blocks(x)     # (B, T, C)
        x = self.ln_f(x)       # (B, T, C)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        loss = None
        if targets is not None:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens: int):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]  # crop context
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]        # last time step: (B, vocab_size)
            probs = F.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)  # (B, 1)
            idx = torch.cat((idx, idx_next), dim=1)             # (B, T+1)
        return idx


In [14]:
model = GPTLanguageModel().to(device)
print(sum(p.numel() for p in model.parameters())/1e6, "M parameters")


0.822375 M parameters
