In this notebook, we will be working through [Building makemore Part 2: Training Diagnostics](https://www.youtube.com/watch?v=P6sfmUTpUmc) by Andrej Karpathy. This is the fourth video in the "Neural Networks: Zero to Hero" series and covers part 2 of it.

In this video we will look at diagnostic tools that we can use to see if our models are training optimally.


# Section 1: "Pytorchify" Our Model

In the previous notebook, we had a very haphazard way of creating the model. Lets first package those into layers similar to how pytorch does it.


In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
from tqdm import tqdm
%matplotlib inline

In [5]:
# lets copy over some stuff from the previous notebook

# utility to read dataset
DATASET_PATH = '../names.txt'
SPECIAL_TOKEN = "."
g = torch.Generator().manual_seed(2147483647)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

def get_dataset():
    with open(DATASET_PATH, 'r') as f:
        rows = [row.strip() for row in f.readlines()]
    return rows

# Load dataset
words = get_dataset()
print(f"{len(words)} names loaded")
print(f"Examples: {words[:8]}")

# Build character mappings — identical to lesson 2
# '.' is our special start/end token at index 0, then a=1, b=2, ..., z=26
all_characters = [SPECIAL_TOKEN] + sorted(list(set(''.join(words))))
stoi = {s: i for i, s in enumerate(all_characters)}
itos = {i: s for s, i in stoi.items()}
vocab_size = len(itos)
print(f"Vocabulary size: {vocab_size}")
print(f"Mappings: {itos}")
print(f"Running on: {device}")

32033 names loaded
Examples: ['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']
Vocabulary size: 27
Mappings: {0: '.', 1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z'}
Running on: cuda


In [6]:
def build_dataset(words, block_size=3):
    """
    Convert a list of words into (X, Y) tensors for training.

    X shape: (N, block_size) — each row is a context window of character indices
    Y shape: (N,) — each element is the target character index

    This function will be called three times: once each for train, val, and test splits.
    """
    X, Y = [], []
    for w in words:
        context = [0] * block_size
        for ch in w + SPECIAL_TOKEN:
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    return X, Y

import random
random.seed(42)
random.shuffle(words)

# 80/10/10 split at the word level
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1], block_size=3)     # training
Xdev, Ydev = build_dataset(words[n1:n2], block_size=3)  # validation ("dev" set)
Xte, Yte = build_dataset(words[n2:], block_size=3)      # test

Xtr, Ytr = Xtr.to(device), Ytr.to(device)
Xdev, Ydev = Xdev.to(device), Ydev.to(device)
Xte, Yte = Xte.to(device), Yte.to(device)

print(f"Training:   {Xtr.shape[0]:>7,} examples from {n1:,} names")
print(f"Validation: {Xdev.shape[0]:>7,} examples from {n2-n1:,} names")
print(f"Test:       {Xte.shape[0]:>7,} examples from {len(words)-n2:,} names")

Training:   182,625 examples from 25,626 names
Validation:  22,655 examples from 3,203 names
Test:        22,866 examples from 3,204 names


In [7]:
# Now this is the code we had at the end of the previous notebook

def get_params(embed_dim, block_size, n_hidden):

    tanh_gain = 5./3  # correction factor for tanh squashing variance

    g = torch.Generator().manual_seed(2147483647)
    C  = torch.randn((vocab_size, embed_dim),            generator=g)
    W1 = torch.randn((embed_dim * block_size, n_hidden), generator=g) * (tanh_gain / (embed_dim * block_size) ** 0.5)  # Kaiming with tanh gain
    b1 = torch.randn(n_hidden,                        generator=g) * 0.01  # small, near zero
    W2 = torch.randn((n_hidden, vocab_size),           generator=g) * (1 / (n_hidden) ** 0.5)  # Xavier (gain=1, no activation)
    b2 = torch.randn(vocab_size,                       generator=g) * 0.01  # small, near zero

    bngain = torch.ones((1, n_hidden))
    bnbias = torch.zeros((1, n_hidden))

    # add tracking variables which are NOT part of the training and thus dont receive gradients
    bnstd_running = torch.ones((1, n_hidden))
    bnmean_running = torch.zeros((1, n_hidden))

    parameters = [C, W1, b1, W2, b2, bngain, bnbias]
    for p in parameters:
        p.requires_grad = True

    return bnstd_running, bnmean_running, parameters

def train_model(params, n_steps=200000, batch_size=32):

    bnstd_running, bnmean_running, parameters = params
    C, W1, b1, W2, b2, bngain, bnbias = parameters

    stepi = []
    lossi = []
    loglossi = []

    pbar = tqdm(range(n_steps), desc="Training")
    for i in pbar:
        # Mini-batch: randomly sample 32 examples
        ixds = torch.randint(0, Xtr.shape[0], (batch_size,))
        mini_batch_inp, mini_batch_target = Xtr[ixds], Ytr[ixds]

        # Forward pass
        emb = C[mini_batch_inp]                                        # (32, 3, 10)
        emb_cat = emb.view(emb.shape[0], -1)                            # (32, 30)
        hidden_layer_preactivation = emb_cat @ W1 + b1                  # (32, 200)

        # CONVERT hidden_layer_preactivation to unit gaussian
        _mean = hidden_layer_preactivation.mean(axis=0, keepdim=True) # (1, 200) take mean across the samples in the mini batch
        _std = hidden_layer_preactivation.std(axis=0, keepdim=True) # (1, 200) take std across the samples in the mini batch

        hidden_layer_preactivation = (hidden_layer_preactivation - _mean) / _std # convert to unit gaussian

        # move the running average slightly based on the currnet mean and std directions
        with torch.no_grad():
            bnmean_running = 0.999 * bnmean_running + 0.001 * _mean
            bnstd_running = 0.999 * bnstd_running + 0.001 * _std

        # scale and shift
        hidden_layer_preactivation = hidden_layer_preactivation*bngain + bnbias

        h = torch.tanh(hidden_layer_preactivation)  # (32, 200)
        logits = h @ W2 + b2                                           # (32, 27)
        loss = F.cross_entropy(logits, mini_batch_target)

        # Backward pass
        for p in parameters:
            p.grad = None
        loss.backward()

        # Learning rate step decay: 0.1 for first 100K steps, then 0.01
        lr = 0.1 if i < 100000 else 0.01
        for p in parameters:
            p.data += -lr * p.grad

        # Track loss
        stepi.append(i)
        lossi.append(loss.item())
        loglossi.append(loss.log10().item())

        if i % 10 == 0:
            pbar.set_postfix(loss=f"{loss.data:.4f}")

    return stepi, lossi, loglossi, hidden_layer_preactivation, h, logits, bnmean_running, bnstd_running

# now we can use these running mean and std during eval
@torch.no_grad()
def eval_loss(X, Y, params, embed_dim, block_size):

    bnstd_running, bnmean_running, parameters = params
    C, W1, b1, W2, b2, bngain, bnbias = parameters

    emb = C[X]
    emb_cat = emb.view(emb.shape[0], -1)

    hidden_layer_preactivation = emb_cat @ W1 + b1
    hidden_layer_preactivation = (hidden_layer_preactivation - bnmean_running) / bnstd_running
    hidden_layer_preactivation = hidden_layer_preactivation*bngain + bnbias

    h = torch.tanh(hidden_layer_preactivation)  # (32, 200)
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, Y)
    return loss.item()

# bnstd_running, bnmean_running, parameters = get_params(embed_dim, block_size, n_hidden)

# stepi, lossi, loglossi, hidden_layer_preactivation, h, logits, bnmean_running, bnstd_running = \
#     train_model([bnstd_running, bnmean_running, parameters])

# print(f"Final Training loss: {eval_loss(Xtr, Ytr,[bnstd_running, bnmean_running, parameters], embed_dim, block_size)}, \
#     Validation Loss: {eval_loss(Xdev, Ydev, [bnstd_running, bnmean_running, parameters], embed_dim, block_size)}")

In [11]:
# Create Linear Layer

class Linear:

    def __init__(self, num_input_features, num_output_features, bias=True, device='cpu'):

        initialization_factor = num_input_features ** 0.5 # kaiming he initialization

        self.weights = \
            torch.randn((num_input_features, num_output_features), generator=g, device=device) / initialization_factor
        self.bias = torch.zeros(num_output_features, device=device) if bias else None

    def __call__(self, x):
        self.out = x @ self.weights # note that we keep the output under self so that we can access it later for diagnostics
        if self.bias:
            self.out += self.bias
        return self.out

    def parameters(self):
        return [self.weights, self.bias] if self.bias else [self.weights]

In [12]:
# Create batchnorm layer

class BatchNorm1d:

    def __init__(self, num_input_features, momentum=0.1, eps=1e-5, device='cpu'):

        self.eps = eps
        self.momentum = momentum

        # learnable scale and shift parameters (gain and bias)
        self.gamma = torch.ones(num_input_features, device=device)
        self.beta = torch.zeros(num_input_features, device=device)

        # Exponential moving average (EMA) tracking
        self.running_mean = torch.zeros(num_input_features, device=device)
        self.runnins_vars = torch.ones(num_input_features, device=device)

        self.training = True # batchnorm behaves different during training vs during inference/eval

    def __call__(self, x: torch.Tensor):

        if self.training:
            xmean = x.mean(axis=0, keepdim=True) # mean across batch
            xvars = x.var(axis=0, keepdim=True) # variance across batch
        else:
            xmean = self.running_mean
            xvars = self.runnins_vars

        # normalize
        self.out = (x - xmean) / torch.sqrt(xvars + self.eps)

        # scale and shift
        self.out = self.gamma * self.out + self.beta

        # EMA update

        if self.training:
            with torch.no_grad(): # these are not learnable params
                self.running_mean = self.running_mean * (1 - self.momentum) + xmean * self.momentum
                self.running_vars = self.running_vars * (1 - self.momentum) + xvars * self.momentum

        return self.out

    def parameters(self):
        return [self.gamma, self.beta]

In [13]:
# Tanh

class Tanh:

    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out

    def parameters(self):
        return []

In [None]:
# make a fn to create a stack of layers

def setup_model(embed_dim,
                block_size,
                n_hidden_per_layer,
                num_hidden_layers = 4,
                bias = True,
                tanh = True,
                batchnorm = True,
                output_gain = 0.1,
                tanh_gain = 5./3,
                device = 'cpu'
                ):

    # create embedding lookup

    C = torch.randn((vocab_size, embed_dim), generator=g, device=device)

    flattened_input_features = embed_dim * block_size

    layers = []
    hidden_layers = []
    # setup input

    input_layer = Linear(flattened_input_features, n_hidden_per_layer, bias=bias, device=device)
    layers.append(input_layer)
    if batchnorm:
        x = BatchNorm1d(n_hidden_per_layer, device=device)
        layers.append(x)
    if tanh:
        x = Tanh()
        layers.append(x)

    # setup hidden
    for _ in range(num_hidden_layers):
        x = Linear(n_hidden_per_layer, n_hidden_per_layer, bias=bias, device=device)
        hidden_layers.append(x)
        if batchnorm:
            x = BatchNorm1d(n_hidden_per_layer, device=device)
            hidden_layers.append(x)
        if tanh:
            x = Tanh()
            hidden_layers.append(x)

    layers.extend(hidden_layers)

    # setup output layer
    output_layer = Linear(n_hidden_per_layer, vocab_size, bias=bias, device=device)
    layers.append(output_layer)

    # Now lets play with the gain a bit to see how it affects activations and gradients

    with torch.no_grad():

        # make last layer less confident for starting with uniform distribution
        output_layer.weights *= output_gain

        if tanh:
            for layer in hidden_layers: # if linear layers are followed by tanh, apply tanh gain
                if isinstance(layer, Linear):
                    layer.weights *= tanh_gain

    parameters = [C] + [param for layer in layers for param in layer.parameters()]
    print(f"Total number of parameters in model: {sum(p.nelement() for p in parameters)}")

    for p in parameters:
        p.requires_grad = True

    return layers, parameters

Lets also port over our training model code with a few changes -

1. We need to tell pytorch to retain gradients for the intermediate (non leaf tensors). This is because by default, only leaf tensors (tensors explicitly created by the user or not the result of an operation) that have requires_grad=True retain their gradients. We want to access these grandients layer for diagnostics.

2. The forward pass looks much simpler now, we simple go through each layer sequentially

3. One important diagnostic metric that we want to track is how big is the update to a parameter. We know that in each backward step, we reduce the data of a parameter by the gradient scaled by a learning rate. The ratio of this update to the data will tell us how big or small the update is w.r.t to the data.


In [16]:
def train_model(C, layers, parameters, n_steps = 200000, batch_size = 32, lr = 0.1, decayed_lr = 0.01):

    stepi = []
    lossi = []
    loglossi = []
    update_to_data_ratio = []

    pbar = tqdm(range(n_steps), desc="Training")
    for i in pbar:
        # Mini-batch: randomly sample 32 examples
        ixds = torch.randint(0, Xtr.shape[0], (batch_size,))
        mini_batch_inp, mini_batch_target = Xtr[ixds], Ytr[ixds]

        # Forward pass
        emb = C[mini_batch_inp]
        x = emb.view(emb.shape[0], -1)

        for layer in layers:
            x = layer(x)

        # loss function
        loss = F.cross_entropy(x, mini_batch_target)

        # Backward pass

        # Retain gradients for intermediate layers for diagnostics
        for layer in layers:
            layer.out.retain_grad()

        for p in parameters:
            p.grad = None
        loss.backward()

        # Learning rate step decay: 0.1 for first 100K steps, then 0.01
        lr = lr if i < 100000 else decayed_lr
        for p in parameters:
            p.data += -lr * p.grad

        # Track loss
        stepi.append(i)
        lossi.append(loss.item())
        loglossi.append(loss.log10().item())

        if i % 10 == 0:
            pbar.set_postfix(loss=f"{loss.data:.4f}")

        with torch.no_grad():
            for p in parameters:

                update_std = lr * p.grad.std()
                data_std = p.data.std().log10().item()
                ratio = update_std / data_std
                update_to_data_ratio.append(ratio)

    return stepi, lossi, loglossi, update_to_data_ratio
