In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F 
import random
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
words = open('../names.txt', 'r').read().splitlines()

chrs = sorted(list(set(''.join(words))))
stoi = {s: i+1 for i, s in enumerate(chrs)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}


In [3]:
# parameters
vocab_size = len(itos)
block_size = 5
batch_size = 32
g = torch.Generator().manual_seed(1337)
n_embd = 10
n_hidden = 100
C = torch.randn((vocab_size, n_embd), generator=g)

In [4]:
# build dataset

def build_dataset(words, block_size=block_size):
    X, Y = [], []
    for w in words:
        context = [0] * block_size
        for ch in w + '.':
            ix = stoi[ch]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.size(), Y.size())
    return X, Y

random.seed(1337)
random.shuffle(words)
n1 = int(len(words)*0.8)
n2 = int(len(words)*0.9)

Xtr, Ytr = build_dataset(words[:n1], block_size)
Xval, Yval = build_dataset(words[n1:n2], block_size)
Xte, Yte = build_dataset(words[n2:], block_size)
        

torch.Size([182552, 5]) torch.Size([182552])
torch.Size([22737, 5]) torch.Size([22737])
torch.Size([22857, 5]) torch.Size([22857])


In [5]:
class Linear:

    def __init__(self, fan_in, fan_out, bias=True):
        self.weight = torch.randn((fan_in, fan_out), generator=g)
        self.bias = torch.randn(fan_out) if bias else None

    def __call__(self, x):
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out
    
    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])

class BatchNorm1d:

    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        self.momentum = momentum
        self.training = True
        # parameters
        self.gamma = torch.ones(dim)  #stdev 
        self.beta = torch.zeros(dim)  #mean
        # memory storage for re-production
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)

    def __call__(self, x):
        # calculate the forward pass
        if self.training:
            xmean = x.mean(0, keepdim=True)
            xvar = x.var(0, keepdim=True)
        else:
            xmean = self.running_mean
            xvar = self.running_var
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xhat + self.beta
        # update the memoru
        if self.training:
            with torch.no_grad():
                self.running_mean = (1-self.momentum) * self.running_mean * self.momentum * xmean
                self.running_var = (1-self.momentum) * self.running_var * self.momentum * xvar
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]
    

class Tanh:

    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    
    def parameters(self):
        return []

In [6]:
# model construct
layers = [
    Linear(n_embd * block_size, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(           n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(           n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(           n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(           n_hidden, n_hidden, bias=False), BatchNorm1d(n_hidden), Tanh(),
    Linear(           n_hidden, vocab_size, bias=False), BatchNorm1d(vocab_size), 
]

with torch.no_grad():
    layers[-1].gamma *= 0.1  #output layer 
    for layer in layers[:-1]:
        if isinstance(layer, Linear):
            layer.weight *= 1  #5/3

parameters = [C] + [p for layer in layers for p in layer.parameters()]
print(sum(p.numel() for p in parameters))
for p in parameters:
    p.requires_grad = True

49024


- `The Kaiming (He) initialization` for ReLU-like nonlinearities sometimes recommends scaling by sqrt(5/3)^2 ≈ 2.78 when using tanh or similar.

In [None]:
# Optimization
max_steps = 70000
lossi = []
ud = []  # update-to-data ratio statistics
best_val_loss = float('inf')
best_weights = None

for step in range(max_steps):

    # mini batch
    b_ix = torch.randint(0, Xtr.size(0), (batch_size,), generator=g)
    Xb, Yb = Xtr[b_ix], Ytr[b_ix]

    # forward pass
    emb = C[Xb]
    x = emb.view(-1, n_embd * block_size)
    for layer in layers:
        x = layer(x)
    loss = F.cross_entropy(x, Yb)

    # backward
    for layer in layers:
        layer.out.retain_grad()
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    lr = 0.3 if step < 10000 else 0.2
    for p in parameters:
        p.data += -lr * p.grad
    
    # track stats
    if step % 1000 == 0:
        

        val_loss = 0.0
        with torch.no_grad():
            # emb = C[Xval]
            # x = emb.view(-1, n_embd * block_size)
            # for layer in layers:
            #     x = layer(x)
            # val_loss += F.cross_entropy(x, Yval, reduction='sum').item()

            for i in range(0, Xval.size(0), batch_size):
                batch_X = Xval[i:i+batch_size]
                batch_Y = Yval[i:i+batch_size]
                emb = C[batch_X]
                x = emb.view(-1, n_embd * block_size)
                for layer in layers:
                    x = layer(x)
                val_loss += F.cross_entropy(x, batch_Y, reduction='sum').item()
        
        val_loss /= Xval.size(0)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_weights = [p.data.clone() for p in parameters]
            torch.save({
                'step': step,
                'val_loss': best_val_loss,
                'weights': best_weights
            }, 'best_model.pth')
            print(f"✅ New best val loss: {best_val_loss:.4f} at step {step}")
        
        print(f"{step:7d}/{max_steps:7d}: {loss.item():.4f}: lr={lr} | valuation loss {val_loss:.4f}")

    lossi.append(loss.log10().item())
    with torch.no_grad():
        ud.append([(lr*p.grad.std() / p.data.std()).log10().item() for p in parameters])

    # break
    # if step > 1000:
    #     break


In [None]:
plt.figure(figsize=(15,5))
legends = []
for i, layer in enumerate(layers[:-1]):
    if isinstance(layer, Tanh):
        t = layer.out
        print('layer %d {10%s}: mean %+.2f, std %.2f, satured: %2.f%%' % (i, layer.__class__.__name__, t.mean(), t.std(), (t.abs() > 0.97).float().mean()*100))
        hy, hx = torch.histogram(t, density=True)
        plt.plot(hx[:-1].detach(), hy.detach());
        legends.append(f"layer {i} ({layer.__class__.__name__})")
plt.legend(legends);
plt.title('activation distribution');

In [None]:
# gradient distirubtion
plt.figure(figsize=(15,6))
legends =[]
for i, layer in enumerate(layers[:-1]):
    if isinstance(layer, Tanh):
        t = layer.out.grad
        print('layer %d (%10s): mean %+f, std %e' % (i, layer.__class__.__name__,t.mean(), t.std()))
        hy, hx = torch.histogram(t, density=True)
        plt.plot(hx[:-1].detach(), hy.detach());
        legends.append(f"layer {i} ({layer.__class__.__name__})")
plt.legend(legends);
plt.title('gradient distribution');



In [None]:
# visualize weights
plt.figure(figsize=(15,5))
legends = []
for i, p in enumerate(parameters):
    t = p.grad
    if p.ndim == 2:
        print('weight %10s | mean %+fstd %e | grad:data ratio %e' % (tuple(p.shape), t.mean(), t.std(), t.std()/p.std()))
        hy, hx = torch.histogram(t, density=True)
        plt.plot(hx[:-1].detach(), hy.detach());
        legends.append(f"{i} {tuple(p.shape)}")

plt.legend(legends);
plt.title("visualize weights");

In [None]:
# plot update to data ratio

plt.figure(figsize=(20,4))
legends=[]
for i, p in enumerate(parameters):
    if p.ndim == 2:
        plt.plot([ud[j][i] for j in range(len(ud))])
        legends.append('param %d' % i)
plt.plot([0, len(ud)], [-3,-3], 'k')
plt.legend(legends);

Log₁₀ Ratio	Update/Weight Ratio	Behavior

| Log₁₀ Ratio      | Update/Weight Ratio | Behavior                      |
|------------------|---------------------|-------------------------------|
| `-2 to -4`       | 0.01 to 0.0001      | ✅ Healthy training range      |
| `> -2`           | > 1%                | ⚠️ Possibly too aggressive      |
| `< -4`           | < 0.01%             | ⚠️ May be too small to learn   |

In [7]:
checkpoint = torch.load('best_model.pth')
best_weights = checkpoint['weights']

@torch.no_grad()  #disable gradient tracking
def split_loss(split, parameters, best_weights):
    x, y = {
        'train': (Xtr, Ytr),
        'val': (Xval, Yval),
        'test': (Xte, Yte),
    }[split]
    emb = C[x]
    x = emb.view(emb.shape[0], -1)
    for p, w in zip(parameters, best_weights):
        p.data = w.clone()
    for layer in layers:
        x = layer(x)
    loss = F.cross_entropy(x, y)
    print(split, loss.item())

# put layers into eval mode
for layer in layers:
    layer.training = False
split_loss('train', parameters, best_weights)
split_loss('val', parameters, best_weights)

train 27.428760528564453
val 23.497825622558594


In [9]:

# Load checkpoint
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
try:
    checkpoint = torch.load('best_model.pth', map_location=device)
    best_weights = checkpoint['weights']
    saved_val_loss = checkpoint['val_loss']
except FileNotFoundError:
    print("Error: best_model.pth not found.")
    exit(1)

# Verify parameters and best_weights
if len(parameters) != len(best_weights):
    raise ValueError(f"Mismatch: {len(parameters)} parameters vs {len(best_weights)} weights")
for i, (p, w) in enumerate(zip(parameters, best_weights)):
    if p.shape != w.shape:
        raise ValueError(f"Shape mismatch for parameter {i}: {p.shape} vs {w.shape}")

# Verify parameters are linked to layers
param_set = set(p for p in parameters)
for i, layer in enumerate(layers):
    if hasattr(layer, 'weight'):
        assert layer.weight in param_set, f"Layer {i} weight not in parameters"
    if hasattr(layer, 'gamma'):
        assert layer.gamma in param_set, f"Layer {i} gamma not in parameters"
        assert layer.beta in param_set, f"Layer {i} beta not in parameters"
print("All layer parameters are in parameters list")

@torch.no_grad()
def split_loss(split, parameters, layers, C, best_weights, batch_size=32):
    x, y = {
        'train': (Xtr, Ytr),
        'val': (Xval, Yval),
        'test': (Xte, Yte),
    }[split]
    
    # Debug: Print parameters and layer weights before
    print(f"{split}: Before update, Parameter 0: {parameters[0].data[:5]}")
    for i, layer in enumerate(layers):
        if hasattr(layer, 'weight'):
            print(f"{split}: Before update, Layer {i} weight: {layer.weight.data[:5]}")
        if hasattr(layer, 'gamma'):
            print(f"{split}: Before update, Layer {i} gamma: {layer.gamma.data[:5]}")
    
    # Apply best_weights
    for p, w in zip(parameters, best_weights):
        p.data = w.clone()
    
    # Verify update
    for i, (p, w) in enumerate(zip(parameters, best_weights)):
        if not torch.allclose(p.data, w, atol=1e-6):
            raise ValueError(f"Parameter {i} failed to update: {p.data[:5]} vs {w[:5]}")
    
    # Debug: Print parameters and layer weights after
    print(f"{split}: After update, Parameter 0: {parameters[0].data[:5]}")
    for i, layer in enumerate(layers):
        if hasattr(layer, 'weight'):
            print(f"{split}: After update, Layer {i} weight: {layer.weight.data[:5]}")
        if hasattr(layer, 'gamma'):
            print(f"{split}: After update, Layer {i} gamma: {layer.gamma.data[:5]}")
    print(f"{split}: All parameters updated to match best_weights")
    
    # Forward pass with batching
    total_loss = 0.0
    num_samples = 0
    for i in range(0, x.size(0), batch_size):
        batch_x = x[i:i+batch_size]
        batch_y = y[i:i+batch_size]
        emb = C[batch_x]
        x_flat = emb.view(emb.shape[0], -1)
        for layer in layers:
            x_flat = layer(x_flat)
        loss = F.cross_entropy(x_flat, batch_y, reduction='sum')
        total_loss += loss.item()
        num_samples += batch_x.size(0)
    
    avg_loss = total_loss / num_samples
    print(f"{split} loss: {avg_loss:.4f}")
    return avg_loss

# Put layers into eval mode
for layer in layers:
    layer.training = False

# Evaluate
train_loss = split_loss('train', parameters, layers, C, best_weights)
val_loss = split_loss('val', parameters, layers, C, best_weights)
test_loss = split_loss('test', parameters, layers, C, best_weights)

# Verify validation loss
if abs(val_loss - saved_val_loss) > 1e-4:
    print(f"Warning: Computed val loss ({val_loss:.4f}) differs from saved val loss ({saved_val_loss:.4f})")

All layer parameters are in parameters list
train: Before update, Parameter 0: tensor([[ 5.0020e-01,  1.0707e+00,  1.0352e-01, -2.4076e+00,  1.0652e+00,
         -1.2974e+00,  1.1662e+00,  2.0856e-01,  2.0339e+00,  7.8339e-01],
        [-2.2706e+00, -1.0396e+00, -1.0836e+00, -4.0938e-02,  4.7228e-01,
          1.5634e+00,  1.8517e+00, -1.3171e+00,  1.2081e+00,  7.7794e-01],
        [-1.4056e+00,  1.1612e+00,  1.6762e+00,  5.4983e-01, -3.2597e-01,
         -1.9864e+00, -1.6509e+00, -8.3637e-01, -2.2062e-02, -8.4794e-01],
        [ 1.2834e-01,  2.4400e+00,  2.5167e-02,  5.1116e-01,  3.1126e-01,
          6.7762e-02,  6.9277e-01,  2.0365e+00, -5.8081e-01,  6.4823e-01],
        [ 7.3445e-02,  1.8815e-03,  1.6354e+00,  1.9116e-01, -1.3753e+00,
         -5.5210e-01, -7.6302e-01, -9.8557e-01,  1.2531e-01,  4.8384e-01]])
train: Before update, Layer 0 weight: tensor([[-0.8687, -1.2667, -0.0874,  0.5083, -0.4486,  0.1166, -0.4242, -0.3788,
         -0.7302,  0.5242,  0.1521, -0.3134,  0.4775,  0

In [None]:
for _ in range(10):

    out = []
    context = [0] * block_size
    while True:
        emb = C[torch.tensor([context])]
        x = emb.view(-1, n_embd * block_size)
        for layer in layers:
            x = layer(x)
        logits = x
        probs = torch.softmax(logits, dim=1)
        ix = torch.multinomial(probs, num_samples=1, generator=g).item()
        context = context[1:] + [ix]
        out.append(ix)

        if ix == 0:
            break
    
    print(''.join(itos[i] for i in out))

#### 🚀 Track Update-to-Data Ratio Statistics for Each Parameter

In the training loop, we calculate the **update-to-data ratio** to monitor how significant each parameter update is relative to the parameter’s current scale. This is useful for diagnosing training stability and tuning learning rates.

- `p.grad.std()` → the standard deviation of the gradient  
  *Represents the size of the update.*

- `p.data.std()` → the standard deviation of the parameter values  
  *Represents the scale of the weights.*

- `lr * p.grad.std()` → the magnitude of the update  
  *Combines learning rate with gradient scale.*

- The ratio:
  
  ```python
  (lr * p.grad.std()) / p.data.std()
- `log10` -> This compresses the scale and makes it easier to visualize (since these ratios can span several orders of magnitude).