In [1]:
import torch
import random
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import torch.nn.functional as F

In [5]:
from layers_part_5 import Linear, BatchNorm1d, Tanh

In [2]:
# Construct the list of names and produce a dataset
words = list()
with open("names.txt", "r") as infile:
    for line in infile:
        words.append(line.strip())

# Mapping of characters to/from integers
special = "."
chars = sorted(list(set("".join(words))))
STOI = {s: i + 1 for i, s in enumerate(chars)}
STOI[special] = 0
ITOS = {i: s for s, i in STOI.items()}

vocab_size = len(STOI)

In [3]:
random.seed(42)
random.shuffle(words)

In [4]:
block_size = 3

In [None]:
# Construct the dataset


def build_dataset(words, *, special: str = ".", block_size: int = 3) -> "tuple[torch.tensor]":
    (X, Y) = (list(), list())

    for word in words:
        context = [0] * block_size
        for char in word + special:
            ix = STOI[char]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(f"Constructed dataset: X: {X.shape}, Y: {Y.shape}")
    return X, Y


# Train, Development, Test split
random.shuffle(words)
n1 = int(0.8 * len(words))
n2 = int(0.9 * len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xtest, Ytest = build_dataset(words[n2:])

In [None]:
torch.manual_seed(42)

In [None]:
# Re-create the training loop with an improved model instantiation

n_embed = 10
n_hidden = 200
fan_in = n_embed * block_size
C = torch.randn((vocab_size, n_embed))

layers = [
    Linear(fan_in=fan_in, fan_out=n_hidden, bias=False),
    BatchNorm1d(n_hidden),
    Tanh(),
    Linear(fan_in=n_hidden, fan_out=vocab_size),
]

with torch.no_grad():
    layers[-1].weight *= 0.1  # slightly less confident

parameters = [C] + [p for layer in layers for p in layer.parameters]
n_parameters = sum(p.nelement() for p in parameters)
print(f"{n_parameters} parameters")
for p in parameters:
    p.requires_grad = True

In [None]:
# Optimization loop
max_epoch = 200_000
optimal_rate = 0.1
batch_size = 32
loss_i = list()
update_info = list()

for epoch in range(max_epoch):

    # Construct a mini-batch
    ix = torch.randint(0, Xtr.shape[0], (batch_size,))
    Xb, Yb = Xtr[ix], Ytr[ix]

    # Forward pass
    emb = C[Xb]  # Embed the characters into vectors
    x = emb.view(emb.shape[0], -1)
    for layer in layers:
        x = layer(x)
    loss = F.cross_entropy(x, Yb)

    # Backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # Update step with learning rate decay
    current_rate = 0.0
    if epoch < 0.5 * max_epoch:
        current_rate = optimal_rate
    elif epoch > 0.5 * max_epoch and epoch < 0.75 * max_epoch:
        current_rate = optimal_rate / 10
    elif epoch > 0.75 * max_epoch and epoch < 0.95 * max_epoch:
        current_rate = optimal_rate / 100
    else:
        current_rate = optimal_rate / 1000
    for p in parameters:
        p.data += -current_rate * p.grad

    # Track stats
    if epoch % 10_000 == 0:
        print(f"{epoch:7d}/{max_epoch:7d}: {loss.item():.4f}")
    loss_i.append(loss.log10().item())

In [None]:
# Sample from the model
n_samples = 10
with torch.no_grad():
    for _ in range(n_samples):
        out = []
        context = [0] * block_size
        while True:
            # Forward pass
            emb = C[torch.tensor([context])]  # (1, block_size, fan_in)
            x = emb.view(-1)
            for layer in layers:
                x = layer(x)
            probs = F.softmax(x, dim=0)
            ix = torch.multinomial(probs, num_samples=1).item()
            context = context[1:] + [ix]
            out.append(ix)
            if ix == 0:
                break

        print("".join(ITOS[i] for i in out))