In [39]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

In [40]:
# Load data
with open("names.txt", "r", encoding="utf-8") as f:
    names = f.read().splitlines()

chars = sorted(list(set("".join(names))))

stoi = {s: i + 1 for i, s in enumerate(chars)}
stoi["."] = 0
itos = {i: s for s, i in stoi.items()}

In [41]:
# Build the dataset
block_size = 3

def build_dataset(words):
    X, Y = [], []

    for word in words:
        context = [0] * block_size

        for char in word + ".":
            token = stoi[char]
            X.append(context)
            Y.append(token)
            context = context[1:] + [token]

    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

In [42]:
# Split up training, validation, test sets

import random
random.shuffle(names)
n1 = int(0.8 * len(names))
n2 = int(0.9 * len(names))

Xtr, Ytr = build_dataset(names[:n1])
Xdev, Ydev = build_dataset(names[n1:n2])
Xte, Yte = build_dataset(names[n2:]) 

torch.Size([182476, 3]) torch.Size([182476])
torch.Size([22784, 3]) torch.Size([22784])
torch.Size([22886, 3]) torch.Size([22886])


In [60]:
class Linear:

    def __init__(self, fan_in: int, fan_out: int, bias:bool=True):
        self.weight = torch.randn((fan_in, fan_out)) / fan_in**0.5
        self.bias = torch.randn(fan_out) if bias else None

    def __call__(self, x: torch.Tensor) -> torch.Tensor:
        self.out = x @ self.weight
        if self.bias is not None:
            self.out += self.bias
        return self.out

    def parameters(self):
        return [self.weight] + ([] if self.bias is None else [self.bias])

class BatchNorm1d:

    def __init__(self, dim: int, eps: float=1e-5, momentum: float=0.1):
        self.momentum = momentum
        self.eps = eps
        self.training = True
        self.running_mean = torch.ones((1, dim))
        self.running_var = torch.zeros((1, dim))
        self.gain = torch.ones((1, dim))
        self.bias = torch.zeros((1, dim))

    def __call__(self, x: torch.Tensor):
        if self.training:
            x_mean = x.mean(0, keepdim=True)
            x_var = x.var(0, keepdim=True)
        else:
            x_mean = self.running_mean
            x_var = self.running_var

        # Normalize
        xhat = (x - x_mean) / torch.sqrt(x_var + self.eps)
        self.out = self.gain * xhat + self.bias

        if self.training:
            # Running stats
            with torch.no_grad():
                self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * x_mean
                self.running_var = (1 - self.momentum) * self.running_var + self.momentum * x_var
        
        return self.gain * (x - x_mean) / x_var + self.bias

    def parameters(self) -> list[torch.Tensor]:
        return [self.gain, self.bias]

class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
        

    def parameters(self):
        return []
    
class Embedding:
    def __init__(self, num_items, embedding_size):
        self.weight = torch.randn((num_items, embedding_size))
    
    def __call__(self, IX):
        self.out = self.weight[IX]
        return self.out
    
    def parameters(self):
        return [self.weight]
    
class Flatten:
    def __call__(self, X):
        self.out = X.view(X.shape[0], -1)
        return self.out

    def parameters(self):
        return []
    

class Sequential:

    def __init__(self, layers):
        self.layers = layers
    
    def __call__(self, x):
        for layer in self.layers:
            x = layer(x)
        self.out = x
        return self.out

    def parameters(self):
        return [p for layer in self.layers for p in layer.parameters()]

In [61]:
vocab_size = len(stoi.keys())
embedding_size = 10
hidden_size = 200

model = Sequential([
    Embedding(vocab_size, embedding_size),
    Flatten(),
    Linear(embedding_size * block_size, hidden_size, bias=False), BatchNorm1d(hidden_size), Tanh(),
    Linear(hidden_size, vocab_size),
])

# Param init
with torch.no_grad():
    model.layers[-1].weight *= 0.1 # Make the last layer less confident

print(sum(p.nelement() for p in model.parameters()))
for p in model.parameters():
    p.requires_grad = True

12097


In [50]:
steps = []
losses = []

max_steps = 2000000
minibatch_size = 32

# Training
for i in range(max_steps):
    # Construct minibatch
    minibatch_ixs = torch.randint(0, Xtr.shape[0], (minibatch_size,))
    Xb, Yb = Xtr[minibatch_ixs], Ytr[minibatch_ixs]
    
    # Forward pass
    logits = model(Xb)
    loss = F.cross_entropy(logits, Yb)

    # Backward
    for p in model.parameters():
        p.grad = None
    loss.backward()

    # Update
    lr = 0.1 if i < 150000 else 0.01
    for p in model.parameters():
        p.data += -lr * p.grad

    # Track stats
    losses.append(loss.log10().item())
    steps.append(i)
    if i % 10000 == 0:
        print(f"{i:7d}/{max_steps:7d}: {loss.item():0.4f}")

      0/2000000: 4.3681


KeyboardInterrupt: 

In [62]:
# Post training
for layer in model.layers:
    layer.training = False

In [63]:
@torch.no_grad()
def split_loss(X, Y):
    logits = model(X)
    loss = F.cross_entropy(logits, Y)
    return loss.item()

print(f"Training: {split_loss(Xtr, Ytr):.4f}")
print(f"Dev: {split_loss(Xdev, Ydev):.4f}")

Training: 3.5906
Dev: 3.5893


In [64]:
# Sample
for _ in range(10):
    name = []
    context = [0] * block_size

    while True:
        logits = model(torch.tensor([context]))
        probs = F.softmax(logits, dim=1)
        token = torch.multinomial(probs, num_samples=1).item()
        context = context[1:] + [token]
        if token == 0:
            break
        else:
            name.append(itos[token])

    print("".join(name))

akjpkjventxtopvdgnvxnttwqnwtbchmtwtbtqtqwtqatatxhxt
qtgf
dehtqtvtiokcqt
qpvttybaatqhatwtjqoagmggnwkd
kifsh
twfwtd
wvvwwucmjtdtayvxmgwdiptct
tfdwijtdtqtwjooqtqgvxdbov
qawaqinqnqkmtkmt
tqtdwvddt
