In [1]:
import torch
import torch.nn.functional as F
import matplotlib.pyplot as plt
%matplotlib inline

use_mps = False

if use_mps and torch.backends.mps.is_available(): # mps turned out to be much slower for some reason
    pt_device = torch.device("mps")
    print("torch using mps")
else:
    pt_device="cpu"
    
torch.set_default_device(pt_device)    

In [2]:
words = open("names.txt").read().splitlines()
print(len(words))
words[:5]

32033


['emma', 'olivia', 'ava', 'isabella', 'sophia']

In [3]:
chars = sorted(list(set(''.join(words))))
stoi = {ch: i+1 for i, ch in enumerate(chars)}
stoi['.'] = 0   
itos = {i: ch for ch, i in stoi.items()}
vocab_size = len(itos)
print(stoi)
print(itos)

{'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26, '.': 0}
{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}


In [4]:
def build_data(words, block_size):
    X, Y = [], [] 
    for w in words:
        context = [0] * block_size
        for c in w + '.':
            ix = stoi[c]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    return torch.tensor(X), torch.tensor(Y)

import random

random.shuffle(words)
n1 = int(len(words) * 0.8)
n2 = int(len(words) * 0.9)

block_size = 3

X_train, Y_train = build_data(words[:n1], block_size)
X_dev, Y_dev = build_data(words[n1:n2], block_size)
X_test, Y_test = build_data(words[n2:], block_size)

print(X_train.shape, X_dev.shape, X_test.shape)
print(Y_train.shape, Y_dev.shape, Y_test.shape)

torch.Size([182407, 3]) torch.Size([22815, 3]) torch.Size([22924, 3])
torch.Size([182407]) torch.Size([22815]) torch.Size([22924])


In [5]:
n_embd = 10 
n_hidden = 200

C = torch.rand((vocab_size,n_embd))
W1 = torch.rand((n_embd * block_size, n_hidden)) * (5/3)/((n_embd * block_size)**0.5)
b1 = torch.rand((n_hidden,)) * 0.01 
W2 = torch.randn(n_hidden, vocab_size) * 0.01 
b2 = torch.randn(vocab_size) * 0
bngain = torch.ones((1, n_hidden))
bnbias = torch.zeros((1, n_hidden))

parameters = [C, W1, b1, W2, b2, bngain, bnbias]

for p in parameters:
    p.requires_grad = True

print(f"{sum(p.numel() for p in parameters)} parameters")

12297 parameters


In [8]:
%%time

max_steps = 200000
batch_size = 32
lossi = []

print(f"expected initial loss {-torch.tensor(1/vocab_size).log()}")

for i in range(max_steps):
    # mini batch, run the whole fordward back ward update in just a small batch
    ix = torch.randint(0, X_train.shape[0], (batch_size,))
    X_batch, Y_batch = X_train[ix], Y_train[ix]

    # forward pas
    emb = C[X_batch] # embed the characters 
    emb_cat = emb.view(emb.shape[0], -1) 
    hpreact = emb_cat @ W1 + b1 # hidden layer preactivation
    hpreact = bngain * (hpreact - hpreact.mean(0, keepdim=True)) / hpreact.std(0, keepdim=True) + bnbias # batch norm 

    h = torch.tanh(hpreact) 
    logits = h @ W2 + b2 # (32, 27)
    # counts = logits.exp()
    # prob = counts / counts.sum(1, keepdim=True)
    # loss = -prob[torch.arange(32), Y].log().mean()
    loss = F.cross_entropy(logits, Y_batch) # exactly the same as the above, just much more efficient
    # backward pass
    for p in parameters:
        p.grad = None
    loss.backward()

    # update
    learning_rate = 0.1 if i < max_steps/2 else 0.01

    for p in parameters:
        p.data += -learning_rate * p.grad

    if i % 10000 == 0:
        print(f"{i:7d}/{max_steps:7d} loss: {loss.item():.4f}")

    lossi.append(loss.log10().item())

expected initial loss 3.295836925506592
      0/ 200000 loss: 1.9799
  10000/ 200000 loss: 1.9323
  20000/ 200000 loss: 2.1351
  30000/ 200000 loss: 2.7393
  40000/ 200000 loss: 1.9193
  50000/ 200000 loss: 1.8307
  60000/ 200000 loss: 2.2794
  70000/ 200000 loss: 2.4838
  80000/ 200000 loss: 2.1302
  90000/ 200000 loss: 2.2680
 100000/ 200000 loss: 2.3333
 110000/ 200000 loss: 1.7238
 120000/ 200000 loss: 2.2617
 130000/ 200000 loss: 2.3168
 140000/ 200000 loss: 1.8460
 150000/ 200000 loss: 2.0502
 160000/ 200000 loss: 2.2080
 170000/ 200000 loss: 2.4302
 180000/ 200000 loss: 1.9603
 190000/ 200000 loss: 2.5170
CPU times: user 6min 1s, sys: 10min 7s, total: 16min 9s
Wall time: 1min 54s


In [None]:
# calibrate the batch norm at the end of training

with torch.no_grad():
    emb = C[X_train]
    emb_cat = emb.view(emb.shape[0], -1)
    hpreact = emb_cat @ W1 + b1
    bngain = 1/(hpreact.std(0, keepdim=True) + 1e-3)
    bnbias.data = -hpreact.mean(0, keepdim=True) * bngain

In [7]:
@torch.no_grad 

def split_loss(split):
    x, y = {
        'train': (X_train, Y_train),
        'val': (X_dev, Y_dev),
        'test': (X_test, Y_test)
    }[split]
    emb = C[x]
    emb_cat = emb.view(emb.shape[0], -1) 
    hpreact = emb_cat @ W1 + b1 # hidden layer preactivation
    hpreact = bngain * (hpreact - hpreact.mean(0, keepdim=True)) / hpreact.std(0, keepdim=True) + bnbias # batch norm 
    h = torch.tanh(hpreact) 
    logits = h @ W2 + b2
    loss = F.cross_entropy(logits, y) 
    print(f"loss for the {split} dataset", loss.item())

split_loss('train')
split_loss('val')

loss for the train dataset 2.0622000694274902
loss for the val dataset 2.116655111312866
