# makemore reimplementation

Code written while following along with: https://www.youtube.com/watch?v=PaCmpygFfXo

# Counting approach, creating a feature by counting occurrences

In [1]:
import torch
from torch import nn
from torchvision import datasets
from torchvision.transforms import ToTensor
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import string

stoi = {ch: i+1 for i, ch in enumerate(string.ascii_lowercase)}
stoi['.'] = 0
itos = {i: ch for ch, i in stoi.items()}

In [3]:
words = open('data/names.txt', 'r').read().splitlines()

In [4]:
N = torch.zeros((27, 27), dtype=torch.int32)

chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}

xs, ys = [], []
for w in words:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    xs.append(stoi[ch1])
    ys.append(stoi[ch2])
    N[ix1, ix2] += 1

P = (N+1).float()
P /= P.sum(1, keepdims=True)

xs = torch.Tensor(xs).long()
ys = torch.Tensor(ys).long()

In [5]:
ys = ys.long()

In [6]:
g = torch.Generator().manual_seed(2147483647)

for i in range(1):

  out = []
  ix = 0
  while True:
    p = P[ix]
    ix = torch.multinomial(p, num_samples=1, replacement=True, generator=g).item()
    out.append(itos[ix])
    if ix == 0:
      break
  print(''.join(out))

cexze.


In [7]:
loss = 0
n = 0
for w in words[:3]:
  chs = ['.'] + list(w) + ['.']
  for ch1, ch2 in zip(chs, chs[1:]):
    n += 1
    ix1 = stoi[ch1]
    ix2 = stoi[ch2]
    prob = P[ix1, ix2]
    logprob = torch.log(prob)
    loss += logprob

print(f'normalized negative log likelihood: {-loss/n:.4f}')

normalized negative log likelihood: 2.4255


# Neural Net approach w/ manual weight matrix

In [8]:
import torch.nn.functional as F


xs_enc = F.one_hot(xs, 27).float()
ys_enc = F.one_hot(ys, 27).float()

In [9]:
W = torch.rand((27, 27), requires_grad=True)

In [10]:
epochs = 20
num = xs.nelement()
for i in range(epochs):
  # The unnormalized (before activation fxn) output of a neuron is a "logit"
  # These are not probabilities and therefore hard to interpret
  # 18000, 27 @ 27, 27 -> 18000, 27
  logits = xs_enc @ W

  # Softmax converts from logits to a probability distribution over a discrete
  # variable with n possible outputs.
  counts = logits.exp()
  probs = counts / counts.sum(1, keepdim=True)

  # Negative log likelihood
  loss = -probs[torch.arange(num), ys].log().mean() + 0.01*(W**2).mean()
  print(f'Loss: {loss:.4f}')

  W.grad = None
  loss.backward()
  W.data += -50 * W.grad

tensor([[0.2316, 0.2670, 0.5199,  ..., 0.5978, 0.2065, 0.7383],
        [0.2000, 0.4254, 0.0926,  ..., 0.7594, 0.0087, 0.7833],
        [0.6711, 0.9686, 0.2646,  ..., 0.3547, 0.3227, 0.0031],
        ...,
        [0.7150, 0.6141, 0.0686,  ..., 0.6213, 0.4833, 0.5559],
        [0.5584, 0.6218, 0.3626,  ..., 0.0862, 0.3065, 0.9438],
        [0.4465, 0.8683, 0.1276,  ..., 0.2699, 0.7839, 0.6016]],
       grad_fn=<MmBackward0>)
tensor([[1.2606, 1.3060, 1.6819,  ..., 1.8181, 1.2294, 2.0925],
        [1.2215, 1.5302, 1.0970,  ..., 2.1369, 1.0087, 2.1887],
        [1.9565, 2.6342, 1.3029,  ..., 1.4257, 1.3808, 1.0031],
        ...,
        [2.0442, 1.8481, 1.0710,  ..., 1.8613, 1.6214, 1.7435],
        [1.7479, 1.8624, 1.4371,  ..., 1.0900, 1.3587, 2.5698],
        [1.5628, 2.3828, 1.1361,  ..., 1.3098, 2.1901, 1.8251]],
       grad_fn=<ExpBackward0>)


In [178]:
for _ in range(10):
    idx = 0
    out = []
    while True:
        xenc = F.one_hot(torch.tensor([idx]), 27).float()
        logits = xenc @ W
        counts = logits.exp()
        probs = counts / counts.sum(1, keepdims=True)
        
        char = torch.multinomial(probs, num_samples=1, replacement=True).item()
        out.append(itos[char])
        idx = char
        if idx == 0:
            break
    
    print(''.join(out))

ccaun.
lormo.
kalllie.
fara.
rilon.
lian.
janatayn.
chemie.
etliz.
jolem.


# Neural Net approach w/ Pytorch

In [191]:
print(xs_enc.shape, ys_enc.shape)

torch.Size([228146, 27]) torch.Size([228146, 27])


In [196]:
from torch.utils.data import Dataset, DataLoader, random_split

class BiGramDataset(Dataset):
    def __init__(self, x, y):
        super(BiGramDataset, self).__init__()
        assert x.shape[0] == y.shape[0] # assuming shape[0] = dataset size
        self.x = x
        self.y = y

    def __len__(self):
        return self.y.shape[0]

    def __getitem__(self, index):
        return self.x[index], self.y[index]

all_data = BiGramDataset(xs_enc, ys)
train_data, dev_data, test_data = random_split(all_data, [0.8, 0.1, 0.1])

In [197]:
batch_size = 1000

train_dataloader = DataLoader(train_data, batch_size=batch_size)
dev_dataloader = DataLoader(dev_data,batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

In [225]:
device = "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif torch.backends.mps.is_available():
    device = "mps"

class Model(nn.Module):
  def __init__(self):
    super().__init__()
    self.linear_relu_stack = nn.Sequential(
        nn.Linear(27, 27),
        
        # A more complex model just makes the predictions/loss worse
        #nn.ReLU(),
        #nn.Linear(512, 512),
        #nn.ReLU(),
        #nn.Linear(512, 27),
        
        # NLLL loss fxn requires the network output to be LogSoftmax
        # Can remove this layer if using CrossEntropyLoss 
        nn.LogSoftmax()
    )

  def forward(self, x):
    logits = self.linear_relu_stack(x)
    return logits

model = Model().to(device)

In [232]:
def train_loop(dataloader, model, loss_fn, optimizer):
  size = len(dataloader.dataset)
  model.train()
  for batch, (X, y) in enumerate(dataloader):
    pred = model(X.to(device))
    loss = loss_fn(pred, y.to(device))

    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if batch % 20 == 0:
      loss, current = loss.item(), batch * batch_size
      print(f'loss: {loss} [{current} / {size}]')

def test_loop(dataloader, model, loss_fn):
  model.eval()
  size = len(dataloader.dataset)
  num_batches = len(dataloader)
  test_loss, correct = 0, 0

  with torch.no_grad():
    for X, y in dataloader:
      X,y = X.to(device), y.to(device)
      pred = model(X)
      test_loss += loss_fn(pred, y).item()
      correct += (pred.argmax(1) == y).type(torch.float).sum().item()

  test_loss /= num_batches
  correct /= size
  print(f'test erorr -- acc: {100*correct}, avg loss: {test_loss}')


learning_rate = 10
epochs = 20
loss_fn = nn.NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

for t in range(epochs):
  print(f'Epoch: {t+1}\n---------------------------------')
  train_loop(train_dataloader, model, loss_fn, optimizer)
  # Validation test
  test_loop(dev_dataloader, model, loss_fn)

# Final test
test_loop(test_dataloader, model, loss_fn)

Epoch: 1
---------------------------------
loss: 2.4062838554382324 [0 / 182517]
loss: 2.3683180809020996 [20000 / 182517]
loss: 2.4546115398406982 [40000 / 182517]
loss: 2.419706344604492 [60000 / 182517]
loss: 2.495737314224243 [80000 / 182517]
loss: 2.451923131942749 [100000 / 182517]
loss: 2.4929842948913574 [120000 / 182517]
loss: 2.4703428745269775 [140000 / 182517]
loss: 2.4828622341156006 [160000 / 182517]
loss: 2.4792494773864746 [180000 / 182517]
test erorr -- acc: 22.388779311856236, avg loss: 2.4775272244992466
Epoch: 2
---------------------------------
loss: 2.406228542327881 [0 / 182517]
loss: 2.3683016300201416 [20000 / 182517]
loss: 2.4544739723205566 [40000 / 182517]
loss: 2.4196372032165527 [60000 / 182517]
loss: 2.495696783065796 [80000 / 182517]
loss: 2.4518816471099854 [100000 / 182517]
loss: 2.4929213523864746 [120000 / 182517]
loss: 2.470343828201294 [140000 / 182517]
loss: 2.482787609100342 [160000 / 182517]
loss: 2.479241132736206 [180000 / 182517]
test erorr -

In [256]:
for _ in range(15):
    idx = 0
    out = []
    while True:
        xenc = F.one_hot(torch.tensor([idx]), 27).float()
        probs = model(xenc.to(device)).exp()
        
        char = torch.multinomial(probs, num_samples=1, replacement=True).item()
        out.append(itos[char])
        idx = char
        if idx == 0:
            break

    if len(out) > 3:
        print(''.join(out))

khonn.
tyar.
thofrency.
lanahazan.
kyon.
juri.
ama.
aeiri.
rissaldamahrahleh.
jeshillil.
