In [1]:
!wget https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

--2023-04-28 15:03:50--  https://raw.githubusercontent.com/karpathy/makemore/master/names.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 228145 (223K) [text/plain]
Saving to: ‘names.txt’


2023-04-28 15:03:50 (30.4 MB/s) - ‘names.txt’ saved [228145/228145]



In [4]:
import torch
import torch.nn.functional as F
import torch.nn as nn

In [5]:
words = open('names.txt').read().split()

In [7]:
vocab = sorted(list(set(''.join(words))))
stoi = {ch:i+1 for i,ch in enumerate(vocab)}
stoi['.'] = 0
itos = {i:ch for ch,i in stoi.items()}
encode = lambda x: [stoi[i] for i in x]
decode = lambda x: ''.join([itos[i] for i in x])

In [8]:
block_size = 3
def build_data(words):
    X, Y = [], []
    for word in words:
        context = [0] * block_size
        for char in word + '.':
            ix = stoi[char]
            X.append(context)
            Y.append(ix)
            context = context[1:] + [ix]
    X = torch.tensor(X)
    Y = torch.tensor(Y)
    print(X.shape, Y.shape)
    return X, Y

n1 = int(len(words) * 0.8)
n2 = int(len(words) * 0.9)
Xtr, Ytr = build_data(words[:n1])
Xte, Yte =  build_data(words[n1:n2])
Xdev , Ydev = build_data(words[n2:])

torch.Size([182778, 3]) torch.Size([182778])
torch.Size([22633, 3]) torch.Size([22633])
torch.Size([22735, 3]) torch.Size([22735])


In [9]:
class Linear:
    def __init__(self, IN, OUT, bias=True):
        self.w = torch.randn(IN, OUT) / IN ** 0.5 
        self.b = torch.zeros(OUT) if bias else None
    
    def __call__(self, x):
        self.out = x @ self.w 
        if self.b is not None:
            self.out += self.b
        return self.out
            
    def parameters(self):
        return [self.w] + ([] if self.b is None else [self.b])

class BatchNorm1d:
    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.momentum = momentum
        self.training = True
        self.eps = eps
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
        self.running_mean = torch.zeros(dim)
        self.running_var = torch.ones(dim)
        
    def __call__(self, x):
        if self.training:
            xmean = x.mean(0,keepdim=True)
            xvar = x.var(0,keepdim=True)
        else:
            xmean = self.running_mean
            xvar = self.running_mean
        xnorm = (x - xmean) / torch.sqrt(xvar + self.eps)
        self.out = self.gamma * xnorm + self.beta
        if self.training:
            self.running_mean = self.momentum * self.running_mean +  self.momentum * xmean
            self.running_var = self.momentum * self.running_var +  self.momentum * xvar
        return self.out
    
    def parameters(self):
        return [self.gamma, self.beta]
            
class Tanh:
    def __call__(self, x):
        self.out = torch.tanh(x)
        return self.out
    
    def parameters(self):
        return []

In [10]:
n_emb = 10
n_hidden = 200
vocab_size = len(stoi)
g = torch.Generator().manual_seed(42)
C = torch.randn(vocab_size, n_emb, generator=g)
Layers = [Linear(n_emb * block_size, n_hidden), BatchNorm1d(n_hidden), Tanh(), 
          Linear(n_hidden, n_hidden), BatchNorm1d(n_hidden), Tanh(),
          Linear(n_hidden, n_hidden), BatchNorm1d(n_hidden), Tanh(),
          Linear(n_hidden, n_hidden), BatchNorm1d(n_hidden),Tanh(),
          Linear(n_hidden, n_hidden)]

parameters = [C] + [p for layer in Layers for p in layer.parameters()]
print(f"Total parameters:", sum(p.numel() for p in parameters))
for p in parameters:
    p.requires_grad = True 

Total parameters: 168870


In [11]:
steps = 20000
batch_size = 25
lr = 0.001
for i in range(steps):
    #get minibatch
    ix = torch.randint(0, Xtr.shape[0], (batch_size,), generator=g)
    Xb , Yb = Xtr[ix], Ytr[ix]
    # forward pass
    emb = C[Xb]
    x = emb.view(emb.shape[0], -1)
    for layer in Layers:
        x = layer(x)
    loss = F.cross_entropy(x, Yb)
    #backward pass
    for p in parameters:
        p.grad = None
    loss.backward()
    # update parameters
    for p in parameters:
        p.data += - lr * p.grad
    if i%1000==0:
        print(f". {i}/{steps}   : {loss.item()}")

. 0/20000   : 5.5347442626953125
. 1000/20000   : 4.730920791625977
. 2000/20000   : 3.9596426486968994
. 3000/20000   : 3.2961580753326416
. 4000/20000   : 3.987142324447632
. 5000/20000   : 3.265638828277588
. 6000/20000   : 3.225757122039795
. 7000/20000   : 3.1466031074523926
. 8000/20000   : 3.5089707374572754
. 9000/20000   : 2.958566188812256
. 10000/20000   : 3.0267727375030518
. 11000/20000   : 2.728703498840332
. 12000/20000   : 2.8094024658203125
. 13000/20000   : 3.125795602798462
. 14000/20000   : 2.973686933517456
. 15000/20000   : 3.3151891231536865
. 16000/20000   : 2.3588993549346924
. 17000/20000   : 2.698923110961914
. 18000/20000   : 2.6301627159118652
. 19000/20000   : 2.8570263385772705


In [None]:
@torch.no_grad() # this decorator disables gradient tracking
def split_loss(split):
  x,y = {
    'train': (Xtr, Ytr),
    'val': (Xdev, Ydev),
    'test': (Xte, Yte),
  }[split]
  emb = C[x] # (N, block_size, n_embd)
  x = emb.view(emb.shape[0], -1) # concat into (N, block_size * n_embd)
  for layer in Layers:
    x = layer(x)
  loss = F.cross_entropy(x, y)
  print(split, loss.item())

split_loss('train')
split_loss('val')

In [None]:
g = torch.Generator().manual_seed(5)

for _ in range(20):
    out = []
    context = [0] * block_size # initialize with all ...
    while True:
      # forward pass the neural net
      emb = C[torch.tensor([context])] # (1,block_size,n_embd)
      x = emb.view(emb.shape[0], -1) # concatenate the vectors
      

      hpreact = x @ W1 
      hpreact = bngain * (hpreact - bnmean_running) / bnstd_running + bnbias
      h = torch.tanh(hpreact) # (N, n_hidden)
      logits = h @ W2 + b2 # (N, vocab_size)
    
  
      probs = F.softmax(logits, dim=1)
      # sample from the distribution
      ix = torch.multinomial(probs, num_samples=1).item()
      # shift the context window and track the samples
      context = context[1:] + [ix]
      # if we sample the special '.' token, break
      if ix == 0:
        break
      out.append(ix)
    print(''.join(itos[i] for i in out))