<a href="https://colab.research.google.com/github/kaifkh20/nn_z_to_h/blob/main/makemore_batch_norm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()

Saving names(1).txt to names(1).txt


In [None]:
import torch
import torch.nn.functional as F

In [None]:
words = open('names(1).txt','r').read().splitlines()
len(words)

32033

In [None]:
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)

In [None]:
#building the dataset

block_size = 3 #context length
X,Y = [],[]

for w in words:
  # print(w)
  context = [0] * block_size #pad it with ...
  for ch in w + '.':
    ix = stoi[ch]
    X.append(context)
    Y.append(ix)
    # print(''.join(itos[i] for i in context), '--->', itos[ix])
    context = context[1:] + [ix]
X = torch.tensor(X)
Y = torch.tensor(Y)

#build training, dev and test dataset

def build_dataset(words):
  block_size = 3 #context length
  X,Y = [],[]

  for w in words:
    # print(w)
    context = [0] * block_size #pad it with ...
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      # print(''.join(itos[i] for i in context), '--->', itos[ix])
      context = context[1:] + [ix]
  X = torch.tensor(X)
  Y = torch.tensor(Y)

  return X,Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr, Ytr = build_dataset(words[:n1])
Xdev, Ydev = build_dataset(words[n1:n2])
Xte, Yte = build_dataset(words[n2:])


In [None]:
#MLP revisited

n_embd = 10 # the dimensionality of the character embedding vectors
n_hidden = 200 # the number of neurons in the hidden layer of MLP

g = torch.Generator().manual_seed(2147483647)
C = torch.randn((vocab_size,n_embd),generator=g)
W1 = torch.randn((n_embd*block_size,n_hidden),generator=g)*0.1
b1 = torch.randn(n_hidden,generator=g)*0.01
W2 = torch.randn((n_hidden,vocab_size),generator=g)*0.01
b2 = torch.randn(vocab_size,generator=g)*0

bngain = torch.ones((1,n_hidden))*0.1
bnbias = torch.zeros((1,n_hidden))
bnmean_running = torch.ones((1,n_hidden))
bnstd_running = torch.ones((1,n_hidden))
parameters = [C,W1,b1,b2,W2,bngain,bnbias]
print(sum(p.nelement() for p in parameters)) #parameters in total
for p in parameters:
  p.requires_grad = True

11897


The formula (0.99 × old_value) + (0.01 × new_value) means:

You keep 99% of your previous knowledge
You incorporate 1% of new information

In [None]:
max_steps = 200000
batch_size = 32
lossi = []

for i in range(max_steps):
  # minibatch construct
  ix = torch.randint(0,Xtr.shape[0],(batch_size,),generator=g)
  Xb, Yb = Xtr[ix],Ytr[ix] #torch X,Y

  #forward pass
  emb = C[Xb] #embed the characters into vectors
  embcat = emb.view(emb.shape[0],-1)
  #linear layer
  hpreact = embcat @ W1 + b1

  #batch-norm layer
  bnmean = hpreact.mean(0,keepdim=True)
  bnstd = hpreact.std(0,keepdim=True)
  hpreact = bngain * (hpreact - bnmean) / bnstd + bnbias

  with torch.no_grad():
    bnstd_running = 0.999 * bnstd_running + 0.001 * bnstd
    bnmean_running = 0.999 * bnmean_running + 0.001 * bnmean

#non -linear layer
  h = torch.tanh(hpreact)
  logits = h @ W2 + b2
  loss = F.cross_entropy(logits,Yb)

  #backward pass
  for p in parameters:
    p.grad = None
  loss.backward()

  #update
  lr = 0.1 if i < 100000 else 0.01
  for p in parameters:
    p.data += -lr * p.grad

  #track stats
  if i%1000==0:
    print(f'{i:7d}/{max_steps:7d}: {loss.item():.4f}')
  lossi.append(loss.log10().item())




      0/ 200000: 3.2984
   1000/ 200000: 2.4533
   2000/ 200000: 2.4724
   3000/ 200000: 2.4339
   4000/ 200000: 2.2701
   5000/ 200000: 2.4084
   6000/ 200000: 2.3369
   7000/ 200000: 2.4261
   8000/ 200000: 2.0707
   9000/ 200000: 2.4729
  10000/ 200000: 2.2635
  11000/ 200000: 1.9317
  12000/ 200000: 2.3945
  13000/ 200000: 2.3702
  14000/ 200000: 2.7574
  15000/ 200000: 2.3690
  16000/ 200000: 2.4723
  17000/ 200000: 2.4310
  18000/ 200000: 2.3219
  19000/ 200000: 2.5272
  20000/ 200000: 2.4784
  21000/ 200000: 2.5291
  22000/ 200000: 1.9709
  23000/ 200000: 2.0386
  24000/ 200000: 2.5464
  25000/ 200000: 2.3428
  26000/ 200000: 2.5498
  27000/ 200000: 2.5549
  28000/ 200000: 2.1970
  29000/ 200000: 2.8467
  30000/ 200000: 2.5924
  31000/ 200000: 2.6017
  32000/ 200000: 2.6269
  33000/ 200000: 2.4820
  34000/ 200000: 2.1813
  35000/ 200000: 2.4278
  36000/ 200000: 2.1416
  37000/ 200000: 2.1655
  38000/ 200000: 2.7646
  39000/ 200000: 2.2638
  40000/ 200000: 2.0942
  41000/ 200000: