# Autograd
This notebook is to test my autograd engine. This notebooks is inspired by [Andrej Karpathy](https://www.youtube.com/watch?v=q8SA3rM6ckI&list=PLAqhIrjkxbuWI23v9cThsA9GvCAUhRvKZ&index=5)

In [1]:
%cd ..

e:\KTorch


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


In [2]:
# Imports 
import torch
import torch.nn.functional as F
from autograd.engine import Tensor
import numpy as np

In [3]:
# !curl -O https://raw.githubusercontent.com/karpathy/makemore/master/names.txt

In [4]:
#### Boilerplate code from Andrej Karpathy
# read in all the words
words = open('names.txt', 'r').read().splitlines()
print(len(words))
print(max(len(w) for w in words))
print(words[:8])

# build the vocabulary of characters and mappings to/from integers
chars = sorted(list(set(''.join(words))))
stoi = {s:i+1 for i,s in enumerate(chars)}
stoi['.'] = 0
itos = {i:s for s,i in stoi.items()}
vocab_size = len(itos)
print(itos)
print(vocab_size)

# build the dataset
block_size = 3 # context length: how many characters do we take to predict the next one?

def build_dataset(words):  
  X, Y = [], []
  
  for w in words:
    context = [0] * block_size
    for ch in w + '.':
      ix = stoi[ch]
      X.append(context)
      Y.append(ix)
      context = context[1:] + [ix] # crop and append

  X = torch.tensor(X)
  Y = torch.tensor(Y)
  print(X.shape, Y.shape)
  return X, Y

import random
random.seed(42)
random.shuffle(words)
n1 = int(0.8*len(words))
n2 = int(0.9*len(words))

Xtr,  Ytr  = build_dataset(words[:n1])     # 80%
Xdev, Ydev = build_dataset(words[n1:n2])   # 10%
Xte,  Yte  = build_dataset(words[n2:])     # 10%

# utility function we will use later when comparing manual gradients to PyTorch gradients
def cmp(s, dt, t):
  ex = torch.all(dt == t.grad).item()
  app = torch.allclose(dt, t.grad)
  maxdiff = (dt - t.grad).abs().max().item()
  print(f'{s:15s} | exact: {str(ex):5s} | approximate: {str(app):5s} | maxdiff: {maxdiff}')

32033
15
['emma', 'olivia', 'ava', 'isabella', 'sophia', 'charlotte', 'mia', 'amelia']
{1: 'a', 2: 'b', 3: 'c', 4: 'd', 5: 'e', 6: 'f', 7: 'g', 8: 'h', 9: 'i', 10: 'j', 11: 'k', 12: 'l', 13: 'm', 14: 'n', 15: 'o', 16: 'p', 17: 'q', 18: 'r', 19: 's', 20: 't', 21: 'u', 22: 'v', 23: 'w', 24: 'x', 25: 'y', 26: 'z', 0: '.'}
27
torch.Size([182625, 3]) torch.Size([182625])
torch.Size([22655, 3]) torch.Size([22655])
torch.Size([22866, 3]) torch.Size([22866])


In [5]:
# Creating the values
np.random.seed(0)
vocab_size, n_embd, block_size, n_hidden, batch_size = 27, 10, 3, 64, 32
n = batch_size # for convenience
C = np.random.rand(vocab_size, n_embd)

# Layer 1
W1 = np.random.rand((n_embd * block_size), n_hidden)
b1 = np.random.rand(n_hidden)

# Layer 2
W2 = np.random.rand(n_hidden, vocab_size)
b2 = np.random.rand(vocab_size)

# Batchnorm
bngain = np.random.rand(1, n_hidden)*0.1 + 1.0
bnbias = np.random.rand(1, n_hidden)*0.1
C.shape, W1.shape, b1.shape, W2.shape, b2.shape, bngain.shape, bnbias.shape

((27, 10), (30, 64), (64,), (64, 27), (27,), (1, 64), (1, 64))

In [6]:
# Creating the my tensors
C_k = Tensor(C)
W1_k = Tensor(W1)
b1_k = Tensor(b1)
W2_k = Tensor(W2)
b2_k = Tensor(b2)
bngain_k = Tensor(bngain)
bnbias_k = Tensor(bnbias)
parameters_k = [C_k, W1_k, b1_k, W2_k, b2_k, bngain_k, bnbias_k]

# Creating the pytorch tensors
C_t = torch.tensor(C, requires_grad=True, dtype=torch.float32)
W1_t = torch.tensor(W1, requires_grad=True, dtype=torch.float32)
b1_t = torch.tensor(b1, requires_grad=True, dtype=torch.float32)
W2_t = torch.tensor(W2, requires_grad=True, dtype=torch.float32)
b2_t = torch.tensor(b2, requires_grad=True, dtype=torch.float32)
bngain_t = torch.tensor(bngain, requires_grad=True, dtype=torch.float32)
bnbias_t = torch.tensor(bnbias, requires_grad=True, dtype=torch.float32)
parameters_t = [C_t, W1_t, b1_t, W2_t, b2_t, bngain_t, bnbias_t]

# Checking the shapes
print(C_k.shape == C_t.shape)
print(W1_k.shape == W1_t.shape)
print(b1_k.shape == b1_t.shape)
print(W2_k.shape == W2_t.shape)
print(b2_k.shape == b2_t.shape)
print(bngain_k.shape == bngain_t.shape)
print(bnbias_k.shape == bnbias_t.shape)

True
True
True
True
True
True
True


In [7]:
# Get input
np.random.seed(0)
ix = np.random.randint(0, len(Xtr), batch_size)
Xb, Yb = Xtr[ix], Ytr[ix] # batch X,Y

In [8]:
# Forward pass for pytroch
emb_t = C_t[Xb]
embcat_t = emb_t.view(emb_t.shape[0], -1)

# Linear layer 1
hprebn_t = embcat_t @ W1_t + b1_t

# Batchnorm
bnmeani_t = 1/n * hprebn_t.sum(dim=0, keepdim=True)
bndiff_t = hprebn_t - bnmeani_t
bndiff2_t = bndiff_t**2
bnvar_t = 1/(n-1) * bndiff2_t.sum(dim=0, keepdim=True)
bnvar_inv_t = (bnvar_t + 1e-5)**-0.5

bnraw_t = bndiff_t * bnvar_inv_t
hpreact_t = bngain_t * bnraw_t + bnbias_t
# Non-linearity
h_t = torch.tanh(hpreact_t) # hidden layer
# Linear layer 2
logits_t = h_t @ W2_t + b2_t # output layer
# cross entropy loss (same as F.cross_entropy(logits, Yb))
logit_maxes_t = logits_t.max(1, keepdim=True).values
norm_logits_t = logits_t - logit_maxes_t # subtract max for numerical stability
counts_t = norm_logits_t.exp()
counts_sum_t = counts_t.sum(1, keepdims=True)
counts_sum_inv_t = counts_sum_t**-1 # if I use (1.0 / counts_sum) instead then I can't get backprop to be bit exact...
probs_t = counts_t * counts_sum_inv_t
logprobs_t = probs_t.log()
loss_t = -logprobs_t[range(n), Yb].mean()

# PyTorch backward pass
for p in parameters_t:
  p.grad = None
for t in [logprobs_t, probs_t, counts_t, counts_sum_t, counts_sum_inv_t, norm_logits_t, logit_maxes_t, logits_t, h_t, hpreact_t, bnraw_t,
          bnvar_inv_t, bnvar_t, bndiff2_t, bndiff_t, hprebn_t, bnmeani_t,
          embcat_t, emb_t]:
  t.retain_grad()
loss_t.backward()
loss_t

tensor(4.4975, grad_fn=<NegBackward0>)

In [9]:
# forward pass for my tensors
emb_k = C_k[Xb]
embcat_k = emb_k.view(emb_k.shape[0], -1)
# Linear layer 1

hprebn_k = embcat_k @ W1_k + b1_k

# Batchnorm
bnmeani_k = 1/n * hprebn_k.sum(axis=0, keepdims=True)
bndiff_k = hprebn_k - bnmeani_k
bndiff2_k = bndiff_k**2
bnvar_k = 1/(n-1) * bndiff2_k.sum(axis=0, keepdims=True)
bnvar_inv_k = (bnvar_k + 1e-5)**-0.5
bnraw_k = bndiff_k * bnvar_inv_k
hpreact_k = bngain_k * bnraw_k + bnbias_k
# Non-linearity
h_k = hpreact_k.tanh() # hidden layer

# Linear layer 2
logits_k = h_k @ W2_k + b2_k # output layer
# cross entropy loss (same as F.cross_entropy(logits, Yb))
logit_maxes_k = logits_k.max(axis=1, keepdims=True).data
norm_logits_k = logits_k - logit_maxes_k # subtract max for numerical stability
counts_k = norm_logits_k.exp()
counts_sum_k = counts_k.sum(axis=1, keepdims=True)
counts_sum_inv_k = counts_sum_k**-1 # if I use (1.0 / counts_sum) instead then I can't get backprop to be bit exact...
probs_k = counts_k * counts_sum_inv_k
logprobs_k = probs_k.log()
loss_k = -logprobs_k[range(n), Yb].mean()

# My backward pass
for p in parameters_k:
  p.zero_grad()

loss_k.backward()
loss_k


tensor: 4.497500419616699

In [10]:
loss_k.grad, loss_t.grad
 

  loss_k.grad, loss_t.grad


(array(1., dtype=float32), None)

In [11]:
(logprobs_k.grad == logprobs_t.grad.detach().numpy()).all()

True

In [12]:
np.all(np.isclose(probs_k.grad, probs_t.grad.detach().numpy()))

True

In [13]:
np.all(np.isclose(counts_sum_inv_k.grad, counts_sum_inv_t.grad.detach().numpy()))

True

In [14]:
np.all(np.isclose(counts_k.grad, counts_t.grad.detach().numpy(), atol=1e-5))

True

In [15]:
np.all(np.isclose(norm_logits_k.grad, norm_logits_t.grad.detach().numpy(), atol=1e-5))

True

In [16]:
np.all(np.isclose(logits_k.grad, logits_t.grad.detach().numpy(), atol=1e-5))

True

In [17]:
np.all(np.isclose(h_k.grad, h_t.grad.detach().numpy(), atol=1e-5))

True

In [18]:
np.all(np.isclose(hpreact_k.grad, hpreact_t.grad.detach().numpy(), atol=1e-5))

True

In [19]:
np.all(np.isclose(bnraw_k.grad, bnraw_t.grad.detach().numpy(), atol=1e-5)) 

True

In [20]:
np.all(np.isclose(bnvar_inv_k.grad, bnvar_inv_t.grad.detach().numpy(), atol=1e-5))

True

In [21]:
np.all(np.isclose(bndiff2_k.grad, bndiff2_t.grad.detach().numpy(), atol=1e-5))

True

In [22]:
np.all(np.isclose(bndiff_k.grad, bndiff_t.grad.detach().numpy(), atol=1e-5))

True

In [23]:
np.all(np.isclose(bnvar_k.grad, bnvar_t.grad.detach().numpy(), atol=1e-5))

True

In [24]:
np.all(np.isclose(bnmeani_k.grad, bnmeani_t.grad.detach().numpy(), atol=1e-5))

True

In [25]:
np.all(np.isclose(hprebn_k.grad, hprebn_t.grad.detach().numpy(), atol=1e-5))

True