<a href="https://colab.research.google.com/github/hatttruong/fastai-playbook/blob/main/13_Backpropagation_MLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In this lesson, I learned:
- Implementing backpropagation from the scratch:
    - simple func
    - class
    - using Pytorch
- Log Softmax: standard to optimized version

In [1]:
import pickle,gzip,math,os,time,shutil,torch,matplotlib as mpl, numpy as np
from pathlib import Path
from torch import tensor
from fastcore.test import test_close
torch.manual_seed(42)

mpl.rcParams['image.cmap'] = 'gray'
torch.set_printoptions(precision=2, linewidth=125, sci_mode=False)
np.set_printoptions(precision=2, linewidth=125)


In [2]:
MNIST_URL='https://github.com/mnielsen/neural-networks-and-deep-learning/blob/master/data/mnist.pkl.gz?raw=true'
path_data = Path('data')
path_data.mkdir(exist_ok=True)
path_gz = path_data/'mnist.pkl.gz'

from urllib.request import urlretrieve
if not path_gz.exists(): urlretrieve(MNIST_URL, path_gz)

with gzip.open(path_gz, 'rb') as f: ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
x_train, y_train, x_valid, y_valid = map(tensor, [x_train, y_train, x_valid, y_valid])

### Basic architecture

In [3]:
n,m = x_train.shape
c = y_train.max()+1
n,m,c


(50000, 784, tensor(10))

In [4]:
# num hidden
nh = 50

In [5]:
w1 = torch.randn(m,nh)
b1 = torch.zeros(nh)
w2 = torch.randn(nh,1)
b2 = torch.zeros(1)

In [6]:
def lin(x, w, b): return x@w + b

In [7]:
t = lin(x_valid, w1, b1)
t.shape

torch.Size([10000, 50])

In [8]:
def relu(x): return x.clamp_min(0.)

In [9]:
t = relu(t)
t

tensor([[ 0.00, 11.87,  0.00,  ...,  5.48,  2.14, 15.30],
        [ 5.38, 10.21,  0.00,  ...,  0.88,  0.08, 20.23],
        [ 3.31,  0.12,  3.10,  ..., 16.89,  0.00, 24.74],
        ...,
        [ 4.01, 10.35,  0.00,  ...,  0.23,  0.00, 18.28],
        [10.62,  0.00, 10.72,  ...,  0.00,  0.00, 18.23],
        [ 2.84,  0.00,  1.43,  ...,  0.00,  5.75,  2.12]])

In [10]:
def model(xb):
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    return lin(l2, w2, b2)

In [11]:
res = model(x_valid)
res.shape

torch.Size([10000, 1])

### Loss function: MSE

In [14]:
res.shape,y_valid.shape

(torch.Size([10000, 1]), torch.Size([10000]))

In [12]:
y_valid

tensor([3, 8, 6,  ..., 5, 6, 8])

In [24]:
res

tensor([[  25.75],
        [ -13.06],
        [-114.79],
        ...,
        [ -67.44],
        [ -74.48],
        [ -60.19]])

In [13]:
res[:,0]

tensor([  25.75,  -13.06, -114.79,  ...,  -67.44,  -74.48,  -60.19])

In [16]:
# remove all the size 1 dimensions
res.squeeze()

tensor([  25.75,  -13.06, -114.79,  ...,  -67.44,  -74.48,  -60.19])

In [15]:
(res[:,0] - y_valid)

tensor([  22.75,  -21.06, -120.79,  ...,  -72.44,  -80.48,  -68.19])

In [17]:
y_train,y_valid = y_train.float(),y_valid.float()

preds = model(x_train)
preds.shape

torch.Size([50000, 1])

In [18]:
def mse(output, targ): return (output[:,0]-targ).pow(2).mean()

In [19]:
mse(preds, y_train)

tensor(4308.76)

### Gradients and backward pass

In [20]:
from sympy import symbols, diff

In [21]:
x, y = symbols('x y')
diff(x**2, x)

2*x

In [22]:
# unsqueeze
x = torch.tensor([1, 2, 3, 4])
x.dim() # 1
x.unsqueeze(-1)

tensor([[1],
        [2],
        [3],
        [4]])

In [23]:
x.unsqueeze(1)

tensor([[1],
        [2],
        [3],
        [4]])

In [25]:
def lin_grad(inp, out, w, b):
    # grad of matmul with respect to input
    inp.g = out.g @ w.t()
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
    b.g = out.g.sum(0)

In [63]:
inp = x_train
targ = y_train
# forward pass:
l1 = lin(inp, w1, b1)
l2 = relu(l1)
out = lin(l2, w2, b2)
diff = out[:,0]-targ
loss = diff.pow(2).mean()


In [67]:
print(diff.shape, diff[:, None].shape, diff.unsqueeze(1).shape)

torch.Size([50000]) torch.Size([50000, 1]) torch.Size([50000, 1])


In [66]:
diff[:, None]

tensor([[-35.97],
        [-99.38],
        [  4.72],
        ...,
        [-60.12],
        [-50.25],
        [-12.35]])

In [68]:
diff.unsqueeze(1)

tensor([[-35.97],
        [-99.38],
        [  4.72],
        ...,
        [-60.12],
        [-50.25],
        [-12.35]])

In [72]:
out.g = 8

In [None]:
# backward pass:
out.g = 2.*diff[:,None] / inp.shape[0]


In [None]:
lin_grad(l2, out, w2, b2)
l1.g = (l1>0).float() * l2.g
lin_grad(inp, l1, w1, b1)


## Refactor Model

## Auto grad