# Fully Connected

### The Forward and backward passes

In [0]:
#export
from pathlib import Path
from IPython.core.debugger import set_trace
from fastai import datasets
import pickle, gzip, math, torch, matplotlib as mpl
import matplotlib.pyplot as plt
from torch import tensor
import fastai

MNIST_URL='http://deeplearning.net/data/mnist/mnist.pkl'

In [0]:
def test(a,b,cmp,cname=None):
    if cname is None: cname=cmp.__name__
    assert cmp(a,b),f"{cname}:\n{a}\n{b}"

def test_eq(a,b): test(a,b,operator.eq,'==')
def near(a,b): return torch.allclose(a, b, rtol=1e-3, atol=1e-5)
def test_near(a,b): test(a,b,near)

In [0]:
def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train,y_train,x_valid,y_valid))

- First we normalize the datasets. Important thing to notice is that we use training set values also for validation set. Normalizing dataset just mean that mean of dataset will be 0 and standard deviation will be 1.

In [0]:
def normalize(x, m, s):
    return (x-m)/s

In [0]:
x_train,y_train,x_valid,y_valid = get_data()

In [118]:
train_mean,train_std = x_train.mean(),x_train.std()
train_mean,train_std

(tensor(0.1304), tensor(0.3073))

In [0]:
x_train = normalize(x_train, train_mean, train_std)
x_valid = normalize(x_valid, train_mean, train_std)

In [120]:
train_mean,train_std = x_train.mean(),x_train.std()
train_mean,train_std

(tensor(0.0001), tensor(1.))

In [0]:
def test_near_zero(a, tol=1e-3):
    assert a.abs()<tol, f'Near zero: {a}'

In [0]:
test_near_zero(x_train.mean())
test_near_zero(1-x_train.std())

In [123]:
n,m = x_train.shape
c = y_train.max()+1
n,m,c

(50000, 784, tensor(10))

## Basic Architecture

- we will create a simple NN with one hidden layer and one output layer.

In [0]:
nh = 50

- simplified kaiming init / he init
    - By dividing with math.sqrt(m) values after the first and the second layer will be also normalized. This is a very important thing to make models work well.

In [0]:
w1 = torch.randn(m,nh)/math.sqrt(m)
b1 = torch.zeros(nh)
w2 = torch.randn(nh,1)/math.sqrt(nh)
b2 = torch.zeros(1)

In [0]:
test_near_zero(w1.mean())
test_near_zero(w1.std()-1/math.sqrt(m))

In [127]:
# This should be ~ (0,1) (mean,std)...
x_valid.mean(),x_valid.std()

(tensor(-0.0057), tensor(0.9924))

- The mean of dataset should be 0 and the standard deviation 1

In [0]:
def lin(x, w, b):
    return x@w + b

In [0]:
t = lin(x_valid, w1, b1)

In [131]:
#...so should this, because we used kaiming init, which is designed to do this
t.mean(), t.std()

(tensor(0.0300), tensor(0.9758))

In [0]:
def relu(x):
    return x.clamp_min(0.)

In [0]:
t = relu(lin(x_valid, w1, b1))

In [134]:
t.mean(), t.std()

(tensor(0.4012), tensor(0.5932))

- These are still 0 and 1 after the first layer because we divide with math.sqrt(m)


**Issue with Relu and BN**
- The problem is that ReLU change mean and standard deviation
- if we remove everything under zero the new data points can’t have mean of 0 or standard deviation of 1.
- Standard deviation will halve every layer which means that there is not much left after a couple of layers. We solve this problem by dividing 2 with m.

    `math.sqrt(2/m)`

In [0]:
# kaiming init / he init for relu
w1 = torch.randn(m,nh)*math.sqrt(2/m)

In [136]:
w1.mean(),w1.std()

(tensor(-0.0005), tensor(0.0507))

In [137]:
t = relu(lin(x_valid, w1, b1))
t.mean(),t.std()

(tensor(0.5239), tensor(0.8329))

#### These things are already implemented in PyTorch so we can now use those.

In [0]:
#export
from torch.nn import init

In [0]:
w1 = torch.zeros(m,nh)
init.kaiming_normal_(w1, mode='fan_out')
t = relu(lin(x_valid, w1, b1))

- fan_out vs. fan_in means that either you divide by the square root of m or the square root of nh And what is the difference? When we use m it will give the variance of 1 during forward pass but when we use nh the variance will be 1 during backward pass. fan_out is same as using nh so why are we using it and how it can be the same as using m ?

In [0]:
init.kaiming_normal_??

In [140]:
w1.mean(),w1.std()

(tensor(4.0896e-05), tensor(0.0504))

In [141]:
t.mean(),t.std()

(tensor(0.5577), tensor(0.7994))

In [142]:
w1.shape

torch.Size([784, 50])

In [0]:
import torch.nn

In [144]:
torch.nn.Linear(m,nh).weight.shape

torch.Size([50, 784])

- The reason to question above is that PyTorch changes the dimensions so we need to think this backwards.

In [0]:
torch.nn.Linear.forward??

In [0]:
torch.nn.functional.linear??

In [0]:
torch.nn.Conv2d??

In [0]:
torch.nn.modules.conv._ConvNd.reset_parameters??

In [0]:
# what if...?
def relu(x): return x.clamp_min(0.) - 0.5

In [146]:
# kaiming init / he init for relu
w1 = torch.randn(m,nh)*math.sqrt(2./m )
t1 = relu(lin(x_valid, w1, b1))
t1.mean(),t1.std()

(tensor(0.0137), tensor(0.7944))

In [0]:
def model(xb):
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    l3 = lin(l2, w2, b2)
    return l3

In [148]:
%timeit -n 10 _=model(x_valid)

10 loops, best of 3: 18.9 ms per loop


In [0]:
assert model(x_valid).shape==torch.Size([x_valid.shape[0],1])

In [44]:
model(x_valid).shape

torch.Size([10000, 1])

In [0]:
#export
def mse(output, targ): return (output.squeeze(-1) - targ).pow(2).mean()

- .squeeze() will get rid of all unit (size of 1) axis.

In [0]:
y_train,y_valid = y_train.float(),y_valid.float()

In [0]:
preds = model(x_train)

In [152]:
preds.shape

torch.Size([50000, 1])

In [153]:
mse(preds, y_train)

tensor(25.6745)

## Gradients and backward pass

- The derivative of something² is just 2*something. The input of the mse is the same as the output of the previous layer. This way gradient can be stored to the input and used later on the previous layer.

In [0]:
def mse_grad(inp, targ):
    inp.g = 2. * (inp.squeeze() - targ).unsqueeze(-1) / inp.shape[0]

- The previous layer use gradient calculated above to multiply it with ReLU’s gradient. ReLU’s gradient is just 0 when less than or equal to zero else 1.

In [0]:
def relu_grad(inp, out):
    inp.g = (inp>0).float() * out.g

- The same thing is done for the linear layer.

In [0]:
def lin_grad(inp, out, w, b):
    inp.g = out.g @ w.t()
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
    b.g = out.g.sum(0)

- We combine forward pass to backward pass and get following code.

In [0]:
def forward_and_backward(inp, targ):
    l1 = inp @ w1 + b1
    l2 = relu(l1)
    out = l2 @ w2 + b2
    
    loss = mse(out, targ)
    
    mse_grad(out, targ)
    lin_grad(l2, out, w2, b2)
    relu_grad(l1, l2)
    lin_grad(inp, l1, w1, b1)

In [0]:
forward_and_backward(x_train, y_train)

In [0]:
# Save for testing against later
w1g = w1.g.clone()
w2g = w2.g.clone()
b1g = b1.g.clone()
b2g = b2.g.clone()
ig  = x_train.g.clone()

We cheat a little bit and use PyTorch autograd to check our results.

In [0]:
xt2 = x_train.clone().requires_grad_(True)
w12 = w1.clone().requires_grad_(True)
w22 = w2.clone().requires_grad_(True)
b12 = b1.clone().requires_grad_(True)
b22 = b2.clone().requires_grad_(True)

In [0]:
def forward(inp, targ):
    l1 = inp @ w12 + b12
    l2 = relu(l1)
    out = l2 @ w22 + b22
    return mse(out, targ)

In [0]:
loss = forward(xt2, y_train)

In [0]:
loss.backward()

In [0]:
test_near(w22.grad, w2g)
test_near(b22.grad, b2g)
test_near(w12.grad, w1g)
test_near(b12.grad, b1g)
test_near(xt2.grad, ig )

## Refactor Model

#### Layers as classes

In [0]:
class Relu():
    def __call__(self, inp):
        self.inp = inp
        self.out = inp.clamp_min(0.)-0.5
        return self.out
    
    def backward(self):
        self.inp.g = (self.inp>0).float() * self.out.g

In [0]:
class Lin():
    def __init__(self, w, b): 
        self.w,self.b = w,b
    
    def __call__(self, inp):
        self.inp = inp
        self.out = inp @ self.w + self.b
        return self.out
    
    def backward(self):
        self.inp.g = self.out.g @ self.w.t()
        self.w.g = (self.inp.unsqueeze(-1) * self.out.g.unsqueeze(1)).sum(0)
        self.b.g = self.out.g.sum(0)

In [0]:
class Mse():
    def __call__(self, inp, targ):
        self.inp = inp
        self.targ = targ
        self.out = (inp.squeeze() - targ).pow(2).mean()
        return self.out
    
    def backward(self):
        self.inp.g = 2. * (self.inp.squeeze() - self.targ).unsqueeze(-1) / self.targ.shape[0]

In [0]:
class Model():
    def __init__(self, w1, b1, w2, b2):
        self.layers = [Lin(w1, b1), Relu(), Lin(w2, b2)]
        self.loss = Mse()
        
    def __call__(self, x, targ):
        for l in self.layers:
            x = l(x)
        return self.loss(x, targ)
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers):
            l.backward()

In [0]:
w1.g, b1.g, w2.g, b2.g = [None] * 4

In [0]:
model = Model(w1, b1, w2, b2)

In [171]:
%time loss = model(x_train, y_train)

CPU times: user 114 ms, sys: 210 µs, total: 114 ms
Wall time: 115 ms


In [172]:
%time model.backward()

CPU times: user 3.17 s, sys: 6.48 s, total: 9.65 s
Wall time: 9.67 s


In [0]:
test_near(w2g, w2.g)
test_near(b2g, b2.g)
test_near(w1g, w1.g)
test_near(b1g, b1.g)
test_near(ig, x_train.g)

#### Module.forward()

In [0]:
class Module():
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out
    
    def forward(self): raise Exception('Not Implemented')
    def backward(self): self.bwd(self.out, *self.args)

In [0]:
class Relu(Module):
    def forward(self, inp): return inp.clamp_min(0.) - 0.5
    def bwd(self, out, inp): inp.g = (inp>0).float() * out.g

In [0]:
class Lin(Module):
    def __init__(self, w, b):
        self.w = w
        self.b = b
        
    def forward(self, inp):
        return inp@self.w + self.b
    
    def bwd(self, out, inp):
        inp.g = out.g @ self.w.t()
        self.w.g = torch.einsum("bi,bj->ij", inp, out.g)
        self.b.g = out.g.sum(0)

In [0]:
class Mse(Module):
    def forward(self, inp, targ):
        return (inp.squeeze() - targ).pow(2).mean()
    
    def bwd(self, out, inp, targ):
        inp.g = 2*(inp.squeeze()-targ).unsqueeze(-1) / targ.shape[0]

In [0]:
class Model():
    def __init__(self):
        self.layers = [Lin(w1, b1), Relu(), Lin(w2, b2)]
        self.loss = Mse()
        
    def __call__(self, x, targ):
        for l in self.layers:
            x = l(x)
        return self.loss(x, targ)
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): l.backward()

In [0]:
w1.g, b1.g, w2.g, b2.g = [None] * 4
model = Model()

In [181]:
%time loss = model(x_train, y_train)

CPU times: user 116 ms, sys: 468 µs, total: 116 ms
Wall time: 117 ms


In [182]:
%time model.backward()

CPU times: user 247 ms, sys: 9.53 ms, total: 256 ms
Wall time: 259 ms


In [0]:
test_near(w2g, w2.g)
test_near(b2g, b2.g)
test_near(w1g, w1.g)
test_near(b1g, b1.g)
test_near(ig, x_train.g)

## Without einsum

In [0]:
class Lin(Module):
    def __init__(self, w, b):
        self.w = w
        self.b = b
        
    def forward(self, inp):
        return inp@self.w + self.b
    
    def bwd(self, out, inp):
        inp.g = out.g @ self.w.t()
        self.w.g = inp.t() @ out.g
        self.b.g = out.g.sum(0)

In [0]:
class Model():
    def __init__(self):
        self.layers = [Lin(w1, b1), Relu(), Lin(w2, b2)]
        self.loss = Mse()
        
    def __call__(self, x, targ):
        for l in self.layers:
            x = l(x)
        return self.loss(x, targ)
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): l.backward()

In [0]:
w1.g,b1.g,w2.g,b2.g = [None]*4
model = Model()

In [105]:
%time loss = model(x_train, y_train)

CPU times: user 111 ms, sys: 1.44 ms, total: 112 ms
Wall time: 115 ms


In [106]:
%time model.backward()

CPU times: user 235 ms, sys: 5.08 ms, total: 240 ms
Wall time: 242 ms


In [0]:
test_near(w2g, w2.g)
test_near(b2g, b2.g)
test_near(w1g, w1.g)
test_near(b1g, b1.g)
test_near(ig, x_train.g)

## nn.Linear and nn.Module

In [0]:
#export
from torch import nn

In [0]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)]
        self.loss = mse
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x.squeeze(), targ)

In [0]:
model = Model(m, nh, 1)

In [111]:
%time loss = model(x_train, y_train)

CPU times: user 98.8 ms, sys: 397 µs, total: 99.2 ms
Wall time: 114 ms


In [112]:
%time loss.backward()

CPU times: user 78.6 ms, sys: 1.26 ms, total: 79.8 ms
Wall time: 85 ms
