In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from google.colab import drive
drive.mount("/content/gdrive")
%cd gdrive/MyDrive/Ml/nbs/dl2/

Mounted at /content/gdrive
/content/gdrive/MyDrive/Ml/nbs/dl2


## The forward and backward passes

[Jump_to lesson 8 video](https://course19.fast.ai/videos/?lesson=8&t=4960)

In [3]:
#export
from exp.nb_01 import *

def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train,y_train,x_valid,y_valid))

def normalize(x, m, s): return (x-m)/s

In [4]:
from six.moves import urllib
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
urllib.request.install_opener(opener)


def get_data():
    import os
    import torchvision.datasets as datasets
    
    datasets.MNIST.resources = [
        ('https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz', 'f68b3c2dcbeaaa9fbdd348bbdeb94873'),
        ('https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz', 'd53e105ee54ea40749a09fcbcd1e9432'),
        ('https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz', '9fb629c4189551a2d022fa330f9573f3'),
        ('https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz', 'ec29112dd5afa0611ce80d1b7f02629c')
    ]
    
    root = '../data'
    if not os.path.exists(root):
        os.mkdir(root)
    train_set = datasets.MNIST(root=root, train=True, download=True)
    test_set = datasets.MNIST(root=root, train=False, download=True)
    x_train, x_valid = train_set.data.split([50000, 10000])
    y_train, y_valid = train_set.targets.split([50000, 10000])
    return (x_train.view(50000, -1) / 256.0), y_train.float(), (x_valid.view(10000, -1))/ 256.0, y_valid.float()

x_train,y_train,x_valid,y_valid = get_data()

def normalize(x, m, s): return (x-m)/s

In [71]:
train_mean,train_std = x_train.mean(),x_train.std()
train_mean,train_std
# raw data's mean & std is not the same like normal distribution

(tensor(3.8966e-08), tensor(1.))

In [72]:
x_train = normalize(x_train, train_mean, train_std)
# NB: Use training, not validation mean for validation set ## 매우 중요,, training dataset으로 normalize하기
x_valid = normalize(x_valid, train_mean, train_std)

# We will make the model with xtrain data, so for knowing how different from train data
# we need to normalize x_valid by x_train mean and std.
# bcz if the criteria is different, we won't compare the data correctly.

In [7]:
train_mean,train_std = x_train.mean(),x_train.std()
train_mean,train_std
# now we get 0 mean, 1 std

(tensor(3.8966e-08), tensor(1.))

In [8]:
#export
def test_near_zero(a,tol=1e-3): assert a.abs()<tol, f"Near zero: {a}"

In [9]:
test_near_zero(x_train.mean())
test_near_zero(1-x_train.std())

In [10]:
n,m = x_train.shape
c = y_train.max()+1
n,m,c

# n is number of data
# m is the number of input feature
# c is the how many classes

(50000, 784, tensor(10.))

## Foundations version

### Basic architecture

[Jump_to lesson 8 video](https://course19.fast.ai/videos/?lesson=8&t=5128)

In [11]:
# num hidden
nh = 50
# choose the number of  hidden layer

[Tinker practice](https://course19.fast.ai/videos/?lesson=8&t=5255)

In [19]:
import torch
torch.randn(m,nh).mean()
# we get the 0 mean, due to getting from normal distribution

tensor(-0.0084)

In [13]:
math.sqrt(m)

28.0

In [14]:
# standard xavier init
w1 = torch.randn(m,nh)/math.sqrt(m)  #나눠주는 이유는 두번째 레이어 인풋을 mean,std : 0,1로 맞추기 위해서 => kaming init이라고 부름
b1 = torch.zeros(nh)
w2 = torch.randn(nh,1)/math.sqrt(nh)
b2 = torch.zeros(1)

# if we use xavier, our model will be more robust in matrix multiply

In [16]:
test_near_zero(w1.mean())
test_near_zero(w1.std()-1/math.sqrt(m))

In [20]:
# This should be ~ (0,1) (mean,std)...
x_valid.mean(),x_valid.std()


(tensor(-0.0059), tensor(0.9924))

In [21]:
def lin(x, w, b): return x@w + b

In [22]:
t = lin(x_valid, w1, b1)

In [23]:
#...so should this, because we used xavier init, which is designed to do this
t.mean(),t.std()

(tensor(0.0341), tensor(1.0163))

In [24]:
def relu(x): return x.clamp_min(0.) #0미만인 것은 0으로

In [25]:
t = relu(lin(x_valid, w1, b1))

In [26]:
#...actually it really should be this!
t.mean(),t.std()

# but xavier is not enought when we use non-linear functions

(tensor(0.4165), tensor(0.5869))

From pytorch docs: `a: the negative slope of the rectifier used after this layer (0 for ReLU by default)`

$$\text{std} = \sqrt{\frac{2}{(1 + a^2) \times \text{fan_in}}}$$

This was introduced in the paper that described the Imagenet-winning approach from *He et al*: [Delving Deep into Rectifiers](https://arxiv.org/abs/1502.01852), which was also the first paper that claimed "super-human performance" on Imagenet (and, most importantly, it introduced resnets!)

[Jump_to lesson 8 video](https://course19.fast.ai/videos/?lesson=8&t=5128)

In [43]:
# kaiming init / he init for relu
w1 = torch.randn(m,nh)*math.sqrt(2/m)
# w1 = torch.randn(m,nh)

In [44]:
w1.mean(),w1.std()

(tensor(-0.0002), tensor(0.0509))

In [45]:
t = relu(lin(x_valid, w1, b1))
t.mean(),t.std()
# when i use kaiming init, sometimes it won't near the 1 std.
# but in most cases we get the number 1 std by using kaiming init

(tensor(0.5188), tensor(0.7710))

In [46]:
#export
from torch.nn import init

In [47]:
w1 = torch.zeros(m,nh)
init.kaiming_normal_(w1, mode='fan_out')

t = relu(lin(x_valid, w1, b1))

#mode – either 'fan_in' (default) or 'fan_out'. 
#Choosing 'fan_in' preserves the magnitude of the variance of the weights in the forward pass. 
#Choosing 'fan_out' preserves the magnitudes in the backwards pass.

In [48]:
init.kaiming_normal_??

In [49]:
w1.mean(),w1.std()

(tensor(-0.0001), tensor(0.0505))

In [50]:
t.mean(),t.std()

(tensor(0.5666), tensor(0.8149))

In [51]:
w1.shape

torch.Size([784, 50])

In [52]:
import torch.nn

In [53]:
torch.nn.Linear(m,nh).weight.shape 
#w1.shape랑 반대임 (784,50)
# but it is natural for torch

torch.Size([50, 784])

In [None]:
torch.nn.Linear.forward??

In [None]:
torch.nn.functional.linear??
 #F = torch.nn.functional

In [None]:
torch.nn.Conv2d??

In [None]:
torch.nn.modules.conv._ConvNd.reset_parameters??

In [54]:
# what if...?
def relu(x): return x.clamp_min(0.) - 0.5 #뺀게 성능이 좋다!
# this is from jeremy recommandation

In [55]:
# kaiming init / he init for relu
w1 = torch.randn(m,nh)*math.sqrt(2./m )
t1 = relu(lin(x_valid, w1, b1))
t1.mean(),t1.std()

(tensor(-0.0066), tensor(0.7704))

In [56]:
def model(xb):
    l1 = lin(xb, w1, b1)
    l2 = relu(l1)
    l3 = lin(l2, w2, b2)
    return l3

In [57]:
%timeit -n 10 _=model(x_valid)

10 loops, best of 5: 19.2 ms per loop


In [58]:
assert model(x_valid).shape==torch.Size([x_valid.shape[0],1])

### Loss function: MSE

[Jump_to lesson 8 video](https://course19.fast.ai/videos/?lesson=8&t=6372)

In [59]:
model(x_valid).shape

torch.Size([10000, 1])

We need `squeeze()` to get rid of that trailing (,1), in order to use `mse`. (Of course, `mse` is not a suitable loss function for multi-class classification; we'll use a better loss function soon. We'll use `mse` for now to keep things simple.)

In [60]:
#export
def mse(output, targ): return (output.squeeze(-1) - targ).pow(2).mean()

In [61]:
y_train,y_valid = y_train.float(),y_valid.float()
# if not convert to float, mse doesn't work properly bcz we need float numbers

In [63]:
preds = model(x_train)

In [64]:
preds.shape

torch.Size([50000, 1])

In [65]:
mse(preds, y_train)

tensor(33.7839)

In [66]:
preds

tensor([[-0.8297],
        [ 0.7798],
        [ 0.9191],
        ...,
        [-1.3331],
        [-0.0762],
        [-1.0307]])

### Gradients and backward pass

In [69]:
def mse_grad(inp, targ):  #inp :previous layer
    # grad of loss with respect to output of previous layer
    # yhat으로 미분(스칼라 미분)
    inp.g = 2. * (inp.squeeze() - targ).unsqueeze(-1) / inp.shape[0]
    # print("msegrad",inp.g.shape)
    
mse_grad(preds,y_train)
# get the mse_grad which is scalar gradient

In [70]:
# preds.squeeze() - y_train
# preds.shape[0]

50000

[Jump_to lesson 8 video](https://course19.fast.ai/videos/?lesson=8&t=6493)

In [73]:
def mse_grad(inp, targ):  #inp :previous layer
    # grad of loss with respect to output of previous layer
    # yhat으로 미분(스칼라 미분)
    inp.g = 2. * (inp.squeeze() - targ).unsqueeze(-1) / inp.shape[0]
    # print("msegrad",inp.g.shape)

In [74]:
def relu_grad(inp, out):
    # grad of relu with respect to input activations
    inp.g = (inp>0).float() * out.g

In [75]:
def lin_grad(inp, out, w, b):
    # grad of matmul with respect to input
    inp.g = out.g @ w.t() #gradient calulus
    # print("-----", out.g.shape, w.t().shape)
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
    # print(inp,"--",inp.shape,inp.unsqueeze(-1).shape)
    # print(out.g,"--",out.g.shape,out.g.unsqueeze(1).shape)
    # print((inp.unsqueeze(-1) * out.g.unsqueeze(1)).shape)
    # print("DDDD",w.g.shape)
    b.g = out.g.sum(0)
    # print("아웃쥐",out.g.shape,out.g.sum(0).shape)
    # print(out.g.sum(0),out.g)
# w.g is from matrix multiply by using the method which we studied time-saving matrix multiply form

In [82]:
def forward_and_backward(inp, targ):
    # forward pass:
    l1 = inp @ w1 + b1
    l2 = relu(l1)
    out = l2 @ w2 + b2
    # we don't actually need the loss in backward!
    loss = mse(out, targ)
    
    # backward pass:
    mse_grad(out, targ)
    lin_grad(l2, out, w2, b2)
    relu_grad(l1, l2)
    lin_grad(inp, l1, w1, b1)

In [83]:
# h1 = torch.randn(50000,50,1)
# h2 = torch.randn(50000,1,1)
# (h1 @ h2).shape
# (h1 * h2).shape

In [84]:
forward_and_backward(x_train, y_train)

tensor([[-0.5000, -0.5000, -0.0816,  ..., -0.5000,  0.7529,  1.4795],
        [-0.5000, -0.5000,  1.9227,  ...,  0.4268, -0.4806,  0.2168],
        [ 0.9644, -0.5000, -0.4297,  ..., -0.5000, -0.5000, -0.5000],
        ...,
        [-0.5000, -0.5000, -0.5000,  ..., -0.5000,  0.2336,  0.6432],
        [ 0.1190, -0.5000, -0.5000,  ..., -0.3029,  0.0968,  1.9720],
        [ 1.8500, -0.5000, -0.5000,  ..., -0.5000, -0.5000,  0.8112]]) -- torch.Size([50000, 50]) torch.Size([50000, 50, 1])
tensor([[-2.3319e-04],
        [ 3.1192e-05],
        [-1.2324e-04],
        ...,
        [-3.7332e-04],
        [-1.6305e-04],
        [-3.6123e-04]]) -- torch.Size([50000, 1]) torch.Size([50000, 1, 1])
torch.Size([50000, 50, 1])
DDDD torch.Size([50, 1])
tensor([[-0.4245, -0.4245, -0.4245,  ..., -0.4245, -0.4245, -0.4245],
        [-0.4245, -0.4245, -0.4245,  ..., -0.4245, -0.4245, -0.4245],
        [-0.4245, -0.4245, -0.4245,  ..., -0.4245, -0.4245, -0.4245],
        ...,
        [-0.4245, -0.4245, -0.424

In [85]:
# Save for testing against later
w1g = w1.g.clone()
w2g = w2.g.clone()
b1g = b1.g.clone()
b2g = b2.g.clone()
ig  = x_train.g.clone()

## 일반적으로 w1.g를 변수로 쓸 수 없지만 클래스/함수에서 w1.g를 변수로 선언할 수 있다.

We cheat a little bit and use PyTorch autograd to check our results.

In [86]:
xt2 = x_train.clone().requires_grad_(True)
w12 = w1.clone().requires_grad_(True)
w22 = w2.clone().requires_grad_(True)
b12 = b1.clone().requires_grad_(True)
b22 = b2.clone().requires_grad_(True)
# 파이토치 autograd쓰기위해 쓰는 문구
# what is pytorch grad?
# requires_grad가 True로 설정되어 있는 Tensor는 계산할 때 Gradient의 계산이 필요하다는 것을 의미합니다. x.requires_grad를 Tensor를 생성할 때 True로 설정해 줬기 때문에 z.backward()를 부른 후에 x.grad에는 
# 이 저장됩니다. x.requires_grad를 True로 설정하면 x로부터 파생되는 Tensor에는 requires_grad가 True로 자동으로 설정됩니다. 그래서 x로부터 파생된 y와 z도 requires_grad가 True로 설정됩니다. 하지만 Gradient를 계산하더라도 그 Gradient를 항상 저장하지는 않습니다. Tensor의 is_leaf가 True이고 requires_grad가 True인 경우에만 Gradient를 계산하고 grad에 Gradient를 저장합니다. Tensor의 requires_grad가 사용자에 의해 True로 설정된 경우에 is_leaf가 True로 설정되고, requires_grad가 True로 설정된 Tensor로부터 파생된 Tensor의 경우에는 is_leaf가 False로 설정됩니다. 그래서 x는 is_leaf가 True이고 y와 z는 is_leaf가 False입니다. y와 z의 is_leaf가 False라서, y와 z는 requires_grad가 True라도, y.grad와 z.grad가 z.backward()를 호출한 뒤에도 Gradient가 저장되지 않고 None이 됩니다. x.grad를 계산하기 위해서는 y.grad와 z.grad를 계산해서 Chain Rule을 사용해야 되기 때문에 x.grad를 계산하기 위해서는 y.grad와 z.grad가 저장이 되지 않더라도 계산은 필요합니다. 그래서 x.grad를 계산하기 위해서 y.grad와 z.grad에 Gradient가 저장이 되지 않더라도 y.requires_grad와 z.requires_grad는 True로 설정합니다.
# https://teamdable.github.io/techblog/PyTorch-Autograd

In [87]:
def forward(inp, targ):
    # forward pass:
    l1 = inp @ w12 + b12
    l2 = relu(l1)
    out = l2 @ w22 + b22
    # we don't actually need the loss in backward!
    return mse(out, targ)

In [88]:
loss = forward(xt2, y_train)

In [89]:
loss.backward()

In [90]:
test_near(w22.grad, w2g)
test_near(b22.grad, b2g)
test_near(w12.grad, w1g)
test_near(b12.grad, b1g)
test_near(xt2.grad, ig )

## Refactor model

### Layers as classes

[Jump_to lesson 8 video](https://course19.fast.ai/videos/?lesson=8&t=7112)

In [None]:
class Relu():
    def __call__(self, inp):
        self.inp = inp
        self.out = inp.clamp_min(0.)-0.5
        return self.out
    
    def backward(self): self.inp.g = (self.inp>0).float() * self.out.g

In [None]:
class Lin():
    def __init__(self, w, b): self.w,self.b = w,b
        
    def __call__(self, inp):
        self.inp = inp
        self.out = inp@self.w + self.b
        return self.out
    
    def backward(self):
        self.inp.g = self.out.g @ self.w.t()
        # Creating a giant outer product, just to sum it, is inefficient!
        self.w.g = (self.inp.unsqueeze(-1) * self.out.g.unsqueeze(1)).sum(0)
        self.b.g = self.out.g.sum(0)

In [None]:
class Mse():
    def __call__(self, inp, targ):
        self.inp = inp
        self.targ = targ
        self.out = (inp.squeeze() - targ).pow(2).mean()
        return self.out
    
    def backward(self):
        self.inp.g = 2. * (self.inp.squeeze() - self.targ).unsqueeze(-1) / self.targ.shape[0]

In [None]:
class Model():
    def __init__(self, w1, b1, w2, b2):
        self.layers = [Lin(w1,b1), Relu(), Lin(w2,b2)]
        self.loss = Mse()
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x, targ)
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): l.backward()

# it is same as torch module
# backward and forward function

In [None]:
w1.g,b1.g,w2.g,b2.g = [None]*4
model = Model(w1, b1, w2, b2)
# 

In [93]:
w1.g,b1.g,w2.g,b2.g = [None] * 4
# if not using * 4 , error arises

In [None]:
%time loss = model(x_train, y_train)

CPU times: user 137 ms, sys: 4.95 ms, total: 142 ms
Wall time: 70.7 ms


In [None]:
%time model.backward()

CPU times: user 2.84 s, sys: 3.86 s, total: 6.71 s
Wall time: 3.4 s


In [None]:
test_near(w2g, w2.g)
test_near(b2g, b2.g)
test_near(w1g, w1.g)
test_near(b1g, b1.g)
test_near(ig, x_train.g)

### Module.forward()

In [None]:
class Module():
    def __call__(self, *args):
        self.args = args
        self.out = self.forward(*args)
        return self.out
    
    def forward(self): raise Exception('not implemented')
    def backward(self): self.bwd(self.out, *self.args)
# the basic frame of class

In [None]:
class Relu(Module):
    def forward(self, inp): return inp.clamp_min(0.)-0.5
    def bwd(self, out, inp): inp.g = (inp>0).float() * out.g

In [None]:
class Lin(Module):
    def __init__(self, w, b): self.w,self.b = w,b
        
    def forward(self, inp): return inp@self.w + self.b
    
    def bwd(self, out, inp):
        inp.g = out.g @ self.w.t()
        self.w.g = torch.einsum("bi,bj->ij", inp, out.g)
        self.b.g = out.g.sum(0)

In [None]:
# more about einsum
#https://baekyeongmin.github.io/dev/einsum/

In [None]:
class Mse(Module):
    def forward (self, inp, targ): return (inp.squeeze() - targ).pow(2).mean()
    def bwd(self, out, inp, targ): inp.g = 2*(inp.squeeze()-targ).unsqueeze(-1) / targ.shape[0]

In [None]:
class Model():
    def __init__(self):
        self.layers = [Lin(w1,b1), Relu(), Lin(w2,b2)]
        self.loss = Mse()
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x, targ)
    
    def backward(self):
        self.loss.backward()
        for l in reversed(self.layers): l.backward()

In [None]:
w1.g,b1.g,w2.g,b2.g = [None]*4
model = Model()

In [None]:
%time loss = model(x_train, y_train)
# 94.2

CPU times: user 86 ms, sys: 8.25 ms, total: 94.2 ms
Wall time: 46.3 ms


In [None]:
%time model.backward()
# 280 + 94.2 = 374.2

CPU times: user 193 ms, sys: 87.6 ms, total: 280 ms
Wall time: 140 ms


In [None]:
test_near(w2g, w2.g)
test_near(b2g, b2.g)
test_near(w1g, w1.g)
test_near(b1g, b1.g)
test_near(ig, x_train.g)

### Without einsum

[Jump_to lesson 8 video](https://course19.fast.ai/videos/?lesson=8&t=7484)

In [None]:
class Lin(Module):
    def __init__(self, w, b): self.w,self.b = w,b
        
    def forward(self, inp): return inp@self.w + self.b
    
    def bwd(self, out, inp):
        inp.g = out.g @ self.w.t()
        self.w.g = inp.t() @ out.g
        self.b.g = out.g.sum(0)

In [None]:
w1.g,b1.g,w2.g,b2.g = [None]*4
model = Model()

In [None]:
%time loss = model(x_train, y_train)

CPU times: user 88.6 ms, sys: 5.04 ms, total: 93.6 ms
Wall time: 46.4 ms


In [None]:
%time model.backward()

# compared to above which is using einsum, almost same

CPU times: user 197 ms, sys: 83.9 ms, total: 281 ms
Wall time: 140 ms


In [None]:
test_near(w2g, w2.g)
test_near(b2g, b2.g)
test_near(w1g, w1.g)
test_near(b1g, b1.g)
test_near(ig, x_train.g)

### nn.Linear and nn.Module

In [None]:
#export
from torch import nn

In [None]:
class Model(nn.Module):
    def __init__(self, n_in, nh, n_out):
        super().__init__()
        self.layers = [nn.Linear(n_in,nh), nn.ReLU(), nn.Linear(nh,n_out)]
        self.loss = mse
        
    def __call__(self, x, targ):
        for l in self.layers: x = l(x)
        return self.loss(x.squeeze(), targ)

In [None]:
model = Model(m, nh, 1)

In [None]:
%time loss = model(x_train, y_train)

CPU times: user 85.1 ms, sys: 8.16 ms, total: 93.3 ms
Wall time: 46.3 ms


In [None]:
%time loss.backward()
# torch is even more fast

CPU times: user 135 ms, sys: 78.1 ms, total: 213 ms
Wall time: 71.1 ms


## Export

In [None]:
!./notebook2script.py 02_fully_connected.ipynb

Converted 02_fully_connected.ipynb to nb_02.py
