In [1]:
from exp.nb_01 import *

x, y = get_mnist_data()
y = y.unsqueeze(-1)


def normalize(x, m, s): return (x - m) / s

x_train, x_test = x[:50000], x[50000:]
y_train, y_test = y[:50000], y[50000:]

In [29]:
x_train.mean(), x_train.std()

(tensor(0.1310), tensor(0.3085))

In [30]:
m, s = x_train.mean(), x_train.std()

x_train = normalize(x_train, m, s)
x_test = normalize(x_test, m, s)  # normalizing the same way as train part

In [31]:
x_train.mean(), x_train.std()

(tensor(2.1126e-08), tensor(1.))

In [32]:
x_train[0].shape

torch.Size([784])

### Linear layer

Initialization is the key to successfuly train neural nets

* [Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification](https://arxiv.org/abs/1502.01852)
* [Understanding the difficulty of training deep feedforward neural networks](http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf)

Basicly speaking, output of each layer idealy should have N(0, 1) distribution

In [33]:
def lin(x, w, b):
    return x.matmul(w) + b


def relu(x):
    return x.clamp_min(0.)

In [34]:
input_dim = 784
hidden_dim = 128
output_dim = 1

w1 = torch.randn(input_dim, hidden_dim) * math.sqrt(2 / input_dim)
b1 = torch.zeros(hidden_dim)

w2 = torch.randn(hidden_dim, output_dim) * math.sqrt(2 / hidden_dim)
b2 = torch.zeros(output_dim)

In [35]:
h = relu(lin(x_test, w1, b1))
h.mean(), h.std()

(tensor(0.5765), tensor(0.8422))

### Model

In [36]:
def model(x):
    x = lin(x, w1, b1)
    h = relu(x)
    out = lin(h, w2, b2)
    return out

In [37]:
%timeit -n 10 _=model(x_test)

24.9 ms ± 829 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Loss function

In [38]:
model(x_test).shape, y_test.shape

(torch.Size([10000, 1]), torch.Size([10000, 1]))

In [41]:
def mse(y_pred, y_true):
    return ((y_pred - y_true) ** 2).mean()

In [42]:
mse(model(x_test), y_test)

tensor(39.7200)

### Backward pass

In [69]:
def mse_grad(inp, y):
    inp.g = 2 * (inp - y)


def lin_grad(inp, out, w, b):
    inp.g = out.g @ w.t()
    
    # why?
    w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
    b.g = out.g.sum(0)


def relu_grad(inp, out):
    inp.g = out.g * (inp > 0).float()

In [70]:
def forward_and_backward(x, y, verbose=False):
    s1 = lin(x, w1, b1)
    l1 = relu(s1)
    s2 = lin(l1, w2, b2)
    out = mse(s2, y)
    
    if verbose:
        print('x', x.shape, 'y', y.shape)
        print('s1', s1.shape)
        print('l1', l1.shape)
        print('s2', s2.shape)
        print('out', out.shape)
    
    mse_grad(s2, y)
    lin_grad(l1, s2, w2, b2)
    relu_grad(s1, l1)
    lin_grad(x, s1, w1, b1)
    
    if verbose:
        print()
        print('s2', s2.g.shape)
        print('l1', l1.g.shape)
        print('s1', s1.g.shape)
        print('x', x.g.shape)

In [71]:
forward_and_backward(x_test[:1], y_test[:1], True)

x torch.Size([1, 784]) y torch.Size([1, 1])
s1 torch.Size([1, 128])
l1 torch.Size([1, 128])
s2 torch.Size([1, 1])
out torch.Size([])

s2 torch.Size([1, 1])
l1 torch.Size([1, 128])
s1 torch.Size([1, 128])
x torch.Size([1, 784])


#### Compare with PyTorch autograd

In [76]:
w1g = w1.g.clone()
b1g = b1.g.clone()
w2g = w2.g.clone()
b2g = b2.g.clone()

In [83]:
w1c = w1.clone().requires_grad_(True)
b1c = b1.clone().requires_grad_(True)
w2c = w2.clone().requires_grad_(True)
b2c = b2.clone().requires_grad_(True)

In [84]:
def model(x):
    s1 = lin(x, w1c, b1c)
    l1 = relu(s1)
    s2 = lin(l1, w2c, b2c)
    return s2

In [85]:
out = model(x_test[:1])
loss = mse(out, y_test[:1])

loss.backward()

near(w1g, w1c.grad)

True

### Modules 

Lets refactor previous model

In [105]:
class Linear:
    
    def __init__(self, in_features, out_features):
        self.w = torch.randn(in_features, out_features) * math.sqrt(2 / in_features)
        self.b = torch.zeros(out_features)
    
    def __call__(self, x):
        self.inp = x
        self.out = x.matmul(self.w) + self.b
        return self.out
    
    def backward(self):
        self.inp.g = self.out.g @ self.w.t()
        
        self.w.g = (self.inp.unsqueeze(-1) * self.out.g.unsqueeze(1)).sum(0)
        self.b.g = self.out.g.sum(0)

In [106]:
class Relu:
    
    def __call__(self, x):
        self.inp = x
        self.out = x.clamp_min(0.)
        return self.out
    
    def backward(self):
        self.inp.g = self.out.g * (self.inp > 0).float()

In [107]:
class MSE:
    
    def __call__(self, output, target):
        self.inp = output
        self.target = target
        self.out = (output - target).pow(2).mean()
        return self.out
    
    def backward(self):
        self.inp.g = 2 * (self.inp - self.target)

In [108]:
class Model():
    
    def __init__(self):
        self.layers = [
            Linear(input_dim, hidden_dim),
            Relu(),
            Linear(hidden_dim, output_dim)
        ]
        
    def __call__(self, x):
        for l in self.layers:
            x = l(x)
        return x
    
    def backward(self):
        for l in self.layers[::-1]:
            l.backward()

In [109]:
model = Model()

criteria = MSE()

In [110]:
output = model(x_test[:1])

In [111]:
loss = criteria(output, y_test[:1])

In [112]:
criteria.backward()
model.backward()

Compare to autograd

In [114]:
w1g = model.layers[0].w.g.clone()
b1g = model.layers[0].b.g.clone()

In [115]:
w1c = model.layers[0].w.clone().requires_grad_(True)
b1c = model.layers[0].b.clone().requires_grad_(True)

w2c = model.layers[2].w.clone().requires_grad_(True)
b2c = model.layers[2].b.clone().requires_grad_(True)

In [117]:
def model(x):
    s1 = lin(x, w1c, b1c)
    l1 = relu(s1)
    s2 = lin(l1, w2c, b2c)
    return s2

In [118]:
out = model(x_test[:1])
loss = mse(out, y_test[:1])

loss.backward()

near(w1g, w1c.grad)

True

#### Continue refactoring

In [127]:
class Module:
    
    def __call__(self, *args):
        self.inps = args
        self.out = self.forward(*args)
        return self.out
    
    def backward(self):
        return self._backward(self.out, *self.inps)

In [133]:
class Linear(Module):
    
    def __init__(self, in_features, out_features):
        self.w = torch.randn(in_features, out_features) * math.sqrt(2 / in_features)
        self.b = torch.zeros(out_features)
        
    def forward(self, x):
        return x @ self.w + self.b
    
    
    def _backward(self, out, inp):
        inp.g = out.g @ self.w.t()
        
        self.w.g = (inp.unsqueeze(-1) * out.g.unsqueeze(1)).sum(0)
        self.b.g = out.g.sum(0)
    
    
class Relu(Module):
    
    def forward(self, x):
        return x.clamp_min(0.)
    
    def _backward(self, out, inp):
        inp.g = out.g * (inp > 0).float()
    
    
class MSE(Module):
    
    def forward(self, output, target):
        return (output-target).pow(2).mean()
    
    def _backward(self, out, inp, target):
        inp.g = 2 * (inp - target)

In [134]:
class Model(Module):
    
    def __init__(self):
        self.layers = [
            Linear(input_dim, hidden_dim),
            Relu(),
            Linear(hidden_dim, output_dim)
        ]
        
    def forward(self, x):
        for l in self.layers:
            x = l(x)
        return x
    
    def backward(self):
        for l in reversed(self.layers):
            l.backward()

In [135]:
model = Model()
criteria = MSE()

In [136]:
output = model(x_test[:1])
loss = criteria(output, y_test[:1])

In [137]:
criteria.backward()
model.backward()

In [138]:
w1g = model.layers[0].w.g.clone()
b1g = model.layers[0].b.g.clone()

w1c = model.layers[0].w.clone().requires_grad_(True)
b1c = model.layers[0].b.clone().requires_grad_(True)

w2c = model.layers[2].w.clone().requires_grad_(True)
b2c = model.layers[2].b.clone().requires_grad_(True)

def model(x):
    s1 = lin(x, w1c, b1c)
    l1 = relu(s1)
    s2 = lin(l1, w2c, b2c)
    return s2

out = model(x_test[:1])
loss = mse(out, y_test[:1])

loss.backward()

near(w1g, w1c.grad)

True