In [45]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Does nn.Conv2d init work well?

In [46]:
#export
from exp.nb_02 import *

def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train,y_train,x_valid,y_valid))

def normalize(x, m, s): return (x-m)/s

In [47]:
torch.nn.modules.conv._ConvNd.reset_parameters??

In [48]:
# load and normalize the MNIST data
x_train,y_train,x_valid,y_valid = get_data()
train_mean,train_std = x_train.mean(),x_train.std()
x_train = normalize(x_train, train_mean, train_std)
x_valid = normalize(x_valid, train_mean, train_std)

In [49]:
x_valid.shape

torch.Size([10000, 784])

In [50]:
# training set consists of 50,000 images of size 28x28
# validation set consists of 10,000 images of size 28x28
# view reshapes an array
x_train = x_train.view(-1,1,28,28)
x_valid = x_valid.view(-1,1,28,28)
x_train.shape,x_valid.shape

(torch.Size([50000, 1, 28, 28]), torch.Size([10000, 1, 28, 28]))

In [52]:
# n is the number of training examples
n,*_ = x_train.shape
# c is number of digits
c = y_train.max()+1
# nh is number of channels in the hidden layer
nh = 32
n,c

(50000, tensor(10))

In [53]:
# linear layer nh = 32 channels, 5x5 kernel
l1 = nn.Conv2d(1, nh, 5)

In [54]:
# subset of validation set
x = x_valid[:100]

In [55]:
x.shape

torch.Size([100, 1, 28, 28])

In [56]:
def stats(x): return x.mean(),x.std()

In [57]:
# n_output, n_input, 5, 5
l1.weight.shape

torch.Size([32, 1, 5, 5])

In [58]:
l1.weight

Parameter containing:
tensor([[[[-1.9272e-01, -5.9605e-02, -1.0969e-01,  8.7362e-02,  1.1057e-01],
          [ 7.2869e-02,  1.8587e-01,  4.2541e-02, -1.3276e-01,  7.4849e-02],
          [ 1.4712e-01,  1.5383e-01, -7.0912e-02, -1.0803e-01, -1.4747e-01],
          [ 1.2613e-01, -5.3487e-02, -2.1475e-02, -1.4186e-01,  1.7312e-01],
          [-4.1480e-02, -1.1257e-01,  3.1724e-02, -1.0660e-03,  2.7187e-03]]],


        [[[ 9.6423e-02,  1.5486e-01, -1.0322e-01,  1.2410e-01, -1.3959e-01],
          [-9.4860e-02, -3.3992e-02,  1.8347e-01,  1.3069e-01,  4.0130e-02],
          [-1.3026e-01, -7.1020e-02, -6.7133e-02,  1.1190e-01,  5.6897e-02],
          [ 1.7928e-01, -1.1067e-01,  3.3082e-02,  1.6363e-01,  1.0432e-01],
          [-5.9715e-02, -8.8557e-02, -3.9847e-03, -1.2043e-02,  5.5731e-02]]],


        [[[ 1.7915e-01, -3.3733e-02, -8.3825e-02, -4.0408e-02, -1.6013e-04],
          [ 1.7347e-01,  1.8882e-01, -1.4995e-01,  4.4772e-03,  1.0159e-01],
          [-6.1053e-02, -9.9347e-02, -1.0995e-

In [59]:
# mean and std
stats(l1.weight),stats(l1.bias)

((tensor(0.0052, grad_fn=<MeanBackward0>),
  tensor(0.1110, grad_fn=<StdBackward0>)),
 (tensor(-0.0361, grad_fn=<MeanBackward0>),
  tensor(0.1063, grad_fn=<StdBackward0>)))

In [60]:
# apply the linear layer to the reduced validation set input data
# why 24 x 24, something to do with padding and stride 
t = l1(x)
t.shape

torch.Size([100, 32, 24, 24])

In [61]:
# mean and std
stats(t)

(tensor(-0.0235, grad_fn=<MeanBackward0>),
 tensor(0.5853, grad_fn=<StdBackward0>))

In [62]:
# for linear layer a = 1
# a is the slope of the negative branch of the relu
init.kaiming_normal_(l1.weight, a=1.)
# initialized layer is close to zero mean and unit variance
stats(l1(x))

(tensor(-0.0553, grad_fn=<MeanBackward0>),
 tensor(1.0808, grad_fn=<StdBackward0>))

In [63]:
import torch.nn.functional as F

In [64]:
# define a standard relu (a = 0)
def f1(x,a=0): return F.leaky_relu(l1(x),a)

In [65]:
# relu
init.kaiming_normal_(l1.weight, a=0)
stats(f1(x))

(tensor(0.5009, grad_fn=<MeanBackward0>),
 tensor(0.8875, grad_fn=<StdBackward0>))

In [21]:
# linear layer
l1 = nn.Conv2d(1, nh, 5)
stats(f1(x))

(tensor(0.2072, grad_fn=<MeanBackward0>),
 tensor(0.3980, grad_fn=<StdBackward0>))

In [66]:
# linear convolution layer
l1.weight.shape

torch.Size([32, 1, 5, 5])

In [67]:
l1.weight[0,0]

tensor([[-0.0194,  0.4017, -0.2740, -0.1100, -0.0630],
        [-0.2318, -0.3643, -0.2879,  0.3252,  0.0431],
        [-0.6038,  0.0480, -0.1527,  0.2660, -0.4643],
        [ 0.1180, -0.0254,  0.3131, -0.1193, -0.1248],
        [ 0.5837,  0.2988,  0.2894,  0.3881,  0.0371]],
       grad_fn=<SelectBackward>)

In [68]:
# receptive field size
rec_fs = l1.weight[0,0].numel()
rec_fs

25

In [69]:
# nf = number of filters (channels)
nf,ni,*_ = l1.weight.shape
nf,ni

(32, 1)

In [70]:
fan_in  = ni*rec_fs
fan_out = nf*rec_fs
fan_in,fan_out

(25, 800)

In [72]:
# what is the origin of this formula? 
# for linear layer a=1, so gain = 1
# for relu, a = 0, so gain = sqrt(2)
# when a = sqrt(5), gain = 1/sqrt(3) = 0.577
def gain(a): return math.sqrt(2.0 / (1 + a**2))

In [74]:
gain(1),gain(0),gain(0.01),gain(0.1),gain(math.sqrt(5.))

(1.0,
 1.4142135623730951,
 1.4141428569978354,
 1.4071950894605838,
 0.5773502691896257)

In [75]:
torch.zeros(100000).uniform_(-1,1).std()

tensor(0.5765)

In [76]:
1/math.sqrt(3.)

0.5773502691896258

In [77]:
def kaiming2(x,a, use_fan_out=False):
    nf,ni,*_ = x.shape
    print(nf,ni)
    rec_fs = x[0,0].shape.numel()
    print(rec_fs)
    fan = nf*rec_fs if use_fan_out else ni*rec_fs
    print(fan)
    std = gain(a) / math.sqrt(fan)
    print(std)
    bound = math.sqrt(3.) * std
    x.data.uniform_(-bound,bound)

In [32]:
# rele
kaiming2(l1.weight, a=0);
stats(f1(x))
print(math.sqrt(2)/5)

32 1
25
25
0.282842712474619
0.282842712474619


In [78]:
kaiming2(l1.weight, a=math.sqrt(5.))
stats(f1(x))

32 1
25
25
0.11547005383792515


(tensor(0.1901, grad_fn=<MeanBackward0>),
 tensor(0.3765, grad_fn=<StdBackward0>))

In [79]:
class Flatten(nn.Module):
    def forward(self,x): return x.view(-1)

In [80]:
m = nn.Sequential(
    nn.Conv2d(1,8, 5,stride=2,padding=2), nn.ReLU(),
    nn.Conv2d(8,16,3,stride=2,padding=1), nn.ReLU(),
    nn.Conv2d(16,32,3,stride=2,padding=1), nn.ReLU(),
    nn.Conv2d(32,1,3,stride=2,padding=1),
    nn.AdaptiveAvgPool2d(1),
    Flatten(),
)

In [81]:
y = y_valid[:100].float()

In [82]:
t = m(x)
stats(t)

(tensor(0.0156, grad_fn=<MeanBackward0>),
 tensor(0.0109, grad_fn=<StdBackward0>))

In [83]:
l = mse(t,y)
l.backward()

In [84]:
stats(m[0].weight.grad)

(tensor(-0.0311), tensor(0.0714))

In [40]:
init.kaiming_uniform_??

In [85]:
for l in m:
    if isinstance(l,nn.Conv2d):
        init.kaiming_uniform_(l.weight)
        l.bias.data.zero_()

In [86]:
t = m(x)
stats(t)

(tensor(-0.3667, grad_fn=<MeanBackward0>),
 tensor(0.2069, grad_fn=<StdBackward0>))

In [87]:
l = mse(t,y)
l.backward()
stats(m[0].weight.grad)

(tensor(-0.0595), tensor(0.3972))

## Export

In [44]:

#!./notebook2script.py 02a_why_sqrt5.ipynb
!notebook2script.py 02a_why_sqrt5.ipynb