In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

## Does nn.Conv2d init work well?

In [2]:
#export
from exp.nb_02 import *

def get_data():
    path = datasets.download_data(MNIST_URL, ext='.gz')
    with gzip.open(path, 'rb') as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding='latin-1')
    return map(tensor, (x_train,y_train,x_valid,y_valid))

def normalize(x, m, s): return (x-m)/s

In [3]:
torch.nn.modules.conv._ConvNd.reset_parameters??

In [4]:
# load and normalize the MNIST data
x_train,y_train,x_valid,y_valid = get_data()
train_mean,train_std = x_train.mean(),x_train.std()
x_train = normalize(x_train, train_mean, train_std)
x_valid = normalize(x_valid, train_mean, train_std)

In [5]:
x_valid.shape

torch.Size([10000, 784])

In [6]:
# training set consists of 50,000 images of size 28x28
# validation set consists of 10,000 images of size 28x28
# view reshapes an array
x_train = x_train.view(-1,1,28,28)
x_valid = x_valid.view(-1,1,28,28)
x_train.shape,x_valid.shape

(torch.Size([50000, 1, 28, 28]), torch.Size([10000, 1, 28, 28]))

In [7]:
# is the number of training examples
n,*_ = x_train.shape
# c is number of digits
c = y_train.max()+1
# nh is number of channels in the hidden layer
nh = 32
n,c

(50000, tensor(10))

In [8]:
# linear layer nh = 32 channels, 5x5 kernel
l1 = nn.Conv2d(1, nh, 5)

In [9]:
# subset of validation set
x = x_valid[:100]

In [10]:
x.shape

torch.Size([100, 1, 28, 28])

In [11]:
def stats(x): return x.mean(),x.std()

In [12]:
# n_output, n_input, 5, 5
l1.weight.shape

torch.Size([32, 1, 5, 5])

In [13]:
l1.weight

Parameter containing:
tensor([[[[-5.2809e-02,  6.4810e-02,  1.4186e-01,  1.2788e-01,  8.2810e-02],
          [-1.9424e-01, -1.5922e-01,  6.6326e-02,  1.9977e-01, -1.3060e-01],
          [ 7.7174e-02, -1.1212e-01,  1.1982e-01, -6.4839e-02,  1.8356e-01],
          [ 9.6833e-02, -1.3729e-01, -4.8811e-02,  1.9416e-01, -1.2793e-02],
          [-1.9664e-01, -9.3973e-03,  3.4571e-02, -2.9812e-02,  1.3348e-01]]],


        [[[-1.1300e-01,  9.9153e-02, -1.5362e-01,  6.6370e-02,  1.3138e-01],
          [ 3.0118e-02,  1.8120e-01, -1.0927e-01, -4.2453e-02,  1.1838e-01],
          [ 9.2759e-02,  4.7240e-02, -9.1664e-03, -6.4132e-02,  1.9716e-01],
          [ 2.8778e-02, -7.4464e-02, -2.5764e-02,  1.1914e-02, -1.9759e-02],
          [-1.8960e-01, -1.3807e-01, -7.8418e-02, -1.8564e-01,  1.8458e-01]]],


        [[[-8.4170e-02, -1.3850e-01,  1.1542e-01,  1.6365e-01,  1.0588e-01],
          [ 1.2976e-01,  5.7654e-02, -1.3596e-01,  1.9529e-01,  6.1403e-02],
          [-1.9773e-01,  3.1471e-02,  1.0674e-

In [14]:
# mean and std
stats(l1.weight),stats(l1.bias)

((tensor(0.0054, grad_fn=<MeanBackward0>),
  tensor(0.1189, grad_fn=<StdBackward0>)),
 (tensor(-0.0071, grad_fn=<MeanBackward0>),
  tensor(0.1102, grad_fn=<StdBackward0>)))

In [15]:
# apply the linear layer to the reduced validation set input data
# why 24 x 24, something to do with padding and stride 
t = l1(x)
t.shape

torch.Size([100, 32, 24, 24])

In [16]:
# mean and std
stats(t)

(tensor(0.0061, grad_fn=<MeanBackward0>),
 tensor(0.6882, grad_fn=<StdBackward0>))

In [17]:
# for linear layer a = 1
# a is the slope of the negative branch of the relu
init.kaiming_normal_(l1.weight, a=1.)
# initialized layer is close to zero mean and unit variance
stats(l1(x))

(tensor(0.0214, grad_fn=<MeanBackward0>),
 tensor(1.0110, grad_fn=<StdBackward0>))

In [18]:
import torch.nn.functional as F

In [19]:
# define a standard relu (a = 0)
def f1(x,a=0): return F.leaky_relu(l1(x),a)

In [20]:
# relu
init.kaiming_normal_(l1.weight, a=0)
stats(f1(x))

(tensor(0.5387, grad_fn=<MeanBackward0>),
 tensor(0.9474, grad_fn=<StdBackward0>))

In [21]:
# linear layer
l1 = nn.Conv2d(1, nh, 5)
stats(f1(x))

(tensor(0.2072, grad_fn=<MeanBackward0>),
 tensor(0.3980, grad_fn=<StdBackward0>))

In [22]:
# linear convolution layer
l1.weight.shape

torch.Size([32, 1, 5, 5])

In [23]:
l1.weight[0,0]

tensor([[ 0.0605,  0.1028, -0.0179,  0.0731, -0.0232],
        [ 0.0402, -0.0461,  0.0979, -0.1891, -0.0516],
        [-0.1229, -0.0199, -0.1008, -0.1345,  0.1178],
        [ 0.1268,  0.1474, -0.0665, -0.0019, -0.1572],
        [-0.1113,  0.0098, -0.0895,  0.1976,  0.1246]],
       grad_fn=<SelectBackward>)

In [24]:
# receptive field size
rec_fs = l1.weight[0,0].numel()
rec_fs

25

In [25]:
# nf = number of filters (channels)
nf,ni,*_ = l1.weight.shape
nf,ni

(32, 1)

In [26]:
fan_in  = ni*rec_fs
fan_out = nf*rec_fs
fan_in,fan_out

(25, 800)

In [27]:
# what is the origin of this formula? 
# for linear layer a=1, so gain = 1
# for relu, a = 0, so gain = sqrt(2)
# when a = sqrt(5), gain = 1/sqrt(3) = 0.577
def gain(a): return math.sqrt(2.0 / (1 + a**2))

In [28]:
gain(1),gain(0),gain(0.01),gain(0.1),gain(math.sqrt(5.))

(1.0,
 1.4142135623730951,
 1.4141428569978354,
 1.4071950894605838,
 0.5773502691896257)

In [29]:
torch.zeros(100000).uniform_(-1,1).std()

tensor(0.5771)

In [30]:
1/math.sqrt(3.)

0.5773502691896258

In [31]:
def kaiming2(x,a, use_fan_out=False):
    nf,ni,*_ = x.shape
    print(nf,ni)
    rec_fs = x[0,0].shape.numel()
    print(rec_fs)
    fan = nf*rec_fs if use_fan_out else ni*rec_fs
    print(fan)
    std = gain(a) / math.sqrt(fan)
    print(std)
    bound = math.sqrt(3.) * std
    x.data.uniform_(-bound,bound)

In [32]:
# rele
kaiming2(l1.weight, a=0);
stats(f1(x))
print(math.sqrt(2)/5)

32 1
25
25
0.282842712474619
0.282842712474619


In [33]:
kaiming2(l1.weight, a=math.sqrt(5.))
stats(f1(x))

32 1
25
25
0.11547005383792515


(tensor(0.1848, grad_fn=<MeanBackward0>),
 tensor(0.3702, grad_fn=<StdBackward0>))

In [34]:
class Flatten(nn.Module):
    def forward(self,x): return x.view(-1)

In [35]:
m = nn.Sequential(
    nn.Conv2d(1,8, 5,stride=2,padding=2), nn.ReLU(),
    nn.Conv2d(8,16,3,stride=2,padding=1), nn.ReLU(),
    nn.Conv2d(16,32,3,stride=2,padding=1), nn.ReLU(),
    nn.Conv2d(32,1,3,stride=2,padding=1),
    nn.AdaptiveAvgPool2d(1),
    Flatten(),
)

In [36]:
y = y_valid[:100].float()

In [37]:
t = m(x)
stats(t)

(tensor(-0.0180, grad_fn=<MeanBackward0>),
 tensor(0.0149, grad_fn=<StdBackward0>))

In [38]:
l = mse(t,y)
l.backward()

In [39]:
stats(m[0].weight.grad)

(tensor(-0.0134), tensor(0.0405))

In [40]:
init.kaiming_uniform_??

In [41]:
for l in m:
    if isinstance(l,nn.Conv2d):
        init.kaiming_uniform_(l.weight)
        l.bias.data.zero_()

In [42]:
t = m(x)
stats(t)

(tensor(-0.1487, grad_fn=<MeanBackward0>),
 tensor(0.2723, grad_fn=<StdBackward0>))

In [43]:
l = mse(t,y)
l.backward()
stats(m[0].weight.grad)

(tensor(-0.0114), tensor(0.4373))

## Export

In [44]:

#!./notebook2script.py 02a_why_sqrt5.ipynb
!notebook2script.py 02a_why_sqrt5.ipynb