In [2]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Does `nn.Conv2D` init work well?

In [64]:
#export
from exp.nb_02_full_connected import *

def get_data(url=MNIST_URL):
    path = datasets.download_data(url=url, ext=".gz")
    with gzip.open(path, "rb") as f:
        ((x_train, y_train), (x_valid, y_valid), _) = pickle.load(f, encoding="latin-1")
    return map(torch.tensor, (x_train, y_train, x_valid, y_valid))

def normalize(inp, mean, std):
    return (inp-mean)/std

def stats(inp):
    return inp.mean(), inp.std()

In [24]:
torch.nn.modules.conv._ConvNd.reset_parameters??

In [8]:
### Step-1: Get data
x_train, y_train, x_valid, y_valid = get_data(url=MNIST_URL)

### Step-2: Normalize your data accurately
mean_train = x_train.mean()
std_train  = x_train.std()
x_train = normalize(inp=x_train, mean=mean_train, std=std_train)
x_valid = normalize(inp=x_valid, mean=mean_train, std=std_train)

In [16]:
x_train.shape, y_train.shape, x_valid.shape, y_valid.shape

(torch.Size([50000, 784]),
 torch.Size([50000]),
 torch.Size([10000, 784]),
 torch.Size([10000]))

In [18]:
### Step-3: reshape your image data in a matrix format using "view_as()"
x_train = x_train.view(-1, 1, 28, 28)
x_valid = x_valid.view(-1, 1, 28, 28)

In [19]:
x_train.shape, y_train.shape, x_valid.shape, y_valid.shape

(torch.Size([50000, 1, 28, 28]),
 torch.Size([50000]),
 torch.Size([10000, 1, 28, 28]),
 torch.Size([10000]))

In [21]:
### Step-4: Gather all the needed training params as vars
num_train, *_ = x_train.shape
num_c = y_train.max() + 1
num_hidden = 32


In [23]:
(num_train, num_c)

(50000, tensor(10))

In [25]:
layer1 = torch.nn.Conv2d??

In [29]:
layer1 = torch.nn.Conv2d(in_channels=1, out_channels=num_hidden, kernel_size=(5,5), bias=True)

In [30]:
layer1

Conv2d(1, 32, kernel_size=(5, 5), stride=(1, 1))

In [31]:
x = x_valid[:5]

In [32]:
x.shape

torch.Size([5, 1, 28, 28])

In [33]:
layer1.weight.shape

torch.Size([32, 1, 5, 5])

In [65]:
### the stats should be about (0 mean, 1 variance) if its normalized
stats(inp=x)

(tensor(-0.0608), tensor(0.9159))

In [66]:
stats(layer1.weight)

(tensor(0.0057, grad_fn=<MeanBackward0>),
 tensor(0.1155, grad_fn=<StdBackward0>))

In [67]:
stats(layer1.bias)

(tensor(0.0075, grad_fn=<MeanBackward0>),
 tensor(0.1174, grad_fn=<StdBackward0>))

In [68]:
### BUT, the stats of weight & bias are not (0 mean, 1 variance)
### That's not good. So we need to initalize our weight and bias in a better way to have zero-mean-unit-var

In [69]:
### lets also check the stats of the output of layer1
out1 = layer1(x)
stats(out1)

(tensor(0.0167, grad_fn=<MeanBackward0>),
 tensor(0.6492, grad_fn=<StdBackward0>))

In [76]:
init.kaiming_normal_(tensor=layer1.weight, a=1.)         ### inplace initialization
stats(layer1(x))

### here we'll notice that after we do kaiming_normal weight init, the stats of the layer output improves
### which shows the importance of proper weight initialization

(tensor(0.0022, grad_fn=<MeanBackward0>),
 tensor(1.0853, grad_fn=<StdBackward0>))

In [85]:
### to experiment let's create a function that gives the output of a linear layer & relu 

import torch.nn.functional as F

def f1(inp, a=0.):
    return F.leaky_relu(layer1(inp), negative_slope=a)

In [110]:
### Case-1: using kaiming initialization with ReLU
layer1 = torch.nn.Conv2d(in_channels=1, out_channels=num_hidden, kernel_size=(5,5), bias=True)
init.kaiming_normal_(layer1.weight, a=0.)
out1 = f1(x)
stats(out1)

(tensor(0.4895, grad_fn=<MeanBackward0>),
 tensor(0.8645, grad_fn=<StdBackward0>))

In [106]:
### Case-2: NO kaiming initialization 
layer1 = torch.nn.Conv2d(in_channels=1, out_channels=num_hidden, kernel_size=(5,5), bias=True)
out2 = f1(x)      ### remmeber this will use the new layer1 defined above
stats(out2)

(tensor(0.1782, grad_fn=<MeanBackward0>),
 tensor(0.3726, grad_fn=<StdBackward0>))

In [97]:
### The above results are not good. It shows that the importance of kaiming initialization 
### when ReLU layers are involved.

### In Case-1: kaiming_normal_ init & a=0 ReLU  -> mean ~= 0, var ~= 1 => GOOD performance
### In Case-2: random init          & a=0 ReLU  -> mean != 0, var != 1 => BAD performance

### This above experiment demonstrates the importance of KAIMING initialization when ReLU layer are there

In [102]:
### Case-3
layer1 = torch.nn.Conv2d(in_channels=1, out_channels=num_hidden, kernel_size=(5,5), bias=True)
init.kaiming_normal_(layer1.weight, a=1.)    ### essentially NOT using ReLU (since a=1.)
out3 = f1(x)
stats(out3)

(tensor(0.3575, grad_fn=<MeanBackward0>),
 tensor(0.6456, grad_fn=<StdBackward0>))

## Designing the Kaiming Initialization:

### Receptive Field Size (`rec_fs`)

In [119]:
num_filters, num_inp, *_ = layer1.weight.shape

rec_fs = layer1.weight[0,0].numel()

In [120]:
num_filters, num_inp

(32, 1)

In [121]:
rec_fs

25

In [122]:
fan_in  = num_inp     * rec_fs
fan_out = num_filters * rec_fs

In [123]:
fan_in, fan_out

(25, 800)

In [124]:
def gain(a):
    return math.sqrt(2.0/(1.0 + a**2))

In [133]:
gain(1), gain(0), gain(0.1), gain(0.01), gain(math.sqrt(5.)) 

(1.0,
 1.4142135623730951,
 1.4071950894605838,
 1.4141428569978354,
 0.5773502691896257)

In [134]:
gain(math.sqrt(5.)), 1/math.sqrt(3.)

(0.5773502691896257, 0.5773502691896258)

In [139]:
### Defining the updated Kaiming Init with math.sqrt(5) factor

def kaiming2(params, a, use_fan_out=False):
    num_f, num_i, *_ = params.shape
    rec_fs = params[0,0].numel()
    fan = num_f * rec_fs if use_fan_out else num_i * rec_fs
    std = gain(a) / math.sqrt(fan)
    bound = std * math.sqrt(3.)
    params.data.uniform_(-bound, bound)    
    

In [161]:
kaiming2(layer1.weight, a=0)
stats(f1(x))

(tensor(0.4749, grad_fn=<MeanBackward0>),
 tensor(0.8379, grad_fn=<StdBackward0>))

In [162]:
kaiming2(layer1.weight, a=math.sqrt(5.))
stats(f1(x))

(tensor(0.1973, grad_fn=<MeanBackward0>),
 tensor(0.3415, grad_fn=<StdBackward0>))

In [164]:
class Flatten(torch.nn.Module):
    def forward(self, x):
        return x.view(-1)

In [166]:
model = torch.nn.Sequential(
    torch.nn.Conv2d(in_channels=1, out_channels=8, kernel_size=5, stride=2, padding=2),
    torch.nn.ReLU(),
    torch.nn.Conv2d(in_channels=8, out_channels=16, kernel_size=3, stride=2, padding=2),
    torch.nn.ReLU(),
    torch.nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=2),
    torch.nn.ReLU(),
    torch.nn.Conv2d(in_channels=32, out_channels=1, kernel_size=3, stride=2, padding=1),
    torch.nn.AdaptiveAvgPool2d(1),
    Flatten(),
)

In [170]:
x = x_valid[:100]
y = y_valid[:100].float()
x.shape, y.shape

(torch.Size([100, 1, 28, 28]), torch.Size([100]))

In [171]:
out = model(x)
stats(out)

(tensor(0.0307, grad_fn=<MeanBackward0>),
 tensor(0.0030, grad_fn=<StdBackward0>))

In [175]:
loss = mse(preds=out, target=y)
loss.backward()

In [178]:
stats(model[0].weight.grad)

### This above results shows that both the weights and gradients of weights have very bad mean, std without
### Kaiming Initialization

(tensor(0.0004), tensor(0.0272))

In [179]:
### So, lets formulate a way to include Kaiming Initialization inside conv layers

for layer in model:
    if isinstance(layer, torch.nn.Conv2d):
        init.kaiming_normal_(layer.weight)
        layer.bias.data.zero_()

In [186]:
out = model(x)
stats(out)

(tensor(-0.2093, grad_fn=<MeanBackward0>),
 tensor(0.1644, grad_fn=<StdBackward0>))

In [187]:
loss = mse(preds=out, target=y)
loss.backward()
stats(model[0].weight.grad)

(tensor(0.2714), tensor(1.1720))

In [188]:
### This above cell shows that the inclusion of Kaiming Init leads to better mean & std even for gradients
### which is a highly necessary factor in proper training

## Export

In [189]:
!python notebook_to_script.py imflash217__02_why_sqrt5.ipynb

Converted imflash217__02_why_sqrt5.ipynb to exp/nb_02_why_sqrt5.py
