In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
#export
from exp.nb_03_batchnorm import *

# Layerwise Sequential Unit Variance (LSUV)

In [3]:
x_train, y_train, x_valid, y_valid = get_data(url=MNIST_URL)

x_train, x_valid = normalize_to(x_train, x_valid)
train_ds = Dataset(x_train, y_train)
valid_ds = Dataset(x_valid, y_valid)

nh = 50,
bs = 512
c = y_train.max().item() + 1
loss_func = F.cross_entropy

data = DataBunch(*get_dls(train_ds, valid_ds, bs=bs), c=c)


In [4]:
mnist_view = view_tfm(1, 28, 28)
cbfs = [Recorder,
        partial(AvgStatsCallback, accuracy),
        partial(BatchTransformXCallback, mnist_view)]

In [5]:
nfs = [8, 16, 32, 64, 64]

In [6]:
class ConvLayer(torch.nn.Module):
    def __init__(self, ni, nf, ks=3, stride=2, sub=0., **kwargs):
        super().__init__()
        self.conv = torch.nn.Conv2d(ni, nf, ks, stride=stride, padding=ks//2, bias=True)
        self.relu = GeneralReLU(sub=sub, **kwargs)
    
    def forward(self, x):
        return self.relu(self.conv(x))
    
    @property
    def bias(self):
        return -self.relu.sub
    
    @bias.setter
    def bias(self, v):
        self.relu.sub = -v
        
    @property
    def weight(self):
        return self.conv.weight        

In [7]:
learner, run = get_learn_run(data=data, nfs=nfs, lr=0.6, layer=ConvLayer, cbs=cbfs)

In [8]:
run.fit(epochs=2, learner=learner)

epoch=[0/2]:	train: [1.92355484375, tensor(0.3376)]	valid: [0.593430224609375, tensor(0.8023)]
epoch=[1/2]:	train: [0.31706408203125, tensor(0.9010)]	valid: [0.289682470703125, tensor(0.9099)]


```
Now we recreate our model and try with LSUV. 
Hopefully we'll get better results.
```

In [9]:
learner, run = get_learn_run(data=data, nfs=nfs, lr=0.6, layer=ConvLayer, cbs=cbfs)

```
We only want the outputs of Convolutional or Linear layers. To find them we need recursive procedure.
We can use sum(list, []) to concatenate the lists of function finds.
(sum applies the + operator b/w the elements of list, beginning with initial state in the 2nd argument)

```

In [14]:
#export
def get_batch(dl, run):
    run.xb, run.yb = next(iter(dl))
    for cb in run.cbs:
        cb.set_runner(run)
    run("begin_batch")
    return run.xb, run.yb

In [15]:
xb, yb = get_batch(data.train_dl, run)

In [16]:
#export
def find_modules(module, cond):
    if cond(module):
        return [module]
    return sum([find_modules(m, cond) for m in module.children()], [])

def is_lin_layer(l):
    lin_layers = (torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Conv3d, torch.nn.Linear, torch.nn.ReLU)
    return isinstance(l, lin_layers)

In [17]:
mods = find_modules(module=learner.model, cond=lambda m: isinstance(m, ConvLayer))

In [18]:
mods

[ConvLayer(
   (conv): Conv2d(1, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
   (relu): GeneralReLU()
 ), ConvLayer(
   (conv): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
   (relu): GeneralReLU()
 ), ConvLayer(
   (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
   (relu): GeneralReLU()
 ), ConvLayer(
   (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
   (relu): GeneralReLU()
 ), ConvLayer(
   (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
   (relu): GeneralReLU()
 )]

```
This is the helper function to grab the mean & std of the output of a hooked layer
```

In [19]:
def append_stat(hook, mod, inp, out):
    d = out.data
    hook.mean = d.mean().item()
    hook.std  = d.std().item()

```
Now we can look at the mean & std of the Conv layers of the model.
```

In [20]:
with Hooks(mods, append_stat) as hooks:
    learner.model(xb)
    for hook in hooks:
        print(hook.mean, hook.std)

0.4540443420410156 0.9798216819763184
0.36190351843833923 0.68576979637146
0.3136413097381592 0.5385108590126038
0.28830426931381226 0.4851055443286896
0.20897021889686584 0.31631138920783997


```
We first adjust the bias term to make the mean 0, then we will adjust the std to make it 1. (with a threshold of 1e-3).
The "learner.model(xb) is not None" clause is just there to pass "xb" through "model" and compute all the activations so that the hooks get updated.
```

In [21]:
#export
def lsuv_module(module, xb):
    h = Hook(module, append_stat)
    
    while learner.model(xb) is not None and abs(h.mean) > 1e-3:
        module.bias -= h.mean
    while learner.model(xb) is not None and abs(h.std-1) > 1e-3:
        module.weight.data /= h.std
        
    h.remove()
    return h.mean, h.std

```
We execute that initialization on all conv layers in order
```

In [22]:
for mod in mods:
    print(lsuv_module(mod, xb))

(0.009352052584290504, 1.0)
(0.18078379333019257, 1.0)
(0.15545712411403656, 1.0)
(0.14422819018363953, 1.0)
(0.3141314685344696, 1.0)


```
NOTE: the mean doesnot stay at 0; bcoz we change the std after by scaling the weight.
```

```
NOTE: Now our trainign is beginning on better grounds... :)
```

In [23]:
%time run.fit(epochs=2, learner=learner)

epoch=[0/2]:	train: [0.5126955078125, tensor(0.8355)]	valid: [0.1474990234375, tensor(0.9528)]
epoch=[1/2]:	train: [0.11361818359375, tensor(0.9644)]	valid: [0.09101828002929688, tensor(0.9719)]
CPU times: user 30.1 s, sys: 8.16 s, total: 38.3 s
Wall time: 13.2 s


```
We can see from above output that the training & valid accuracy has also improved.
```

```
LSUV is particularly useful for more complex and deeper architectures that are hard to initialize to get unit variance at the last layer
```

# Export

In [24]:
!python notebook_to_script.py imflash217__04_lsuv.ipynb

Converted imflash217__04_lsuv.ipynb to exp/nb_04_lsuv.py
Converted imflash217__04_lsuv.ipynb to exp/nb_04_lsuv.py
Converted imflash217__04_lsuv.ipynb to exp/nb_04_lsuv.py
Converted imflash217__04_lsuv.ipynb to exp/nb_04_lsuv.py
