# Layerwise Sequential Unit Variance (LSUV)

In [7]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
#export
from exp.nb_07 import *

## 1. Get the MNIST data and prepare to build a CNN

[Jump_to lesson 11 video](https://course.fast.ai/videos/?lesson=11&t=235)

In [9]:
# get training and validation data
x_train,y_train,x_valid,y_valid = get_data()

# normalize validation data to training data
x_train,x_valid = normalize_to(x_train,x_valid)
train_ds,valid_ds = Dataset(x_train, y_train),Dataset(x_valid, y_valid)

# number of hidden layers, batch size
# note -- we don't need nh for this notebook
nh,bs = 50,512


# number of digit classes 
c = y_train.max().item()+1

# specify loss function
loss_func = F.cross_entropy

# create a DataBunch
data = DataBunch(*get_dls(train_ds, valid_ds, bs), c)

In [10]:
# callback function to resize flattened image back into a 2D array
mnist_view = view_tfm(1,28,28)

# list of callback functions
cbfs = [Recorder,
        partial(AvgStatsCallback,accuracy),
        CudaCallback,
        partial(BatchTransformXCallback, mnist_view)]

In [11]:
# list number of channels for each desired convolution layer
nfs = [8,16,32,64,64]

## 2. Review documentation from the fastai library 

In [12]:
conv_layer??
Signature: conv_layer(ni, nf, ks=3, stride=2, bn=True, **kwargs)
Docstring: <no docstring>
Source:   
def conv_layer(ni, nf, ks=3, stride=2, bn=True, **kwargs):
    layers = [nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=not bn),
              GeneralRelu(**kwargs)]
    if bn: layers.append(nn.BatchNorm2d(nf, eps=1e-5, momentum=0.1))
    return nn.Sequential(*layers)
File:      c:\users\cross-entropy\fastai\course-v3\nbs\dl2\exp\nb_07.py
Type:      function

SyntaxError: invalid syntax (<ipython-input-12-6a9cf9ddb02c>, line 3)

In [None]:
nn.Sequential??
Init signature: nn.Sequential(*args)
Source:        
class Sequential(Module):
    r"""A sequential container.
    Modules will be added to it in the order they are passed in the constructor.
    Alternatively, an ordered dict of modules can also be passed in.

    To make it easier to understand, here is a small example::

        # Example of using Sequential
        model = nn.Sequential(
                  nn.Conv2d(1,20,5),
                  nn.ReLU(),
                  nn.Conv2d(20,64,5),
                  nn.ReLU()
                )

        # Example of using Sequential with OrderedDict
        model = nn.Sequential(OrderedDict([
                  ('conv1', nn.Conv2d(1,20,5)),
                  ('relu1', nn.ReLU()),
                  ('conv2', nn.Conv2d(20,64,5)),
                  ('relu2', nn.ReLU())
                ]))
    """

    def __init__(self, *args):
        super(Sequential, self).__init__()
        if len(args) == 1 and isinstance(args[0], OrderedDict):
            for key, module in args[0].items():
                self.add_module(key, module)
        else:
            for idx, module in enumerate(args):
                self.add_module(str(idx), module)

    def _get_item_by_idx(self, iterator, idx):
        """Get the idx-th item of the iterator"""
        size = len(self)
        idx = operator.index(idx)
        if not -size <= idx < size:
            raise IndexError('index {} is out of range'.format(idx))
        idx %= size
        return next(islice(iterator, idx, None))

    def __getitem__(self, idx):
        if isinstance(idx, slice):
            return self.__class__(OrderedDict(list(self._modules.items())[idx]))
        else:
            return self._get_item_by_idx(self._modules.values(), idx)

    def __setitem__(self, idx, module):
        key = self._get_item_by_idx(self._modules.keys(), idx)
        return setattr(self, key, module)

    def __delitem__(self, idx):
        if isinstance(idx, slice):
            for key in list(self._modules.keys())[idx]:
                delattr(self, key)
        else:
            key = self._get_item_by_idx(self._modules.keys(), idx)
            delattr(self, key)

    def __len__(self):
        return len(self._modules)

    def __dir__(self):
        keys = super(Sequential, self).__dir__()
        keys = [key for key in keys if not key.isdigit()]
        return keys

    def forward(self, input):
        for module in self._modules.values():
            input = module(input)
        return input
File:           c:\users\cross-entropy\anaconda3\envs\fastai\lib\site-packages\torch\nn\modules\container.py
Type:           type
Subclasses:     ConvReLU2d, LinearReLU, ConvBn2d, ConvBnReLU2d

In [None]:
GeneralRelu??
Init signature: GeneralRelu(leak=None, sub=None, maxv=None)
Docstring:     
Base class for all neural network modules.

Your models should also subclass this class.

Modules can also contain other Modules, allowing to nest them in
a tree structure. You can assign the submodules as regular attributes::

    import torch.nn as nn
    import torch.nn.functional as F

    class Model(nn.Module):
        def __init__(self):
            super(Model, self).__init__()
            self.conv1 = nn.Conv2d(1, 20, 5)
            self.conv2 = nn.Conv2d(20, 20, 5)

        def forward(self, x):
            x = F.relu(self.conv1(x))
            return F.relu(self.conv2(x))

Submodules assigned in this way will be registered, and will have their
parameters converted too when you call :meth:`to`, etc.
Source:        
class GeneralRelu(nn.Module):
    def __init__(self, leak=None, sub=None, maxv=None):
        super().__init__()
        self.leak,self.sub,self.maxv = leak,sub,maxv

    def forward(self, x):
        x = F.leaky_relu(x,self.leak) if self.leak is not None else F.relu(x)
        if self.sub is not None: x.sub_(self.sub)
        if self.maxv is not None: x.clamp_max_(self.maxv)
        return x
File:           c:\users\cross-entropy\fastai\course-v3\nbs\dl2\exp\nb_06.py
Type:           type
Subclasses:     

In [None]:
get_learn_run??

Signature:
get_learn_run(
    nfs,
    data,
    lr,
    layer,
    cbs=None,
    opt_func=None,
    uniform=False,
    **kwargs,
)
Docstring: <no docstring>
Source:   
def get_learn_run(nfs, data, lr, layer, cbs=None, opt_func=None, uniform=False, **kwargs):
    model = get_cnn_model(data, nfs, layer, **kwargs)
    init_cnn(model, uniform=uniform)
    return get_runner(model, data, lr=lr, cbs=cbs, opt_func=opt_func)
File:      c:\users\cross-entropy\fastai\course-v3\nbs\dl2\exp\nb_07.py
Type:      function

In [None]:
get_cnn_model??

Signature: get_cnn_model(data, nfs, layer, **kwargs)
Docstring: <no docstring>
Source:   
def get_cnn_model(data, nfs, layer, **kwargs):
    return nn.Sequential(*get_cnn_layers(data, nfs, layer, **kwargs))
File:      c:\users\cross-entropy\fastai\course-v3\nbs\dl2\exp\nb_06.py
Type:      function


In [None]:
dir(cbfs[0])

## 3. Build  a Baseline CNN model

In [13]:
class ConvLayer(nn.Module):
    def __init__(self, ni, nf, ks=3, stride=2, sub=0., **kwargs):
        super().__init__()
        self.conv = nn.Conv2d(ni, nf, ks, padding=ks//2, stride=stride, bias=True)
        self.relu = GeneralRelu(sub=sub, **kwargs)
    
    def forward(self, x): return self.relu(self.conv(x))
    
    @property
    def bias(self): return -self.relu.sub
    @bias.setter
    def bias(self,v): self.relu.sub = -v
    @property
    def weight(self): return self.conv.weight

In [31]:
# set up for training
learn,run = get_learn_run(nfs, data, 0.6, ConvLayer, cbs=cbfs)

Now we're going to look at the paper [All You Need is a Good Init](https://arxiv.org/pdf/1511.06422.pdf), which introduces *Layer-wise Sequential Unit-Variance* (*LSUV*). We initialize our neural net with the usual technique, then we pass a batch through the model and check the outputs of the linear and convolutional layers. We can then rescale the weights according to the actual variance we observe on the activations, and subtract the mean we observe from the initial bias. That way we will have activations that stay normalized.

We repeat this process until we are satisfied with the mean/variance we observe.

Let's start by looking at a baseline:

In [32]:
%%time
# train
run.fit(2, learn)

train: [2.27331109375, tensor(0.1960, device='cuda:0')]
valid: [2.120380078125, tensor(0.2680, device='cuda:0')]
train: [1.150715546875, tensor(0.6335, device='cuda:0')]
valid: [0.2563557373046875, tensor(0.9246, device='cuda:0')]
Wall time: 2.91 s


## 4. Layer-wise sequential Unit Variance (LSUV)
Now we recreate our model and we'll try again with LSUV. Hopefully, we'll get better results!

In [33]:
# set up for training again
learn,run = get_learn_run(nfs, data, 0.6, ConvLayer, cbs=cbfs)

### break from code to check some things

In [None]:
dir(learn)

In [None]:
learn.model

In [None]:
dir(run)

In [None]:
run.cbs

In [None]:
run.avg_stats

In [None]:
run.batch_transform_x

In [None]:
run.fit

In [None]:
# why do run.model, run.data, run.loss_func,run.opt return AttributeError: 'Runner' object has no attribute 'learn'

In [None]:
run.model

In [None]:
run.data

In [None]:
run.loss_func

In [None]:
run.opt

In [None]:
run.all_batches

In [None]:
run.stop

In [None]:
cbfs[0].set_runner

In [None]:
Callback.set_runner??
Signature: Callback.set_runner(self, run)
Docstring: <no docstring>
Source:        def set_runner(self, run): self.run=run
File:      c:\users\cross-entropy\fastai\course-v3\nbs\dl2\exp\nb_05b.py
Type:      function

In [None]:
cbfs[0].set_runner

In [None]:
run??
Signature:   run(cb_name)
Type:        Runner
String form: <exp.nb_05b.Runner object at 0x000002A0D70316A0>
File:        c:\users\cross-entropy\fastai\course-v3\nbs\dl2\exp\nb_05b.py
Source:     
class Runner():
    def __init__(self, cbs=None, cb_funcs=None):
        cbs = listify(cbs)
        for cbf in listify(cb_funcs):
            cb = cbf()
            setattr(self, cb.name, cb)
            cbs.append(cb)
        self.stop,self.cbs = False,[TrainEvalCallback()]+cbs

    # Question: how does run have access to the Learner object?
    @property
    def opt(self):       return self.learn.opt
    @property
    def model(self):     return self.learn.model
    @property
    def loss_func(self): return self.learn.loss_func
    @property
    def data(self):      return self.learn.data

    def one_batch(self, xb, yb):
        try:
            self.xb,self.yb = xb,yb
            self('begin_batch')
            self.pred = self.model(self.xb)
            self('after_pred')
            self.loss = self.loss_func(self.pred, self.yb)
            self('after_loss')
            if not self.in_train: return
            self.loss.backward()
            self('after_backward')
            self.opt.step()
            self('after_step')
            self.opt.zero_grad()
        except CancelBatchException: self('after_cancel_batch')
        finally: self('after_batch')

    def all_batches(self, dl):
        self.iters = len(dl)
        try:
            for xb,yb in dl: self.one_batch(xb, yb)
        except CancelEpochException: self('after_cancel_epoch')

    def fit(self, epochs, learn):
        self.epochs,self.learn,self.loss = epochs,learn,tensor(0.) 

        try:
            for cb in self.cbs: cb.set_runner(self)
            self('begin_fit')
            for epoch in range(epochs):
                self.epoch = epoch
                if not self('begin_epoch'): self.all_batches(self.data.train_dl)

                with torch.no_grad():
                    if not self('begin_validate'): self.all_batches(self.data.valid_dl)
                self('after_epoch')

        except CancelTrainException: self('after_cancel_train')
        finally:
            self('after_fit')
            self.learn = None

    def __call__(self, cb_name):
        res = False
        # Question: won't the following line always return res = False?
        for cb in sorted(self.cbs, key=lambda x: x._order): res = cb(cb_name) and res
        return res

### continue with code
Helper function to get one batch of a given dataloader, with the callbacks called to preprocess it.

In [17]:
#export
def get_batch(dl, run):
    run.xb,run.yb = next(iter(dl))
    # associat each callback with current Runner object
    for cb in run.cbs: cb.set_runner(run)
    run('begin_batch')
    # return a batch
    return run.xb,run.yb

In [18]:
# get a batch
xb,yb = get_batch(data.train_dl, run)

In [None]:
print(xb.size())
print(xb[0,0,0,:])

We only want the outputs of convolutional or linear layers. To find them, we need a recursive function. We can use `sum(list, [])` to concatenate the lists the function finds (`sum` applies the + operate between the elements of the list you pass it, beginning with the initial state in the second argument).

In [40]:
#export
def find_modules(m, cond):
    if cond(m): return [m]
    return sum([find_modules(o,cond) for o in m.children()], [])

def is_lin_layer(l):
    lin_layers = (nn.Conv1d, nn.Conv2d, nn.Conv3d, nn.Linear, nn.ReLU)
    return isinstance(l, lin_layers)

In [41]:
mods = find_modules(learn.model, lambda o: isinstance(o,ConvLayer))

In [None]:
mods

This is a helper function to grab the mean and std of the output of a hooked layer.

In [42]:
def append_stat(hook, mod, inp, outp):
    d = outp.data
    hook.mean,hook.std = d.mean().item(),d.std().item()

In [43]:
# put model on the GPU
mdl = learn.model.cuda()

## Break from code to look up some docs in the fastai library

In [None]:
dir(mdl)

In [None]:
mdl??
Signature:   mdl(*input, **kwargs)
Type:        Sequential
String form:
Sequential(
  (0): ConvLayer(
    (conv): Conv2d(1, 8, kernel_size=(5, 5), stride=(2, 2), padding=(2, 2))
    (relu): GeneralRelu()
  )
  (1): ConvLayer(
    (conv): Conv2d(8, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (relu): GeneralRelu()
  )
  (2): ConvLayer(
    (conv): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (relu): GeneralRelu()
  )
  (3): ConvLayer(
    (conv): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (relu): GeneralRelu()
  )
  (4): ConvLayer(
    (conv): Conv2d(64, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (relu): GeneralRelu()
  )
  (5): AdaptiveAvgPool2d(output_size=1)
  (6): Lambda()
  (7): Linear(in_features=64, out_features=10, bias=True)
)
Length:      8
File:        c:\users\cross-entropy\anaconda3\envs\fastai\lib\site-packages\torch\nn\modules\container.py
Source:     
class Sequential(Module):
    r"""A sequential container.
    Modules will be added to it in the order they are passed in the constructor.
    Alternatively, an ordered dict of modules can also be passed in.

    To make it easier to understand, here is a small example::

        # Example of using Sequential
        model = nn.Sequential(
                  nn.Conv2d(1,20,5),
                  nn.ReLU(),
                  nn.Conv2d(20,64,5),
                  nn.ReLU()
                )

        # Example of using Sequential with OrderedDict
        model = nn.Sequential(OrderedDict([
                  ('conv1', nn.Conv2d(1,20,5)),
                  ('relu1', nn.ReLU()),
                  ('conv2', nn.Conv2d(20,64,5)),
                  ('relu2', nn.ReLU())
                ]))
    """

    def __init__(self, *args):
        super(Sequential, self).__init__()
        if len(args) == 1 and isinstance(args[0], OrderedDict):
            for key, module in args[0].items():
                self.add_module(key, module)
        else:
            for idx, module in enumerate(args):
                self.add_module(str(idx), module)

    def _get_item_by_idx(self, iterator, idx):
        """Get the idx-th item of the iterator"""
        size = len(self)
        idx = operator.index(idx)
        if not -size <= idx < size:
            raise IndexError('index {} is out of range'.format(idx))
        idx %= size
        return next(islice(iterator, idx, None))

    def __getitem__(self, idx):
        if isinstance(idx, slice):
            return self.__class__(OrderedDict(list(self._modules.items())[idx]))
        else:
            return self._get_item_by_idx(self._modules.values(), idx)

    def __setitem__(self, idx, module):
        key = self._get_item_by_idx(self._modules.keys(), idx)
        return setattr(self, key, module)

    def __delitem__(self, idx):
        if isinstance(idx, slice):
            for key in list(self._modules.keys())[idx]:
                delattr(self, key)
        else:
            key = self._get_item_by_idx(self._modules.keys(), idx)
            delattr(self, key)

    def __len__(self):
        return len(self._modules)

    def __dir__(self):
        keys = super(Sequential, self).__dir__()
        keys = [key for key in keys if not key.isdigit()]
        return keys

    def forward(self, input):
        for module in self._modules.values():
            input = module(input)
        return input

In [None]:
Hooks??
Init signature: Hooks(ms, f)
Docstring:      <no docstring>
Source:        
class Hooks(ListContainer):
    def __init__(self, ms, f): super().__init__([Hook(m, f) for m in ms])
    def __enter__(self, *args): return self
    def __exit__ (self, *args): self.remove()
    def __del__(self): self.remove()

    def __delitem__(self, i):
        self[i].remove()
        super().__delitem__(i)

    def remove(self):
        for h in self: h.remove()
File:           c:\users\cross-entropy\fastai\course-v3\nbs\dl2\exp\nb_06.py
Type:           type
Subclasses:     

In [None]:
# look up register_forward_hook 
learn.model.register_forward_hook??
Signature: learn.model.register_forward_hook(hook)
Source:   
    def register_forward_hook(self, hook):
        r"""Registers a forward hook on the module.

        The hook will be called every time after :func:`forward` has computed an output.
        It should have the following signature::

            hook(module, input, output) -> None or modified output

        The hook can modify the output. It can modify the input inplace but
        it will not have effect on forward since this is called after
        :func:`forward` is called.

        Returns:
            :class:`torch.utils.hooks.RemovableHandle`:
                a handle that can be used to remove the added hook by calling
                ``handle.remove()``
        """
        handle = hooks.RemovableHandle(self._forward_hooks)
        self._forward_hooks[handle.id] = hook
        return handle
File:      c:\users\cross-entropy\anaconda3\envs\fastai\lib\site-packages\torch\nn\modules\module.py
Type:      method


In [None]:
Hook??
Init signature: Hook(m, f)
Docstring:      <no docstring>
Source:        
class Hook():
    def __init__(self, m, f): self.hook = m.register_forward_hook(partial(f, self))
    def remove(self): self.hook.remove()
    def __del__(self): self.remove()
File:           c:\users\cross-entropy\fastai\course-v3\nbs\dl2\exp\nb_06.py
Type:           type
Subclasses:     

In [None]:
ListContainer??
Init signature: ListContainer(items)
Docstring:      <no docstring>
Source:        
class ListContainer():
    def __init__(self, items): self.items = listify(items)
    def __getitem__(self, idx):
        if isinstance(idx, (int,slice)): return self.items[idx]
        if isinstance(idx[0],bool):
            assert len(idx)==len(self) # bool mask
            return [o for m,o in zip(idx,self.items) if m]
        return [self.items[i] for i in idx]
    def __len__(self): return len(self.items)
    def __iter__(self): return iter(self.items)
    def __setitem__(self, i, o): self.items[i] = o
    def __delitem__(self, i): del(self.items[i])
    def __repr__(self):
        res = f'{self.__class__.__name__} ({len(self)} items)\n{self.items[:10]}'
        if len(self)>10: res = res[:-1]+ '...]'
        return res
File:           c:\users\cross-entropy\fastai\course-v3\nbs\dl2\exp\nb_06.py
Type:           type
Subclasses:     Hooks

##  resume with the code
So now we can look at the mean and std of the conv layers of our model.

In [44]:
# train a batch
with Hooks(mods, append_stat) as hooks:
    # forward pass
    mdl(xb)
    for hook in hooks: print(hook.mean,hook.std)

0.4256686270236969 0.8805520534515381
0.34560641646385193 0.7743778228759766
0.27013126015663147 0.5498278141021729
0.32408612966537476 0.5131123661994934
0.2721932530403137 0.3936083912849426


We first adjust the bias terms to make the means 0, then we adjust the standard deviations to make the stds 1 (with a threshold of 1e-3). The `mdl(xb) is not None` clause is just there to pass `xb` through `mdl` and compute all the activations so that the hooks get updated. 

In [27]:
learn.model[7].bias

Parameter containing:
tensor([ 0.0149, -0.2131,  0.1780, -0.0543,  0.0442,  0.1368, -0.0295,  0.0206,
         0.1515,  0.0179], device='cuda:0', requires_grad=True)

In [45]:
#export
# LSUV algorithm
def lsuv_module(m, xb):
    h = Hook(m, append_stat)
    # switching the order, normalizing the variances first, then the means
    #     achieves near-perfect normalization
    while mdl(xb) is not None and abs(h.std-1) > 1e-3: m.weight.data /= h.std
    while mdl(xb) is not None and abs(h.mean)  > 1e-3: m.bias -= h.mean

    h.remove()
    return h.mean,h.std

We execute that initialization on all the conv layers in order:

In [46]:
for m in mods: print(lsuv_module(m, xb))

(0.05774246156215668, 1.0)
(0.07110263407230377, 0.9999999403953552)
(0.17730598151683807, 0.9999998211860657)
(0.1192900538444519, 1.0000001192092896)
(0.2652382254600525, 0.9999999403953552)


Note that the mean doesn't exactly stay at 0. since we change the standard deviation after by scaling the weight.
Note from jcat: But if you modify the code to scale the standard deviations first, then correct the mean, you achieve near-perfect normalization.

Then training is beginning on better grounds, and we get impoved accuracy after 2 epochs compared to the initial run with LSUV

In [47]:
# train with LSUV
%time run.fit(2, learn)

train: [0.4852893359375, tensor(0.8433, device='cuda:0')]
valid: [0.18036103515625, tensor(0.9479, device='cuda:0')]
train: [0.112610205078125, tensor(0.9645, device='cuda:0')]
valid: [0.09935478515625, tensor(0.9698, device='cuda:0')]
Wall time: 2.83 s


LSUV is particularly useful for more complex and deeper architectures that are hard to initialize to get unit variance at the last layer.

## Export

In [None]:
!python notebook2script.py 07a_lsuv.ipynb