<a href="https://colab.research.google.com/github/lagom-QB/M11/blob/master/Practice_6_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Modern convolutional architectures

# keywords: resnet, inception trick, batchnorm, image normalization

Here is a resnet implementation (see also https://github.com/pytorch/vision/blob/master/torchvision/models/resnet.py).

In [0]:
import torch.nn as nn
import torch.nn.functional as F
import math
import torch

In [0]:
def conv3x3(in_planes, out_planes, stride=1):
    "3x3 convolution with padding"
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=1, bias=True)

class Flatten(nn.Module):
  def forward(self, x):
    batch_size = x.shape[0]
    return x.view(batch_size, -1)
    
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(BasicBlock, self).__init__()
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None):
        super(Bottleneck, self).__init__()
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=True)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=stride,
                               padding=1, bias=True)
        self.bn2 = nn.BatchNorm2d(planes)
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=True)
        self.bn3 = nn.BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            residual = self.downsample(x)

        out += residual
        out = self.relu(out)

        return out

class FBResNet(nn.Module):

    def __init__(self, block, layers, num_classes=1000):
        self.inplanes = 16
        # Special attributs
        self.input_space = None
        self.input_size = (299, 299, 3)
        self.mean = None
        self.std = None
        super(FBResNet, self).__init__()
        # Modules
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                                bias=True)
        self.bn1 = nn.BatchNorm2d(16)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.last_linear = nn.Linear(512 * block.expansion, num_classes)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.flatten = Flatten()
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')


    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=True),
                nn.BatchNorm2d(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes))

        return nn.Sequential(*layers)

    def features(self, input):
        x = self.conv1(input)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.avgpool(x)
        x = self.flatten(x)
        return x

    def logits(self, features):
        return self.last_linear(features)

    def forward(self, input):
        x = self.features(input)
        x = self.logits(x)
        return x


In [0]:
def fbresnet18(num_classes=1000):
    """Constructs a ResNet-18 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = FBResNet(BasicBlock, [2, 2, 2, 2], num_classes=num_classes)
    return model


def fbresnet34(num_classes=1000):
    """Constructs a ResNet-34 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = FBResNet(BasicBlock, [3, 4, 6, 3], num_classes=num_classes)
    return model


def fbresnet50(num_classes=1000):
    """Constructs a ResNet-50 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = FBResNet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes)
    return model


def fbresnet101(num_classes=1000):
    """Constructs a ResNet-101 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = FBResNet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes)
    return model

Note: there is no dropout layers here. Usually dropouts and batchnorms does not work well together. You may use dropouts in the dense classifier in the end, but using dropout before the batchnorm is usually a bad idea.

In [0]:
resnet18 = fbresnet18()

Let's count the number of the parameters:

In [6]:
sum(p.numel() for p in resnet18.parameters() if p.requires_grad)

3271496

In [0]:
resnet18

In [0]:
test_batch = torch.ones((4, 3, 224, 224))

In [0]:
resnet18(test_batch)

In [0]:
%load_ext tensorboard


In [0]:
!rm -r logs

In [0]:
%tensorboard --logdir logs

In [0]:
from torch.utils.tensorboard import SummaryWriter


In [0]:
writer = SummaryWriter(log_dir="logs/resnet18_graph")
writer.add_graph(resnet18, test_batch)
writer.close()

In [0]:
resnet18.par

# Image normalization

As there is batchnorm after every layer, it's easier to train the network (lr is similar for all layers). Except the first one.

To fix this we add normalization to test/train transform.
Below is the usual transform for imagenet dataset (it's better to use it if you use pretrained network).

In [0]:
mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)
transforms.Normalize(mean, std)

If you train from scratch it's better to compute mean and dispersion for the dataset you are training on.

# Updates for cifar

Perception field of `7` is not needed for `32`x`32` images, also we do not want to decrease our resolution in half, so let's set `stride=1`.



In [0]:
 self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=True)

resnet18 has way too many parameters for the cifar. There is `64` channels after the first convolution. Let's decrease this number to `16`.
Also there are two blocks with `64` channels, two blocks with `128` , with `256` and with `512` channels.
For cifar `16`, `32`, `64`, `128` will be enough.

In [0]:
self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=True)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)

=>

In [0]:
self.conv1 = nn.Conv2d(3, 16, kernel_size=3, stride=1, padding=1, bias=True)
self.layer1 = self._make_layer(block, 16, layers[0])
self.layer2 = self._make_layer(block, 32, layers[1], stride=2)
self.layer3 = self._make_layer(block, 64, layers[2], stride=2)
self.layer4 = self._make_layer(block, 128, layers[3], stride=2)

Also, we are dividing dimensions of the image by `2` three times.
This means that in the last group of blocks we are working with `4`x`4` images, i.e. our network is almost dense.
It's benefitial to skip `self.layer4` altogether.
To keep network deep enough, let's create `3` blocks in each other layer, instead of `2`.

In [0]:
def fbresnet20(num_classes=1000):
    """Constructs a ResNet-18 model.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
    """
    model = FBResNetCifar(BasicBlock, [3, 3, 3], num_classes=num_classes)
    return model

# Assignment[10]

1. Shape arithmetic [2]:

a) How will the layer shape change after apllying `Conv2d(3, 64, kernel_size=3, stride=2, padding=1, bias=True)`?

b) What padding should we use, so that `Conv2d(3, 64, kernel_size=17, stride=2, padding=?, bias=True)` transforms a tensor with shape `[64, 3, 2x, 2x]` to the tensor with shape `[64, 64, x, x]`?

c) What will be dimensions of the tensor `[64, 3, 4, 4]` after we `Flatten` it?

d) In resnet blocks of different layer work with tensors of different shape. How does the number of parameters per block changes between layers1, layers2, layers3, layers4?

2. Implement the changes we discussed in practice in FBResNetCifar and create a resnet20 model. Compute the number of its parameters.[2]

3. Compute mean and standard deviation for each channel for cifar10 train dataset. [2]

4. Train resnet20 network on cifar dataset for at least 80% accuracy. Normalize input according to values computed in (3). [2]

5. Replace 3x3 convolutions in resnet20 by 3x1 and 1x3 convolution, using inception trick. Compute the number of its parameters. Train this model on cifar dataset for the same number of epochs with the same optimizer. Normalize the input. [2]