In [1]:
import torch
import torch.nn as nn
import torchvision
from torchsummary import summary

import os
import json
import h5py
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from collections import Counter

from datasets import CaptionDataset
from models import Encoder, Attention, DecoderWithAttention

%load_ext autoreload
%autoreload 2

# 01 Encoder

## 01-1 Last layers 

In [2]:
resnet101 = torchvision.models.resnet101(pretrained=True)

In [8]:
resnet101.avgpool

AdaptiveAvgPool2d(output_size=(1, 1))

In [9]:
resnet101.fc

Linear(in_features=2048, out_features=1000, bias=True)

To change layers we use `children()` function. As we may see `children[:-2]` is in fact all the layers except the 2 last ones. So we can remove `fc` layer completely and change pooling layer.

In [12]:
children = list(resnet101.children())

In [14]:
len(children)

10

In [16]:
children[-2], children[-1]

(AdaptiveAvgPool2d(output_size=(1, 1)),
 Linear(in_features=2048, out_features=1000, bias=True))

Is there another way to do this? First of all the method that is used seems to be very popular but we basically use `Sequential` 2 times and mess up with the structure of `resnet`. Do we have another approach? 

If we don't have to remove `Linear` layer we can just change it: `model.fc = Linear(...)`. But in our case we don't need this Linear layer. So we may use slightly modified [approach](https://discuss.pytorch.org/t/how-to-delete-layer-in-pretrained-model/17648/2):

```python
class Identity(nn.Module):
    def __init__(self):
        super(Identity, self).__init__()
        
    def forward(self, x):
        return x


model = models.resnet18(pretrained=False)
model.fc = Identity()
```

## 01-2 Pooling

We're changing `AdaptiveAvgPool2d(output_size=(1, 1))` to `output_size=(14, 14)`:
- output of `resnet101` before the original pooling is `(2048, 8, 8)` and after - `(2048, 1, 1)`;
- but output with the new pooling will be `(2048, 14, 14)` - not clear at all what we're doing here; it's probably not averageing but rather just repeating some values; I'm not sure this is correct; I'd just remain the output as it is if we need this `(8, 8)` shape for some purpose;

In [18]:
pool14 = nn.AdaptiveAvgPool2d(output_size=(14, 14))

In [19]:
out14 = pool14(torch.zeros(1, 2048, 8, 8))

In [20]:
out14.shape

torch.Size([1, 2048, 14, 14])

# Appendix. Summary of the model

In [7]:
resnet101

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [6]:
summary(resnet101, (3, 256, 256))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 128, 128]           9,408
       BatchNorm2d-2         [-1, 64, 128, 128]             128
              ReLU-3         [-1, 64, 128, 128]               0
         MaxPool2d-4           [-1, 64, 64, 64]               0
            Conv2d-5           [-1, 64, 64, 64]           4,096
       BatchNorm2d-6           [-1, 64, 64, 64]             128
              ReLU-7           [-1, 64, 64, 64]               0
            Conv2d-8           [-1, 64, 64, 64]          36,864
       BatchNorm2d-9           [-1, 64, 64, 64]             128
             ReLU-10           [-1, 64, 64, 64]               0
           Conv2d-11          [-1, 256, 64, 64]          16,384
      BatchNorm2d-12          [-1, 256, 64, 64]             512
           Conv2d-13          [-1, 256, 64, 64]          16,384
      BatchNorm2d-14          [-1, 256,