In [13]:
import torch
import torch.nn as nn
import torchvision.models as models
from pycocotools.coco import COCO
import nltk
from data_loader import get_loader
from torchvision import transforms
from collections import Counter

# Watch for any changes in model.py, and re-load it automatically.
%load_ext autoreload
%autoreload 2

# Import EncoderCNN and DecoderRNN. 
from model import EncoderCNN, DecoderRNN

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## `EncoderCNN`

### model

We're going to create a pretrained `resnet50`, freeze all layers and change the last linear layer. 

In [4]:
resnet = models.resnet50(pretrained=True)

We may see that our model contains linear layer `resnet.fc` that generates scores for ImageNet classes (it has `out_features=1000`). It's not yet freezed of course. 

In [6]:
resnet.fc

Linear(in_features=2048, out_features=1000, bias=True)

In [7]:
resnet.fc.weight.requires_grad

True

Let's freeze our layers. We may see that now our final linear layer is freezed.

In [8]:
for param in resnet.parameters():
    param.requires_grad_(False)

In [9]:
resnet.fc.weight.requires_grad

False

What kind of a linear layer do we need instead? It looks like (from the picture in the 2nd notebook) that we use output of `CNN` as an input to `RNN` (not as a hidden state). So we need to set `out_features=embed_size`.

In [10]:
modules = list(resnet.children())[:-1]
encoder_resnet = nn.Sequential(*modules)

In [11]:
embed_size = 256
encoder_embed = nn.Linear(resnet.fc.in_features, embed_size)

In [12]:
encoder_embed

Linear(in_features=2048, out_features=256, bias=True)

### forward pass

Let's now get some data and try to run forward pass.

In [16]:
# Define a transform to pre-process the training images.
transform_train = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])

# Specify the batch size.
batch_size = 10

# Obtain the data loader (from file). Note that it runs much faster than before!
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_from_file=True)


Vocabulary successfully loaded from vocab.pkl file!
loading annotations into memory...
Done (t=0.76s)
creating index...


  0%|          | 806/414113 [00:00<00:51, 8057.43it/s]

index created!
Obtaining caption lengths...


100%|██████████| 414113/414113 [00:47<00:00, 8648.21it/s]


In [17]:
import numpy as np
import torch.utils.data as data

# Randomly sample a caption length, and sample indices with that length.
indices = data_loader.dataset.get_train_indices()
print('sampled indices:', indices)

# Create and assign a batch sampler to retrieve a batch with the sampled indices.
new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
data_loader.batch_sampler.sampler = new_sampler
    
# Obtain the batch.
images, captions = next(iter(data_loader))
    
print('images.shape:', images.shape)
print('captions.shape:', captions.shape)

# (Optional) Uncomment the lines of code below to print the pre-processed images and captions.
# print('images:', images)
# print('captions:', captions)

sampled indices: [217166, 388582, 246112, 277267, 193339, 125984, 105138, 288341, 148747, 103694]
images.shape: torch.Size([10, 3, 224, 224])
captions.shape: torch.Size([10, 14])


Let's start from the `CNN` part of the `encoder`. We get a `4D` tensor that we should transform into `2D` shape. We may do it with `torch.flatten()` or manually as in the assignment code.

In [20]:
cnn_out = encoder_resnet(images)

In [21]:
cnn_out.shape

torch.Size([10, 2048, 1, 1])

In [24]:
cnn_out.view(cnn_out.size(0), -1).shape

torch.Size([10, 2048])

In [23]:
torch.flatten(cnn_out, 1).shape

torch.Size([10, 2048])

Finally let's apply our linear layer.

In [25]:
cnn_out_flatten = torch.flatten(cnn_out, 1)

In [26]:
cnn_out_embed = encoder_embed(cnn_out_flatten)

In [27]:
cnn_out_embed.shape

torch.Size([10, 256])

## `DecoderRNN`