In [21]:
import torch
import torch.nn as nn
import torchvision.models as models
from pycocotools.coco import COCO
import nltk
from data_loader import get_loader
from torchvision import transforms
from collections import Counter

# Watch for any changes in model.py, and re-load it automatically.
%load_ext autoreload
%autoreload 2

# Import EncoderCNN and DecoderRNN. 
from model import EncoderCNN, DecoderRNN

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## `EncoderCNN`

### model

We're going to create a pretrained `resnet50`, freeze all layers and change the last linear layer. 

In [22]:
resnet = models.resnet50(pretrained=True)

We may see that our model contains linear layer `resnet.fc` that generates scores for ImageNet classes (it has `out_features=1000`). It's not yet freezed of course. 

In [23]:
resnet.fc

Linear(in_features=2048, out_features=1000, bias=True)

In [24]:
resnet.fc.weight.requires_grad

True

Let's freeze our layers. We may see that now our final linear layer is freezed.

In [25]:
for param in resnet.parameters():
    param.requires_grad_(False)

In [26]:
resnet.fc.weight.requires_grad

False

What kind of a linear layer do we need instead? It looks like (from the picture in the 2nd notebook) that we use output of `CNN` as an input to `RNN` (not as a hidden state). So we need to set `out_features=embed_size`.

In [27]:
modules = list(resnet.children())[:-1]
encoder_resnet = nn.Sequential(*modules)

In [28]:
embed_size = 256
encoder_embed = nn.Linear(resnet.fc.in_features, embed_size)

In [29]:
encoder_embed

Linear(in_features=2048, out_features=256, bias=True)

### forward pass

Let's now get some data and try to run forward pass.

In [30]:
# Define a transform to pre-process the training images.
transform_train = transforms.Compose([ 
    transforms.Resize(256),                          # smaller edge of image resized to 256
    transforms.RandomCrop(224),                      # get 224x224 crop from random location
    transforms.RandomHorizontalFlip(),               # horizontally flip image with probability=0.5
    transforms.ToTensor(),                           # convert the PIL Image to a tensor
    transforms.Normalize((0.485, 0.456, 0.406),      # normalize image for pre-trained model
                         (0.229, 0.224, 0.225))])

# Specify the batch size.
batch_size = 10

# Obtain the data loader (from file). Note that it runs much faster than before!
data_loader = get_loader(transform=transform_train,
                         mode='train',
                         batch_size=batch_size,
                         vocab_from_file=True)


Vocabulary successfully loaded from vocab.pkl file!
loading annotations into memory...
Done (t=0.84s)
creating index...


  0%|          | 718/414113 [00:00<00:57, 7179.73it/s]

index created!
Obtaining caption lengths...


100%|██████████| 414113/414113 [00:50<00:00, 8175.10it/s]


In [31]:
import numpy as np
import torch.utils.data as data

# Randomly sample a caption length, and sample indices with that length.
indices = data_loader.dataset.get_train_indices()
print('sampled indices:', indices)

# Create and assign a batch sampler to retrieve a batch with the sampled indices.
new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
data_loader.batch_sampler.sampler = new_sampler
    
# Obtain the batch.
images, captions = next(iter(data_loader))
    
print('images.shape:', images.shape)
print('captions.shape:', captions.shape)

# (Optional) Uncomment the lines of code below to print the pre-processed images and captions.
# print('images:', images)
# print('captions:', captions)

sampled indices: [207795, 106154, 170323, 358553, 232329, 370980, 93021, 51322, 74488, 58299]
images.shape: torch.Size([10, 3, 224, 224])
captions.shape: torch.Size([10, 11])


Let's start from the `CNN` part of the `encoder`. We get a `4D` tensor that we should transform into `2D` shape. We may do it with `torch.flatten()` or manually as in the assignment code.

In [32]:
cnn_out = encoder_resnet(images)

In [33]:
cnn_out.shape

torch.Size([10, 2048, 1, 1])

In [34]:
cnn_out.view(cnn_out.size(0), -1).shape

torch.Size([10, 2048])

In [35]:
torch.flatten(cnn_out, 1).shape

torch.Size([10, 2048])

Finally let's apply our linear layer.

In [36]:
cnn_out_flatten = torch.flatten(cnn_out, 1)

In [37]:
cnn_out_embed = encoder_embed(cnn_out_flatten)

In [38]:
cnn_out_embed.shape

torch.Size([10, 256])

## `DecoderRNN`

We may use an output of `CNN` in different ways. In this assignment we follow the picture in step `4`: 

- we remove `<end>` symbol from our captions;
- we use output of `CNN` as the first word in captions;

### get features

In [39]:
images.shape, captions.shape

(torch.Size([10, 3, 224, 224]), torch.Size([10, 11]))

In [40]:
embed_size

256

Let's create an encoder and get our `CNN` output.

In [41]:
encoder = EncoderCNN(embed_size=embed_size)

In [58]:
features = encoder(images=images)

In [59]:
features.shape

torch.Size([10, 256])

### captions

Let's first embed our captions.

In [46]:
len(data_loader.dataset.vocab)

9955

In [47]:
vocab_size = len(data_loader.dataset.vocab)

In [48]:
embedding = nn.Embedding(num_embeddings=vocab_size,
                         embedding_dim=embed_size)

In [49]:
captions_embed = embedding(captions)

In [50]:
captions_embed.shape

torch.Size([10, 11, 256])

In [51]:
captions_embed = captions_embed[:, :-1, :]

Now let's concatinate them. We may see that indeed after concatination the first element of `captions_embed` contains our features from `CNN`.

In [60]:
features.shape, captions_embed.shape

(torch.Size([10, 256]), torch.Size([10, 10, 256]))

In [61]:
features = features.unsqueeze(1)

In [62]:
features.shape

torch.Size([10, 1, 256])

In [63]:
features[0, 0, :5]

tensor([-0.2905, -0.1864, -0.1545,  0.1422, -0.0187], grad_fn=<SliceBackward>)

In [64]:
captions_embed[0, 0, :5]

tensor([-0.7915,  0.0590,  1.3937, -1.5682,  0.4048], grad_fn=<SliceBackward>)

In [65]:
captions_embed = torch.cat((features, captions_embed), dim=1)

In [66]:
captions_embed.shape

torch.Size([10, 11, 256])

In [67]:
captions_embed[0, 0, :5]

tensor([-0.2905, -0.1864, -0.1545,  0.1422, -0.0187], grad_fn=<SliceBackward>)

In [68]:
captions_embed[0, 1, :5]

tensor([-0.7915,  0.0590,  1.3937, -1.5682,  0.4048], grad_fn=<SliceBackward>)

### scores

Finally we have to pass `captions_embed` to our `LSTM` and get `scores` using linear projection on `LSTM` output. The important point here is that we apply `linear` layer to a sequence. This is possible:

> Input: $(N, *, H_{in})$ where $*$ means any number of additional dimensions and $H_{in} = \text{in_features}$. Output: $(N, *, H_{out})$ where $H_{out} = \text{out_features}$.

In [75]:
hidden_size=512

In [70]:
lstm = nn.LSTM(input_size=embed_size,
               hidden_size=hidden_size,
               num_layers=1,
               batch_first=True)

In [71]:
lstm_out, _ = lstm(captions_embed)

In [72]:
lstm_out.shape

torch.Size([10, 11, 512])

In [76]:
linear = nn.Linear(in_features=hidden_size,
                   out_features=vocab_size)

In [78]:
scores = linear(lstm_out)

In [85]:
lstm_out.shape, scores.shape

(torch.Size([10, 11, 512]), torch.Size([10, 11, 9955]))

Let's check that we use the same weights for all outputs in the sequence.

In [80]:
W, b = linear.weight, linear.bias

In [82]:
scores_man = torch.zeros_like(scores)

In [86]:
for t in range(11):
    scores_man[:, t, :] = torch.mm(lstm_out[:, t, :], W.t()) + b

In [92]:
scores.detach().numpy()[0, 0, :]

array([ 0.00589012,  0.01097454, -0.00724187, ..., -0.01234332,
       -0.02735836,  0.07523744], dtype=float32)

In [93]:
scores_man.detach().numpy()[0, 0, :]

array([ 0.00589011,  0.01097453, -0.00724187, ..., -0.01234331,
       -0.02735836,  0.07523744], dtype=float32)

In [97]:
np.allclose(scores.detach().numpy(), scores_man.detach().numpy(), atol=1e-06)

True

This concludes our debugging of the model.