In [212]:
# As usual, a bit of setup
import time, os, json
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import torch

from cs231n.gradient_check import eval_numerical_gradient, eval_numerical_gradient_array
from cs231n.rnn_layers import *
from cs231n.captioning_solver import CaptioningSolver
from cs231n.classifiers.rnn import CaptioningRNN
from cs231n.coco_utils import load_coco_data, sample_coco_minibatch, decode_captions
from cs231n.image_utils import image_from_url
from cs231n.layers import *
from cs231n.rnn_layers import *

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


That's a pretty involved question so let's do some additional debugging. Hopefully it will also be useful for tackling captioning in `pytorch` on a real (`20G`) dataset (an upcoming project from `Udacity`).

## Microsoft COCO

Let's first have a look at our dataset:

- first of all we're working with 2014 dataset; `80K` training images (we skip here our validation statistics for simplicity); 
- we have features from `VGG16` (full and reduced) as described in the assignment; features are `pca` reduced;
- we have `5` captions per image: `400K` captions; captions padded or truncated to `17` words; represented as list of indicies from a vocabulary; we may see their content with `decode_captions()`; we have files with somewhat misleading titles `images` - they contain the number of an image for each caption;
- we have an url for each training example, not an image itself as specified in the assignments;

Last question that we have - is there any difference between those files and original files (except dimesionality reduction)? That's not quite clear from files itself so let's postpone this question.

### files

In [124]:
data_dir = Path.home() / 'data/coco_captioning'

In [125]:
list(data_dir.glob('*'))

[PosixPath('/Users/ilyarudyak/data/coco_captioning/val2014_urls.txt'),
 PosixPath('/Users/ilyarudyak/data/coco_captioning/coco2014_vocab.json'),
 PosixPath('/Users/ilyarudyak/data/coco_captioning/train2014_urls.txt'),
 PosixPath('/Users/ilyarudyak/data/coco_captioning/val2014_vgg16_fc7.h5'),
 PosixPath('/Users/ilyarudyak/data/coco_captioning/train2014_vgg16_fc7.h5'),
 PosixPath('/Users/ilyarudyak/data/coco_captioning/val2014_vgg16_fc7_pca.h5'),
 PosixPath('/Users/ilyarudyak/data/coco_captioning/train2014_images.txt'),
 PosixPath('/Users/ilyarudyak/data/coco_captioning/val2014_images.txt'),
 PosixPath('/Users/ilyarudyak/data/coco_captioning/coco2014_captions.h5'),
 PosixPath('/Users/ilyarudyak/data/coco_captioning/train2014_vgg16_fc7_pca.h5')]

In [126]:
list(data_dir.glob('*fc7*'))

[PosixPath('/Users/ilyarudyak/data/coco_captioning/val2014_vgg16_fc7.h5'),
 PosixPath('/Users/ilyarudyak/data/coco_captioning/train2014_vgg16_fc7.h5'),
 PosixPath('/Users/ilyarudyak/data/coco_captioning/val2014_vgg16_fc7_pca.h5'),
 PosixPath('/Users/ilyarudyak/data/coco_captioning/train2014_vgg16_fc7_pca.h5')]

In [127]:
list(data_dir.glob('*vocab*'))

[PosixPath('/Users/ilyarudyak/data/coco_captioning/coco2014_vocab.json')]

In [128]:
list(data_dir.glob('*captions*'))

[PosixPath('/Users/ilyarudyak/data/coco_captioning/coco2014_captions.h5')]

In [129]:
list(data_dir.glob('*images*'))

[PosixPath('/Users/ilyarudyak/data/coco_captioning/train2014_images.txt'),
 PosixPath('/Users/ilyarudyak/data/coco_captioning/val2014_images.txt')]

In [130]:
list(data_dir.glob('*urls*'))

[PosixPath('/Users/ilyarudyak/data/coco_captioning/val2014_urls.txt'),
 PosixPath('/Users/ilyarudyak/data/coco_captioning/train2014_urls.txt')]

### data

In [131]:
data = load_coco_data(pca_features=True)

In [132]:
data.keys()

dict_keys(['train_captions', 'train_image_idxs', 'val_captions', 'val_image_idxs', 'train_features', 'val_features', 'idx_to_word', 'word_to_idx', 'train_urls', 'val_urls'])

### features

In [133]:
data['train_features'].shape

(82783, 512)

### vocabulary

In [134]:
len(data['idx_to_word'])

1004

In [135]:
[data['idx_to_word'][i] for i in range(10)]

['<NULL>', '<START>', '<END>', '<UNK>', 'a', 'on', 'of', 'the', 'in', 'with']

### captions

In [136]:
data['train_captions'].shape

(400135, 17)

In [137]:
data['train_captions'][0]

array([  1,   4, 142, 510,  10, 667, 415, 277,  58,   2,   0,   0,   0,
         0,   0,   0,   0], dtype=int32)

In [138]:
decode_captions(data['train_captions'][0], data['idx_to_word'])

'<START> a very clean and well decorated empty bathroom <END>'

In [139]:
data['idx_to_word'][0]

'<NULL>'

### urls

In [140]:
data['train_urls'].shape

(82783,)

In [141]:
data['train_urls'][0]

'http://farm4.staticflickr.com/3153/2970773875_164f0c0b83_z.jpg'

In [142]:
# url = data['train_urls'][0]
# plt.imshow(image_from_url(url));

### images

In [143]:
data['train_image_idxs'].shape

(400135,)

In [144]:
data['train_image_idxs'][:10]

array([53314, 21548, 53314, 21548, 43077, 43077, 53314, 53314, 44042,
       16413], dtype=int32)

In [145]:
# url = data['train_urls'][43077]
# plt.imshow(image_from_url(url));

In [146]:
decode_captions(data['train_captions'][4:6], data['idx_to_word'])

['<START> a <UNK> stop sign across the street from a red car <END>',
 '<START> a <UNK> stop sign and a red <UNK> on the road <END>']

Let's finally extract all captions for this image.

In [147]:
caps = data['train_captions'][data['train_image_idxs'] == 43077]

In [148]:
decode_captions(caps, data['idx_to_word'])

['<START> a <UNK> stop sign across the street from a red car <END>',
 '<START> a <UNK> stop sign and a red <UNK> on the road <END>',
 '<START> a red stop sign with a <UNK> <UNK> <UNK> under the <UNK> stop <END>',
 '<START> a stop sign that has been <UNK> is <UNK> in front of a parked car <END>',
 '<START> a street sign <UNK> to <UNK> stop <UNK> <END>']

### small data

So it looks like we have only 50 images and 1 caption per image in this small dataset.

In [149]:
np.random.seed(231)
small_data = load_coco_data(max_train=50)

In [150]:
small_data.keys()

dict_keys(['train_captions', 'train_image_idxs', 'val_captions', 'val_image_idxs', 'train_features', 'val_features', 'idx_to_word', 'word_to_idx', 'train_urls', 'val_urls'])

In [151]:
small_data['train_features'].shape

(82783, 512)

In [152]:
small_data['train_captions'].shape

(50, 17)

In [153]:
small_data['train_image_idxs'].shape

(50,)

In [154]:
small_data['train_image_idxs']

array([ 2844, 72739, 29132, 24456, 19151,  9391, 70313, 68609, 43887,
        1509, 46755, 12770, 47533, 22781,  2198, 21214, 77700, 19257,
       77186, 45683, 78192, 31889, 55132, 48245,  4805, 17209, 67353,
       32150, 78473, 29969, 30195, 77955, 60566,  4192, 60633, 52886,
       25245, 12033, 41517, 67383, 63836, 14180, 73870, 26231, 43160,
        1457, 62170, 75613, 45350, 49583], dtype=int32)

In [155]:
small_data['train_urls'].shape

(82783,)

In [156]:
len(small_data['idx_to_word'])

1004

In [157]:
# url = small_data['train_urls'][2844]
# plt.imshow(image_from_url(url));

In [158]:
decode_captions(small_data['train_captions'][0], small_data['idx_to_word'])

'<START> a living room with <UNK> of <UNK> <UNK> <END>'

## Training

### model

Let's now try to make a step of forward pass using this small dataset. When we create a model we initialize only weights, we nave no layers like in `pytorch`. All our layers are functions here, not classes. What layers do we have?

- linear layer to get initial hidden state of our `RNN` from `CNN` features; in our case this is a square matrix - `hidden_dim=512` is the same as `fc7` features (with reduced dimensionality);
- embedding layer with shape `(vocab_size, wordvec_dim)` in our case `(1004, 256)`; 
- rnn layer with shapes for `Wh` and `Wx`: (512, 512), (256, 512); we multiply `x` as a row vector: `next_h_lin = prev_h.dot(Wh) + x.dot(Wx) + b`;
- finally we produce our scores using `W_vocab` of shape `(hidden_dim, vocab_size)` or in our case `(512, 1004)`;

In [159]:
np.random.seed(231)
small_data = load_coco_data(max_train=50)
small_rnn_model = CaptioningRNN(
          cell_type='rnn',
          word_to_idx=data['word_to_idx'],
          input_dim=data['train_features'].shape[1],
          hidden_dim=512,
          wordvec_dim=256,
        )

In [160]:
small_rnn_model.params.keys()

dict_keys(['W_embed', 'W_proj', 'b_proj', 'Wx', 'Wh', 'b', 'W_vocab', 'b_vocab'])

In [161]:
small_rnn_model.params['W_proj'].shape

(512, 512)

In [162]:
small_rnn_model.params['W_embed'].shape

(1004, 256)

In [163]:
small_rnn_model.params['Wh'].shape, small_rnn_model.params['Wx'].shape

((512, 512), (256, 512))

In [164]:
small_rnn_model.params['W_vocab'].shape

(512, 1004)

### forward pass

#### minibatch

Let's now get a minibatch of data and make a forward pass.

In [165]:
minibatch = sample_coco_minibatch(small_data,
                                  batch_size=25,
                                  split='train')
captions, features, urls = minibatch

In [166]:
captions.shape, features.shape, urls.shape

((25, 17), (25, 512), (25,))

#### captions and mask

First of all we have to produce captions for input (`captions_in`) and for loss (`captions_out`). It's important to notice that in case of `captions_in` we just remove the last `0` (in many cases).

In [167]:
captions_in = captions[:, :-1]
captions_out = captions[:, 1:]

In [168]:
captions[0, :]

array([  1,   4,  12, 292,   9,  40, 236, 628,   8,  44,   3,   9, 318,
         2,   0,   0,   0], dtype=int32)

In [169]:
captions_in[0, :]

array([  1,   4,  12, 292,   9,  40, 236, 628,   8,  44,   3,   9, 318,
         2,   0,   0], dtype=int32)

In [170]:
captions_out[0, :]

array([  4,  12, 292,   9,  40, 236, 628,   8,  44,   3,   9, 318,   2,
         0,   0,   0], dtype=int32)

Let's have a look at `mask` that we use to compute loss. It's allowed us to exclude padding from loss computations. `mask` has the same shape as 

In [171]:
small_rnn_model._null

0

In [172]:
mask = (captions_out != small_rnn_model._null)

In [173]:
mask.shape

(25, 16)

In [174]:
mask[0, :]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False, False])

Let's unpack our weight matrices.

In [175]:
# Weight and bias for the affine transform from image features to initial
# hidden state
W_proj, b_proj = small_rnn_model.params['W_proj'], small_rnn_model.params['b_proj']

# Word embedding matrix
W_embed = small_rnn_model.params['W_embed']

# Input-to-hidden, hidden-to-hidden, and biases for the RNN
Wx, Wh, b = small_rnn_model.params['Wx'], small_rnn_model.params['Wh'], small_rnn_model.params['b']

# Weight and bias for the hidden-to-vocab transformation.
W_vocab, b_vocab = small_rnn_model.params['W_vocab'], small_rnn_model.params['b_vocab']

loss, grads = 0.0, {}

#### (1) affine transformation

Let's compare output of `affine_forward` with manual computations. Our input is of shape `(batch_size, pca_reduced_size)` and output is of the shape: `(batch_size, hidden_dim)`. This is actually the same shape.

In [176]:
affine_out, affine_cache = affine_forward(features, W_proj, b_proj)

In [177]:
features.shape, W_proj.shape

((25, 512), (512, 512))

In [178]:
affine_out.shape

(25, 512)

In [179]:
affine_out_man = features.dot(W_proj) + b_proj

In [180]:
np.allclose(affine_out, affine_out_man)

True

#### (2) word embedding layer

In [181]:
captions_in_embed, embed_cache = word_embedding_forward(captions_in, W_embed)

In [182]:
captions_in.shape

(25, 16)

In [183]:
captions_in_embed.shape

(25, 16, 256)

Let's check our result on a simple example.

In [184]:
captions_in_embed[0, :5, :2]

array([[-0.00601405, -0.00853269],
       [ 0.00725536, -0.0103831 ],
       [ 0.00829113,  0.00189677],
       [ 0.01076211,  0.0198786 ],
       [ 0.00631156, -0.01850057]], dtype=float32)

In [185]:
captions_in[0]

array([  1,   4,  12, 292,   9,  40, 236, 628,   8,  44,   3,   9, 318,
         2,   0,   0], dtype=int32)

In [186]:
[W_embed[i, :2] for i in captions_in[0][:5]]

[array([-0.00601405, -0.00853269], dtype=float32),
 array([ 0.00725536, -0.0103831 ], dtype=float32),
 array([0.00829113, 0.00189677], dtype=float32),
 array([0.01076211, 0.0198786 ], dtype=float32),
 array([ 0.00631156, -0.01850057], dtype=float32)]

In [187]:
W_embed[captions_in[0], :2][:5]

array([[-0.00601405, -0.00853269],
       [ 0.00725536, -0.0103831 ],
       [ 0.00829113,  0.00189677],
       [ 0.01076211,  0.0198786 ],
       [ 0.00631156, -0.01850057]], dtype=float32)

In [188]:
W_embed[captions_in, :2][0, :5, :]

array([[-0.00601405, -0.00853269],
       [ 0.00725536, -0.0103831 ],
       [ 0.00829113,  0.00189677],
       [ 0.01076211,  0.0198786 ],
       [ 0.00631156, -0.01850057]], dtype=float32)

#### (3) rnn layer

The shape of the hidden state is `(batch_size, hidden_size)` or in our case `(25, 512)`. The `seq_len` in our case `16` - we removed one character from full `caption`. We use `affine_out` as our initial hidden state (that's our `CNN` output after the linear layer).

In [189]:
affine_out.shape

(25, 512)

In [190]:
hidden_rnn, rnn_cache = rnn_forward(captions_in_embed, affine_out, Wx, Wh, b)

In [191]:
hidden_rnn.shape

(25, 16, 512)

Let's now try to reproduce this result using `rnn_step_forward()`.

In [192]:
captions_in_embed.shape

(25, 16, 256)

In [193]:
hidden_rnn_man = np.zeros_like(hidden_rnn)
hidden = affine_out
for t in range(16):
    hidden, _ = rnn_step_forward(captions_in_embed[:, t, :], hidden, Wx, Wh, b)
    hidden_rnn_man[:, t, :] = hidden

In [194]:
np.allclose(hidden_rnn, hidden_rnn_man)

True

In [195]:
hidden_rnn_man2 = np.zeros_like(hidden_rnn)
hidden = affine_out
for t in range(16):
    hidden = np.tanh(captions_in_embed[:, t, :].dot(Wx) + hidden.dot(Wh) + b)
    hidden_rnn_man2[:, t, :] = hidden

In [196]:
np.allclose(hidden_rnn, hidden_rnn_man2)

True

#### (4) (temporal) affine transformation

Let's get our scores. They have shape `(batch_size, seq_len, vocab_size)` or in our case `(25, 16, 1004)`. In other words each hidden state produce a score and it produces it with the **same** `W_vocab`. These are **unnormalized** scores (before `softmax`).

In [197]:
scores, _ = temporal_affine_forward(hidden_rnn, W_vocab, b_vocab)

In [198]:
scores.shape

(25, 16, 1004)

In [199]:
scores_man = np.zeros_like(scores)
for t in range(16):
    scores_man[:, t, :] = hidden_rnn[:, t, :].dot(W_vocab) + b_vocab

In [200]:
np.allclose(scores, scores_man)

True

Let's now try to vectorize this loop. It turns out that we may just use the same `dot` operation in `numpy`.

In [201]:
hidden_rnn.shape, W_vocab.shape, scores.shape

((25, 16, 512), (512, 1004), (25, 16, 1004))

In [202]:
scores_man2 = hidden_rnn.dot(W_vocab)

In [203]:
scores_man2.shape

(25, 16, 1004)

In [204]:
np.allclose(scores, scores_man2)

True

#### (5) (temporal) softmax to compute loss

##### actual loss

Finally let's compute our softmax loss. At least it's easy to compare our manual loss to the actual one - it's just a number.

In [207]:
scores.shape, captions_out.shape, mask.shape

((25, 16, 1004), (25, 16), (25, 16))

In [208]:
loss, _ = temporal_softmax_loss(scores, captions_out, mask)

In [209]:
loss

80.14522802655904

##### mask

Situation is complicated by padding - `captions_out` contains padding and for those positions we don't have to compute loss (we have to use `mask` for this purpose). The three last elements are padding and we have `mask == False` for those positions. 

In [210]:
captions_out[0, :]

array([  4,  12, 292,   9,  40, 236, 628,   8,  44,   3,   9, 318,   2,
         0,   0,   0], dtype=int32)

In [211]:
mask[0, :]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True, False, False, False])

##### manual loss

So how do we compute our loss? We have to:

- compute softmax - we know that our scores are **before softmax** (see above);
- then we have to extract scores for correct classes and compute losses for them (do not compute for padding);
- take an average of those losses;

Let's first do this in a loop and then we'll vectorize it.

In [218]:
# (1) softmax
probs = torch.softmax(torch.Tensor(scores), dim=2).numpy()

In [221]:
np.sum(probs, axis=2)[0, :]

array([1.0000001 , 1.        , 1.        , 1.        , 1.0000001 ,
       1.        , 0.99999994, 1.        , 1.        , 1.0000001 ,
       1.        , 0.9999999 , 1.        , 1.0000001 , 1.        ,
       1.        ], dtype=float32)

In [222]:
# (2) extract scores in a loop
probs.shape

(25, 16, 1004)

In [223]:
captions_out.shape

(25, 16)

In [247]:
probs_correct = np.ones_like(captions_out, dtype='float')
for i in range(25):
    for j in range(16):
        if captions_out[i, j]:
            probs_correct[i, j] = probs[i, j, captions_out[i, j]]

In [248]:
probs_correct.shape

(25, 16)

In [249]:
probs_correct[0, :]

array([1.08322070e-03, 7.16992130e-04, 6.49803143e-04, 8.29070457e-04,
       2.27524783e-03, 1.13537675e-03, 9.35421151e-04, 1.00416481e-03,
       1.50695874e-03, 9.40954254e-04, 1.44199433e-03, 1.18457165e-03,
       7.34507630e-04, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00])

In [250]:
# (3) compute loss
-np.sum(np.log(probs_correct)) / 25

80.14522793536727

So we may see that the loss is the same as above.