In [2]:
# As usual, a bit of setup
import time, os, json
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import torch
from scipy.special import softmax

from cs231n.gradient_check import eval_numerical_gradient, eval_numerical_gradient_array
from cs231n.rnn_layers import *
from cs231n.captioning_solver import CaptioningSolver
from cs231n.classifiers.rnn import CaptioningRNN
from cs231n.coco_utils import load_coco_data, sample_coco_minibatch, decode_captions
from cs231n.image_utils import image_from_url
from cs231n.layers import *
from cs231n.rnn_layers import *

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

Let's look at the softmax forward and back propagation in more details.

In [3]:
np.random.seed(231)

# build data and model
small_data = load_coco_data(max_train=50)
small_rnn_model = CaptioningRNN(
          cell_type='rnn',
          word_to_idx=small_data['word_to_idx'],
          input_dim=small_data['train_features'].shape[1],
          hidden_dim=512,
          wordvec_dim=256,
        )

# get minibatch of data
minibatch = sample_coco_minibatch(small_data,
                                  batch_size=25,
                                  split='train')

# build captions and mask
captions, features, urls = minibatch
captions_in = captions[:, :-1]
captions_out = captions[:, 1:]
mask = (captions_out != small_rnn_model._null)

# unpack weights
W_proj, b_proj = small_rnn_model.params['W_proj'], small_rnn_model.params['b_proj']
W_embed = small_rnn_model.params['W_embed']
Wx, Wh, b = small_rnn_model.params['Wx'], small_rnn_model.params['Wh'], small_rnn_model.params['b']
W_vocab, b_vocab = small_rnn_model.params['W_vocab'], small_rnn_model.params['b_vocab']
loss, grads = 0.0, {}

In [4]:
# forward pass up to softmax loss
affine_out, affine_cache = affine_forward(features, W_proj, b_proj)
captions_in_embed, embed_cache = word_embedding_forward(captions_in, W_embed)
hidden_rnn, rnn_cache = rnn_forward(captions_in_embed, affine_out, Wx, Wh, b)
scores, scores_cache = temporal_affine_forward(hidden_rnn, W_vocab, b_vocab)

In [5]:
scores.shape

(25, 16, 1004)

## Theory

### forward prop

The best description of `softmax` loss that I know is in `cs224n` assignments. Suppose $\hat{y} = softmax(x)$, then if $y$ is one-hot encoded we have:

$$CE(y, \hat{y}) = -\sum_i{y_i log(\hat{y}_i)} = -log(\hat{y}_s) = -log(\frac{e^{x_s}}{\sum_j{e^{x_j}}})$$

In our case correct classes are in `captions_out`. We need to extract those values from `scores`:

> A temporal version of softmax loss for use in `RNNs`. We assume that we are
    making predictions over a vocabulary of size `V` for each timestep of a
    timeseries of length T, over a minibatch of size `N`. The input `x` gives scores
    for all vocabulary elements at all timesteps, and `y` gives the indices of the
    ground-truth element at each timestep.

In [33]:
scores.shape, captions_out.shape

((25, 16, 1004), (25, 16))

In [34]:
captions_out[0, :]

array([  4,  12, 292,   9,  40, 236, 628,   8,  44,   3,   9, 318,   2,
         0,   0,   0], dtype=int32)

But we also need somehow combine losses over timesteps and minibatches:

> We use a cross-entropy loss at each
    timestep, summing the loss over all timesteps and averaging across the
    minibatch.

Finally we have to use mask:

>   As an additional complication, we may want to ignore the model output at some
timesteps, since sequences of different length may have been combined into a
minibatch and padded with NULL tokens. The optional mask argument tells us
which elements should contribute to the loss.

### back prop

What about backprop? We have to use the following result from `cs224n` (where `y` is one-hot encoded):

$$\frac{\partial CE(y, \hat{y})}{\partial \theta} = \hat{y} - y$$

So we just need to subtract `1` from `probs` at the correct classes that are given by `captions_out`.

In [35]:
probs.shape

(25, 16, 1004)

We also have to divide by `batch_size` - we average over batches as stated above. In case of masked values $CE(\hat{y}, y)$ is just `0` and the gradient is `0` so we just need to multiply by the `mask`.

## Forward propagation

So how do we compute our loss? We have to:

- compute softmax - we know that our scores are **before softmax** (see above);
- then we have to extract scores for correct classes; and 
- compute losses for them (do not compute for padding); then take an average of those losses;

### softmax

This is easy. We are not going to compute it manually. Instead we use `scipy`.

In [6]:
probs = softmax(scores, axis=2)

In [7]:
probs.shape

(25, 16, 1004)

In [8]:
np.sum(probs, axis=2)[0, :5]

array([1., 1., 1., 1., 1.])

### extract scores

That's also not too difficult. Let's first do it with a loop and then with indexing. Correct classes are in `captions_out`. We need to extract those classes from `probs`.

In [9]:
captions_out.shape

(25, 16)

In [10]:
captions_out[0]

array([  4,  12, 292,   9,  40, 236, 628,   8,  44,   3,   9, 318,   2,
         0,   0,   0], dtype=int32)

In [11]:
probs[0, 0, 4], probs[0, 1, 12], probs[0, 2, 292] 

(0.0010832205554948437, 0.0007169921412785972, 0.0006498031358371554)

In [12]:
probs[0, :, :][np.arange(16), captions_out[0]]

array([0.00108322, 0.00071699, 0.0006498 , 0.00082907, 0.00227525,
       0.00113538, 0.00093542, 0.00100416, 0.00150696, 0.00094095,
       0.00144199, 0.00118457, 0.00073451, 0.00070052, 0.00144163,
       0.00072034])

In [13]:
probs_correct_loop_no_mask = np.zeros_like(captions_out, dtype='float')
for i in range(25):
    for j in range(16):
        probs_correct_loop_no_mask[i, j] = probs[i, j, captions_out[i, j]]

In [14]:
probs_correct_loop_no_mask.shape

(25, 16)

In [15]:
probs_correct_loop_no_mask[0]

array([0.00108322, 0.00071699, 0.0006498 , 0.00082907, 0.00227525,
       0.00113538, 0.00093542, 0.00100416, 0.00150696, 0.00094095,
       0.00144199, 0.00118457, 0.00073451, 0.00070052, 0.00144163,
       0.00072034])

In [16]:
np.allclose(probs_correct_loop_no_mask[0], probs[0, :, :][np.arange(16), captions_out[0]])

True

Let's now compute this using indexing into `probs`. We use solution from [here](https://stackoverflow.com/questions/50785226/extracting-values-from-last-dimension-of-3d-numpy-array). And it seems we get the correct answer!

In [17]:
probs.shape, captions_out.shape

((25, 16, 1004), (25, 16))

In [18]:
N, T, V = probs.shape

In [19]:
I, J = np.ogrid[:N,:T]
probs_correct_vect_no_mask = probs[I, J, captions_out]

In [20]:
np.allclose(probs_correct_loop_no_mask, probs_correct_vect_no_mask)

True

### compute loss

First let's modify our correct score in the loop to account for our mask. Then we can easily compute our loss. 

In [21]:
loss, dscores = temporal_softmax_loss(scores, captions_out, mask)

In [22]:
loss

80.14522802655904

In [23]:
probs_correct_loop_with_mask = np.ones_like(captions_out, dtype='float')
for i in range(25):
    for j in range(16):
        if captions_out[i, j]:
            probs_correct_loop_with_mask[i, j] = probs[i, j, captions_out[i, j]]

In [24]:
loss_loop = -np.sum(np.log(probs_correct_loop_with_mask)) / N

In [25]:
loss_loop

80.14522802655904

In vectorized case we may just multiply by mask.

In [26]:
mask_simple = mask[:2, -5:]

In [27]:
mask_simple

array([[ True,  True, False, False, False],
       [ True,  True,  True, False, False]])

In [28]:
mask_simple * 2 * np.ones_like(mask_simple)

array([[2, 2, 0, 0, 0],
       [2, 2, 2, 0, 0]])

In [29]:
mask.shape, probs_correct_vect_no_mask.shape

((25, 16), (25, 16))

In [30]:
loss_vect = -np.sum(mask * np.log(probs_correct_vect_no_mask)) / N

In [31]:
loss_vect

80.14522802655904

## Backward propagation

As we mentioned in the theory section we just need to remove `1` from correct classes given by `captions_out`.

In [36]:
scores.shape, dscores.shape

((25, 16, 1004), (25, 16, 1004))

In [58]:
dscores_man = probs.copy()

In [59]:
dscores_man[I, J, captions_out] -= 1

In [60]:
dscores_man /= N

In [61]:
dscores_man *= mask.reshape(N, T, 1)

In [62]:
np.allclose(dscores, dscores_man)

True

This concludes our analysis of the softmax loss.