In [None]:
import numpy as np
%matplotlib inline

## RNN

### Backward step computation for one timestep

I) $$ \frac{\partial (tanh(x))}{\partial (x)} = 1 - tanh^2(x)$$

II) $$ \frac{\partial (Ux_t + Ws_{t-1} + b)}{\partial (Ux_t)} = 1$$
$$ \frac{\partial (Ux_t + Ws_{t-1} + b)}{\partial (Ws_{t-1})} = 1$$
$$ \frac{\partial (Ux_t + Ws_{t-1} + b)}{\partial (b)} = 1$$


III.a) $$ \frac{\partial (Ws_{t-1})}{\partial (W)} = s_{t-1}$$
$$ \frac{\partial (Ws_{t-1})}{\partial (s_{t-1})} = W$$

III.b) $$ \frac{\partial (Ux_t)}{\partial (U)} = x_t$$
$$ \frac{\partial (Ux_t)}{\partial (x_t)} = U$$


In [None]:
def rnn_step_forward(x, prev_s, U_x, W_s, b):
    linear_transform = np.dot(x, U_x) + np.dot(prev_s, W_s) + b.T
    next_s = np.tanh(linear_transform)
    cache = {'input': x, 
             'prev_s': prev_s.copy(), 
             'weight_x': U_x,
             'weight_s': W_s,
             'next_s': next_s,
             'linear_transform': linear_transform}
    
    return next_s, cache

def rnn_step_backward(d_next_s, cache):
    (x, prev_s, U_x, W_s, next_s, linear_transform) = cache.values()
    
    # I) how much will `linear_transform` vary with an output variation
    d_linear_transform = (1 - np.square(np.tanh(linear_transform))) * (d_next_s)

    # II) how much will `[Ux, Ws, bias]` vary with d_linear_transform
    d_Ux = d_linear_transform
    d_Ws = d_linear_transform
    d_b = np.sum(d_linear_transform, axis=0)

    # III.a) how much will `Ws_{t-1}` vary with [d_Ux, d_Ws, d_b]
    d_W_s = prev_s.T.dot(d_Ws)
    d_prev_s = W_s.dot(d_Ws.T).T

    # III.b) how much will `Ux_t` vary with [d_U_x, d_W_s, d_b]
    d_x = d_Ux.dot(U_x.T)
    d_U_x = x.T.dot(d_Ux)

    return {'d_x': d_x,
            'd_prev_s': d_prev_s,
            'd_U_x': d_U_x,
            'd_W_s': d_W_s,
            'd_b': d_b}