In [5]:
import numpy as np

In [6]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [17]:
def RNN_forward_prop(x_t, a_prev, parameters):
   
    Wya = parameters["Wya"]
    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    ba = parameters["ba"]
    by = parameters["by"]
    
    a_next = np.tanh(np.dot(Waa,a_prev) + np.dot(Wax,x_t) + ba)

    yt_pred = softmax(np.dot(Wya, a_next) + by)

    cache = (a_next, a_prev, xt, parameters)
    
    return a_next, yt_pred, cache

In [18]:
def RNN_forward(x, a0, parameters):
    
    caches = []
    
    n_x, m, T_x = x.shape
    n_y, n_a = parameters["Wya"].shape

    a = np.zeros((n_a,m,T_x))
    y_pred = np.zeros((n_y,m,T_x))

    a_next = a0
    
    for t in range(T_x):

        a_next, yt_pred, cache = RNN_forward_prop(x[:,:,t], a_next, parameters )
        a[:,:,t] = a_next
        y_pred[:,:,t] = yt_pred
        caches.append(cache)

    caches = (caches, x)
    
    return a, y_pred, caches

In [20]:
def RNN_backward_prop(da_next, cache):

    (a_next, a_prev, x_t, parameters) = cache
    
    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Wya = parameters["Wya"]
    ba = parameters["ba"]
    by = parameters["by"]
    
    dtanh = np.multiply((1-np.square(a_next)),da_next)
    dxt = np.dot(Wax.T,dtanh)
    dWax = np.dot(dtanh,x_t.T)
    da_prev = np.dot(Waa.T,dtanh)
    dWaa = np.dot(dtanh,a_prev.T)
    dba = dtanh.sum(axis=0)

    gradients = {"dxt": dxt, "da_prev": da_prev, "dWax": dWax, "dWaa": dWaa, "dba": dba}
    
    return gradients

In [23]:
def RNN_backward(da, caches):

    (caches, x) = caches
    (a1, a0, x1, parameters) = caches[0]

    n_a, m, T_x = da.shape
    n_x, m = x1.shape

    dx = np.zeros((n_x,m,T_x))
    dWax = np.zeros((n_a,n_x))
    dWaa = np.zeros((n_a,n_a))
    dba = np.zeros((n_a,m))
    da0 = np.zeros((n_a,m))
    da_prevt = np.zeros((n_a,m))

    for t in reversed(range(T_x)):
        gradients = RNN_backward_prop(da[:,:,t]+da_prevt, caches[t])
        dxt, da_prevt, dWaxt, dWaat, dbat = gradients["dxt"], gradients["da_prev"], gradients["dWax"], gradients["dWaa"], gradients["dba"]
        dx[:, :, t] = dxt
        dWax += dWaxt
        dWaa += dWaat
        dba += dbat

    da0 = da_prevt
    gradients = {"dx": dx, "da0": da0, "dWax": dWax, "dWaa": dWaa,"dba": dba}
    
    return gradients