<a name='0'></a>
## Packages

In [2]:
import numpy as np


In [3]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)


def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [4]:
def initialize_adam(parameters) :

    L = len(parameters) // 2 # number of layers in the neural networks
    v = {}
    s = {}

    for l in range(L):
        v["dW" + str(l+1)] = np.zeros(parameters["W" + str(l+1)].shape)
        v["db" + str(l+1)] = np.zeros(parameters["b" + str(l+1)].shape)
        s["dW" + str(l+1)] = np.zeros(parameters["W" + str(l+1)].shape)
        s["db" + str(l+1)] = np.zeros(parameters["b" + str(l+1)].shape)

    return v, s


def update_parameters_with_adam(parameters, grads, v, s, t, learning_rate = 0.01,
                                beta1 = 0.9, beta2 = 0.999,  epsilon = 1e-8):


    L = len(parameters) // 2                 # number of layers in the neural networks
    v_corrected = {}                         # Initializing first moment estimate, python dictionary
    s_corrected = {}                         # Initializing second moment estimate, python dictionary

    # Perform Adam update on all parameters
    for l in range(L):

        v["dW" + str(l+1)] = beta1 * v["dW" + str(l+1)] + (1 - beta1) * grads["dW" + str(l+1)]
        v["db" + str(l+1)] = beta1 * v["db" + str(l+1)] + (1 - beta1) * grads["db" + str(l+1)]

        v_corrected["dW" + str(l+1)] = v["dW" + str(l+1)] / (1 - beta1**t)
        v_corrected["db" + str(l+1)] = v["db" + str(l+1)] / (1 - beta1**t)

        s["dW" + str(l+1)] = beta2 * s["dW" + str(l+1)] + (1 - beta2) * (grads["dW" + str(l+1)] ** 2)
        s["db" + str(l+1)] = beta2 * s["db" + str(l+1)] + (1 - beta2) * (grads["db" + str(l+1)] ** 2)

        s_corrected["dW" + str(l+1)] = s["dW" + str(l+1)] / (1 - beta2 ** t)
        s_corrected["db" + str(l+1)] = s["db" + str(l+1)] / (1 - beta2 ** t)

        parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * v_corrected["dW" + str(l+1)] / np.sqrt(s_corrected["dW" + str(l+1)] + epsilon)
        parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * v_corrected["db" + str(l+1)] / np.sqrt(s_corrected["db" + str(l+1)] + epsilon)

    return parameters, v, s

<a name='1'></a>
## 1 - Forward Propagation for the Basic Recurrent Neural Network



<a name='-1'></a>
 1 - rnn_cell_forward


1. The hidden state:\
 $a^{\langle t \rangle} = \tanh(W_{aa} a^{\langle t-1 \rangle} + W_{ax} x^{\langle t \rangle} + b_a)$
2. Y-hat: <br>
 $\hat{y}^{\langle t \rangle} = softmax(W_{ya} a^{\langle t \rangle} + b_y)$. `



In [5]:


def rnn_cell_forward(xt, a_prev, parameters):

   # xt -- your input data at timestep "t", numpy array of shape (n_x, m).
   # a_prev -- Hidden state at timestep "t-1", numpy array of shape (n_a, m)
   # parameters -- python dictionary (Wax, Waa, Wya, ba, by)

    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Wya = parameters["Wya"]
    ba = parameters["ba"]
    by = parameters["by"]

    a_next = np.tanh(np.dot(Waa,a_prev) + np.dot(Wax,xt) + ba)
    yt_pred = softmax(np.dot(Wya,a_next) + by)



    cache = (a_next, a_prev, xt, parameters)

    return a_next, yt_pred, cache

<a name='-2'></a>
### rnn_forward



In [6]:


def rnn_forward(x, a0, parameters):

    # x -- Input data for every time-step, of shape (n_x, m, T_x).
    # a0 -- Initial hidden state, of shape (n_a, m)
    # parameters -- python dictionary

    caches = []

    n_x, m, T_x = x.shape
    n_y, n_a = parameters["Wya"].shape

    a = np.zeros((n_a,m,T_x))
    y_pred = np.zeros((n_y,m,T_x))

    a_next = a0

    # loop over all time-steps
    for t in range(T_x):
        a_next, yt_pred, cache =  rnn_cell_forward(x[:,:,t], a_next, parameters)
        a[:,:,t] = a_next
        y_pred[:,:,t] = yt_pred
        caches.append(cache)

    caches = (caches, x)

    return a, y_pred, caches

<a name='2'></a>
## 2 - Long Short-Term Memory (LSTM) Network




### Forget gate $\mathbf{\Gamma}_{f}$



##### Equation

$$\mathbf{\Gamma}_f^{\langle t \rangle} = \sigma(\mathbf{W}_f[\mathbf{a}^{\langle t-1 \rangle}, \mathbf{x}^{\langle t \rangle}] + \mathbf{b}_f)\ $$


### Candidate value $\tilde{\mathbf{c}}^{\langle t \rangle}$


##### Equation
$$\mathbf{\tilde{c}}^{\langle t \rangle} = \tanh\left( \mathbf{W}_{c} [\mathbf{a}^{\langle t - 1 \rangle}, \mathbf{x}^{\langle t \rangle}] + \mathbf{b}_{c} \right) \tag{3}$$


### Update gate $\mathbf{\Gamma}_{i}$



##### Equation

$$\mathbf{\Gamma}_i^{\langle t \rangle} = \sigma(\mathbf{W}_i[a^{\langle t-1 \rangle}, \mathbf{x}^{\langle t \rangle}] + \mathbf{b}_i)\tag{2} $$



#### Cell state $\mathbf{c}^{\langle t \rangle}$


##### Equation

$$ \mathbf{c}^{\langle t \rangle} = \mathbf{\Gamma}_f^{\langle t \rangle}* \mathbf{c}^{\langle t-1 \rangle} + \mathbf{\Gamma}_{i}^{\langle t \rangle} *\mathbf{\tilde{c}}^{\langle t \rangle} \tag{4} $$


### Output gate $\mathbf{\Gamma}_{o}$


##### Equation

$$ \mathbf{\Gamma}_o^{\langle t \rangle}=  \sigma(\mathbf{W}_o[\mathbf{a}^{\langle t-1 \rangle}, \mathbf{x}^{\langle t \rangle}] + \mathbf{b}_{o})\tag{5}$$


#### Hidden state $\mathbf{a}^{\langle t \rangle}$

##### Equation

$$ \mathbf{a}^{\langle t \rangle} = \mathbf{\Gamma}_o^{\langle t \rangle} * \tanh(\mathbf{c}^{\langle t \rangle})\tag{6} $$


#### Prediction $\mathbf{y}^{\langle t \rangle}_{pred}$


The equation :
$$\mathbf{y}^{\langle t \rangle}_{pred} = \textrm{softmax}(\mathbf{W}_{y} \mathbf{a}^{\langle t \rangle} + \mathbf{b}_{y})$$


<a name='2-1'></a>
### 2.1 - LSTM Cell

<a name='e-3'></a>
###  lstm_cell_forward



In [8]:
def lstm_cell_forward(xt, a_prev, c_prev, parameters):

    Wf = parameters["Wf"] # forget gate weight
    bf = parameters["bf"]
    Wi = parameters["Wi"] # update gate weight
    bi = parameters["bi"]
    Wc = parameters["Wc"] # candidate value weight
    bc = parameters["bc"]
    Wo = parameters["Wo"] # output gate weight
    bo = parameters["bo"]
    Wy = parameters["Wy"] # prediction weight
    by = parameters["by"]

    n_x, m = xt.shape
    n_y, n_a = Wy.shape


    concat = np.concatenate((a_prev , xt))


    ft = sigmoid(np.dot(Wf , concat) + bf)
    it = sigmoid(np.dot(Wi , concat) + bi)
    cct = np.tanh(np.dot(Wc , concat) + bc)
    c_next = ft * c_prev + it * cct
    ot = sigmoid(np.dot(Wo , concat) + bo)
    a_next = ot * np.tanh(c_next)

    yt_pred = softmax(np.dot(Wy , a_next) + by)

    cache = (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters)

    return a_next, c_next, yt_pred, cache

<a name='2-2'></a>
### 2.2 - Forward Pass for LSTM

<a name='e-4'></a>    
###  lstm_forward
  

In [9]:
def lstm_forward(x, a0, parameters):


    caches = []

    Wy = parameters['Wy']
    n_x, m, T_x = x.shape
    n_y, n_a = Wy.shape

    a = np.zeros((n_a,m,T_x))
    c = np.zeros((n_a,m,T_x))
    y = np.zeros((n_y,m,T_x))

    a_next = a0
    c_next = np.zeros((n_a,m))

    # loop over all time-steps
    for t in range(T_x):
        # Get the 2D slice 'xt' from the 3D input 'x' at time step 't'
        xt = x[:,:,t]
        a_next, c_next, yt, cache = lstm_cell_forward(xt, a_next, c_next, parameters)
        a[:,:,t] = a_next
        c[:,:,t]  = c_next
        y[:,:,t] = yt
        caches.append(cache)


    caches = (caches, x)

    return a, y, c, caches

<a name='3'></a>    
## 3 - Backpropagation in RNN

##### Equations


\begin{align}
\displaystyle a^{\langle t \rangle} &= \tanh(W_{ax} x^{\langle t \rangle} + W_{aa} a^{\langle t-1 \rangle} + b_{a})\ \\[8pt]
\displaystyle \frac{\partial \tanh(x)} {\partial x} &= 1 - \tanh^2(x) \ \\[8pt]
\displaystyle {dtanh} &= da_{next} * ( 1 - \tanh^2(W_{ax}x^{\langle t \rangle}+W_{aa} a^{\langle t-1 \rangle} + b_{a})) \ \\[8pt]
\displaystyle  {dW_{ax}} &= dtanh \cdot x^{\langle t \rangle T} \\[8pt]
\displaystyle dW_{aa} &= dtanh \cdot a^{\langle t-1 \rangle T} \\[8pt]
\displaystyle db_a& = \sum_{batch}dtanh\ \\[8pt]
\displaystyle dx^{\langle t \rangle} &= { W_{ax}}^T \cdot dtanh\\[8pt]
\displaystyle da_{prev} &= { W_{aa}}^T \cdot dtanh
\end{align}



<a name='e-5'></a>
###  rnn_cell_backward



In [10]:
def rnn_cell_backward(da_next, cache):


    (a_next, a_prev, xt, parameters) = cache

    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Wya = parameters["Wya"]
    ba = parameters["ba"]
    by = parameters["by"]

    # compute the gradient of dtanh
    dtanh = da_next * (1 - np.square(np.tanh(np.dot(Wax,xt) + np.dot(Waa,a_prev) + ba)))

    # compute the gradient of the loss
    dxt = np.dot(np.transpose(Wax) , dtanh)
    dWax = np.dot(dtanh , np.transpose(xt))

    da_prev = np.dot(np.transpose(Waa) , dtanh)
    dWaa = np.dot(dtanh,np.transpose(a_prev))

    dba = np.sum(dtanh , keepdims = True , axis  = 1)


    gradients = {"dxt": dxt, "da_prev": da_prev, "dWax": dWax, "dWaa": dWaa, "dba": dba}

    return gradients

<a name='e-6'></a>
###  rnn_backward


In [11]:
def rnn_backward(da, caches):


    (caches, x) = caches
    (a1, a0, x1, parameters) = caches[0]

    n_a, m, T_x = da.shape
    n_x, m = x1.shape

    dx = np.zeros((n_x, m, T_x))
    dWax = np.zeros((n_a, n_x))
    dWaa = np.zeros((n_a, n_a))
    dba = np.zeros((n_a, 1))
    da0 = np.zeros((n_a, m))
    da_prevt = np.zeros((n_a, m))

    # Loop through all the time steps
    for t in reversed(range(T_x)):
        gradients = rnn_cell_backward(da[:, :, t] + da_prevt, caches[t])
        dxt, da_prevt, dWaxt, dWaat, dbat = gradients["dxt"], gradients["da_prev"], gradients["dWax"], gradients["dWaa"], gradients["dba"]
        dx[:, :, t] = dxt
        dWax += dWaxt
        dWaa += dWaat
        dba += dbat

    da0 = da_prevt

    gradients = {"dx": dx, "da0": da0, "dWax": dWax, "dWaa": dWaa,"dba": dba}

    return gradients