# Recurrent Neural Networks with LSTM
This ipython notebook implements Forward Propagation of Recurrent Neural network with Long-Short Term Memory. 

In [1]:
# importing important libraries
import numpy as np

# Basic Functions
Implementing basic functions like Sigmoid and Softmax which are used by different Functions to forward propagate in Recurrent Neural Network.

In [1]:
def sigmoid(z):
    s = 1 / (1 + np.exp(-z))
    return s

In [2]:
def softmax(z):
    e_z = np.exp(z)
    s = e_z / np.sum(e_z)
    return s

# Recurent Cell:
<img src="image1.jpg">

In [4]:
def recurrent_cell_forward(xt, a_prev, parameters):
    """This function computes the activation a_next and output yt_pred of current time step 't' using 
    activation from previous time step i.e. a_prev: a numpy array of shape n_a * m where m is number of training examples.
    i.e. xt: a numpy array of shape nx * m where m is the number of examples and nx is the length of one-hot vector. 
    and the current input. 
    The parameters are retrieved from a dictionary named parameters having following parameters for a layer:
    Wax: Weight Matrix Multiplied by input xt.
    Waa: Weight Matrix Multiplied by previous hidden state activation a_prev.
    Wya: Weight Matrix Multiplied by current hidden state activation a_next to calculate output.
    ba: Bias to compute activation a_next
    by: Bias to compute output yt_pred"""
    
    # Retrieving the parameters from parameters dictionary
    Wax = parameters["Wax"]
    Waa = parameters["Waa"]
    Wya = parameters["Wya"]
    ba = parameters["ba"]
    by = parameters["by"]
    
    a_next = np.tanh(np.dot(Waa,a_prev) + np.dot(Wax,xt) + ba) # Current State activation
    yt_pred = softmax(np.dot(Wya,a_next) + by) # Current State Output
    
    # Storing Values in a cache, can be used while back-propagating.
    cache = (a_next, a_prev, xt, parameters)
    
    return a_next, yt_pred, cache

# Forward Propagation in Recurrent Neural Network
<img src="image2.jpg">

<img src="image3.jpg">

In [5]:
def recurrent_forward_propagation(x, a0, parameters):
    """This function implements forward propagation from time step 1 to t using recurrent_cell_forward function.
    x : input
    a0: Time step 0 activation -> A vector of Zeros.
    parameters: Parameter Dictionary."""
    
    caches = [] # Initializing Caches which will contain list of cache for every time step.
    
    # Retrieving Shapes from x and parameters.
    n_x, m, T_x = x.shape  # n_x is length of a training example, m is number of examples, T_x is number of time steps.
    n_y, n_a = parameters["Wya"].shape # n_y is number of outputs, n_a is length of activation a.
    
    # Initializing Activations across time-steps and training examples.
    a = np.zeros((n_a, m, T_x)) # Contains Activations 
    y_pred = np.zeros((n_y, m, T_x)) # Contains Outputs
    
    # Initializing a_next which consists current state activation.
    a_next = a0 # a0 is zeros of shape n_a , m
    
    # Looping over the time steps:
    for t in range(T_x):
        a_next, yt_pred, cache = recurrent_cell_forward(x[:,:,t], a_next, parameters) # Computing values from recurrent_cell_forward function.
        
        a[:,:,t] = a_next # Saving the value of activation of current time step in a.
        
        y_pred[:,:,t] = yt_pred # Saving the value of output of current time step in y_pred.
        
        caches.append(cache) # Appending cache to caches list.
        
        
    # Storing caches list in caches tuple with x, to be used in backward propagation.
    caches = (caches, x)
    
    return a, y_pred, caches

# LSTM Cell
<img src="image4.jpg">

In [None]:
def lstm_cell_forward(xt, a_prev, c_prev, parameters):
    """This function computes forward propagation for a single long short term memory cell
    Function Parameters: xt: Current input, a_prev: Previous time step activation, 
    c_prev: Memory Cell of previos time step and parameters : Dictionary of Parameters."""
    
    # Retrieving Parameters
    # Retrieve parameters from "parameters"
    Wf = parameters["Wf"] # Weights to compute forget gate
    bf = parameters["bf"] # Bias for forget gate
    Wi = parameters["Wi"] # Weights to compute update gate
    bi = parameters["bi"] # Bias for update gate
    Wc = parameters["Wc"] # Weights to compute candidate value of memory cell.
    bc = parameters["bc"] # Bias for candidate value of memory cell.
    Wo = parameters["Wo"] # Weights to compute output gate.
    bo = parameters["bo"] # Bias for output gate.
    Wy = parameters["Wy"] # Weights to compute output at current time step.
    by = parameters["by"] # Bias to compute output at current time step.
    
    # Retrieving dimensions from shapes of xt and Wy
    n_x, m = xt.shape
    n_y, n_a = Wy.shape
    
    # Concatenate a_prev and xt. To take dot product directly with Wf,Wi,Wc to 
    # compute forget gate,update gate and candidate value respectively.
    concat = np.zeros((n_a + n_x , m ))
    concat[: n_a, :] = a_prev
    concat[n_a :, :] = xt
    
    ft = sigmoid(np.dot(Wf , concat)  + bf)  # Forget Gate
    it = sigmoid(np.dot(Wi , concat)  + bi)  # Update Gate 
    cct = np.tanh(np.dot(Wc , concat)  + bc) # Candidate value of memory cell.
    c_next = np.multiply(ft, c_prev) + np.multiply(it, cct) # Cnext
    ot = sigmoid(np.dot(Wo , concat)  + bo) # Output Gate
    a_next = np.multiply(ot, np.tanh(c_next)) # Current State Activation.
    
    yt_pred = softmax(np.dot(Wy, a_next) + by) # Prediction at Current State.
    
    cache = (a_next, c_next, a_prev, c_prev, ft, it, cct, ot, xt, parameters) # Storing Values in cache to be used in Backpropagation.
    
    return a_next, c_next, yt_pred, cache


In [10]:
# Forward Propagation for LSTM.
def lstm_forward_propagation(x, a0, parameters):
    """This function computes forward propagation for LSTM Cells for time steps 1 to T using lstm_cell_forward function
        Function Parameters: x: Input of shape (n_x, m, T_x) 
                             a0: Initial hidden state, of shape (n_a, m)
                             parameters: Dictionary containing parameters."""
    
    # Initialize "caches", which will track the list of all the caches
    caches = []
    
    # Retrieve dimensions from shapes of x and parameters['Wy'] 
    n_x, m, T_x = x.shape
    n_y, n_a = parameters["Wy"].shape
    
    # initialize "a", "c" and "y" with zeros 
    a = np.zeros((n_a, m, T_x)) # Activations of All Lstm Cells i.e. T time steps for m examples.
    c = np.zeros((n_a, m, T_x)) # Memory Cells of All Lstm Cells i.e. T time steps for m examples.
    y = np.zeros((n_y, m, T_x)) # Predictions of All Lstm Cells i.e. T time steps for m examples.
    
    # Initialize a_next and c_next 
    a_next = a0 # a_next is current state activation.
    c_next = np.zeros(a_next.shape) # c_next is current state memory cell.
    
    # loop over all time-steps
    for t in range(T_x):
        # Update next hidden state, next memory state, compute the prediction, get the cache
        a_next, c_next, yt, cache = lstm_cell_forward(x[:,:,t], a_next, c_next, parameters)
        # Save the value of the new "next" hidden state in a 
        a[:,:,t] = a_next
        # Save the value of the prediction in y 
        y[:,:,t] = yt
        # Save the value of the next cell state 
        c[:,:,t]  = c_next
        # Append the cache into caches
        caches.append(cache)
        
        # store values needed for backward propagation in cache
        caches = (caches, x)
        
        return a, y, c, caches