In [1]:
import numpy as np

Vanilla RNN suffers from the problem of vanishing gradients which causes the network to have short term memory and not learn properly. LSTM is variant of RNN and is designed specifically to tackle vanishing gradients.


LSTM introduces 3 gates on top of vanilla RNN to control the flow of information in the sequence of data:
- input modulation gate
- input gate
- forget gate: how much previous memory to forget
- output gate

These gates can be thought of as filters which control what goes in and what comes out.

On a high level, the architecture of LSTM looks similar to vanilla RNN, where there are several recurrent unit, each producing an ouput, except that besides passing the hidden state from one unit to the next, each unit also passes along the cell state.
<img src="https://media.geeksforgeeks.org/wp-content/uploads/20190702161217/yt2.png" width= 800/>


The internal mechanism within an LSTM cell is illustrated below:
<img src="https://media.geeksforgeeks.org/wp-content/uploads/20190702161123/working3.png" width= 800/>

In [2]:
def init_params(hidden_size, vocab_size):
    z_size = hidden_size + vocab_size # dimension of concatenated vector from prev hidden state and new input
    
    # weights matrix for input modulation, input, forget and output gates
    Wg = np.random.randn(hidden_size, z_size) * 0.1
    Wi = np.random.randn(hidden_size, z_size) * 0.1
    Wf = np.random.randn(hidden_size, z_size) * 0.1
    Wo = np.random.randn(hidden_size, z_size) * 0.1
    
    # weights for prediction - y_hat
    Wy = np.random.randn(vocab_size, hidden_size) * 0.1
    
    # bias for input modulation, input, forget and output gates
    bg = np.zeros((hidden_size, 1))
    bi = np.zeros((hidden_size, 1))
    bf = np.zeros((hidden_size, 1))
    bo = np.zeros((hidden_size, 1))
    
    # bias for prediction - y_hat
    by = np.zeros(vocab_size, 1)
    
    return Wg, Wi, Wf, Wo, Wy, bg, bi, bf, bo, by

In [3]:
# activation functions
def sigmoid(x, derivative=False): # squish the value between [0,1]
    if not derivative:
        return 1/(1+np.exp(-x))
    else:
        return sigmoid(x)*(1-sigmoid(x))
    
    
def tanh(x, derivative=False): # squish the value between [-1,1]
    if not derivative:
        return (np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))
    else:
        return 1 - tanh(x)**2
    
    
def softmax(x): # convert numbers into probabilities
    return np.exp(x) / np.sum(np.exp(x))

In [None]:
# forward propagation 
def forward_propagation(x, hidden_size, vocab_size, params):
    
    # unpack params
    Wg, Wi, Wf, Wo, Wy, bg, bi, bf, bo, by = params
    
    # store all hidden states & cell states for each time step
    hidden_states, cell_states, outputs = [], [], []
    f_s, i_s, g_s, o_s = [], [], [], []
    
    # initialize hidden state & cell state for first iteration
    h_prev = np.zeros(hidden_size, 1)
    c_prev = np.zeros(hidden_size, 1) # nothing to forget at t=0
    
    # iterate over the sequence
    for input_vector in x:
        # concat the hidden state and input vector
        # shape -> (z_size, 1)
        concat_vector = np.concatenate((h_prev, input_vector), axis=0)
        
        # input modulation gate
        # shape -> (hidden_size, 1)
        g = tanh(np.dot(Wg, concat_vector) + bg)
        g_s.append(g)
        
        # input gate
        # shape -> (hidden_size, 1)
        i = sigmoid(np.dot(Wi, concat_vector) + bi)
        i_s.append(i)
        
        # candidate memory - element-wise multiplication of g and i
        # shape -> (hidden_size, 1)
        candidate = g * i
        
        # forget gate
        f = sigmoid(np.dot(Wf, concat_vector) + bf)
        f_s.append(f)
        
        # element-wise multiplication with c_prev
        c_curr = f * c_prev # intermediate cell state
        
        # update c_curr -> element-wise addition of c_curr & candidate memory
        c_curr = c_curr + candidate
        
        # output gate
        # shape -> (hidden_size, 1)
        o = sigmoid(np.dot(Wo, concat_vector) + bo)
        
        # new hidden state - element-wise multiplication of cell state and output
        # shape -> (hidden_size, 1)
        h_curr = tanh(c_curr) * o
        # update h_prev for next iteration
        h_prev = h_curr
        
        # output - y_hat
        # shape -> (vocab_size, 1)
        y_hat = softmax(np.dot(Wy, h_prev) + by)
        outputs.append(y_hat)
    
            
    return hidden_states, cell_states, outputs
        
    

In [10]:
def cross_entropy_loss(y_hat, y):
    epsilon = 1e-12  # avoid log(0)
    return -np.mean(np.log(y_hat + epsilon) * y)

def backward_propagation(y, outputs, params, hidden_states, cell_states):
    # unpack parameters
    Wg, Wi, Wf, Wo, Wy, bg, bi, bf, bo, by = params
    
    
    loss = 0
    dh_next = np.zeros_like(hidden_states[0])
    
    for t, y_hat in enumerate(reversed(outputs)):
        loss += cross_entropy_loss(y_hat, y[t])
        
        # derivative of softmax with cross entropy
        dy = outputs[t].copy()
        dy = dy - y[t]
        
        # derivative of output
        dWy = np.dot(dy, hidden_states[t].T)
        by = dy
        
        # derivative of hidden state
        dh = np.dot(Wy.T, dy)
        dh_next += dh
        do = dh * tanh(cell_states[t])
        do = sigmoid(o[t], derivative=True) * do
        
        # derivative of cell state
        dc = dh * o[t]
        dc = tanh(c[t], derivative=True) * dc
        
        # derivative of forget gate
        df = dc * c_prev
        df = sigmoid(f[t], derivative=True) * df
        
        # derivative of input modulation gate
        di = dc * g[t]
        di = sigmoid(i[t], derivative=True) * di
        
        # derivative of input gate
        dg = dc * i[t]
        dg = sigmoid(g[t], derivative=True) * dg

        
        
    