# Define LSTM model and forward & backward propagation
We will create the weights for the following cell:
![title](files/lstm.png)


In [1]:
import numpy as np

# activation functions
def sigmoid(x):
    z = 1/(1 + np.exp(-x)) 
    return z
def dtanh(x):
    return 1./(np.cosh(x)**2)
def dsigmoid(x):
    s = sigmoid(x)
    return (s)*(1-s)

# create LSTM weights & biases (i.e. model)
def define_lstm(x_dim, state_dim):

    output_dim = state_dim

    model = dict()

    # WEIGHTS
    # forget gate
    model['Wf'] = np.random.randn(state_dim,x_dim + output_dim) / np.sqrt(x_dim + output_dim) # "Xavier" initialization
    # input gate
    model['Wi'] = np.random.randn(state_dim,x_dim + output_dim) / np.sqrt(x_dim + output_dim)
    # input activation
    model['Wa'] = np.random.randn(state_dim,x_dim + output_dim) / np.sqrt(x_dim + output_dim)
    # sigmoid #3
    model['Wo'] = np.random.randn(output_dim,x_dim + output_dim) / np.sqrt(x_dim + output_dim)

    # BIASES
    model['bf'] = np.random.randn(state_dim)
    model['bi'] = np.random.randn(state_dim)
    model['ba'] = np.random.randn(state_dim)
    model['bo'] = np.random.randn(output_dim)
    
    return model


# FORWARD Prop

def propagate_forward(x,h,state):
    global model
    xh = np.concatenate((x,h))
    
    hf = sigmoid(np.dot(model['Wf'], xh) + model['bf'])
    hi = sigmoid(np.dot(model['Wi'], xh) + model['bi'])
    ha = np.tanh(np.dot(model['Wa'], xh) + model['ba'])
    ho = sigmoid(np.dot(model['Wo'], xh) + model['bo'])
    
    # information of the previous state we want to remember
    forget_state = np.multiply(hf, state)
    
    # how the current input updates the state 
    input_state_update = np.multiply(hi, ha)
    
    # we add up info from the previous state + info from the current input
    state_out = np.add(forget_state, input_state_update)
    
    
    tanh_state = np.tanh(state_out)
    x_out = np.multiply(ho, tanh_state)
    
    forward_result = [hf, hi, ha, ho, forget_state, input_state_update, state_out, tanh_state, x_out, xh, state]
    
    return forward_result


# BACKPROP

# The main idea here is that if you have a multidimensional function y(x), and you want to tweak the input x
# for y(x) to move in a direction vector dy, you need to modify x in the direction dx = y'(x) * dy, and luckily
# here w = y'(x)

# Now if we consider that THE WEIGHTS are the inputs, dw = y'(w) * dy, and here y'(w) = x


# regarding the activation functions, if y = tanh(Wx+b), y'(x) = tanh'(Wx+b)*W, dx = 


def dense_backprop(y, dy, W, x, activation):
    
    if activation == 'tanh':
        d_preactivation_y = dy * (1 - y**2)
    elif activation == 'sigmoid':
        d_preactivation_y = dy * (y)*(1.-y) # dsigmoid = sigmoid * (1 - sigmoid)
    else:
        print('shiiiit')
        return 0
    
    
    # bias update
    db = d_preactivation_y
    # W update
    dw = np.outer(d_preactivation_y, x)
    # 'input update', or how we backpropagate the output error to the input error   
    dx = np.dot(d_preactivation_y, W)

    return dw, db, dx

def lstm_backprop(forward_prop_result, d_x_out, d_state_out_given):
    
    global model
    
    # forward propagation results
    hf, hi, ha, ho, forget_state, input_state_update, state_out, tanh_state, x_out, xh, state = forward_prop_result
    
    # product backprop
    d_tanh_state = d_x_out * ho
    d_ho = d_x_out * tanh_state
    
    d_state_out = d_tanh_state * dtanh(state_out) + d_state_out_given
    
    
    # sum backprop
    d_forget_state = d_state_out
    d_input_state_update = d_state_out
    
    # product backprop
    d_hi = d_input_state_update * ha
    d_ha = d_input_state_update * hi
    
    d_hf = d_forget_state * state
    d_state = d_forget_state * hf
    
    dwf, dbf, dxf = dense_backprop(hf, d_hf, model['Wf'], xh, 'sigmoid')
    dwi, dbi, dxi = dense_backprop(hi, d_hi, model['Wi'], xh, 'sigmoid')
    dwa, dba, dxa = dense_backprop(ha, d_ha, model['Wa'], xh, 'tanh')
    dwo, dbo, dxo = dense_backprop(ho, d_ho, model['Wo'], xh, 'sigmoid')
    dx = dxf + dxi + dxa + dxo
    
    backprop_info = [dwf, dbf, dwi, dbi, dwa, dba, dwo, dbo, dx, d_state]

    return backprop_info

def backprop_multiple_timesteps(x,y):
    global model
    x_dim, state_dim = len(x[0]), len(y[0])

    forwardprop_results = []

    h_t = np.zeros(state_dim)
    state_t = np.zeros(state_dim)
    timesteps = len(x)

    for t in range(timesteps):
        x_t = x[t]

        forward_result = propagate_forward(x_t, h_t, state_t)
        hf, hi, ha, ho, forget_state, input_state_update, state_out, tanh_state, x_out, xh, state = forward_result
        h_t = x_out
        state_t = state_out
        forwardprop_results.append(forward_result)


    dx_next = np.zeros(state_dim)
    d_state = np.zeros(state_dim)

    backprop_results = []

    for s in range(timesteps):
        t = timesteps - s - 1
        forward_result = forwardprop_results[t]
        hf, hi, ha, ho, forget_state, input_state_update, state_out, tanh_state, x_out, xh, state = forward_result
        y_t = y[t]

        d_out = x_out - y_t + dx_next

        backprop_info = lstm_backprop(forward_result, d_out, d_state)
        [dwf, dbf, dwi, dbi, dwa, dba, dwo, dbo, dx, d_state] = backprop_info
        dx_next = dx[-state_dim:]

        backprop_results.append(backprop_info)

    return backprop_results


## Check everything is OK (no bugs) with the numerical example here:
##### https://medium.com/@aidangomez/let-s-do-this-f9b699de31d9

In [2]:
x_dim = 2
state_dim = 1
model = define_lstm(x_dim, state_dim)

model['Wa'][0] = [0.45, 0.25, 0.15]
model['Wi'][0] = [0.95, 0.8, 0.8]
model['Wf'][0] = [0.7, 0.45, 0.1]
model['Wo'][0] = [0.6, 0.4, 0.25]

model['ba'] = [0.2]
model['bi'] = [0.65]
model['bf'] = [0.15]
model['bo'] = [0.1]

x0 = np.array([1., 2.])
x1 = np.array([0.5, 3.])

y0 = np.array([0.5])
y1 = np.array([1.25])

In [5]:
h0 = np.zeros(1)
state0 = np.zeros(1)

forward_result0 = propagate_forward(x0,h0,state0)
hf, hi, ha, ho, forget_state, input_state_update, state_out, tanh_state, x_out, xh, state = forward_result0

![title](files/x0.png)

In [6]:
forward_result0

[array([0.8519528]),
 array([0.96083428]),
 array([0.81775408]),
 array([0.81757448]),
 array([0.]),
 array([0.78572615]),
 array([0.78572615]),
 array([0.65598109]),
 array([0.5363134]),
 array([1., 2., 0.]),
 array([0.])]

#### The obtained values and the values in the example picture for x0 match, so probably there are no forwardprop bugs :) 

In [7]:
h0 = np.zeros(1)
state0 = np.zeros(1)

forward_result1 = propagate_forward(x1,x_out,state_out)
hf, hi, ha, ho, forget_state, input_state_update, state_out, tanh_state, x_out, xh, state = forward_result1

In [8]:
forward_result1

[array([0.87030197]),
 array([0.98118397]),
 array([0.84980402]),
 array([0.84993334]),
 array([0.68381901]),
 array([0.83381408]),
 array([1.5176331]),
 array([0.90828429]),
 array([0.77198111]),
 array([0.5      , 3.       , 0.5363134]),
 array([0.78572615])]

#### These match too :) , now, backprop

In [9]:
d_out = x_out - y1
d_state = np.zeros(1)

backprop_info = lstm_backprop(forward_result1, d_out, d_state)
[dwf, dbf, dwi, dbi, dwa, dba, dwo, dbo, dx, d_state] = backprop_info
dx

array([-0.04742407, -0.03072765, -0.01827526])

#### 2nd timestep backprop works :)

In [10]:
hf, hi, ha, ho, forget_state, input_state_update, state_out, tanh_state, x_out, xh, state = forward_result0

dx_next = dx[-state_dim:]
d_out = x_out - y0 + dx_next
d_state = d_state

backprop_info = lstm_backprop(forward_result0, d_out, d_state)
[dwf, dbf, dwi, dbi, dwa, dba, dwo, dbo, dx, d_state] = backprop_info

In [11]:
dx

array([-0.00816553, -0.0048668 , -0.00342911])

#### 1st timestep backprop works :)

In [12]:
x_dim = 2
state_dim = 1
timesteps = 2

# create model
model = define_lstm(x_dim, state_dim)
model['Wa'][0] = [0.45, 0.25, 0.15]
model['Wi'][0] = [0.95, 0.8, 0.8]
model['Wf'][0] = [0.7, 0.45, 0.1]
model['Wo'][0] = [0.6, 0.4, 0.25]

model['ba'] = [0.2]
model['bi'] = [0.65]
model['bf'] = [0.15]
model['bo'] = [0.1]

# inputs & outputs
x0 = np.array([1., 2.])
x1 = np.array([0.5, 3.])

y0 = np.array([0.5])
y1 = np.array([1.25])

x = np.array([x0,x1])
y = np.array([y0,y1])



In [13]:
backprop_results = backprop_multiple_timesteps(x,y)

Dwf = np.zeros_like(model['Wf'])
Dbf = np.zeros_like(model['bf'])
Dwi = np.zeros_like(model['Wi'])
Dbi = np.zeros_like(model['bi'])
Dwa = np.zeros_like(model['Wa'])
Dba = np.zeros_like(model['ba'])
Dwo = np.zeros_like(model['Wo'])
Dbo = np.zeros_like(model['bo'])

for result in backprop_results:
    [dwf, dbf, dwi, dbi, dwa, dba, dwo, dbo, dx, d_state] = result
    Dwf += dwf
    Dbf += dbf
    Dwi += dwi
    Dbi += dbi
    Dwa += dwa
    Dba += dba
    Dwo += dwo
    Dbo += dbo
    

In [14]:
Dwf

array([[-0.00315327, -0.01891963, -0.00338228]])

In [15]:
Dwa

array([[-0.02671622, -0.09220113, -0.01039609]])

In [16]:
Dbo

array([-0.05361303])

### Everything works :) Forward prop and backprop are OK