In [2]:
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import pickle
%matplotlib inline

In [3]:
with open('data.pkl', 'rb') as input:
    X_train = pickle.load(input)
    char_to_indexes = pickle.load(input)
    indexes_to_char = pickle.load(input)  
    data_size = pickle.load(input)
    vocab_size = pickle.load(input)  

### hyperparameters

In [4]:
hidden_size = 100
seq_length = 25
learning_rate = 1e-1

weight_sd = 0.1
z_size = hidden_size + vocab_size

### activations

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_derivative(y):
    return y * (1 - y)
    
def tanh_derivative(x):
    return 1 - np.tanh(x) ** 2
    

## LSTM

In [None]:
class LSTM:
    def __init__ (self, vocab_size, seq_length, learning_rate):
#         x = np.zeros(2 * vocab_size)
        self.h = np.zeros(vocab_size)
        self.c = np.zeros(vocab_size)
        
        self.seq_length = seq_length
        self.learning_rate = learning_rate
        
        # state
        self.c = np.zeros(vocab_size)
        # gates
        self.Wf = np.random.random((vocab_size, 2 * vocab_size))
        self.Wi = np.random.random((vocab_size, 2 * vocab_size))
        self.Wo = np.random.random((vocab_size, 2 * vocab_size))
        
        # cell state
        self.Wc = np.random.random((vocab_size, 2 * vocab_size))
        
        # gates gradients
        self.dWf = np.zeros_like(self.f)
        self.dWi = np.zeros_like(self.i)
        self.dWo = np.zeros_like(self.o)
        
        # state gradient
        self.dWc = np.zeros_like(self.c)
        
    def _step_forward(self, h_x):
        f = sigmoid(np.dot(self.Wf, h_x))
        i = sigmoid(np.dot(self.Wi, h_x))
        o = sigmoid(np.dot(self.Wo, h_x))

        c_tilde = np.tanh(np.dot(self.Wc, h_x))
        
        self.c *= f
        self.c += i * c_tilde
        
        h = o * self.tangent(self.c)
        return self.c, h, f, i, o, c
    
   
    def _step_backward(self, loss, pcs, f, i, c, o, dfcs, dh, h_x):
        loss = np.clip(loss + dh, -6, 6)
        #multiply loss by activated cell state to compute output derivative
        do = np.tanh(self.c) * loss
        #output update = (output deriv * activated output) * input
        ou = np.dot(np.atleast_2d(do * self.dtangent(o)).T, np.atleast_2d(h_x))
        #derivative of cell state = error * output * deriv of cell state + deriv cell
        dcs = np.clip(e * o * self.dtangent(self.c) + dfcs, -6, 6)
        #deriv of cell = deriv cell state * input
        dc = dcs * i
        #cell update = deriv cell * activated cell * input
        cu = np.dot(np.atleast_2d(dc * self.dtangent(c)).T, np.atleast_2d(h_x))
        #deriv of input = deriv cell state * cell
        di = dcs * c
        #input update = (deriv input * activated input) * input
        iu = np.dot(np.atleast_2d(di * self.dsigmoid(i)).T, np.atleast_2d(h_x))
        #deriv forget = deriv cell state * all cell states
        df = dcs * pcs
        #forget update = (deriv forget * deriv forget) * input
        fu = np.dot(np.atleast_2d(df * self.dsigmoid(f)).T, np.atleast_2d(h_x))
        #deriv cell state = deriv cell state * forget
        dpcs = dcs * f
        #deriv hidden state = (deriv cell * cell) * output + deriv output * output * output deriv input * input * output + deriv forget
        #* forget * output
        dphs = np.dot(dc, self.c)[:self.ys] + np.dot(do, self.o)[:self.ys] + np.dot(di, self.i)[:self.ys] + np.dot(df, self.f)[:self.ys] 
        #return update gradinets for forget, input, cell, output, cell state, hidden state
        return fu, iu, cu, ou, dpcs, dphs
            
    def update(self, fu, iu, cu, ou):
        #update forget, input, cell, and output gradients
        self.Gf = 0.9 * self.Gf + 0.1 * fu**2 
        self.Gi = 0.9 * self.Gi + 0.1 * iu**2   
        self.Gc = 0.9 * self.Gc + 0.1 * cu**2   
        self.Go = 0.9 * self.Go + 0.1 * ou**2   
        
        #update our gates using our gradients
        self.f -= self.learning_rate/np.sqrt(self.Gf + 1e-8) * fu
        self.i -= self.learning_rate/np.sqrt(self.Gi + 1e-8) * iu
        self.c -= self.learning_rate/np.sqrt(self.Gc + 1e-8) * cu
        self.o -= self.learning_rate/np.sqrt(self.Go + 1e-8) * ou
        return