## Imports

In [1]:
from lstm import LSTM
from tokenizer import Vocabulary
from dense import Dense
from embedding import EmbeddingLayer
import numpy as np

## Get Data

In [2]:
# step 1 -- data
f = open(r"C:\Users\12482\Desktop\opensource\numpy-rnn\data\alice_wonderland.txt", 'r', encoding='utf-8').readlines()

# step 2 -- tokenize
## create vocabulary + tokenize
v = Vocabulary()
token_sequences = v.tokenize(f, 26)

# step 3 -- split into x/y
## create X & Y datasets
X = token_sequences[:,:-1]
y = token_sequences[:,-1]

## Run Test

In [3]:
e = EmbeddingLayer(vocab_size=v.size, hidden_dim=20)
batch1 = e.forward(X[0])

lstm = LSTM(units=100, features=20, seq_length=25)
init_state = {'h':np.zeros((100,)), 'c':np.zeros((100,))}
cache, state = lstm.forward(batch1, init_state)

cache['embedding_inputs'] = np.copy(X[0])

dense = Dense(v.size, input_shape=lstm.hidden_dim)
final_out = dense.forward(state['h'])

init_state_grads = {'h':np.zeros_like(state['h']), 'c':np.zeros_like(state['c'])}

kernel_grads, recurrent_kernel_grads, state_grads, embedding_grads = lstm.backward(prediction=final_out,
                                                                  actual=y[0],
                                                                  state_gradients=init_state_grads,
                                                                  state=state,
                                                                  cache=cache,
                                                                  dense_weights=dense.weights)

In [4]:
print(batch1.shape, state['h'].shape, final_out.shape)

(25, 20) (25, 100) (25, 2855)


This makes sense! inputs=t_steps x dim, lstm_out=t_steps x dim, final_out=t_steps x vocab_size

In [5]:
print('KERNEL f, GRADIENT & OG:', kernel_grads['Wf'].shape, lstm.kernel_f.shape)

print('RECURRENT KERNEL f, GRADIENT & OG:', recurrent_kernel_grads['Uf'].shape, lstm.recurrent_kernel_f.shape)

print('BIAS KERNEL f, GRADIENT & OG:', kernel_grads['bf'].shape, lstm.bias_f.shape)

print('BATCH INPUT X[0], GRADIENT & OG:', embedding_grads['dX'].shape, batch1.shape)

KERNEL f, GRADIENT & OG: (20, 100) (20, 100)
RECURRENT KERNEL f, GRADIENT & OG: (100, 100) (100, 100)
BIAS KERNEL f, GRADIENT & OG: (100,) (100,)
BATCH INPUT X[0], GRADIENT & OG: (25, 20) (25, 20)


This makes sense! The gradients and originals should have the same shape!

**Step Function (SGD)**

In [6]:
def step(lstm, embedding, dense, kernel_grads, cache, recurrent_grads, state_grads, embedding_grads, lr=0.01):
    """
    Update model params using SGD
    """
    
    kernel_f, kernel_i, kernel_c, kernel_o = lstm.kernel_f, lstm.kernel_i, lstm.kernel_c, lstm.kernel_o
    r_kernel_f, r_kernel_i, r_kernel_c, r_kernel_o = lstm.recurrent_kernel_f, lstm.recurrent_kernel_i, lstm.recurrent_kernel_c, lstm.recurrent_kernel_o
    lstm_bias_f, lstm_bias_i, lstm_bias_c, lstm_bias_o = lstm.bias_f, lstm.bias_i, lstm.bias_c, lstm.bias_o
    
    dense_weights, dense_bias = dense.weights, dense.bias
    
    embeddings = embedding.weights
    
    dense_weights -= lr * kernel_grads['Wy']
    dense_bias -= lr * kernel_grads['by']
    
    kernel_f -= lr * kernel_grads['Wf']
    kernel_i -= lr * kernel_grads['Wi']
    kernel_c -= lr * kernel_grads['Wc']
    kernel_o -= lr * kernel_grads['Wo']
    
    r_kernel_f -= lr * recurrent_grads['Uf']
    r_kernel_i -= lr * recurrent_grads['Ui']
    r_kernel_c -= lr * recurrent_grads['Uc']
    r_kernel_o -= lr * recurrent_grads['Uo']
    
    
    embeddings[cache['embedding_inputs']] -= lr * embedding_grads['dX']
    
    print('GREAT WORK!')
    
step(lstm=lstm, embedding=e, dense=dense, kernel_grads=kernel_grads, cache=cache,
     recurrent_grads=recurrent_kernel_grads, state_grads=state_grads, embedding_grads=embedding_grads)

GREAT WORK!


**Calculate Loss**

In [7]:
samples = 25 ## t_steps

correct_logprobs = -np.log(final_out[range(samples),y[0]])
data_loss = np.sum(correct_logprobs)/samples

data_loss

8.470231621786166

## Sequential Model

In [8]:
class LSTMSequential:
    def __init__(self):
        self.network = {}
        
    def add(self, layer):
        self.network[layer.name] = layer
        
    def _init_hidden(self):
        hidden = self.network['LSTM'].hidden_dim
        
        state = {'h':np.zeros((hidden,)), 'c':np.zeros((hidden,))}
    
        return state
    
    def _init_hidden_grads(self):
        hidden, seq_length = self.network['LSTM'].hidden_dim, self.network['LSTM'].seq_length
        
        init_state_grads = {'h':np.zeros((seq_length, hidden)), 'c':np.zeros((seq_length, hidden))}
        
        return init_state_grads
    
    def _calculate_loss(self, predictions, actual):
        samples = self.network['LSTM'].seq_length ## t_steps

        correct_logprobs = -np.log(predictions[range(samples),actual])
        data_loss = np.sum(correct_logprobs)/samples

        return data_loss
    
    def _step(self, kernel_grads, recurrent_grads, state_grads, embedding_grads, cache, lr):
        self.network['Dense'].weights -= lr * kernel_grads['Wy']
        self.network['Dense'].bias -= lr * kernel_grads['by']
        
        self.network['Embedding'].weights[cache['embedding_inputs']] -= lr * embedding_grads['dX']
            
        self.network['LSTM'].kernel_f -= lr * kernel_grads['Wf']
        self.network['LSTM'].kernel_i -= lr * kernel_grads['Wi']
        self.network['LSTM'].kernel_c -= lr * kernel_grads['Wc']
        self.network['LSTM'].kernel_o -= lr * kernel_grads['Wo']

        self.network['LSTM'].recurrent_kernel_f -= lr * recurrent_grads['Uf']
        self.network['LSTM'].recurrent_kernel_i -= lr * recurrent_grads['Ui']
        self.network['LSTM'].recurrent_kernel_c -= lr * recurrent_grads['Uc']
        self.network['LSTM'].recurrent_kernel_o -= lr * recurrent_grads['Uo']
        
        self.network['LSTM'].bias_f -= lr * kernel_grads['bf']
        self.network['LSTM'].bias_i -= lr * kernel_grads['bi']
        self.network['LSTM'].bias_c -= lr * kernel_grads['bc']
        self.network['LSTM'].bias_o -= lr * kernel_grads['bo']
    
    def _train_step(self, X_train, y_train, state, lr):
        assert('Embedding' in self.network and 'LSTM' in self.network and 'Dense' in self.network)
        
        loss = 0
        
        for idx in range(0, X_train.shape[0]):

            state_grads = self._init_hidden_grads()
            
            lstm_inp = self.network['Embedding'].forward(X_train[idx])
            cache, state = self.network['LSTM'].forward(lstm_inp, state)
            final_out = self.network['Dense'].forward(state['h'])
            
            cache['embedding_inputs'] = np.copy(X_train[idx])
            
            l = self._calculate_loss(predictions=final_out, actual=y[idx])
            loss+=l
                
            kernel_grads, r_kernel_grads, state_grads, embed_grads = \
                                        self.network['LSTM'].backward(prediction=final_out, 
                                        actual=y[idx], 
                                        state_gradients=state_grads,
                                        state=state,
                                        cache=cache,
                                        dense_weights=self.network['Dense'].weights)
            
            self._step(kernel_grads=kernel_grads, recurrent_grads=r_kernel_grads,
                      state_grads=state_grads, embedding_grads=embed_grads,
                      cache=cache, lr=lr)
            
        return loss/X_train.shape[0], state

            
    def train(self, X_train, y_train, epochs, lr=0.01):
        init_state = self._init_hidden()

        for e in range(epochs):
            
            if e == 0:
                loss, state = self._train_step(X_train=X_train, y_train=y_train, state=init_state, lr=lr)
            else:
                loss, state = self._train_step(X_train=X_train, y_train=y_train, state=state, lr=lr)

            print('LOSS: {}, EPOCH: {}'.format(loss, e))
            
    
        
model = LSTMSequential()

model.add(EmbeddingLayer(vocab_size=v.size, hidden_dim=20))
model.add(LSTM(units=100, features=20, seq_length=25))
model.add(Dense(v.size, 100))

model.train(X, y, 200)

LOSS: 8.18755887290257, EPOCH: 0
LOSS: 8.089105862218226, EPOCH: 1
LOSS: 8.080343450178777, EPOCH: 2
LOSS: 8.075965808706163, EPOCH: 3
LOSS: 8.071566959447232, EPOCH: 4
LOSS: 8.05840072058782, EPOCH: 5
LOSS: 8.025282971544934, EPOCH: 6


  f = 1 / (1 + np.exp(-x_safe))


LOSS: 7.962477969591778, EPOCH: 7


KeyboardInterrupt: 