### IMPORTS

In [5]:
import numpy as np, re

### DATA

In [6]:
f = open(r"C:\Users\12482\Desktop\alice_wonderland.txt", 'r', encoding='utf-8').readlines()

### TOKENIZER & EMBEDDING CLASSES

In [7]:
from tokenizer import Vocabulary
from embedding import EmbeddingLayer

### LSTM NETWORK

In [8]:
## create vocabulary + tokenize
v = Vocabulary()
token_sequences = v.tokenize(f, 26)

## create embedding layer
e = EmbeddingLayer(vocab_size=v.size, hidden_dim=20) ## hidden_dim is a hyper-param

## create X & Y datasets
X = token_sequences[:,:-1]
y = token_sequences[:,-1]

lstm_inputs = e.predict(X)
lstm_inputs.shape ## batch_size x seq_length x dimensionality

(2829, 25, 20)

In [13]:
class LSTM:
    def __init__(self, units, seq_length, vocab_size, features):
        """
        Initializes the LSTM layer
        
        Args:
            Units: int (num of LSTM units in layer)
            features: int (dimensionality of token embeddings)
            seq_length: int (num of tokens at each timestep)
            vocab_size: int (num of unique tokens in vocab)
        """
        self.hidden_dim = units
        self.dimensionality = features
        self.seq_length = seq_length
        self.vocab_size = vocab_size
        
        # Initialize hidden state as zeros
        self.h = np.zeros((units, features))
        self.c = np.zeros((units, features))
        
    def _init_orthogonal(self, param):
        """
        Initializes weight parameters orthogonally.

        Refer to this paper for an explanation of this initialization:
        https://arxiv.org/abs/1312.6120
        """
        if param.ndim < 2:
            raise ValueError("Only parameters with 2 or more dimensions are supported.")

        rows, cols = param.shape

        new_param = np.random.randn(rows, cols)

        if rows < cols:
            new_param = new_param.T

        # Compute QR factorization
        q, r = np.linalg.qr(new_param)

        # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf
        d = np.diag(r, 0)
        ph = np.sign(d)
        q *= ph

        if rows < cols:
            q = q.T

        new_param = q

        return new_param
    
    def _init_params(self):
        """
        Initializes the weight and biases of the layer
        
        Initialize weights according to https://arxiv.org/abs/1312.6120 (_init_orthogonal)
        """
        
        # Weight matrix (forget gate)
        self.W_f = self._init_orthogonal(np.random.randn(self.hidden_dim , self.seq_length))

        # Bias for forget gate
        self.b_f = np.zeros((self.hidden_dim , 1))

        # Weight matrix (input gate)
        self.W_i = self._init_orthogonal(np.random.randn(self.hidden_dim , self.seq_length))

        # Bias for input gate
        self.b_i = np.zeros((self.hidden_dim , 1))

        # Weight matrix (candidate)
        self.W_g = self._init_orthogonal(np.random.randn(self.hidden_dim , self.seq_length))

        # Bias for candidate
        self.b_g = np.zeros((self.hidden_dim , 1))

        # Weight matrix of the output gate
        self.W_o = self._init_orthogonal(np.random.randn(self.hidden_dim , self.seq_length))
        self.b_o = np.zeros((self.hidden_dim , 1))

        # Weight matrix relating the hidden-state to the output
        self.W_v = self._init_orthogonal(np.random.randn(self.vocab_size, self.hidden_dim))
        self.b_v = np.zeros((self.vocab_size, 1))

In [14]:
lstm = LSTM(100, v.seq_length, v.size, e.hidden_dim)
lstm._init_params()

### EXAMPLES

In [16]:
batch_input_1 = lstm_inputs[0]
z = np.row_stack((batch_input_1, lstm.h))

z.shape

(125, 20)