### IMPORTS

In [81]:
import numpy as np, re

### DATA

In [82]:
f = open(r"C:\Users\12482\Desktop\alice_wonderland.txt", 'r', encoding='utf-8').readlines()

### TOKENIZER & EMBEDDING CLASSES

In [83]:
from tokenizer import Vocabulary
from embedding import EmbeddingLayer

### LSTM NETWORK

In [84]:
## create vocabulary + tokenize
v = Vocabulary()
token_sequences = v.tokenize(f, 26)

## create embedding layer
e = EmbeddingLayer(vocab_size=v.size, hidden_dim=20) ## hidden_dim is a hyper-param

## create X & Y datasets
X = token_sequences[:,:-1]
y = token_sequences[:,-1]

lstm_inputs = e.predict(X)
lstm_inputs.shape ## batch_size x seq_length x dimensionality

(2829, 25, 20)

In [66]:
class LSTM:
    def __init__(self, units, seq_length, vocab_size, features):
        """
        Initializes the LSTM layer
        
        Args:
            Units: int (num of LSTM units in layer)
            features: int (dimensionality of token embeddings)
            seq_length: int (num of tokens at each timestep)
            vocab_size: int (num of unique tokens in vocab)
        """
        self.hidden_dim = units
        self.dimensionality = features
        self.seq_length = seq_length
        self.vocab_size = vocab_size
        
        # Initialize hidden state as zeros
        self.h = np.zeros((units, features))
        self.c = np.zeros((units, features))
        
    def _init_orthogonal(self, param):
        """
        Initializes weight parameters orthogonally.

        Refer to this paper for an explanation of this initialization:
        https://arxiv.org/abs/1312.6120
        """
        if param.ndim < 2:
            raise ValueError("Only parameters with 2 or more dimensions are supported.")

        rows, cols = param.shape

        new_param = np.random.randn(rows, cols)

        if rows < cols:
            new_param = new_param.T

        # Compute QR factorization
        q, r = np.linalg.qr(new_param)

        # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf
        d = np.diag(r, 0)
        ph = np.sign(d)
        q *= ph

        if rows < cols:
            q = q.T

        new_param = q

        return new_param
    
    def sigmoid(self, x, derivative=False):
        """
        Computes the element-wise sigmoid activation function for an array x.

        Args:
         `x`: the array where the function is applied
         `derivative`: if set to True will return the derivative instead of the forward pass
        """
        x_safe = x + 1e-12
        f = 1 / (1 + np.exp(-x_safe))

        if derivative: # Return the derivative of the function evaluated at x
            return f * (1 - f)
        else: # Return the forward pass of the function at x
            return f
    
    def tanh(self, x, derivative=False):
        """
        Computes the element-wise tanh activation function for an array x.

        Args:
         `x`: the array where the function is applied
         `derivative`: if set to True will return the derivative instead of the forward pass
        """
        x_safe = x + 1e-12
        f = (np.exp(x_safe)-np.exp(-x_safe))/(np.exp(x_safe)+np.exp(-x_safe))

        if derivative: # Return the derivative of the function evaluated at x
            return 1-f**2
        else: # Return the forward pass of the function at x
            return f
    
    def softmax(self, x):
        """
        Computes the softmax for an array x.

        Args:
         `x`: the array where the function is applied
         `derivative`: if set to True will return the derivative instead of the forward pass
        """
        x_safe = x + 1e-12
        f = np.exp(x_safe) / np.sum(np.exp(x_safe))

        # Return the forward pass of the function at x
        return f
    
    def _init_params(self):
        """
        Initializes the weight and biases of the layer
        
        Initialize weights according to https://arxiv.org/abs/1312.6120 (_init_orthogonal)
        """
        
        # Weight matrix (forget gate)
        self.W_f = self._init_orthogonal(np.random.randn(self.hidden_dim , self.hidden_dim + self.seq_length-1))

        # Bias for forget gate
        self.b_f = np.zeros((self.hidden_dim , 1))

        # Weight matrix (input gate)
        self.W_i = self._init_orthogonal(np.random.randn(self.hidden_dim , self.hidden_dim + self.seq_length-1))

        # Bias for input gate
        self.b_i = np.zeros((self.hidden_dim , 1))

        # Weight matrix (candidate)
        self.W_g = self._init_orthogonal(np.random.randn(self.hidden_dim , self.hidden_dim + self.seq_length-1))

        # Bias for candidate
        self.b_g = np.zeros((self.hidden_dim , 1))

        # Weight matrix of the output gate
        self.W_o = self._init_orthogonal(np.random.randn(self.hidden_dim , self.hidden_dim + self.seq_length-1))
        self.b_o = np.zeros((self.hidden_dim , 1))

        # Weight matrix relating the hidden-state to the output
        self.W_v = self._init_orthogonal(np.random.randn(self.vocab_size, self.hidden_dim))
        self.b_v = np.zeros((self.vocab_size, 1))

In [67]:
lstm = LSTM(100, v.seq_length, v.size, e.hidden_dim)
lstm._init_params()

### FORWARD PASS MINI-BATCH EXAMPLE

In [77]:
batch_input_1 = lstm_inputs[0]
z = np.row_stack((batch_input_1, lstm.h))

z.shape

(125, 20)

In [76]:
lstm.W_f.shape

(100, 125)

In [70]:
# Calculate forget gate
f = lstm.sigmoid(np.dot(lstm.W_f, z) + lstm.b_f)

# Calculate input gate
i = lstm.sigmoid(np.dot(lstm.W_i, z) + lstm.b_i)

# Calculate candidate
g = lstm.tanh(np.dot(lstm.W_g, z) + lstm.b_g)

# Calculate new memory state
new_c = f * lstm.c + i * g 

# Calculate output gate
o = lstm.sigmoid(np.dot(lstm.W_o, z) + lstm.b_o)

# Calculate new hidden state
new_h = o * lstm.tanh(new_c)

In [71]:
# Calculate logits
v = np.dot(lstm.W_v, new_h) + lstm.b_v

# Calculate softmax
output = lstm.softmax(v)

In [96]:
lstm_inputs[0][0] == e.weights[v.to_index('alice')]

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True])

In [98]:
last_hidden = output[-1] # this is our prediction
## output is return_sequences=True (all hidden states)

In [114]:
target_word = v.to_word(y[0])
predicted_word = v.to_word(np.argmax(last_hidden))

print('TARGET WORD: ', target_word)
print('PREDICTED WORD: ', predicted_word)

TARGET WORD:  into
PREDICTED WORD:  to
