In [1]:
import numpy as np

In [2]:
lstm_hidden = 50
vocab_size = 1000
batch_size = 2829

out = np.random.randn(batch_size, lstm_hidden)
W_v = np.random.randn(lstm_hidden, vocab_size)
b_v = np.random.randn(1, vocab_size)

In [3]:
def softmax(inputs):
    exp_scores = np.exp(inputs)
    probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
    return probs

In [4]:
out.shape, W_v.shape, b_v.shape

((2829, 50), (50, 1000), (1, 1000))

In [6]:
v = np.dot(out, W_v) + b_v
y = softmax(v)

y.shape

(2829, 1000)

In [2]:
class Dense:
    def __init__(self, neurons):
        self.neurons = neurons
        
    def softmax(self, inputs):
        exp_scores = np.exp(inputs)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
        return probs
        
    def forward(self, inputs):
        self.weights = np.random.randn(inputs.shape[1], self.neurons)
        self.bias = np.zeros((1, self.neurons))
        
        y = np.dot(inputs, self.weights) + self.bias
        
        return self.softmax(y)
    
class EmbeddingLayer:
    def __init__(self, vocab_size, hidden_dim):
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        self.weights = np.random.randn(vocab_size, hidden_dim) ## (vocab_size, hidden_dim)

    def predict(self, array):
        """
        PARAMS:
          array: 
           -- integer matrix of batch_size x seq_length

        RETURNS:
          array:
           -- integer matrix of batch_size x seq_length x hidden_dim
           -- the word vectors for each word in the tokenized input
        """
        assert np.max(array) <= self.vocab_size

        return np.array([self.weights[i] for i in array])  

In [3]:
class LSTM:
    def __init__(self, units, features):
        """
        Initializes the LSTM layer
        
        Args:
            Units: int (num of LSTM units in layer)
            features: int (dimensionality of token embeddings)
        """
        self.hidden_dim = units
        self.dimensionality = features
        self.cache = {} ## all intermediate variables (i,f,o,cbar)
        self.state = {} ## hidden & cell state
        
    def _init_orthogonal(self, param):
        """
        Initializes weight parameters orthogonally.

        Refer to this paper for an explanation of this initialization:
        https://arxiv.org/abs/1312.6120
        """
        if param.ndim < 2:
            raise ValueError("Only parameters with 2 or more dimensions are supported.")

        rows, cols = param.shape

        new_param = np.random.randn(rows, cols)

        if rows < cols:
            new_param = new_param.T

        # Compute QR factorization
        q, r = np.linalg.qr(new_param)

        # Make Q uniform according to https://arxiv.org/pdf/math-ph/0609050.pdf
        d = np.diag(r, 0)
        ph = np.sign(d)
        q *= ph

        if rows < cols:
            q = q.T

        new_param = q

        return new_param
    
    def sigmoid(self, x, derivative=False):
        """
        Computes the element-wise sigmoid activation function for an array x.

        Args:
         `x`: the array where the function is applied
         `derivative`: if set to True will return the derivative instead of the forward pass
        """
        x_safe = x + 1e-12
        f = 1 / (1 + np.exp(-x_safe))

        if derivative: # Return the derivative of the function evaluated at x
            return f * (1 - f)
        else: # Return the forward pass of the function at x
            return f
    
    def tanh(self, x, derivative=False):
        """
        Computes the element-wise tanh activation function for an array x.

        Args:
         `x`: the array where the function is applied
         `derivative`: if set to True will return the derivative instead of the forward pass
        """
        x_safe = x + 1e-12
        f = (np.exp(x_safe)-np.exp(-x_safe))/(np.exp(x_safe)+np.exp(-x_safe))

        if derivative: # Return the derivative of the function evaluated at x
            return 1-f**2
        else: # Return the forward pass of the function at x
            return f
    
    def softmax(self, x):
        """
        Computes the softmax for an array x.

        Args:
         `x`: the array where the function is applied
         `derivative`: if set to True will return the derivative instead of the forward pass
        """
        x_safe = x + 1e-12
        f = np.exp(x_safe) / np.sum(np.exp(x_safe))

        # Return the forward pass of the function at x
        return f
    
    def _init_params(self):
        """
        Initializes the weight and biases of the layer
        
            -- Initialize weights according to https://arxiv.org/abs/1312.6120 (_init_orthogonal)
            -- Initialize weights according to https://github.com/keras-team/keras/blob/master/keras/layers/rnn/lstm.py
            -- Assumptions: Batch_First=True (PyTorch) or time_major=False (keras)
        """
        self.kernel = self._init_orthogonal(np.random.randn(self.dimensionality, self.hidden_dim * 4))
        self.recurrent_kernel = self._init_orthogonal(np.random.randn(self.hidden_dim, self.hidden_dim * 4))
        self.bias = np.random.randn(self.hidden_dim * 4, )

        self.kernel_i = self.kernel[:, :self.hidden_dim]
        self.kernel_f = self.kernel[:, self.hidden_dim: self.hidden_dim * 2]
        self.kernel_c = self.kernel[:, self.hidden_dim * 2: self.hidden_dim * 3]
        self.kernel_o = self.kernel[:, self.hidden_dim * 3:]

        self.recurrent_kernel_i = self.recurrent_kernel[:, :self.hidden_dim]
        self.recurrent_kernel_f = self.recurrent_kernel[:, self.hidden_dim: self.hidden_dim * 2]
        self.recurrent_kernel_c = self.recurrent_kernel[:, self.hidden_dim * 2: self.hidden_dim * 3]
        self.recurrent_kernel_o = self.recurrent_kernel[:, self.hidden_dim * 3:]

        self.bias_i = self.bias[:self.hidden_dim]
        self.bias_f = self.bias[self.hidden_dim: self.hidden_dim * 2]
        self.bias_c = self.bias[self.hidden_dim * 2: self.hidden_dim * 3]
        self.bias_o = self.bias[self.hidden_dim * 3:]

    def forward(self, inputs, return_sequences=False):
        """
        Performs one full forward pass through the layer

        Args:
            inputs: 3D array of shape (batch_size, seq_length, dimensionality)
            return_sequences: return the full sequence of hidden states or just the last one (per batch)
        """

        self._init_params()

        h_tm1 = np.zeros((self.hidden_dim,))
        c_tm1 = np.zeros((self.hidden_dim,))
        
        h_state_out = []
        count = 0
        
        for batch in inputs:
        
            inputs_i = batch
            inputs_f = batch
            inputs_c = batch
            inputs_o = batch

            h_tm1_i = h_tm1
            h_tm1_f = h_tm1
            h_tm1_c = h_tm1
            h_tm1_o = h_tm1

            x_i = np.dot(inputs_i, self.kernel_i) + self.bias_i
            x_f = np.dot(inputs_f, self.kernel_f) + self.bias_f
            x_c = np.dot(inputs_c, self.kernel_c) + self.bias_c
            x_o = np.dot(inputs_o, self.kernel_o) + self.bias_o

            f = self.sigmoid(x_f + np.dot(h_tm1_f, self.recurrent_kernel_f))
            i = self.sigmoid(x_i + np.dot(h_tm1_i, self.recurrent_kernel_i))
            o = self.sigmoid(x_o + np.dot(h_tm1_o, self.recurrent_kernel_o))
            cbar = self.sigmoid(x_c + np.dot(h_tm1_c, self.recurrent_kernel_c))
            c = (f * c_tm1) + (i * cbar)
            ht = o * self.tanh(c)
            
            if return_sequences == True:
                h_state_out.append(ht)
            else:
                h_state_out.append(ht[-1])
            
            self.state[count] = {'c':c, 'h':ht}
            self.cache[count] = {'i':i, 'f':f, 'o':o, 'cbar':cbar}
            count += 1
            
            h_tm1 = ht
            c_tm1 = c
        
        return np.array(h_state_out)

In [4]:
import re


class Vocabulary:
    def __init__(self) -> None:
        self.word2index = {}
        self.word2count = {}
        self.index2word = {}
        self.sentences = []
        self.tokens = []
        self.num_words = 0
        self.num_sentences = 0

    def _add_word(self, word):
        if word not in self.word2index:
            self.tokens.append(word)
            self.word2count[word] = 1
            self.word2index[word] = self.num_words
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    def _add_sentence(self, sentence):
        sentence = sentence.lower()
        new = self._clean_sentence(sentence=sentence)
        new = new.replace('\n', '')
        self.sentences.append(new)
        
        for word in new.split(' '):
            if word != '':
                self._add_word(word)
            else:
                continue
      
        self.num_sentences += 1
        
    def pad_sequences(self, sequence, length=None):
        """
        Default: Pad an input sequence to be the same as self.seq_length
        
        Alternative: Pad an input sequence to the 'length' param
        
        Keras: Pads input sequences with length of longest sequence
        
        Params:
        sequence --> np.array[numpy.array], integer matrix of tokenized words
        
        Returns:
        padded_sequence --> np.array[numpy.array], integer matrix of tokenized words with padding
        """
        return_arr = []
        
        for s in sequence:
            new = list(s)
            
            if not length:
                missing = self.seq_length - len(new)
            else:
                missing = length - len(new)
                
            new.extend([0]*missing)
            return_arr.append(new)
            
        return np.vstack(return_arr)
    
    def _sort_by_frequency(self):
        sorted_count = dict(sorted(self.word2count.items(), key=lambda x:x[1], reverse=True))

        self.word2index = {}
        
        count = 0 ## start at 1 to copy keras --> 0 is reserved for padding (this is how keras does it)
        for k,v in sorted_count.items():
            self.word2index[k] = count
            count += 1
        
        self.index2word = {v:k for k,v in self.word2index.items()}
        
        return self
    
    def _compile_vocab(self, corpus):
        """
        Creates vocabulary

        Params:
        Corpus --> List[str]
        
        Returns:
        self
        """
        for s in corpus:
            self._add_sentence(s)

        assert len(self.word2count) == len(self.word2index) == len(self.index2word)
        self.size = len(self.word2count)
        
        self._sort_by_frequency()
        
    def tokenize(self, corpus, seq_length):
        """
        Creates sequences of tokens

        Params:
        Corpus --> List[str]
        
        Returns:
        Token Sequences --> List[str]
        """
        self._compile_vocab(corpus)
        self.seq_length = seq_length
        self.token_sequences = []
        
        for i in range(seq_length, self.size):
            seq = self.tokens[i-seq_length:i]
            seq = [self.word2index[i] for i in seq]
            self.token_sequences.append(seq)
        
        return np.array(self.token_sequences)

    def _clean_sentence(self, sentence):
        new_string = re.sub(r'[^\w\s]', '', sentence)
        return new_string

    def to_word(self, index):
        return self.index2word[index]

    def to_index(self, word):
        return self.word2index[word]

In [5]:
# TEST RUN

# step 1 -- data
f = open(r"C:\Users\12482\Desktop\opensource\numpy-rnn\data\alice_wonderland.txt", 'r', encoding='utf-8').readlines()

# step 2 -- tokenize
## create vocabulary + tokenize
v = Vocabulary()
token_sequences = v.tokenize(f, 26)

# step 3 -- split into x/y
## create X & Y datasets
X = token_sequences[:,:-1]
y = token_sequences[:,-1]

# step 4 -- embedding layer -- layer 1
## create embedding layer
e = EmbeddingLayer(vocab_size=v.size, hidden_dim=20) ## hidden_dim is a hyper-param
lstm_inputs = e.predict(X)

# step 5 -- lstm layer -- layer 2
lstm = LSTM(100, lstm_inputs.shape[-1])
lstm_out = lstm.forward(lstm_inputs)

# step 6 -- dense layer (softmax) -- layer 3
dense = Dense(v.size)
final = dense.forward(lstm_out)

final.shape

(2829, 2855)

In [7]:
## the sequence

class Sequence:
    def __init__(self):
        self.sequence = []
        
    def add(self, layer):
        self.sequence.add(layer)
        
    def forward(self, data):
        out = []
        
        for i in range(0, len(self.sequence)):
            if len(out) == 0:
                predictions = self.sequence[i].forward(data)
                out.append(predictions)
            else:
                predictions = self.sequence[i].forward(out[-1])
                
        return out[-1]

The final shape is batch_size x vocab_size because we have one output for every batch with a possibility for it to be any word in the vocab

For example, the output for batch one is:

In [6]:
v.to_word(np.argmax(final[0]))

'trees'

**Test Backward Pass**

In [71]:
## compute the gradient on predictions
dscores = np.copy(final[len(y)-1])
dscores[y[len(y)-1]] -= 1

# unpack cache and state
f, i, cbar, o = lstm.cache[2828]['f'], lstm.cache[2828]['i'], lstm.cache[2828]['cbar'], lstm.cache[2828]['o']

h, c = lstm.state[2828]['h'][-1], lstm.state[2828]['c'][-1]

h_prev, c_prev = np.zeros((100,)), np.zeros((100,))

dh_next, dc_next = np.zeros((100,)), np.zeros((100,))

In [73]:
dscores = dscores.reshape(dscores.shape[0], 1) 
h = h.reshape(h.shape[0], 1)
dense_weights = dense.weights[-1].reshape(2855,1)

In [76]:
# Hidden to output gradient
dWy = np.dot(dscores, h.T)
dby = dscores
dh = np.dot(dense_weights.T, dscores) + dh_next

In [79]:
# Gradient for o
do = lstm.tanh(c) * dh
do = lstm.sigmoid(o, derivative=True) * do

In [80]:
# Gradient for c
dc = o * dh * lstm.tanh(c, derivative=True)
dc = dc + dc_next

In [81]:
# Gradient for f
df = c_prev * dc
df = lstm.sigmoid(f, derivative=True) * df

In [82]:
# Gradient for i
di = c * dc
di = lstm.sigmoid(i, derivative=True) * di

In [83]:
# Gradient for c
dc = i * dc
dc = lstm.tanh(c, derivative=True) * dc

In [86]:
x = lstm_inputs[-1]

In [93]:
Wf = lstm.kernel_f
Wi = lstm.kernel_i
Wo = lstm.kernel_o
Wc = lstm.kernel_c

In [92]:
# Gate gradients, just a normal fully connected layer gradient
dWf = np.dot(x.T, df)
dbf = df
dXf = np.dot(df, Wf.T)

In [94]:
dWi = x.T @ di
dbi = di
dXi = di @ Wi.T

dWo = x.T @ do
dbo = do
dXo = do @ Wo.T

dWc = x.T @ dc
dbc = dc
dXc = dc @ Wc.T

In [103]:
# As X was used in multiple gates, the gradient must be accumulated here
dX = dXo + dXc + dXi + dXf

# Split the concatenated X, so that we get our gradient of h_old
dh_next_ = dX[:, :100]

# Gradient for c_old in c = hf * c_old + hi * hc
dc_next_ = f * dc

**--------------------------------------------- END BACKPROP HERE ---------------------------------------------**