# Build LSTM From Scratch

1. LSTM Unit
2. LSTM Layer
3. LSTM Network

In [1]:
# import relevant libraries
import numpy as np
import torch
import torch.nn as nn

### Single LSTM Cell

In [2]:
# define sigmoid function
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# define tanh function
def tanh(x):
    return np.tanh(x)

In [3]:
# define a single LSTM Unit
def LSTMUnit(x, h, c, W_hh, W_ih, b):
    """
    Input
    x: np array of shape (batch_size, input_size)
    h: np array of shape (batch_size, hidden_size)
    c: np array of shape (batch_size, hidden_size)
    W_hh: np array of shape (4 * hidden_size, hidden_size)
    W_ih: np array of shape (4 * hidden_size, input_size)
    b: np array of shape (4 * hidden_size)
    """
    # generate the inputs to forget gate, input gate, candidate gate, and output gate
    i, f, g, o = np.split(W_hh@h + W_ih@x + b, 4)
    i, f, g, o = sigmoid(i), sigmoid(f), tanh(g), sigmoid(o)
    c_out = f*c + i*g
    h_out = o * tanh(c_out)
    return h_out, c_out

In [21]:
# define the model
batch_size = 1
input_size = 20
hidden_size = 100
model = nn.LSTMCell(20, 100)

In [22]:
# verify that LSTMUnit is working correctly
x = np.random.randn(1, 20).astype(np.float32)
h0 = np.random.randn(1, 100).astype(np.float32)
c0 = np.random.randn(1, 100).astype(np.float32)

h_, c_ = model(torch.tensor(x)), (torch.tensor(h0), torch.tensor(c0))

h, c = LSTMUnit(x[0], h0[0], c0[0],
                model.weight_hh.detach().numpy(),
                model.weight_ih.detach().numpy(),
                (model.bias_hh + model.bias_ih).detach().numpy())

In [23]:
np.linalg.norm(h_[0].detach().numpy() - h)

2.4048018

### LSTM Layer

In [24]:
model = nn.LSTM(input_size, hidden_size)

In [28]:
# define the model
batch_size = 50 # 50 sequeces
input_size = 20
hidden_size = 100
X = np.random.randn(batch_size, input_size).astype(np.float32)
h0 = np.random.randn(1, hidden_size).astype(np.float32)
c0 = np.random.randn(1, hidden_size).astype(np.float32)

In [29]:
def LSTMLayer(X, h, c, W_hh, W_ih, b):
    H = np.zeros((X.shape[0], h.shape[0]))
    for t in range(X.shape[0]):
        h, c = LSTMUnit(X[t], h, c, W_hh, W_ih, b)
        H[t] = h
    return H, c

In [31]:
H, cn = LSTMLayer(X, h0[0], c0[0],
                model.weight_hh_l0.detach().numpy(),
                model.weight_ih_l0.detach().numpy(),
                (model.bias_hh_l0 + model.bias_ih_l0).detach().numpy())

In [33]:
H_, (hn, cn) = model(torch.tensor(X)[:, None, :],
                     (torch.tensor(h0)[:, None, :],
                      torch.tensor(c0)[:, None, :]))

In [34]:
np.linalg.norm(H - H_[:, 0, :].detach().numpy())

1.4628261130620635e-06

#### Batching

In [35]:
# define a single LSTM Unit
def LSTMUnit(x, h, c, W_hh, W_ih, b):
    """
    Input
    x: np array of shape (batch_size, input_size)
    h: np array of shape (batch_size, hidden_size)
    c: np array of shape (batch_size, hidden_size)
    W_hh: np array of shape (4 * hidden_size, hidden_size)
    W_ih: np array of shape (4 * hidden_size, input_size)
    b: np array of shape (4 * hidden_size)
    """
    # generate the inputs to forget gate, input gate, candidate gate, and output gate
    i, f, g, o = np.split(h @ W_hh + x @ W_ih + b, 4, axis=1) # now first dim is batch dim, second dim is hidden dim
    i, f, g, o = sigmoid(i), sigmoid(f), tanh(g), sigmoid(o)
    c_out = f*c + i*g
    h_out = o * tanh(c_out)
    return h_out, c_out

def LSTMLayer(X, h, c, W_hh, W_ih, b):
    H = np.zeros((X.shape[0], X.shape[1], h.shape[1]))
    for t in range(X.shape[0]):
        h, c = LSTMUnit(X[t], h, c, W_hh, W_ih, b)
        H[t] = h
    return H, c

In [45]:
# define the model
batch_size = 128
sequence_length = 50
input_size = 20
hidden_size = 100
X = np.random.randn(sequence_length, batch_size, input_size).astype(np.float32)
h0 = np.random.randn(1, batch_size, hidden_size).astype(np.float32)
c0 = np.random.randn(1, batch_size, hidden_size).astype(np.float32)

In [46]:
H_, (hn, cn) = model(torch.tensor(X),
                     (torch.tensor(h0),
                      torch.tensor(c0)))

In [47]:
H_.shape

torch.Size([50, 128, 100])

In [48]:
H, cn = LSTMLayer(X, h0[0], c0[0],
                model.weight_hh_l0.detach().numpy().T,
                model.weight_ih_l0.detach().numpy().T,
                (model.bias_hh_l0 + model.bias_ih_l0).detach().numpy())

In [49]:
np.linalg.norm(H-H_.detach().numpy())

9.93722142209323e-06