In [1]:
en_data = [
    "I am learning NLP",
    "This is a sequence to sequence model",
    "How are you"
]

fr_data = [
    "J'apprends le NLP",
    "Ceci est un modèle de séquence à séquence",
    "Comment ça va"
]

special_tokens = ['<PAD>', '<SOS>', '<EOS>', '<UNK>']

def build_vocab(data):
    vocab = dict()
    # Initialize vocabularies with special tokens
    for token in special_tokens:
        vocab[token] = len(vocab)

    for sentence in data:
        for word in sentence.split():
            if word.lower() not in vocab: vocab[word.lower()] = len(vocab)
    
    return vocab

def tokenize(vocab, data):
    tok_data = []

    for sentence in data:
        sentence_tok = []
        sentence_tok.append(vocab['<SOS>'])
        for word in sentence.split():
            if word.lower() in vocab:
                sentence_tok.append(vocab[word.lower()])
            else:
                sentence_tok.append(vocab['<UNK>'])
        sentence_tok.append(vocab['<EOS>'])
        tok_data.append(sentence_tok)
    
    return tok_data

def pad_sentences(data, pad_token):
    max_len = max(len(sentence) for sentence in data)
    padded_data = []
    for sentence in data:
        while len(sentence) < max_len:
            sentence.append(pad_token)
        padded_data.append(sentence)
    return padded_data

en_vocab, fr_vocab = build_vocab(en_data), build_vocab(fr_data)
en_data_tok, fr_data_tok = tokenize(en_vocab, en_data), tokenize(fr_vocab, fr_data)

en_data = pad_sentences(en_data_tok, en_vocab['<PAD>'])
fr_data = pad_sentences(fr_data_tok, fr_vocab['<PAD>'])



In [2]:
from datasets import load_dataset

dataset = load_dataset("opus_books", 'en-fi')

  from .autonotebook import tqdm as notebook_tqdm
Using the latest cached version of the dataset since opus_books couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'en-fi' at /Users/jonirajalaa/.cache/huggingface/datasets/opus_books/en-fi/0.0.0/1f9f6191d0e91a3c539c2595e2fe48fc1420de9b (last modified on Mon May 27 11:01:16 2024).


In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 3645
    })
})

In [4]:
import numpy as np
def dataset_to_numpy(dataset):
    finnish_sentences = []
    english_sentences = []
    
    for example in dataset:
        finnish_sentences.append(example['translation']['fi'])
        english_sentences.append(example['translation']['en'])
    
    return finnish_sentences, english_sentences
finnish_data, english_data = dataset_to_numpy(dataset['train'])


In [5]:
finnish_data

['Source: Project Gutenberg',
 'BASKERVILLEN KOIRA',
 'A. Conan Doyle ENSIMMÄINEN LUKU.',
 'Herra Sherlock Holmes.',
 'Herra Sherlock Holmes, joka tavallisesti nousi hyvin myöhään ylös aamusin, paitsi niissä kylläkin useissa tapauksissa, jolloin hän oli valvonut koko yön, istui aamiaisella.',
 'Minä seisoin matolla tulisijan edessä pitäen kädessäni keppiä, jonka eräs edellisenä iltana luonamme käynyt herra oli unohtanut.',
 'Se oli jokseenkin soma ja tukeva, se oli varustettu sipulinmuotoisella kädensijalla ja näytti oikealta "tuomarin sauvalta." \'M.R.C.S.',
 '[M.R.C.S. Member of the Royal College of Surgeons = kuninkaallisen kirurgi-kollegion jäsen.]',
 "James Mortimerille ystäviltänsä C. C. H:ssa' oli kaiverrettu tuuman-levyiselle, kädensijan alapuolella olevalle hopealevylle, sekä vielä vuosiluku 1884.",
 'Juuri sellaista keppiä käyttävät tavallisesti vanhat perhelääkärit -- se näytti arvokkaalta, vakavalta ja luottamusta herättävältä.',
 '"Kas niin, Watson, mitä johtopäätöksiä tuo

In [6]:
en_vocab, fi_vocab = build_vocab(english_data), build_vocab(finnish_data)
en_data_tok, fi_data_tok = tokenize(en_vocab, english_data), tokenize(fi_vocab, finnish_data)

en_data = pad_sentences(en_data_tok, en_vocab['<PAD>'])
fi_data = pad_sentences(fi_data_tok, fi_vocab['<PAD>'])

In [65]:
from jonigrad.layers import Layer, Parameter, Sigmoid, Tanh
import numpy as np

class LSTM(Layer):
    def __init__(self, input_size, hidden_size, num_layers=1):
        super().__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        # Initialize parameters for the first LSTM layer
        self._params["W_ih_l0"] = Parameter(np.random.randn(4 * hidden_size, input_size).astype(np.float32), True)
        self._params["B_ih_l0"] = Parameter(np.zeros((4 * hidden_size), dtype=np.float32), True)
        self._params["W_hh_l0"] = Parameter(np.random.randn(4 * hidden_size, hidden_size).astype(np.float32), True)
        self._params["B_hh_l0"] = Parameter(np.zeros((4 * hidden_size), dtype=np.float32), True)

        self.inp_gate_sigmoid = Sigmoid()
        self.forg_gate_sigmoid = Sigmoid()
        self.cell_gate_tanh = Tanh()
        self.outp_gate_sigmoid = Sigmoid()

    def forward(self, input_tensor, h0, c0):
        batch_size, seq_length, _ = input_tensor.shape
        h = h0.squeeze(0)
        c = c0.squeeze(0)
        outputs = []
        self.cache = []  # To store intermediate values for backward pass

        for t in range(seq_length):
            x_t = input_tensor[:, t, :]

            # Forget gate
            ft = self.forg_gate_sigmoid(
                np.dot(x_t, self._params["W_ih_l0"].data[self.hidden_size:2 * self.hidden_size, :].T) +
                np.dot(h, self._params["W_hh_l0"].data[self.hidden_size:2 * self.hidden_size, :].T) +
                self._params["B_ih_l0"].data[self.hidden_size:2 * self.hidden_size] +
                self._params["B_hh_l0"].data[self.hidden_size:2 * self.hidden_size]
            )
            
            # Input gate
            it = self.inp_gate_sigmoid(
                np.dot(x_t, self._params["W_ih_l0"].data[:self.hidden_size, :].T) +
                np.dot(h, self._params["W_hh_l0"].data[:self.hidden_size, :].T) +
                self._params["B_ih_l0"].data[:self.hidden_size] +
                self._params["B_hh_l0"].data[:self.hidden_size]
            )

            # Candidate cell state
            c_tilde = self.cell_gate_tanh(
                np.dot(x_t, self._params["W_ih_l0"].data[2 * self.hidden_size:3 * self.hidden_size, :].T) +
                np.dot(h, self._params["W_hh_l0"].data[2 * self.hidden_size:3 * self.hidden_size, :].T) +
                self._params["B_ih_l0"].data[2 * self.hidden_size:3 * self.hidden_size] +
                self._params["B_hh_l0"].data[2 * self.hidden_size:3 * self.hidden_size]
            )

            # Update cell state
            c = ft * c + it * c_tilde

            # Output gate
            ot = self.outp_gate_sigmoid(
                np.dot(x_t, self._params["W_ih_l0"].data[3 * self.hidden_size:, :].T) +
                np.dot(h, self._params["W_hh_l0"].data[3 * self.hidden_size:, :].T) +
                self._params["B_ih_l0"].data[3 * self.hidden_size:] +
                self._params["B_hh_l0"].data[3 * self.hidden_size:]
            )

            # Update hidden state
            h = ot * self.cell_gate_tanh(c)
            outputs.append(h)
            self.cache.append((x_t, h, c, ft, it, c_tilde, ot))

        outputs = np.stack(outputs, axis=1)  # Shape should be (batch_size, seq_length, hidden_size)
        return outputs, h[np.newaxis, :, :], c[np.newaxis, :, :]

    # def backward(self, grad_output, grad_hn, grad_cn):
    #     grad_output = grad_output
    #     grad_hn = grad_hn.squeeze(0)
    #     grad_cn = grad_cn.squeeze(0)

    #     dW_ih_l0 = np.zeros_like(self._params["W_ih_l0"].data)
    #     dB_ih_l0 = np.zeros_like(self._params["B_ih_l0"].data)
    #     dW_hh_l0 = np.zeros_like(self._params["W_hh_l0"].data)
    #     dB_hh_l0 = np.zeros_like(self._params["B_hh_l0"].data)

    #     dh_next = grad_hn
    #     dc_next = grad_cn

    #     for t in reversed(range(len(self.cache))):
    #         x_t, h, c, ft, it, c_tilde, ot = self.cache[t]

    #         # Calculate gradients for the output gate
    #         do = grad_output[:, t, :] + dh_next * self.cell_gate_tanh(c)
    #         do = do * ot * (1 - ot)
            
    #         # Calculate gradients for the cell state
    #         dc = (grad_output[:, t, :] * ot * (1 - np.tanh(c) ** 2)) + dc_next + (dh_next * ot * (1 - np.tanh(c) ** 2))

    #         # Calculate gradients for the input gate
    #         di = dc * c_tilde
    #         di = di * it * (1 - it)

    #         # Calculate gradients for the forget gate
    #         df = dc * c
    #         df = df * ft * (1 - ft)

    #         # Calculate gradients for the candidate cell state
    #         dc_tilde = dc * it
    #         dc_tilde = dc_tilde * (1 - c_tilde ** 2)

    #         dW_ih_l0[:self.hidden_size] += np.dot(di.T, x_t)
    #         dW_ih_l0[self.hidden_size:2*self.hidden_size] += np.dot(df.T, x_t)
    #         dW_ih_l0[2*self.hidden_size:3*self.hidden_size] += np.dot(dc_tilde.T, x_t)
    #         dW_ih_l0[3*self.hidden_size:] += np.dot(do.T, x_t)

    #         dB_ih_l0[:self.hidden_size] += np.sum(di, axis=0)
    #         dB_ih_l0[self.hidden_size:2*self.hidden_size] += np.sum(df, axis=0)
    #         dB_ih_l0[2*self.hidden_size:3*self.hidden_size] += np.sum(dc_tilde, axis=0)
    #         dB_ih_l0[3*self.hidden_size:] += np.sum(do, axis=0)

    #         dW_hh_l0[:self.hidden_size] += np.dot(di.T, h)
    #         dW_hh_l0[self.hidden_size:2*self.hidden_size] += np.dot(df.T, h)
    #         dW_hh_l0[2*self.hidden_size:3*self.hidden_size] += np.dot(dc_tilde.T, h)
    #         dW_hh_l0[3*self.hidden_size:] += np.dot(do.T, h)

    #         dB_hh_l0[:self.hidden_size] += np.sum(di, axis=0)
    #         dB_hh_l0[self.hidden_size:2*self.hidden_size] += np.sum(df, axis=0)
    #         dB_hh_l0[2*self.hidden_size:3*self.hidden_size] += np.sum(dc_tilde, axis=0)
    #         dB_hh_l0[3*self.hidden_size:] += np.sum(do, axis=0)

    #         # Propagate gradients to the previous time step
    #         dh_next = np.dot(di, self._params["W_hh_l0"].data[:self.hidden_size].T) + \
    #                   np.dot(df, self._params["W_hh_l0"].data[self.hidden_size:2*self.hidden_size].T) + \
    #                   np.dot(dc_tilde, self._params["W_hh_l0"].data[2*self.hidden_size:3*self.hidden_size].T) + \
    #                   np.dot(do, self._params["W_hh_l0"].data[3*self.hidden_size:].T)

    #         dc_next = dc * ft

    #     self._params["W_ih_l0"].grad = dW_ih_l0
    #     self._params["B_ih_l0"].grad = dB_ih_l0
    #     self._params["W_hh_l0"].grad = dW_hh_l0
    #     self._params["B_hh_l0"].grad = dB_hh_l0

    def backward(self, grad_output, grad_hn, grad_cn):
        grad_output = grad_output
        grad_hn = grad_hn.squeeze(0)
        grad_cn = grad_cn.squeeze(0)

        dW_ih_l0 = np.zeros_like(self._params["W_ih_l0"].data)
        dB_ih_l0 = np.zeros_like(self._params["B_ih_l0"].data)
        dW_hh_l0 = np.zeros_like(self._params["W_hh_l0"].data)
        dB_hh_l0 = np.zeros_like(self._params["B_hh_l0"].data)

        dh_next = grad_hn
        dc_next = grad_cn
        dL_dx = np.zeros((grad_output.shape[0], grad_output.shape[1], self.input_size))

        for t in reversed(range(len(self.cache))):
            x_t, h, c, ft, it, c_tilde, ot = self.cache[t]

            # Calculate gradients for the output gate
            do = (grad_output[:, t, :] + dh_next * self.cell_gate_tanh(c)) * ot * (1 - ot)
            
            # Calculate gradients for the cell state
            dc = (grad_output[:, t, :] * ot * (1 - np.tanh(c) ** 2)) + dc_next + (dh_next * ot * (1 - np.tanh(c) ** 2))

            # Calculate gradients for the input gate
            di = dc * c_tilde * it * (1 - it)

            # Calculate gradients for the forget gate
            df = dc * c * ft * (1 - ft)

            # Calculate gradients for the candidate cell state
            dc_tilde = dc * it * (1 - c_tilde ** 2)

            dW_ih_l0[:self.hidden_size] += np.dot(di.T, x_t)
            dW_ih_l0[self.hidden_size:2*self.hidden_size] += np.dot(df.T, x_t)
            dW_ih_l0[2*self.hidden_size:3*self.hidden_size] += np.dot(dc_tilde.T, x_t)
            dW_ih_l0[3*self.hidden_size:] += np.dot(do.T, x_t)

            dB_ih_l0[:self.hidden_size] += np.sum(di, axis=0)
            dB_ih_l0[self.hidden_size:2*self.hidden_size] += np.sum(df, axis=0)
            dB_ih_l0[2*self.hidden_size:3*self.hidden_size] += np.sum(dc_tilde, axis=0)
            dB_ih_l0[3*self.hidden_size:] += np.sum(do, axis=0)

            dW_hh_l0[:self.hidden_size] += np.dot(di.T, h)
            dW_hh_l0[self.hidden_size:2*self.hidden_size] += np.dot(df.T, h)
            dW_hh_l0[2*self.hidden_size:3*self.hidden_size] += np.dot(dc_tilde.T, h)
            dW_hh_l0[3*self.hidden_size:] += np.dot(do.T, h)

            dB_hh_l0[:self.hidden_size] += np.sum(di, axis=0)
            dB_hh_l0[self.hidden_size:2*self.hidden_size] += np.sum(df, axis=0)
            dB_hh_l0[2*self.hidden_size:3*self.hidden_size] += np.sum(dc_tilde, axis=0)
            dB_hh_l0[3*self.hidden_size:] += np.sum(do, axis=0)

            # Calculate gradient with respect to input x_t
            dx_t = np.dot(di, self._params["W_ih_l0"].data[:self.hidden_size]) + \
                np.dot(df, self._params["W_ih_l0"].data[self.hidden_size:2*self.hidden_size]) + \
                np.dot(dc_tilde, self._params["W_ih_l0"].data[2*self.hidden_size:3*self.hidden_size]) + \
                np.dot(do, self._params["W_ih_l0"].data[3*self.hidden_size:])
            dL_dx[:, t, :] = dx_t

            # Propagate gradients to the previous time step
            dh_next = np.dot(di, self._params["W_hh_l0"].data[:self.hidden_size].T) + \
                    np.dot(df, self._params["W_hh_l0"].data[self.hidden_size:2*self.hidden_size].T) + \
                    np.dot(dc_tilde, self._params["W_hh_l0"].data[2*self.hidden_size:3*self.hidden_size].T) + \
                    np.dot(do, self._params["W_hh_l0"].data[3*self.hidden_size:].T)

            dc_next = dc * ft

        self._params["W_ih_l0"].grad = dW_ih_l0
        self._params["B_ih_l0"].grad = dB_ih_l0
        self._params["W_hh_l0"].grad = dW_hh_l0
        self._params["B_hh_l0"].grad = dB_hh_l0

        return dL_dx, dh_next[np.newaxis, :, :], dc_next[np.newaxis, :, :]


In [66]:
import torch
import torch.nn as nn

def copy_weights_from_pytorch_to_custom(pytorch_lstm, custom_lstm):
    with torch.no_grad():
        custom_lstm._params["W_ih_l0"].data = pytorch_lstm.weight_ih_l0.detach().numpy().astype(np.float32)
        custom_lstm._params["B_ih_l0"].data = pytorch_lstm.bias_ih_l0.detach().numpy().astype(np.float32)
        custom_lstm._params["W_hh_l0"].data = pytorch_lstm.weight_hh_l0.detach().numpy().astype(np.float32)
        custom_lstm._params["B_hh_l0"].data = pytorch_lstm.bias_hh_l0.detach().numpy().astype(np.float32)

# Sample parameters and inputs
input_size = 10
hidden_size = 20
seq_length = 5
batch_size = 3

# Sample input tensor
input_tensor = torch.randn(batch_size, seq_length, input_size)

# Initialize PyTorch LSTM
pytorch_lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
h0 = torch.randn(1, batch_size, hidden_size)  # Initial hidden state
c0 = torch.randn(1, batch_size, hidden_size)  # Initial cell state

custom_lstm = LSTM(input_size, hidden_size)

# Copy weights from PyTorch LSTM to custom LSTM
copy_weights_from_pytorch_to_custom(pytorch_lstm, custom_lstm)

# Forward pass through PyTorch LSTM
pytorch_output, (pytorch_hn, pytorch_cn) = pytorch_lstm(input_tensor, (h0, c0))

# Forward pass through custom LSTM
hand_built_output, hand_built_hn, hand_built_cn = custom_lstm.forward(input_tensor.detach().numpy(), h0.detach().numpy(), c0.detach().numpy())

print(pytorch_output.shape, hand_built_output.shape)
print(pytorch_hn.shape, hand_built_hn.shape)
print(pytorch_cn.shape, hand_built_cn.shape)

# Verify the outputs
print("Output comparison:", torch.allclose(pytorch_output, torch.tensor(hand_built_output).reshape(pytorch_output.shape), atol=1e-6))
print("Hidden state comparison:", torch.allclose(pytorch_hn, torch.tensor(hand_built_hn), atol=1e-6))
print("Cell state comparison:", torch.allclose(pytorch_cn, torch.tensor(hand_built_cn), atol=1e-6))


torch.Size([3, 5, 20]) (3, 5, 20)
torch.Size([1, 3, 20]) (1, 3, 20)
torch.Size([1, 3, 20]) (1, 3, 20)
Output comparison: True
Hidden state comparison: True
Cell state comparison: True


In [82]:
# Sample parameters and inputs
input_size = 10
hidden_size = 20
seq_length = 5
batch_size = 3

x_torch = torch.randn(batch_size, seq_length, input_size)
x = x_torch.detach().numpy()
x_torch.requires_grad_(True)
h0 = torch.randn(1, batch_size, hidden_size)  # Initial hidden state
c0 = torch.randn(1, batch_size, hidden_size)  # Initial cell state

# Initialize PyTorch LSTM and custom LSTM
pytorch_lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
custom_lstm = LSTM(input_size, hidden_size)

# Copy weights from PyTorch LSTM to custom LSTM
def copy_weights_from_pytorch_to_custom(pytorch_lstm, custom_lstm):
    custom_lstm._params["W_ih_l0"].data = pytorch_lstm.weight_ih_l0.detach().numpy().astype(np.float32)
    custom_lstm._params["B_ih_l0"].data = pytorch_lstm.bias_ih_l0.detach().numpy().astype(np.float32)
    custom_lstm._params["W_hh_l0"].data = pytorch_lstm.weight_hh_l0.detach().numpy().astype(np.float32)
    custom_lstm._params["B_hh_l0"].data = pytorch_lstm.bias_hh_l0.detach().numpy().astype(np.float32)

copy_weights_from_pytorch_to_custom(pytorch_lstm, custom_lstm)

# Forward pass through PyTorch LSTM
torch_output, (pytorch_hn, pytorch_cn) = pytorch_lstm(x_torch, (h0, c0))

# Forward pass through custom LSTM
custom_output, hand_built_hn, hand_built_cn = custom_lstm.forward(x, h0.detach().numpy(), c0.detach().numpy())

# Create random gradient for backward pass
grad_output = np.random.randn(*custom_output.shape).astype(np.float32)
grad_output_torch = torch.from_numpy(grad_output).float()

# Initialize zero gradients for hn and cn
grad_hn = torch.zeros_like(pytorch_hn)
grad_cn = torch.zeros_like(pytorch_cn)

# Backward pass through custom layer
custom_lstm.zero_grad()
custom_grad_input, dh_next, dc_next = custom_lstm.backward(grad_output, grad_hn.detach().numpy(), grad_cn.detach().numpy())

# Backward pass through PyTorch layer
pytorch_lstm.zero_grad()
torch_output.backward(grad_output_torch)
torch_grad_input = x_torch.grad.numpy()

# Compare the gradients
print(np.allclose(custom_grad_input, torch_grad_input, atol=1e-6), "Backward pass gradients do not match!")
print("Weight gradients comparison for W_ih_l0:", np.allclose(pytorch_lstm.weight_ih_l0.grad.numpy(), custom_lstm._params["W_ih_l0"].grad, atol=1e-6))
print("Bias gradients comparison for B_ih_l0:", np.allclose(pytorch_lstm.bias_ih_l0.grad.numpy(), custom_lstm._params["B_ih_l0"].grad, atol=1e-6))
print("Weight gradients comparison for W_hh_l0:", np.allclose(pytorch_lstm.weight_hh_l0.grad.numpy(), custom_lstm._params["W_hh_l0"].grad, atol=1e-6))
print("Bias gradients comparison for B_hh_l0:", np.allclose(pytorch_lstm.bias_hh_l0.grad.numpy(), custom_lstm._params["B_hh_l0"].grad, atol=1e-6))


False Backward pass gradients do not match!
Weight gradients comparison for W_ih_l0: False
Bias gradients comparison for B_ih_l0: False
Weight gradients comparison for W_hh_l0: False
Bias gradients comparison for B_hh_l0: False
