In [None]:
# adapted and modified from https://www.kaggle.com/code/fareselmenshawii/lstm-from-scratch/notebook

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

In [2]:
class DataGenerator:

    def __init__(self, path: str, sequence_length: int):
        with open(path) as f:
            # Read the contents of the file
            self.data = f.read()

        # Find all unique characters in the text
        chars = list(set(self.data))

        # Create dictionaries to map characters to indices and vice versa
        self.char_to_idx = {ch: i for (i, ch) in enumerate(chars)}
        self.idx_to_char = {i: ch for (i, ch) in enumerate(chars)}

        # Store the size of the text data and the size of the vocabulary
        self.data_size = len(self.data)
        self.vocab_size = len(chars)

        # Initialize the pointer that will be used to generate sequences
        self.pointer = 0

        # Store the desired sequence length
        self.sequence_length = sequence_length


    def next_batch(self, batch_size):
        input_start = self.pointer
        input_end = self.pointer + batch_size*self.sequence_length

        # Get the input sequence as a list of integers
        inputs = [self.char_to_idx[ch] for ch in self.data[input_start:input_end]]

        # One-hot encode the input sequence
        inputs_one_hot = torch.zeros((len(inputs), self.vocab_size))
        inputs_one_hot[torch.arange(len(inputs)), inputs] = 1
        inputs_one_hot = inputs_one_hot.reshape(batch_size, self.sequence_length, self.vocab_size)
        # Get the target sequence as a list of integers
        targets = torch.tensor([self.char_to_idx[ch] for ch in self.data[input_start + 1:input_end + 1]])
        targets = targets.reshape(batch_size, self.sequence_length)
        # Update the pointer
        self.pointer += 4*self.sequence_length

        # Reset the pointer if the next batch would exceed the length of the text data
        if self.pointer + batch_size*self.sequence_length + 1 >= self.data_size:
            self.pointer = 0

        return inputs_one_hot, targets
    

In [3]:
class LSTMModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
        super(LSTMModel, self).__init__()
        self.hidden_dim = hidden_dim
        self.layer_dim = layer_dim
        self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, h0=None, c0=None):
        if h0 is None or c0 is None:
            h0 = torch.zeros(self.layer_dim, x.shape[0], self.hidden_dim)
            c0 = torch.zeros(self.layer_dim, x.shape[0], self.hidden_dim)
        
        out, (hn, cn) = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        # out = self.fc(out)
        return out, hn, cn

In [None]:
sequence_length = 3
data_generator = DataGenerator('names.txt', sequence_length)
print(data_generator.vocab_size)
model = LSTMModel(input_dim=data_generator.vocab_size, hidden_dim=500, layer_dim=1, output_dim=data_generator.vocab_size)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

27


In [6]:
num_epochs = 4

for epoch in range(num_epochs):
    h0, c0 = None, None
    data_generator.pointer = 1
    i = 0
    while(data_generator.pointer != 0):
        model.train()
        optimizer.zero_grad()

        inputs, targets = data_generator.next_batch(4)
        # print(inputs.shape)

        
        outputs, h0, c0 = model(inputs, h0, c0)
        loss = criterion(outputs, targets[:,-1].long())
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5)
        optimizer.step()

        h0 = h0.detach()
        c0 = c0.detach()

        if(i % 10000 == 0):
            print(f'Till Now, Loss: {loss.item():.4f}')
        i+=1

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Till Now, Loss: 1.6612
Till Now, Loss: 0.4144
Epoch [1/4], Loss: 2.1683
Till Now, Loss: 1.1700
Till Now, Loss: 0.4021
Epoch [2/4], Loss: 2.5600
Till Now, Loss: 1.0081
Till Now, Loss: 0.4960
Epoch [3/4], Loss: 2.3885
Till Now, Loss: 1.5144
Till Now, Loss: 0.6955
Epoch [4/4], Loss: 1.9129


In [7]:
def predict(vocab_size, data_generator, hidden_dim, start, sequence_length, n):
    # initialize input sequence
    x = torch.zeros((1, sequence_length, vocab_size))
    zero_add = torch.zeros((1, 1, vocab_size))
    chars = [ch for ch in start]
    idxes = []

    # for i in range(len(chars)):
    #     idx = data_generator.char_to_idx[chars[i]]
    #     x[0, sequence_length-1, idx] = 1
    #     x = torch.cat((x[:, 1:, :], zero_add), 1)
    #     idxes.append(idx)

    # Initialize input with the provided start sequence
    for i, char in enumerate(chars):
        idx = data_generator.char_to_idx[char]
        x[0, sequence_length - len(chars) + i, idx] = 1  # Fix index shifting
        idxes.append(idx)
            
    h0 = torch.zeros(1, x.shape[0], hidden_dim)
    c0 = torch.zeros(1, x.shape[0], hidden_dim)
    # generate new sequence of characters
    for _ in range(n):
        predicted, h0, c0 = model(x, h0, c0)
        # Sample from the distribution
        predicted = F.softmax(predicted, dim=-1)
        # print(predicted)
        idx = torch.multinomial(predicted.squeeze(), num_samples=1).item()
        x = torch.cat((x[:, 1:, :], zero_add), 1)
        x[0, sequence_length-1, idx] = 1
        # x = torch.zeros((vocab_size, 1))
        # x[idx] = 1
        idxes.append(idx)
        
    txt = ''.join(data_generator.idx_to_char[i] for i in idxes)
    # print(repr(txt))  # Shows the exact characters in the string

    txt = txt.replace('\n'," | ")
    return txt

In [8]:
model.eval()
predict(data_generator.vocab_size, data_generator, 500, "joshu", 4, 100)

'joshuab | aujlan | alak | obuice | marm | maziah | naveen | zakian | zabi | zehbe | noil | mus | alashannod | michi | mons | phel | paari'