RNN From Scratch

taken from : https://jaketae.github.io/study/pytorch-rnn/

In [None]:
import os
import random
from string import ascii_letters

import torch
from unidecode import unidecode

In [None]:
_ = torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Peek dataset

In [None]:
data_dir = "D:\\Academic\\Code\\dl-playground\\data\\torchdata\\names"

lang2label = {
    file_name.split(".")[0]: torch.tensor([i], dtype=torch.long) for i, file_name in enumerate(os.listdir(data_dir))
}
print(lang2label)

In [None]:
num_lang = len(lang2label)
print(num_lang)

Data preprocessing

There are 59 characters to be included 52 for `ascii_letters` plus another 7 symbols

In [None]:
char2idx = {
    char: i for i, char in enumerate(ascii_letters + " .,:;-'") # including space, period, comma, colon, semicolon, dash, apostrophe
}
num_char = len(char2idx)
print(num_char)

- Represent each character in name as one hot encoding tensor of possible 59 chars
- RNN layers in PyTorch need to have input shape of (`seq_len`, `batch_size`, `input_size`)

In [None]:
def name2tensor(name):
    tensor = torch.zeros(len(name), 1, num_char) # RNN layers in PyTorch need to have input shape of (seq_len, batch_size, input_size)
    for i, char in enumerate(name):
        tensor[i][0][char2idx[char]] = 1
    return tensor

In [None]:
# check encoded name
print(name2tensor("abc"))

Create Dataset

In [None]:
tensor_names = []
target_langs = []

for file in os.listdir(data_dir):
    with open(os.path.join(data_dir, file), "r") as f:
        lang = file.split(".")[0]
        names = [unidecode(line.rstrip()) for line in f]
        for name in names:
            try:
                tensor_names.append(name2tensor(name))  # convert name to tensor
                target_langs.append(lang2label[lang])   # check target_langs
            except KeyError:
                pass

Split dataset

In [None]:
from sklearn.model_selection import train_test_split

train_idx, test_idx = train_test_split(range(len(target_langs)), test_size=0.1, shuffle=True, stratify=target_langs)
train_dataset = [
    (tensor_names[i], target_langs[i]) for i in train_idx
]
test_dataset = [
    (tensor_names[i], target_langs[i]) for i in test_idx
]

In [None]:
print(len(train_dataset))
print(len(test_dataset))

RNN Model from scratch

In [None]:
import torch.nn as nn
import torch.nn.functional as F

In [None]:
class MyRNN(nn.Module):
    def __init__(self, in_size, hidden_size, out_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.in2hidden = nn.Linear(in_size + hidden_size, hidden_size)
        self.in2out = nn.Linear(in_size + hidden_size, out_size)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), dim=1)
        hidden = torch.sigmoid(self.in2hidden(combined))
        output = self.in2out(combined)
        return output, hidden

    def init_hidden(self):
        return nn.init.kaiming_uniform_(torch.empty(1, self.hidden_size))


Build and train model

In [None]:
hidden_size = 256
learning_rate = 0.001

model = MyRNN(num_char, hidden_size, num_lang)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

Train routine

In [None]:
num_epochs = 5
print_interval = 3000

for epoch in range(num_epochs):
    random.shuffle(train_dataset)
    for i, (name, label) in enumerate(train_dataset):
        hidden_state = model.init_hidden() # initialize hidden state at start of each sample
        for char in name:
            output, hidden_state = model(char, hidden_state)
        loss = criterion(output, label)

        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()

        if (i + 1) % print_interval == 0:
            print(f"Epoch {epoch + 1}/{num_epochs}, Step {i + 1}/{len(train_dataset)}, Loss: {loss.item():.4f}")


Test the model

In [None]:
num_correct = 0
num_samples = len(test_dataset)

model.eval() # set model to evaluation mode

with torch.no_grad():
    for name, label in test_dataset:
        hidden_state = model.init_hidden()
        for char in name:
            output, hidden_state = model(char, hidden_state)
        _, pred_label = torch.max(output, dim=1)
        num_correct += int(pred_label == label)

accuracy = num_correct / num_samples
print(f"Test Accuracy: {accuracy:.4f}")

Take a look at concrete sample

In [None]:
label2lang = {label.item(): lang for lang, label in lang2label.items()}

def myrnn_predict(name):
    model.eval()
    tensor_names = name2tensor(name)
    with torch.no_grad():
        hidden_state = model.init_hidden()
        for char in tensor_names:
            output, hidden_state = model(char, hidden_state)
        _, pred_label = torch.max(output, dim=1)
    model.train() # set model back to training mode
    return label2lang[pred_label.item()]

check sample

In [None]:
print(myrnn_predict("Xia"))

In [None]:
print(myrnn_predict("Angelo"))

In [None]:
print(myrnn_predict("Hitler"))

In [None]:
print(myrnn_predict("Tsubasa"))

In [None]:
print(myrnn_predict("Eusebio"))