In [1]:
import torch
import string
import unicodedata
import os
from torch.utils.data import Dataset
from torch import nn
import random
import numpy as np

DEVICE = torch.device('cpu')
if torch.cuda.is_available():
    DEVICE = torch.device('cuda')

torch.set_default_device(DEVICE)

PATH_DATA_DIR = './data/names'

In [2]:
# Disclosure: some of these functions, and the dataset itself, are copied from:
# - https://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial 

allowed_characters = string.ascii_letters + " .,;'"
n_letters = len(allowed_characters)

# From https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in allowed_characters
    )

def letterToIndex(letter):
    return allowed_characters.find(letter)

def lineToTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letterToIndex(letter)] = 1
    return tensor

print('Allowed characters:' + allowed_characters)
print('Example unicodeToAscii: ' + unicodeToAscii('Órgão'))

Allowed characters:abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,;'
Example unicodeToAscii: Orgao


In [3]:
class NamesDataset(Dataset):
    def __init__(self, path):
        self.data = [] # name as tensor
        self.labels = []
        self.values = [] # store name as string
        self.labels_tensor = []
        self.unique_labels = []
    
        for filename in os.listdir(path):
            fn = filename.replace('.txt', '')
            self.unique_labels.append(fn)
            with open(f'{path}/{filename}', 'r') as f:
                names = [l.replace('\n', '') for l in f.readlines()]
                for n in names:
                    self.data.append(lineToTensor(n))
                    self.labels.append(fn)
                    self.values.append(n)

        # tensors for labels
        # format (batch_size) we only need index, no need to one hot encode
        for i in range(len(self.labels)):
            temp_tensor = torch.tensor([self.unique_labels.index(self.labels[i])], dtype=torch.long)
            self.labels_tensor.append(temp_tensor)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, i):
        return self.data[i], self.labels_tensor[i], self.values[i], self.labels[i]

ds = NamesDataset(PATH_DATA_DIR)

train_dataset, test_dataset = torch.utils.data.random_split(ds, [0.8, 0.2],
                                                           generator=torch.Generator(device=DEVICE).manual_seed(2024))

In [4]:
all_classes = train_dataset.dataset.unique_labels
all_classes

['Russian',
 'Vietnamese',
 'Arabic',
 'Portuguese',
 'Italian',
 'Scottish',
 'Chinese',
 'German',
 'English',
 'Czech',
 'Dutch',
 'Spanish',
 'French',
 'Greek',
 'Korean',
 'Japanese',
 'Polish',
 'Irish']

In [5]:
train_dataset.dataset[12000]

(tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0.]],
 
         [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0.]],
 
         [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0.]],
 
         [[0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0.,

In [6]:
class CharRNN(nn.Module):
    def __init__(self, input_size, hidden_size, out_size):
        super(CharRNN, self).__init__()

        self.rnn = nn.RNN(input_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, out_size)
        self.softmax = nn.LogSoftmax(dim=1) # applies log(softmax(x)) directly without computing probabilities as an intermediary

    def forward(self, x):
        rnn_out, hidden = self.rnn(x) # rnn_out are the previous hidden states, hidden is the current hidden state
        out = self.h2o(hidden[0]) # we are doing a single hidden state vector RNN
        out = self.softmax(out)

        return out

n_hidden = 128
rnn = CharRNN(n_letters, n_hidden, len(train_dataset.dataset.unique_labels))
print(rnn)

CharRNN(
  (rnn): RNN(57, 128)
  (h2o): Linear(in_features=128, out_features=18, bias=True)
  (softmax): LogSoftmax(dim=1)
)


In [7]:
# example prediction
# https://pytorch.org/docs/stable/generated/torch.nn.RNN.html see this for details on matrix sizes and our format
print(train_dataset.dataset[0][0].shape)
out = rnn.forward(train_dataset.dataset[0][0])
print(f'Output is {out}')
print(f'Selected class is {all_classes[out[0].argmax()]}')

torch.Size([6, 1, 57])
Output is tensor([[-2.8261, -2.8964, -2.8662, -2.9859, -2.7617, -2.9067, -2.8312, -2.8575,
         -2.8678, -3.0262, -2.8581, -2.9051, -2.9183, -2.8371, -2.9090, -2.8713,
         -3.0227, -2.9179]], device='cuda:0', grad_fn=<LogSoftmaxBackward0>)
Selected class is Italian


In [8]:
# Training
EPOCHS = 25
BATCH_SIZE = 64

criterion = nn.NLLLoss()
rnn.train()
optimizer = torch.optim.SGD(rnn.parameters(), lr=0.1)

loss_log = []
for i in range(EPOCHS):
    rnn.zero_grad()

    # indexes for batches
    batches = list(range(len(train_dataset)))
    random.shuffle(batches)
    batches = np.array_split(batches, len(batches) //BATCH_SIZE )

    current_loss = 0
    for idx, batch in enumerate(batches):
        batch_loss = 0
        for sample_ind in batch:
            data, label, _, _ = train_dataset[sample_ind]

            out = rnn.forward(data)
            loss = criterion(out, label)
            batch_loss += loss

        batch_loss.backward() # compute gradients
        nn.utils.clip_grad_norm_(rnn.parameters(), 3) # prevent exploding gradients by clipping
        optimizer.step()
        optimizer.zero_grad()

        current_loss += batch_loss.item() / len(batch)

    print(f'On epoch {i} current loss is {current_loss/len(batches)}')
    loss_log.append(current_loss/len(batches))


On epoch 0 current loss is 1.464627379501783
On epoch 1 current loss is 1.1307479199776282
On epoch 2 current loss is 1.0356515202705678
On epoch 3 current loss is 0.9645232928826258
On epoch 4 current loss is 0.9155496785677396
On epoch 5 current loss is 0.8657287864648379
On epoch 6 current loss is 0.8325115723646604
On epoch 7 current loss is 0.7885450555801392
On epoch 8 current loss is 0.7678859983169116
On epoch 9 current loss is 0.7379098573868091
On epoch 10 current loss is 0.7098076523248967
On epoch 11 current loss is 0.6828851538786521
On epoch 12 current loss is 0.6654920316861226
On epoch 13 current loss is 0.6441865317133757
On epoch 14 current loss is 0.620182626774678
On epoch 15 current loss is 0.5986385559458
On epoch 16 current loss is 0.5814857034719908
On epoch 17 current loss is 0.5630672317779981
On epoch 18 current loss is 0.5418883153539438
On epoch 19 current loss is 0.5247832979578239
On epoch 20 current loss is 0.5086421897429686
On epoch 21 current loss is 

In [19]:
# evaluate performance
rnn.eval()

# TODO: accuracy by label
correct = 0
for s in test_dataset:
    data, label_tensor, value, label_text = s

    out = rnn.forward(data)
    predict = all_classes[out[0].argmax()]
    print(f'Prediction: {predict} => Label is {label_text}')

    if predict == label_text: correct += 1

print(f'Accuracy over all classes: {correct/len(test_dataset):.2f}')
    

Prediction: Russian => Label is Russian
Prediction: English => Label is English
Prediction: Russian => Label is Russian
Prediction: Greek => Label is Greek
Prediction: English => Label is English
Prediction: Arabic => Label is Arabic
Prediction: Greek => Label is Italian
Prediction: Russian => Label is Russian
Prediction: Russian => Label is Russian
Prediction: Russian => Label is Russian
Prediction: Russian => Label is Russian
Prediction: Japanese => Label is Japanese
Prediction: Russian => Label is Russian
Prediction: English => Label is English
Prediction: Russian => Label is Russian
Prediction: English => Label is Russian
Prediction: Russian => Label is Russian
Prediction: English => Label is English
Prediction: Russian => Label is Russian
Prediction: Russian => Label is Russian
Prediction: German => Label is English
Prediction: Russian => Label is Russian
Prediction: Chinese => Label is Chinese
Prediction: Russian => Label is Russian
Prediction: Arabic => Label is Arabic
Predictio