A rough copy of https://jaketae.github.io/study/pytorch-rnn/

In [1]:
import os
import random
from io import BytesIO
from pathlib import Path
from zipfile import ZipFile
from string import ascii_letters

import requests
import numpy as np
from unidecode import unidecode
from tinygrad.tensor import Tensor
from tinygrad.nn import Linear, optim
from extra.training import sparse_categorical_crossentropy
from sklearn.model_selection import train_test_split

ops_triton not available No module named 'pycuda'


In [2]:
os.environ["GPU"] = "1"
os.environ["CL_DEVICE"] = "1"
random.seed(1337)

In [3]:
names_dir = Path("..").resolve() / "data" / "names"
data_dir = names_dir / "data" / "names"

In [4]:
if not data_dir.is_dir():
    names_dir.mkdir(parents=True, exist_ok=True)
    response = requests.get("https://download.pytorch.org/tutorial/data.zip")
    assert response.status_code == 200
    zip_file = BytesIO(response.content)
    with ZipFile(zip_file, "r") as zip_ref:
        zip_ref.extractall(names_dir)

In [5]:
lang2label = {file_path.stem: i for i, file_path in enumerate(data_dir.iterdir())}
num_langs = len(lang2label)

In [6]:
char2idx = {letter: i for i, letter in enumerate(ascii_letters + " .,:;-'")}
num_letters = len(char2idx)

In [7]:
def name2array(name: str) -> Tensor:
    array = np.zeros((len(name), 1, num_letters))
    for i, char in enumerate(name):
        array[i][0][char2idx[char]] = 1
    return array

In [8]:
input_names, target_langs = [], []

for file_path in data_dir.iterdir():
    with file_path.open("r") as file:
        for name in [unidecode(line.rstrip()) for line in file]:
            if any(letter not in char2idx for letter in name):
                continue
            input_names.append(name2array(name))            
            target_langs.append(lang2label[file_path.stem])

In [9]:
train_idx, test_idx = train_test_split(range(len(target_langs)), test_size=0.1, shuffle=True, stratify=target_langs)
train_dataset = [(Tensor(input_names[i]), np.array(target_langs[i])) for i in train_idx]
test_dataset = [(Tensor(input_names[i]), np.array(target_langs[i])) for i in test_idx]

In [10]:
class RecurrentNet:

    def __init__(self, input_size: int, hidden_size: int, output_size: int):
        self.hidden_size = hidden_size
        self.in2hidden = Linear(input_size + hidden_size, hidden_size)
        self.in2output = Linear(input_size + hidden_size, output_size)
    
    def __call__(self, x: Tensor, hidden_state: Tensor) -> Tensor:
        combined = Tensor.cat(x, hidden_state, dim=1)
        hidden = self.in2hidden(combined).sigmoid()
        output = self.in2output(combined).logsoftmax()
        return output, hidden
    
    def init_hidden(self):
        hidden = Tensor.zeros(1, self.hidden_size)
        return hidden

In [11]:
model = RecurrentNet(num_letters, 256, num_langs)
criterion = sparse_categorical_crossentropy
optimizer = optim.Adam(optim.get_parameters(model), lr=0.001)

In [12]:
Tensor.training = True
num_epochs = 2
plot_interval = 3000

for epoch in range(num_epochs):
    random.shuffle(train_dataset)
    for i, (name, label) in enumerate(train_dataset):
        hidden_state = model.init_hidden()
        for char in name:
            output, hidden_state = model(char, hidden_state)
        loss = criterion(output, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if (i + 1) % plot_interval == 0:
            print(f"epoch {epoch+1}/{num_epochs} step {i+1}/{len(train_dataset)} loss {loss.data[0]:2f}")

epoch 1/2 step 3000/18063 loss 0.130866
epoch 1/2 step 6000/18063 loss 0.010060
epoch 1/2 step 9000/18063 loss 0.038996
epoch 1/2 step 12000/18063 loss 0.003655
epoch 1/2 step 15000/18063 loss 0.000900
epoch 1/2 step 18000/18063 loss 0.039689
epoch 2/2 step 3000/18063 loss 1.132380
epoch 2/2 step 6000/18063 loss 1.772914
epoch 2/2 step 9000/18063 loss 1.132850
epoch 2/2 step 12000/18063 loss 2.128294
epoch 2/2 step 15000/18063 loss 0.699941
epoch 2/2 step 18000/18063 loss 0.229142


In [13]:
Tensor.training = False
num_correct = 0

for name, label in test_dataset:
    hidden_state = model.init_hidden()
    for char in name:
        output, hidden_state = model(char, hidden_state)
    pred = output.cpu().numpy().argmax()
    num_correct += (pred == label).sum()

f"test set accuracy is {num_correct / len(test_dataset) * 100:.4f}%"

'test set accuracy is 71.4499%'

In [14]:
label2lang = {label: lang for lang, label in lang2label.items()}

def predict(name):
    tensor_name = Tensor(name2array(name))
    hidden_state = model.init_hidden()
    for char in tensor_name:
        output, hidden_state = model(char, hidden_state)
    pred = output.cpu().numpy().argmax()
    return label2lang[pred]

In [15]:
predict("Mike")

'English'

In [16]:
predict("Qin")

'Chinese'

In [17]:
predict("Slaveya")

'Russian'