In [1]:
import pandas as pd

df = pd.read_csv('data/name_country.csv')
text_data = df['Name'].tolist()
label_data = df['Country'].tolist()

country_list = sorted(set(label_data))
country_count = len(country_list)

data_dict = {}  # key: country, value: list of names
for name, country in zip(text_data, label_data):
    if country not in data_dict:
        data_dict[country] = []
    data_dict[country].append(name)

print(data_dict)

{'Czech': ['abl', 'adsit', 'ajdrna', 'alt', 'antonowitsch', 'antonowitz', 'bacon', 'ballalatak', 'ballaltick', 'bartonova', 'bastl', 'baroch', 'benesch', 'betlach', 'biganska', 'bilek', 'blahut', 'blazek', 'blazek', 'blazejovsky', 'blecha', 'bleskan', 'blober', 'bock', 'bohac', 'bohunovsky', 'bolcar', 'borovka', 'borovski', 'borowski', 'borovsky', 'brabbery', 'brezovjak', 'brousil', 'bruckner', 'buchta', 'cablikova', 'camfrlova', 'cap', 'cerda', 'cermak', 'chermak', 'cermak', 'cernochova', 'cernohous', 'cerny', 'cerney', 'cerny', 'cerv', 'cervenka', 'chalupka', 'charlott', 'chemlik', 'chicken', 'chilar', 'chromy', 'cihak', 'clineburg', 'klineberg', 'cober', 'colling', 'cvacek', 'czabal', 'damell', 'demall', 'dehmel', 'dana', 'dejmal', 'dempko', 'demko', 'dinko', 'divoky', 'dolejsi', 'dolezal', 'doljs', 'dopita', 'drassal', 'driml', 'duyava', 'dvorak', 'dziadik', 'egr', 'entler', 'faltysek', 'faltejsek', 'fencl', 'fenyo', 'fillipova', 'finfera', 'finferovy', 'finke', 'fojtikova', 'fremu

In [2]:
unique_chars = set()

for name in df['Name']:
    unique_chars.update(name)
    
unique_chars = sorted(list(unique_chars))
all_letters = ''.join(unique_chars)
print(all_letters)

 ',abcdefghijklmnopqrstuvwxyz


In [None]:
import torch

n_letters = len(all_letters)

def nameToTensor(name):
    tensor = torch.zeros(len(name), n_letters)
    for char_idx, char in enumerate(name):
        letter_idx = all_letters.find(char)
        assert letter_idx != -1, f"char is {name}, {char}"
        tensor[char_idx][letter_idx] = 1
    return tensor

In [None]:
from myRNN import *

n_hidden = 32
rnn_model = MyRNN(n_letters, n_hidden, country_count)

In [None]:
import random
from torch.optim import Adam, SGD

loss_fn = nn.CrossEntropyLoss()
optimizer = Adam(rnn_model.parameters(), lr=0.001)  # Adam 1.2647557258605
iter_count = 100000
crnt_loss = 0.
correct_predictions = 0

rnn_model.train()
for iter_idx in range(iter_count):
    random_country = random.choice(list(data_dict.keys()))
    random_name = random.choice(data_dict[random_country])

    name_tensor = nameToTensor(random_name)
    country_tensor = torch.tensor([country_list.index(random_country)], dtype=torch.long)
    hidden = rnn_model.get_hidden()
    rnn_model.zero_grad()

    for char_idx in range(len(random_name)):
        char_tensor = name_tensor[char_idx]
        output, hidden = rnn_model(char_tensor[None,:], hidden)

    loss = loss_fn(output, country_tensor)
    loss.backward()
    optimizer.step()

    crnt_loss += loss.item()
    predicted_index = torch.argmax(output, 1)
    correct_predictions += (predicted_index == country_tensor).sum().item()

    if iter_idx % 5000 == 0 and iter_idx != 0:
        average_loss = crnt_loss / 5000
        accuracy = 100 * correct_predictions / 5000
        print(f'Iter idx {iter_idx}, Loss: {average_loss:.4f}, Accuracy: {accuracy:.2f}%')
        crnt_loss = 0
        correct_predictions = 0
        total_predictions = 0

Iter idx 5000, Loss: 2.1843, Accuracy: 32.30%
Iter idx 10000, Loss: 1.6822, Accuracy: 44.54%
Iter idx 15000, Loss: 1.5392, Accuracy: 49.22%
Iter idx 20000, Loss: 1.4653, Accuracy: 50.68%
Iter idx 25000, Loss: 1.4045, Accuracy: 53.66%
Iter idx 30000, Loss: 1.3747, Accuracy: 55.50%
Iter idx 35000, Loss: 1.3133, Accuracy: 56.92%
Iter idx 40000, Loss: 1.2724, Accuracy: 58.60%
Iter idx 45000, Loss: 1.2223, Accuracy: 59.24%
Iter idx 50000, Loss: 1.2224, Accuracy: 60.14%
Iter idx 55000, Loss: 1.1717, Accuracy: 61.46%
Iter idx 60000, Loss: 1.1636, Accuracy: 61.84%
Iter idx 65000, Loss: 1.1677, Accuracy: 62.06%
Iter idx 70000, Loss: 1.0940, Accuracy: 64.38%
Iter idx 75000, Loss: 1.1083, Accuracy: 63.06%
Iter idx 80000, Loss: 1.1129, Accuracy: 63.20%
Iter idx 85000, Loss: 1.0822, Accuracy: 65.00%
Iter idx 90000, Loss: 1.0693, Accuracy: 64.96%
Iter idx 95000, Loss: 1.0172, Accuracy: 66.20%


In [None]:
test_name = 'jinping'
test_name_tensor = nameToTensor(test_name)

rnn_model.eval()
hidden = rnn_model.get_hidden()
for char_idx in range(len(test_name)):
    char_tensor = test_name_tensor[char_idx]
    output, hidden = rnn_model(char_tensor[None,:], hidden)
predicted_index = torch.argmax(output, 1).item()
print(country_list[predicted_index])

Chinese
