In [None]:
import string
import os
import random
import numpy as np
from unidecode import unidecode
from torch.utils.data import Dataset, DataLoader, BatchSampler
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data.dataset import random_split
from collections import Counter

In [None]:
NAMES_PATH = 'data/NameCountryPytorch/names'
ASCII = string.ascii_letters
ASCII_COUNT = len(ASCII)
COUNTRIES = os.listdir(NAMES_PATH)
BATCH_SIZE = 64

random.seed(1711)

def char_onehot(c):
    if c not in ASCII:
        return []
    
    result = [0.0] * ASCII_COUNT    
    result[ASCII.index(c)] = 1.0
    return result

In [None]:
def ascii_name_onehot(name):
    name_onehot = []

    for c in name:
        c = char_onehot(c)
        if c:
            name_onehot.append(c)

    return name_onehot

In [None]:
# From path to a whole Dataset
class NameDataset(Dataset):
    def __init__(self, name_original, name_unicode, name_tensor, label):
        self.name_original = name_original  # Each row is a original name
        self.name_unicode = name_unicode    # Each row is a unicoded name
        self.name_tensor = name_tensor      # Each row is a name tensor
        self.label = label                  # Each row is a country label

    def __len__(self):
        return len(self.label)
    
    def __getitem__(self, idx):
        if isinstance(idx, list):
            return [self.name_original[i] for i in idx],\
                [self.name_unicode[i] for i in idx],\
                [self.name_tensor[i] for i in idx],\
                [self.label[i] for i in idx]
        
        return self.name_original[idx],\
                self.name_unicode[idx],\
                self.name_tensor[idx],\
                self.label[idx]

In [None]:
class NameRNN(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=256):
        super().__init__()
        self.rnn = nn.RNN(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, nonlinearity='tanh', bias=True, batch_first=True)
        self.fcc = nn.Linear(in_features=hidden_dim, out_features=output_dim, bias=True)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x):
        rnn_out, hn_out = self.rnn(x)
        compare = (rnn_out[:,-1,:] == hn_out[-1]).all().item()
        if not compare:
            print(False)
        rnn_out = rnn_out[:,-1,:] # batch_size, last_word, output_dim
        rnn_logits = self.fcc(rnn_out)
        rnn_softmax = self.softmax(rnn_logits)
        return rnn_softmax

In [None]:
class NameGRU(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=256):
        super().__init__()
        self.gru = nn.GRU(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bias=True, batch_first=True)
        self.fcc = nn.Linear(in_features=hidden_dim, out_features=output_dim, bias=True)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x):
        gru_out, _ = self.gru(x)
        gru_out = gru_out[:,-1,:] # batch_size, last_word, output_dim
        gru_logits = self.fcc(gru_out)
        gru_softmax = self.softmax(gru_logits)
        return gru_softmax

In [None]:
class NameLSTM(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim=256):
        super().__init__()
        self.lstm = nn.LSTM(input_size=input_dim, hidden_size=hidden_dim, num_layers=1, bias=True, batch_first=True)
        self.fcc = nn.Linear(in_features=hidden_dim, out_features=output_dim, bias=True)
        self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:,-1,:] # batch_size, last_word, output_dim
        lstm_logits = self.fcc(lstm_out)
        lstm_softmax = self.softmax(lstm_logits)
        return lstm_softmax

In [None]:
def get_dataset_dict(path):
    dataset_dict = {} # name_length to dataset
    
    for filename in os.listdir(path):
        with open(path + "/" + filename, encoding='utf-8') as f:
            for name in f.readlines():
                name = name.strip()
                nameunicode = unidecode(name.replace(" ", ""))
                name_onehot = ascii_name_onehot(nameunicode)
                if name_onehot:
                    name_len = len(name_onehot)
                    if name_len not in dataset_dict:
                        dataset_dict[len(name_onehot)] = [[], [], [], []]
                    
                    dataset_dict[name_len][0].append(name)
                    dataset_dict[name_len][1].append(nameunicode)
                    dataset_dict[name_len][2].append(name_onehot)
                    dataset_dict[name_len][3].append(COUNTRIES.index(filename))

    result = {}
    for length, ds in dataset_dict.items():
        ds[2] = torch.tensor(ds[2], dtype=torch.float)
        ds[3] = torch.tensor(ds[3], dtype=torch.long)
        result[length] = NameDataset(ds[0], ds[1], ds[2], ds[3])
    
    return result
# train_set, test_set = torch.utils.data.random_split(alldata, [.85, .15], 

In [None]:
dataset_dict = get_dataset_dict(NAMES_PATH)
dataloader_dict = {i: DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True) for i, ds in dataset_dict.items()}

In [None]:
def train(model):
    epochs = 200
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    for epoch in range(epochs):
        epoch_loss = 0
        for _, dataloader in dataloader_dict.items():
            dl_loss = 0
            for dl in dataloader:
                origin_name, unicode_name, onehot_name, country_label = dl            
                x_out = model(onehot_name)
                loss = F.cross_entropy(x_out, country_label, reduction='mean')
                
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                dl_loss += loss.item()
            epoch_loss += dl_loss / len(dataloader)
        if epoch % 50 == 0:
            print(f"Epoch={epoch}: Loss={epoch_loss}")

In [None]:
def make_prediction(model, name):
    model.eval()
    name = unidecode(name)
    name_onehot = ascii_name_onehot(name)
    name_onehot = torch.tensor(name_onehot)
    name_onehot = name_onehot.unsqueeze(0)  # A batch of one datapoint
    output = model(name_onehot)
    predicted_idx = output.argmax(dim=1)
    return COUNTRIES[predicted_idx.item()]

In [None]:
def eval(model):
    for file_name in os.listdir(NAMES_PATH):
        pred_countries = []
        with open(NAMES_PATH + "/" + file_name, encoding='utf-8') as f:
            for line in f.readlines():
                name = line.strip().replace(" ", "")
                predicted_country = make_prediction(model, name)
                pred_countries.append(predicted_country)
        freq_dict = dict(Counter(pred_countries))
        acc = freq_dict.get(file_name, 0) / len(pred_countries) * 100
        print(f"{file_name}: {acc:.2f}%")

In [None]:
rnn_model = NameRNN(input_dim=ASCII_COUNT, output_dim=len(COUNTRIES))
train(rnn_model)
eval(rnn_model)

In [None]:
gru_model = NameGRU(input_dim=ASCII_COUNT, output_dim=len(COUNTRIES))
train(gru_model)
eval(gru_model)

In [None]:
lstm_model = NameLSTM(input_dim=ASCII_COUNT, output_dim=len(COUNTRIES))
train(lstm_model)
eval(lstm_model)