In [1]:
import os
import random
from string import ascii_letters

import torch
from torch import nn
import torch.nn.functional as F
from unidecode import unidecode

In [2]:
# Get data
# !curl -O https://download.pytorch.org/tutorial/data.zip; unzip data.zip

In [3]:
# Set torch variables
_ = torch.manual_seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [19]:
# Assign int label to each language name
data_dir = "./data/names"
LANGUAGE_TO_LABEL = {
    file_name.split(".")[0]: torch.tensor([i], dtype=torch.long) for i, file_name in enumerate(os.listdir(data_dir))
}
NUM_LANGUAGES = len(LANGUAGE_TO_LABEL)
LANGUAGE_TO_LABEL

{'Czech': tensor([0]),
 'German': tensor([1]),
 'Arabic': tensor([2]),
 'Japanese': tensor([3]),
 'Chinese': tensor([4]),
 'Vietnamese': tensor([5]),
 'Russian': tensor([6]),
 'French': tensor([7]),
 'Irish': tensor([8]),
 'English': tensor([9]),
 'Spanish': tensor([10]),
 'Greek': tensor([11]),
 'Italian': tensor([12]),
 'Portuguese': tensor([13]),
 'Scottish': tensor([14]),
 'Dutch': tensor([15]),
 'Korean': tensor([16]),
 'Polish': tensor([17])}

In [5]:
# Assign label to each character in vocabulary
CHAR_TO_IDX = {letter: i for i, letter in enumerate(ascii_letters + " .,:;-'")}
NUM_LETTERS = len(CHAR_TO_IDX)
NUM_LETTERS

59

In [6]:
def name_to_tensor(name, char_vocab_size=NUM_LETTERS, char_to_int_mapping=CHAR_TO_IDX):
    """Function to embed a name as a matrix, where each character in the name is a one-hot tensor
    Each matrix also has a batch dimension of 1"""
    emb = torch.zeros(len(name), 1, char_vocab_size)
    for i, char in enumerate(name):
        emb[i][0][char_to_int_mapping[char]] = 1
    return emb

In [7]:
names_tensor = []
target_langs = []

for file in os.listdir(data_dir):
    with open(os.path.join(data_dir, file)) as f:
        language = file.split('.')[0]
        names = [unidecode(line.rstrip()) for line in f]
        for name in names:
            try:
                names_tensor.append(name_to_tensor(name))
                target_langs.append(LANGUAGE_TO_LABEL[language])
            except KeyError:
                pass

In [16]:
from sklearn.model_selection import train_test_split


# Split data into train and test sets
train_idx, test_idx = train_test_split(
    range(len(target_langs)),
    test_size=0.1,
    shuffle=True,
    stratify=target_langs
)

# Get train dataset
train_dataset = [(names_tensor[i], target_langs[i]) for i in train_idx]

# Get test dataset
test_dataset = [(names_tensor[i], target_langs[i]) for i in test_idx]

print(f"Train dataset: {len(train_dataset)}")
print(f"Test dataset: {len(test_dataset)}")

Train dataset: 18063
Test dataset: 2007


In [18]:
# Build a simple RNN model:
# Takes in a single character & produces a prediction and a hidden state
class SimpleRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleRNN, self).__init__()
        self.hidden_size = hidden_size
        self.in_to_hidden = nn.Linear(input_size + hidden_size, hidden_size)
        self.in_to_output = nn.Linear(input_size + hidden_size, output_size)
    
    def forward(self, x, hidden_state):
        combined = torch.cat((x, hidden_state), 1)
        hidden = torch.sigmoid(self.in_to_hidden(combined))
        output = self.in_to_output(combined)
        return output, hidden
    
    def init_hidden(self):
        return nn.init.kaiming_uniform_(torch.empty(1, self.hidden_size))

In [20]:
hidden_size = 256
learning_rate = 0.001

# Instantiate model
model = SimpleRNN(NUM_LETTERS, hidden_size, NUM_LANGUAGES)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [27]:
num_epochs = 2
print_interval = 3000

# Training loop
for epoch in range(num_epochs):
    # Shuffle data
    random.shuffle(train_dataset)
    
    # Each batch contains just one name
    for i, (name, label) in enumerate(train_dataset):
        # Reinit hidden state for each name
        hidden_state = model.init_hidden()

        # Zero gradients
        optimizer.zero_grad()

        # Get prediction for name
        for char in name:
            output, hidden_state = model(char, hidden_state)
        
        # Compute loss
        loss = criterion(output, label)
        loss.backward()
        # Clip gradients
        nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()

        if (i + 1) % print_interval == 0:
            print(
                f"Epoch [{epoch + 1}/{num_epochs}], "
                f"Step [{i + 1}/{len(train_dataset)}], "
                f"Loss: {loss.item():.4f}"
            )

Epoch [1/2], Step [3000/18063], Loss: 2.2671
Epoch [1/2], Step [6000/18063], Loss: 0.0082
Epoch [1/2], Step [9000/18063], Loss: 1.0068
Epoch [1/2], Step [12000/18063], Loss: 0.0156
Epoch [1/2], Step [15000/18063], Loss: 0.0025
Epoch [1/2], Step [18000/18063], Loss: 0.0220
Epoch [2/2], Step [3000/18063], Loss: 0.0000
Epoch [2/2], Step [6000/18063], Loss: 0.0006
Epoch [2/2], Step [9000/18063], Loss: 0.0001
Epoch [2/2], Step [12000/18063], Loss: 0.0000
Epoch [2/2], Step [15000/18063], Loss: 0.0000
Epoch [2/2], Step [18000/18063], Loss: 0.0000


In [32]:
# Compute accuracy of model
num_correct = 0
num_samples = len(test_dataset)

model.eval()
with torch.no_grad():
    for name, label in test_dataset:
        hidden_state = model.init_hidden()
        for char in name:
            output, hidden_state = model(char, hidden_state)
        _, pred = torch.max(output, dim=1)
        num_correct += int(pred == label)

print(f"Accuracy: {num_correct / num_samples * 100:.4f}%")

Accuracy: 78.6248%


In [34]:
LABEL_TO_LANGUAGE = {label.item(): lang for lang, label in LANGUAGE_TO_LABEL.items()}

def simple_rnn_predict(name):
    model.eval()
    name_tensor = name_to_tensor(name)
    with torch.no_grad():
        hidden_state = model.init_hidden()
        for char in name_tensor:
            output, hidden_state = model(char, hidden_state)
        _, pred = torch.max(output, dim=1)    
    return LABEL_TO_LANGUAGE[pred.item()]

In [46]:
print("Jeffrey:", simple_rnn_predict("Jeffrey"))
print("Qin:", simple_rnn_predict("Qin"))
print("Slaveya:", simple_rnn_predict("Slaveya"))
print("Michael:", simple_rnn_predict("Michael"))
print("Vladimir:", simple_rnn_predict("Vladimir"))
print("Stavrios:", simple_rnn_predict("Stavrios"))

Jeffrey: English
Qin: Chinese
Slaveya: Russian
Michael: German
Vladimir: Russian
Stavrios: Greek


In [47]:
class GRUModel(nn.Module):
    def __init__(self, num_layers, hidden_size):
        super(GRUModel, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.gru = nn.GRU(
            input_size=NUM_LETTERS, 
            hidden_size=hidden_size, 
            num_layers=num_layers,
        )
        self.fc = nn.Linear(hidden_size, NUM_LANGUAGES)
    
    def forward(self, x):
        hidden_state = self.init_hidden()
        output, hidden_state = self.gru(x, hidden_state)
        output = self.fc(output[-1])
        return output
    
    def init_hidden(self):
        return torch.zeros(self.num_layers, 1, self.hidden_size).to(device)

In [48]:
# Instantiate GRU model using same training hyperparameters
model = GRUModel(num_layers=2, hidden_size=hidden_size)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [49]:
# Training loop
for epoch in range(num_epochs):
    # Shuffle data
    random.shuffle(train_dataset)
    
    for i, (name, label) in enumerate(train_dataset):
        # Zero gradients
        optimizer.zero_grad()

        # Get predictions
        output = model(name)

        # Compute loss
        loss = criterion(output, label)
        loss.backward()
        # Clip gradients
        nn.utils.clip_grad_norm_(model.parameters(), 1)
        optimizer.step()
         
        if (i + 1) % print_interval == 0:
            print(
                f"Epoch [{epoch + 1}/{num_epochs}], "
                f"Step [{i + 1}/{len(train_dataset)}], "
                f"Loss: {loss.item():.4f}"
            )

Epoch [1/2], Step [3000/18063], Loss: 0.1305
Epoch [1/2], Step [6000/18063], Loss: 0.0002
Epoch [1/2], Step [9000/18063], Loss: 3.5073
Epoch [1/2], Step [12000/18063], Loss: 6.0814
Epoch [1/2], Step [15000/18063], Loss: 0.0925
Epoch [1/2], Step [18000/18063], Loss: 0.0021
Epoch [2/2], Step [3000/18063], Loss: 0.2745
Epoch [2/2], Step [6000/18063], Loss: 0.0004
Epoch [2/2], Step [9000/18063], Loss: 0.0005
Epoch [2/2], Step [12000/18063], Loss: 0.0001
Epoch [2/2], Step [15000/18063], Loss: 0.0003
Epoch [2/2], Step [18000/18063], Loss: 0.8954


In [50]:
num_correct = 0

model.eval()
with torch.no_grad():
    for name, label in test_dataset:
        output = model(name)
        _, pred = torch.max(output, dim=1)
        num_correct += int(pred == label)

print(f"Accuracy: {num_correct / num_samples * 100:.4f}%")

Accuracy: 79.7708%


In [51]:
def gru_predict(name):
    model.eval()
    tensor_name = name_to_tensor(name)
    with torch.no_grad():
        output = model(tensor_name)
        _, pred = torch.max(output, dim=1)
    model.train()
    return LABEL_TO_LANGUAGE[pred.item()]

In [52]:
print("Jeffrey:", gru_predict("Jeffrey"))
print("Qin:", gru_predict("Qin"))
print("Slaveya:", gru_predict("Slaveya"))
print("Michael:", gru_predict("Michael"))
print("Vladimir:", gru_predict("Vladimir"))
print("Stavrios:", gru_predict("Stavrios"))

Jeffrey: English
Qin: Chinese
Slaveya: Italian
Michael: Irish
Vladimir: Russian
Stavrios: Greek
