# **NATIONALITY PREDICTION**

The goal of this notebook is to create a model that can predict nationalities from name strings.

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pack_padded_sequence
import country_converter as coco
from data_util import NameNationalityData, NameNationalityDataStream

device: str = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

MAXIMUM_NAME_LENGTH: int = 50 # maximum number of characters
BATCH_SIZE: int = 1024 # number of training examples per batch
N_EVAL: int = 100 # evaluate loss every n batches
N_TRAINING_STEPS: int = 10000 # number of trainings steps 

# read country codes
with open('./data/.country_codes', 'r') as f:
    COUNTRY_CODES: list = f.read().splitlines()

#read vocabulary (all unique characters used in the dataset)
with open('./data/.vocabulary', 'r') as f:
    VOCABULARY: str = f.read()

# generate country code mappings
target_class: str = 'UNregion' # see country_converter documentation on PyPI for available classes
COUNTRY_MAPPING: dict = {cc: coco.convert(names=cc, to=target_class) for cc in COUNTRY_CODES} 

### **IMPORT DATA**

- train.csv gets streamed in chunks
- val.csv will be loaded into memory as a whole
- name strings will be encoded as integer tensors where index i maps to the i-th character in the vocabulary
- zero will be used as padding index, names longer than max_name_length will be truncated
- the tensors will have a shape of (batch_size, max_name_length)
- the dataset also generates a tensor of shape (batch_size) that holds the sequence length (number of characters) of the current name
- countries will be converted to one-hot-encoded tensors of shape (batch_size, n_countries+1) where n_countries is the number of output classes in the COUNTRY_MAPPING dictionary

In [None]:
train_data = NameNationalityDataStream(
    data_file='./data/train.csv',
    chunksize=100*BATCH_SIZE,
    maximum_name_length=MAXIMUM_NAME_LENGTH,
    vocabulary=VOCABULARY,
    country_codes=COUNTRY_CODES,
    country_mapping=COUNTRY_MAPPING
)
train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE)

In [None]:
val_data = NameNationalityData(
    data_file='./data/val.csv',
    maximum_name_length=MAXIMUM_NAME_LENGTH,
    vocabulary=VOCABULARY,
    country_codes=COUNTRY_CODES,
    country_mapping=COUNTRY_MAPPING
)
val_dataloader = DataLoader(val_data, batch_size=N_EVAL*BATCH_SIZE, drop_last=True, shuffle=True)

### **MODELING**

- Create simple model using character embeddings, rnn layers and a dense layer
- embedding layer maps input tensor of shape (batch_size, max_name_length) to embedding tensor of shape (batch_size, max_name_length, embedding_dim)
- the embedding tensor and sequence_lengths tensor will be used to [pack a padded batch](https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pack_padded_sequence.html), which enables variable length inputs
- the packed sequence will be passed to the rnn layer 
- the hidden state of the last rnn layer will be used passed through a dense layer to create an output of shape (batch_size, n_countries+1), where where n_countries is the number of output classes in the COUNTRY_MAPPING dictionary

In [45]:
class RNN_Nationality_Predictor(nn.Module):
    """
    A PyTorch-based RNN model for predicting nationality from a name.

    This model embeds input characters and processes them using a recurrent layer,
    which can be instantiated as a vanilla RNN, GRU, or LSTM. It leverages sequence
    packing to efficiently handle variable-length inputs, and uses the final hidden state
    from the RNN to produce class logits through a dense layer.

    Parameters
    ----------
    architecture : str
        The type of RNN to use. Must be one of: 'RNN', 'GRU', or 'LSTM'.
    embedding_dim : int
        The dimension of the embedding space for input characters.
    hidden_size : int
        The number of features in the hidden state of the RNN.
    num_rnn_layers : int
        The number of recurrent layers (stacked) in the RNN.
    dropout : float
        Dropout probability applied between RNN layers.

    Attributes
    ----------
    embed : nn.Embedding
        The embedding layer that converts input indices to dense vectors.
    rnn : nn.Module
        The recurrent layer (RNN, GRU, or LSTM) that processes the embedded sequence.
    dense : nn.Linear
        A linear layer that maps the final hidden state to output logits corresponding
        to the target nationality classes.
    """
    def __init__(self, architecture, embedding_dim, hidden_size, num_rnn_layers, dropout):
        super().__init__()
        # hyperparameters
        self.architecture = architecture
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.num_rnn_layers = num_rnn_layers
        self.dropout = dropout

        # embedding layer
        self.embed = nn.Embedding(
            num_embeddings=len(VOCABULARY)+1,
            embedding_dim=self.embedding_dim,
            padding_idx=0
        )

        # rnn layers
        if architecture == 'RNN':
            rnn_constructor = nn.RNN
        elif architecture == 'GRU':
            rnn_constructor = nn.GRU
        elif architecture == 'LSTM':
            rnn_constructor = nn.LSTM
        else:
            raise NameError("architecture must be 'RNN', 'GRU' or 'LSTM'")
        self.rnn = rnn_constructor(
            input_size=self.embedding_dim,
            hidden_size=self.hidden_size,
            num_layers=self.num_rnn_layers,
            dropout=self.dropout,
            batch_first=True,
        )
        
        # dense layer
        self.dense = nn.Linear(
            in_features=self.hidden_size,
            out_features=len(set(COUNTRY_MAPPING.values()))+1,
        )

    def forward(self, X, lengths):
        embeddings = self.embed(X)

        # Pack the padded batch
        packed = pack_padded_sequence(
            embeddings,
            lengths=lengths,
            batch_first=True,
            enforce_sorted=False
        )
        if self.architecture == 'LSTM':
            _, (hidden, _) = self.rnn(packed) # output and cell state ignored
        else:
            _, hidden = self.rnn(packed) # output ignored
        logits = self.dense(hidden[-1])
        return logits

In [46]:
model = RNN_Nationality_Predictor(
    architecture='LSTM',
    embedding_dim=64,
    hidden_size=128,
    num_rnn_layers=3,
    dropout=0.3
)
model = model.to(device)
model.train()
criterion = F.binary_cross_entropy_with_logits
optimizer = optim.AdamW(model.parameters(), lr=3e-4)

In [None]:
def train(
        n_training_steps: int,
        n_eval: int
    ) -> None:
    batch_number: int = 1
    losses: list = []
    val_losses: list = []

    while True:
        for X, y, sequence_lenghts in train_dataloader:
            batch_number += 1
            model.train()
            X, y = X.to(device), y.to(device)
            logits = model(X, sequence_lenghts)
            loss = criterion(logits, y)
            losses.append(loss.item())
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            with torch.no_grad():
                if batch_number%n_eval==0:
                    model.eval()
                    X, y, sequence_lenghts = next(iter(val_dataloader))
                    X, y = X.to(device), y.to(device)
                    logits = model(X, sequence_lenghts)
                    val_loss = criterion(logits, y)
                    val_losses.append(val_loss.item())
                    print(f'batch {batch_number} --- mean training loss over last {n_eval} batches: {np.mean(losses[-n_eval:]):.5f} --- validation loss: {val_loss:.5f}')
            if batch_number >= n_training_steps:
                return batch_number, losses, val_losses
            
batch_number, losses, val_losses = train(N_TRAINING_STEPS, N_EVAL)

In [None]:
# Test names for all 19 regions
test_names = {
    # Africa
    "Northern Africa": "Abdel Fattah el-Sisi", # Egypt
    "Middle Africa": "João Lourenço", # Angola
    "Western Africa": "Bola Ahmed Tinubu", # Nigeria
    "Eastern Africa": "Taye Atske Selassie", # Ethiopia
    "Southern Africa": "Cyril Ramaphosa", # South Africa

    # Asia
    "Central Asia": "Qassym-Schomart Kemeluly Toqajew", # Kazakhstan
    "Eastern Asia": "Xi Jinping", # China
    "South-Eastern Asia": "Prabowo Subianto", # Indonesia
    "Southern Asia": "Droupadi Murmu", # India
    "Western Asia": "Recep Tayyip Erdoğan", # Turkey

    # Europe
    "Northern Europe": "Ulf Kristersson", # Sweden
    "Western Europe": "Olaf Scholz", # Germany
    "Southern Europe": "Giorgia Meloni", # Italy
    "Eastern Europe": "Andrzej Sebastian Duda", # Poland

    # Americas
    "Northern America": "Donald Trump", # United States
    "Central America": "Andrés Manuel López Obrador", # Mexico
    "Caribbean": "Andrew Holness", # Jamaica
    "South America": "Luiz Inácio Lula da Silva", # Brazil

    # Oceania
    "Oceania": "Anthony Albanese" # Australia
}

# run test on test names
model.eval()
tensor, length = train_data._encode_name(list(test_names.values()))
tensor = tensor.to(device)
logits = model(tensor, length)
countries_list = train_data._decode_country(logits)
preds = dict(zip(test_names.values(), countries_list))

# define column widths
name_width = 40
actual_width = 20
predicted_width = 20
correct_width = 10

# print output header
header = f"{'Name':<{name_width}} {'Actual Class':<{actual_width}} {'Predicted Class':<{predicted_width}} {'Correct?':<{correct_width}}"
print(header)
print("-" * (name_width + actual_width + predicted_width + correct_width))

# loop through test names and format outputs
total = 0
correct_count = 0
for actual_class, name in test_names.items():
    predicted_class = preds.get(name, "N/A")
    is_correct = predicted_class == actual_class
    correct_str = "Yes" if is_correct else "No"
    if is_correct:
        correct_count += 1
    total += 1
    row = f"{name:<{name_width}} {actual_class:<{actual_width}} {predicted_class:<{predicted_width}} {correct_str:<{correct_width}}"
    print(row)
accuracy = (correct_count / total) * 100
print(f'\nAccuracy: {accuracy:.2f}%')