# **NATIONALITY PREDICTION**

The goal of this notebook is to create a model that can predict nationalities from name strings.

In [1]:
import os
import pycountry
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
from unidecode import unidecode

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


### **IMPORT DATA**

- Import data from selected (todo: all) CSV files
- Use n (todo: all) samples from imported dataframe
- Concatenate dataframes, generate 'name' and 'nationality' columns
- Create vocabularies for inputs and outputs:
- Input vocabulary links characters to index integers and vice versa
- Output vocabulary links nationalities to index integers and vice versa

In [7]:
n_samples = 70_000
country_codes = ['MX', 'RU', 'NO', 'IT', 'HK', 'AE', 'GB']
#country_codes = [c.split('.')[0] for c in os.listdir('./data/')]

In [96]:
class NameNationalityData(Dataset):
    """Name Nationality dataset."""

    def __init__(self, n_samples, country_codes):
        self.padding_index = 0
        countries = {}
        for country in pycountry.countries:
            countries[country.alpha_2] = country.name
        dfs = []

        for alpha2 in country_codes:
            tmp = pd.read_csv(
                './data/' + alpha2 + '.csv',
                index_col=False,
                header=None,
                names=['forename', 'surname', 'gender', 'alpha2'],
                dtype={'forename':'string', 'surname':'string', 'gender':'string', 'alpha2':'string'}
            )
            l = len(tmp)
            tmp = tmp.dropna(subset=['forename', 'surname', 'alpha2'])
            print(f'imported file: "{alpha2}.csv". number of records: {l}. dropped {l-len(tmp)} records because of missing values.')
            tmp = tmp.sample(n=n_samples//len(country_codes))
            dfs.append(tmp)

        # concatenate country dataframes
        df = pd.concat(dfs).reset_index(drop=True)
        print(f'final dataset has {len(df)} records.')

        # construct name and nationality, drop unnecessary columns
        df['name'] = df['forename'] + ' ' + df['surname']
        df = df[['name', 'alpha2']]
        self.maximum_name_length = df['name'].str.len().max()

        # generate input and output vocabularies
        (self.ctoi_input,
         self.itoc_input,
         self.input_vocabulary_length) = self._generate_input_vocabulary(
             df['name'].to_list()
        )
        (self.ctoi_output,
         self.itoc_output,
         self.output_vocabulary_length) = self._generate_output_vocabulary(
             df['alpha2'].unique()
        )

        # encode inputs as padded index tensors
        (self.X,
         self.sequence_lengths) = self._encode_input(
             df['name'].to_list()
        )
        # encode outputs as one-hot index
        self.y = self._encode_output(
            df['alpha2'].to_list()
        )


    def _generate_input_vocabulary(self, names_list):
        """
        Builds a vocabulary of unique characters from the provided list of names. 
        Each character is assigned an integer index, starting at 1 so that 0 can be 
        used as a padding index. This method also prints out the generated vocabulary.

        Parameters
        ----------
        names_list : list of str
            A list of names from which the character vocabulary is constructed.

        Returns
        -------
        tuple of (dict, dict, int)
            - ctoi_input : dict
                Mapping from character to integer index.
            - itoc_input : dict
                Mapping from integer index back to character.
            - input_vocabulary_length : int
                The number of unique characters in input vocabulary
        """
        input_vocabulary = sorted(list(set(''.join(names_list))))
        input_vocabulary_length = len(input_vocabulary)
        print(f"Input vocabulary of length {len(input_vocabulary)}:\n{''.join(input_vocabulary)}")
        ctoi_input = {c:i for i, c in enumerate(input_vocabulary, 1)} # start enumeration at 1 because 0 is padding index
        itoc_input = {i:c for i, c in enumerate(input_vocabulary, 1)}
        return ctoi_input, itoc_input, input_vocabulary_length

    def _generate_output_vocabulary(self, alpha2_list):
        """
        Builds a sorted list of unique output labels (country codes), then creates 
        mappings from label to integer index (starting at 1 so 0 can be used for padding) and from integer index 
        back to the label. It also returns the total number of unique labels.

        Parameters
        ----------
        alpha2_list : list of str
            A list of country codes to be included in the output vocabulary.

        Returns
        -------
        tuple of (dict, dict, int)
            - ctoi_output : dict
                Mapping from country code string to integer index.
            - itoc_output : dict
                Mapping from integer index back to the country code string.
            - output_vocabulary_length : int
                The number of unique output labels in the vocabulary.
        """
        output_vocabulary = sorted(list(alpha2_list))
        output_vocabulary_length = len(output_vocabulary)
        print(f'Output vocabulary of length {len(output_vocabulary)}:\n{output_vocabulary}')
        ctoi_output = {c:i for i, c in enumerate(output_vocabulary, 1)} # start enumeration at 1 because 0 is padding index
        itoc_output = {i:c for i, c in enumerate(output_vocabulary, 1)}
        return ctoi_output, itoc_output, output_vocabulary_length

    def _encode_input(self, seq):
        """
        Encodes a single string or a list of strings into integer indices based on `self.ctoi_input`,
        replacing unmapped characters with `self.padding_index`. Each encoded sequence is then padded
        to `self.maximum_name_length`, and the original (unpadded) lengths are recorded.

        Parameters
        ----------
        seq : str or list of str
            The input string(s) to be converted.

        Returns
        -------
        (torch.Tensor, torch.Tensor)
            - padded_tensors : torch.Tensor
                A batch-first tensor of shape (batch_size, self.maximum_name_length) containing 
                the padded integer-encoded sequences.
            - sequence_lengths : torch.Tensor
                A tensor of shape (batch_size,) indicating the original lengths of each sequence 
                before padding.
        """
        assert isinstance(seq, (str, list)), "Input must be string or list of strings"
        if isinstance(seq, str): 
            seq = [seq] # wrap single string into a list of a single string
        encoded_input = []
        for s in seq:
            assert isinstance(s, str), "Each element in the list must be a string"
            # Convert each character to an index, defaulting to padding_index if not found
            encoded_input.append([self.ctoi_input.get(char, self.padding_index) for char in s])
        sequence_lengths = torch.tensor([len(encoding) for encoding in encoded_input], dtype=torch.int32)
        # create empty tensor
        batch_size = len(encoded_input)
        padded_tensors = torch.full(
            (batch_size, self.maximum_name_length),
            self.padding_index,
            dtype=torch.int32
        )
        # fill empty tensor with actual data
        for i, encoding in enumerate(encoded_input):
            seq_len = len(encoding)
            # truncate if the sequence is longer than maximum_name_length
            max_len = min(seq_len, self.maximum_name_length)
            padded_tensors[i, :max_len] = torch.tensor(encoding[:max_len], dtype=torch.int32)
        return padded_tensors, sequence_lengths
 
    def _decode_input(self, seq_tensor):
        """
        Decodes a 1D or 2D tensor of integer indices into characters using the `self.itoc_input` mapping.
        
        - If `seq_tensor` is 1D (shape: [N]), it decodes a single sequence of characters.
        - If `seq_tensor` is 2D (shape: [B, N]), it decodes multiple sequences (one per row).

        Parameters
        ----------
        seq_tensor : torch.Tensor
            A 1D or 2D tensor of integer indices.

        Returns
        -------
        list of str or list of list of str
            - If `seq_tensor` is 1D, returns a list of decoded characters for that single sequence.
            - If `seq_tensor` is 2D, returns a list of lists of decoded characters, one list per row.
        """
        if not isinstance(seq_tensor, torch.Tensor):
            raise TypeError("seq_tensor must be a torch.Tensor of integer indices.")
        if seq_tensor.dim() == 1:
            return ''.join([self.itoc_input.get(int(idx), '') for idx in seq_tensor])
        elif seq_tensor.dim() == 2:
            decoded_sequences = []
            for row in seq_tensor:
                decoded_sequences.append(''.join([self.itoc_input.get(int(idx), '') for idx in row]))
            return decoded_sequences
        else:
            raise ValueError("seq_tensor must be a 1D or 2D tensor of integer indices.")

    def _encode_output(self, country_code):
        """
        Encodes a single country code string or a list of strings into integer indices 
        using `self.ctoi_output`, then converts them to a one-hot representation. Any 
        code not found in `self.ctoi_output` is mapped to `self.padding_index`.

        Parameters
        ----------
        country_code : str or list of str
            The country code(s) to encode.

        Returns
        -------
        torch.Tensor
            A one-hot encoded tensor (dtype=torch.float64) of shape 
            (num_items, self.output_vocabulary_length + 1), 
            where 'num_items' is 1 if `country_code` is a single string, 
            or len(country_code) if it is a list.
        """
        assert isinstance(country_code, (str, list)), 'Input must be string or list of strings'
        if isinstance(country_code, str):
            encoded_output = [self.ctoi_output.get(country_code, self.padding_index)]
        elif isinstance(country_code, list):
            encoded_output = []
            for c in country_code:
                assert isinstance(c, str), 'Input must be string or list of strings'
                encoded_output.append(self.ctoi_output.get(c, self.padding_index))
        index_tensors = torch.tensor(encoded_output, dtype=torch.int64)
        encoded_tensors = F.one_hot(index_tensors, num_classes=self.output_vocabulary_length+1).to(torch.float64)
        return encoded_tensors      

    def _decode_output(self, country_code_tensor):
        """
        Decodes a single integer or a list of integer country codes into their corresponding
        string labels using the `self.itoc` dictionary. If a code is not found in the mapping,
        'Unknown' is returned.

        Parameters
        ----------
        country_code : int or list of int
            The country code(s) to decode.

        Returns
        -------
        str or list of str
            A single decoded string if `country_code` is an integer,
            or a list of decoded strings if `country_code` is a list.
        """
        if not isinstance(country_code_tensor, torch.Tensor):
            raise TypeError("country_code_tensor must be a torch.Tensor of integer indices.")
        if country_code_tensor.dim() == 1:
            return [self.itoc_output.get(torch.argmax(country_code_tensor).item(), 'Unknown')]
        elif country_code_tensor.dim() == 2:
            decoded_output = []
            for encoding in country_code_tensor:
                index = torch.argmax(encoding).item()
                decoded_output.append(self.itoc_output.get(index, 'Unknown'))
            return decoded_output
        else:
            raise ValueError("country_code_tensor must be a 1D or 2D tensor of integer indices.")
        
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        X = self.X[idx]
        y = self.y[idx]
        sequence_length = self.sequence_lengths[idx]
        return (X, y, sequence_length)


In [97]:
dataset = NameNationalityData(n_samples=n_samples, country_codes=country_codes)
train_set, val_set = torch.utils.data.random_split(dataset, [0.8, 0.2])
train_dataloader = DataLoader(train_set, batch_size=1024, shuffle=True, drop_last=True)
val_dataloader = DataLoader(val_set, batch_size=len(val_set), shuffle=True, drop_last=True)

input_vocabulary_length = dataset.input_vocabulary_length
output_vocabulary_length = dataset.output_vocabulary_length

imported file: "MX.csv". number of records: 13330219. dropped 43260 records because of missing values.
imported file: "RU.csv". number of records: 9992686. dropped 17069 records because of missing values.
imported file: "NO.csv". number of records: 475782. dropped 1850 records because of missing values.
imported file: "IT.csv". number of records: 35554357. dropped 69126 records because of missing values.
imported file: "HK.csv". number of records: 2846829. dropped 293650 records because of missing values.
imported file: "AE.csv". number of records: 6792773. dropped 54115 records because of missing values.
imported file: "GB.csv". number of records: 11519228. dropped 24981 records because of missing values.
final dataset has 70000 records.
Input vocabulary of length 730:
 -.ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyzÀÁÂÃÄÅÈÉÍÎÏÑÓÖØÜÞàáâãäåæçèéêëìíîïðñòóôõöøùúûüýþāăąĆćčďĐđēėęĞğĥĨĩīįİıĶĹĺļľŁłńņňőŕřŚśŞşŠšŤťũŪūųŸŹźżŽžǎȘțɞʚʬ̃ΒΕΜΝαειλνοπστυχύЄІАБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЫЭЮЯабвгдежзий

### **MODELING**

- Create simple model using character embeddings, rnn layers and a dense layer (todo: dropout, weight initialization)
- Find best initial learning rate
- Get a baseline crossentropy loss
- Get a model to overfit sample data
- train test split

In [5]:
class RNN_Nationality_Predictor(nn.Module):
    def __init__(self):
        super().__init__()
        self.embed = nn.Embedding(
            num_embeddings=dataset.input_vocabulary_length+1,
            embedding_dim=32,
            padding_idx=0
        )
        self.rnn = nn.RNN(
            input_size=32,
            hidden_size=64,
            num_layers=3,
            batch_first=True,
            dropout=0.3
        )
        self.dense = nn.Linear(
            in_features=64,
            out_features=dataset.output_vocabulary_length+1,
        )

    def forward(self, X, lengths):
        embeddings = self.embed(X)

        # Pack the padded batch
        packed = pack_padded_sequence(
            embeddings,
            lengths=lengths,
            batch_first=True,
            enforce_sorted=False
        )
        _, hidden = self.rnn(packed)
        logits = self.dense(hidden[-1])
        output = F.softmax(logits, dim=0)
        return logits, output

In [6]:
model = RNN_Nationality_Predictor().to(device)
criterion = F.binary_cross_entropy_with_logits
optimizer = optim.AdamW(model.parameters(), lr=3e-4)

In [7]:
losses = []
eval_every_n_batches = 100
for epoch in range(10):
    print(f'-'*25+f' EPOCH {epoch+1} '+f'-'*25)
    for batch, (X, y, sequence_lenghts) in enumerate(train_dataloader):
        model.train()
        X, y = X.to(device), y.to(device)
        logits, _ = model(X, sequence_lenghts)
        loss = criterion(logits, y)
        losses.append(loss.item())
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if batch%eval_every_n_batches==0:
            model.eval()
            X, y, sequence_lenghts = next(iter(val_dataloader))
            X, y = X.to(device), y.to(device)
            logits, _ = model(X, sequence_lenghts)
            val_loss = criterion(logits, y)
            print(f'mean training loss over last {eval_every_n_batches} batches: {np.mean(losses[-eval_every_n_batches:]):.5f} --- validation loss: {val_loss:.5f}')

------------------------- EPOCH 1 -------------------------
mean training loss over last 100 batches: 0.69696 --- validation loss: 0.68780
mean training loss over last 100 batches: 0.43860 --- validation loss: 0.36159
mean training loss over last 100 batches: 0.34971 --- validation loss: 0.33065
mean training loss over last 100 batches: 0.31528 --- validation loss: 0.29598
mean training loss over last 100 batches: 0.28495 --- validation loss: 0.26570
mean training loss over last 100 batches: 0.25834 --- validation loss: 0.24234
------------------------- EPOCH 2 -------------------------
mean training loss over last 100 batches: 0.24852 --- validation loss: 0.23426
mean training loss over last 100 batches: 0.23344 --- validation loss: 0.22133
mean training loss over last 100 batches: 0.22166 --- validation loss: 0.21089
mean training loss over last 100 batches: 0.21147 --- validation loss: 0.20166
mean training loss over last 100 batches: 0.20350 --- validation loss: 0.19482
mean traini