In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm_notebook

from argparse import Namespace
import os
import collections
from collections import Counter

import string

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
##### 'TRAIN' a new model or 'LOAD' an existing model 
get_model = 'TRAIN'

# @@@@@ 0. Settings

In [3]:
args = Namespace(
    # Training or loading
    get_model   = get_model,
    
    # Data and Path information
    input_path  = os.getcwd(),
    output_path = os.getcwd()+'/OUTPUT/',
    
    # Training hyperparameters
    learning_rate   = 0.001,
    batch_size      = 128,
    hidden_dim      = 300,
    num_channels    = 256,
    device          = 'cpu',
    num_epochs      = 100,
    dropout_p       = 0.1,
    early_stopping_criteria = 5
)

# @@@@@ 1. Data Preparation
## The data preparation part is to perform a text-to-vectorized-minibatch pipeline: converting text inputs to vectorized minibatches.
- ### Step 1: Creating a Vocabulary - mapping each token (characters in the context of surnames) in the surname data to a numerical version of itself.
- ### Step 2: Vectorization - going from a text dataset to a vector. The Vectorizer turns different surnames to vectors of integers with the same length.
- ### Step 3: Group the vectorized data points into batches.

## 1.1 - Read Data
### **Train partition**: a dataset to derive the model parameters
### **Valid partition**: a dataset for selecting among hyperparameters and making modeling decisions
### **Test partition**: a dataset for final evaluation and reporting

In [4]:
df_all = pd.read_csv('surnames_with_splits.csv')
df_all.sample(5)

Unnamed: 0,nationality,nationality_index,split,surname
10370,Russian,13,test,Vertegel
6393,Greek,4,test,Dasios
2437,Dutch,2,test,Richard
7538,Japanese,7,train,Shimada
5851,German,9,train,Schröder


In [5]:
Crosstab = pd.crosstab(df_all['nationality'], df_all['split'])
Crosstab['total'] = Crosstab.sum(axis=1)
Crosstab.loc['Total'] = Crosstab.sum(axis=0)
Crosstab

split,test,train,val,total
nationality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Arabic,241,1122,240,1603
Chinese,33,154,33,220
Czech,63,289,62,414
Dutch,36,165,35,236
English,447,2080,445,2972
French,35,160,34,229
German,87,403,86,576
Greek,24,109,23,156
Irish,28,128,27,183
Italian,90,420,90,600


## 1.2 - The Vocabulary class
### [A walkthrough of codes](https://github.com/houzhj/Machine_Learning/blob/main/ipynb/Surname_Nationality/class_Vocabulary.ipynb)
### - Creating a mapping between the tokens and integers, in terms of dictionaries. To make this mapping reversible, create two dictionaries, one is from-token-to-index, one is from-index-to-token. Then encapsulate this mapping (bijection) into a Vocabulary class.
### - By using the UNK token, we can handle tokens at test time that were never seen in training.
### - Restricting infrequent tokens from the Vocabularies with a pre-specified parameter cut_off. This is essential in limiting the memory used by the Vocabulary class. 
### - Expected behaviors:
#### (1) add_token(): to add new tokens to the Vocabulary
#### (2) lookup_token(): to retrieve the index for a token
#### (3) lookup_index(): to retrieve the token corresponding to a specific index.
### - The Vocabulary objects will be used in the Vectorization step (discussed next). 

In [6]:
class Vocabulary(object):
    """Class to process text and extract vocabulary for mapping"""
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
            add_unk (bool): a flag that indicates whether to add the UNK token
            unk_token (str): the UNK token to add into the Vocabulary
        """
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        
        self._add_unk   = add_unk
        self._unk_token = unk_token      
        self.unk_index  = -999
        ### the unk_token, i.e, "<UNK>" is the first added token if add_unk=True
        ### self.unk_index is changed from -999 to 0
        if add_unk:
            self.unk_index = self.add_token(unk_token) 

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
   
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            ### .get(): return self.unk_index if the key "token" does not exist. 
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
    
    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    def __len__(self):
        return len(self._token_to_idx)
    

## 1.3 - Vectorization
### [A walkthrough of codes](https://github.com/houzhj/Machine_Learning/blob/main/ipynb/Surname_Nationality/class_Vectorizer_matrix_of_one_hots.ipynb)
### - The class method **from_dataframe()** is used to instantiate a SurnameVectorizer object from a dataframe.
### - The SurnameVectorizer encapsulates the nationality vocabulary (nationality_vocab) and the surname vocabulary (surname_vocab).
### - The surname_vocab will be used as the reference for one-hot representation. 
### - The class method **vectorize()** is the core functionality of the Vectorizer. It takes as an argument a string representing a surname and returns a matrix of one-hots representation for the surname. The size of the matrix is (represents the number of tokens in surname_vocab, the length of the longest surname). 

In [7]:
class SurnameVectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use"""
    def __init__(self, surname_vocab, nationality_vocab,max_surname_length):
        """
        Args:
            surname_vocab (Vocabulary): maps characters to integers
            nationality_vocab (Vocabulary): maps nationalities to integers
            max_surname_length (int): the length of the longest surname
        """
        self.surname_vocab       = surname_vocab
        self.nationality_vocab   = nationality_vocab
        self._max_surname_length = max_surname_length
         
    @classmethod
    def from_dataframe(cls, surname_df):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            surname_df (pandas.DataFrame): the surnames dataset
        Returns:
            an instance of the SurnameVectorizer
        """
        surname_vocab      = Vocabulary(add_unk=True, unk_token="@")
        nationality_vocab  = Vocabulary(add_unk=False)
        # initialzed value of max_surname_length, updated in the for loop below 
        max_surname_length = 0
        
        ########## Add tokens to surname_vocab and nationality_vocab
        for index, row in surname_df.iterrows():
            # update max_surname_length
            max_surname_length = max(max_surname_length, len(row.surname))
            
            # Add tokens(characters) to surname_vocab
            for letter in row.surname:
                surname_vocab.add_token(letter)
            # Add tokens(words) to nationality_vocab
            nationality_vocab.add_token(row.nationality)

        return cls(surname_vocab, nationality_vocab, max_surname_length)

    ### This is the key functionality of the Vectorizer.
    ### It takes as an argument a string representing a surname, 
    ### and the length of the longest surname, and returns a vectorized 
    ### representation of the surname.
    def vectorize(self, surname):
        """
        Create a matrix of one-hots representation for the surname
        The number of rows: the longest 
        
        Args:
            surname (str): the surname 
        Returns:
            one_hot_matrix (np.ndarray): a matrix of one-hot vectors
        """
        ### Create a matrix with size (len(self.surname_vocab),self._max_surname_length)
        ### len(self.surname_vocab) represents the number of tokens in surname_vocab
        ### self._max_surname_length represents the length of the longest surname.

        ### Run lookup_token() for each character in the surname sequentially, return an index
        ### Assign the corresponding element in the matrix to 1.
        one_hot_matrix_size = (len(self.surname_vocab), self._max_surname_length)
        one_hot_matrix      = np.zeros(one_hot_matrix_size, dtype=np.float32)
                               
        for position_index, character in enumerate(surname):
            character_index = self.surname_vocab.lookup_token(character)
            one_hot_matrix[character_index][position_index] = 1
        return one_hot_matrix

## 1.4 - Batches
### [A walkthrough of codes](https://github.com/houzhj/Machine_Learning/blob/main/ipynb/Surname_Nationality/batch_generator.ipynb)

### - Group the vectorized data points into batches. 
### - The grouping is conducted throught the built in class **[DataLoader](https://pytorch.org/docs/stable/data.html)** in PyTorch. 
### - The class SurnameDataset inherits from the [**Dataset**](https://pytorch.org/vision/0.16/datasets.html) class. Instances of the derived class **SurnameDataset** can then be used with data loading tools like **DataLoader()** for efficient batch loading during model training.
### - The methods \_\_len\_\_(), and \_\_getitem\_\_() are defined in class **SurnameDataset** - these magic functions are expected by the **DataLoader()**. An object equipped with \_\_len\_\_() can be passed to the len() Python build-in function. For objects equipped with \_\_getitem\_\_() we can use the standard subscript for indexing tuples and lists to access individual items. 
### - The **DataLoader()** function utilizes the return results of the \_\_getitem\_\_() method in the dataset to construct batches of data. In each iteration, **DataLoader()** calls the \_\_getitem\_\_() method of the dataset to retrieve a sample, and then combines these samples into a batch. 
### - In **DataLoader()**, the \_\_getitem\_\_() method uses an index generated by the **Sampler** object. The **Sampler** is responsible for determining the indices of samples in each batch. This index may be generated sequentially or randomly, depending on the setting of the shuffle parameter.
### - Define a batch generator function that wraps the DataLoader and switch the data between the CPU and the GPU.

In [8]:
class SurnameDataset(Dataset):
    def __init__(self,surname_df,vectorizer):
        self.surname_df  = surname_df
        self._vectorizer = vectorizer
        
        self.train_df    = self.surname_df[self.surname_df.split=='train']
        self.train_size  = len(self.train_df)

        self.val_df      = self.surname_df[self.surname_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df     = self.surname_df[self.surname_df.split=='test']
        self.test_size   = len(self.test_df)
        
        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val'  : (self.val_df, self.validation_size),
                             'test' : (self.test_df, self.test_size)}
        self.set_split('train')
        
        # Class weights - Assign a weight to each class that inversely proportional to its frequency.
        class_counts = surname_df.nationality.value_counts().to_dict()
        def sort_key(item):
            return self._vectorizer.nationality_vocab.lookup_token(item[0])
        sorted_counts = sorted(class_counts.items(), key=sort_key)
        frequencies = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(frequencies, dtype=torch.float32)
        
    @classmethod
    def load_csv_and_make_vectorizer(cls,surname_csv):
        """Load dataset and make a new vectorizer from scratch
        Args:
            surname_csv (str): location of the dataset
        Returns:
            an instance of SurnameDataset
        """
        surname_df = pd.read_csv(surname_csv)
        ### make vectorizer using training dataset
        train_surname_df = surname_df[surname_df.split=='train']
        new_vectorizer  = SurnameVectorizer.from_dataframe(train_surname_df)
        return cls(surname_df,new_vectorizer)
    
    @classmethod
    def load_df_and_make_vectorizer(cls,surname_df):
        """Load dataset and make a new vectorizer from scratch
        Args:
            surname_df: dataset
        Returns:
            an instance of SurnameDataset
        """
        ### make vectorizer using training dataset
        train_surname_df = surname_df[surname_df.split=='train']
        new_vectorizer  = SurnameVectorizer.from_dataframe(train_surname_df)
        return cls(surname_df,new_vectorizer)
    
    def set_split(self, split="train"):
        """ selects the splits in the dataset using a column in the dataframe 
        Args:
            split (str): one of "train", "val", or "test"
        """
        self._target_split = split
        ### when split = 'train', _target_df means the training set
        self._target_df, self._target_size = self._lookup_dict[split]
        
    def __len__(self):
        ### _target_size is defined in set_split() 
        return self._target_size        
        
    def __getitem__(self, index):
        """the primary entry point method for PyTorch datasets
        
        Args:
            index (int): the index to the data point 
        Returns:
            a dictionary holding the data point's features (x_data) and label (y_target)
        """
        
        row = self._target_df.iloc[index]

        surname_matrix = \
            self._vectorizer.vectorize(row.surname)

        nationality_index = \
            self._vectorizer.nationality_vocab.lookup_token(row.nationality)

        return {'x_data': surname_matrix,
                'y_target': nationality_index}

    def get_vectorizer(self):
        """ returns the vectorizer """
        return self._vectorizer
    
    def get_num_batches(self, batch_size):
        """Given a batch size, return the number of batches in the dataset
        Args:
            batch_size (int)
        Returns:
            number of batches in the dataset
        """
        return len(self) // batch_size  

### Generator Function

In [9]:
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device='cpu'):
    """
    A generator function which wraps the PyTorch DataLoader. It will 
      ensure each tensor is on the write device location.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

### An example of one data batch

In [10]:
df_sample      = df_all.sample(100,random_state=100)
dataset_sample = SurnameDataset.load_df_and_make_vectorizer(df_sample)
batch_size     = 10
shuffle        = True
drop_last      = True
dataloader     = DataLoader(dataset=dataset_sample, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)
one_batch = next(iter(dataloader))
print('x in one batch')
print(one_batch['x_data'])
print('size of x_data:', one_batch['x_data'].shape)
print('-' * 60)
print('y in one batch')
print(one_batch['y_target'])
print('size of y_data:', one_batch['y_target'].shape)

x in one batch
tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 1.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
       

# @@@@@ 2. Model / Optimizer / Loss
## 2.1 - The model (CNN) and activate function (Exponential Linear Unit, ELU)
### - The **SurnameClassifier** inherits from PyTorch’s **Module** and creates CNN classifier. The model is created using [nn.Sequential()](https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html).
### - Some details about convolutional layers is discussed [in this study](https://github.com/houzhj/Machine_Learning/blob/main/ipynb/Surname_Nationality/convolutional_layer.ipynb). 
### - In PyTorch, the **nn.Module** class implements the **\_\_call\_\_** method, enabling model instances to be invoked like functions. Calling an instance from nn.Module with a set of arguments ends up calling a method named forward with the same argument. The forward function executes the forward computation, while **\_\_call\_\_** does other important chores before and after calling forward. In general, the correct way to call the module as a function is to use **Classifier(input)**, rather than **Classifier.forward(input)**, although they will produce the same outputs (silient errors, since there are steps not called properly if just using **forward(...)** directly).
### - The **forward()** method allows for the softmax function (working as the nonlinear activation function) to be optionally applied. Here the default is do not apply the sigmoid function. 
### - ELU (Exponential Linear Unit) is the activation function.  [nn.ELU()](https://pytorch.org/docs/stable/generated/torch.nn.ELU.html) is used to apply the ELU function.  ELU is a nonlinearity similar to the ReLU, but rather than clipping values below 0, it exponentiates them. ELU has been shown to be a promising nonlinearity to use between convolutional layers (Clevert et al., 2015).

In [11]:
class SurnameClassifier(nn.Module):
    """  2-layer Multilayer Perceptron """
    def __init__(self, initial_num_channels, num_classes, num_channels, dropout_prob):
        """
        Args:
            initial_num_channels (int): size of the incoming feature vector
                                        (used in the first convnet layer)
                                        the length of surname vocabulary
                                        
            num_classes (int) : size of the output prediction vector 
                                (used in the final fc layer)
                                the number of nationality classes
                                
            num_channels (int): constant channel size to use throughout network 
                                (used in the convnet layers as output channels)
                                in this example, the num_channels for each of the convolutions
                                is set to be the same value. This is not necessary. We can
                                alternatively chosen a different number of channels for each 
                                convolution operation separately. Doing so would entail optimizing 
                                more hyperparameters. 
                                
            dropout_prob(float): apply dropout layer if dropout_prob>0
                                
        """
        super(SurnameClassifier, self).__init__()
        
        ### Define the Convolutional Neural Network
        self.convnet = nn.Sequential(
            ### conv1
            nn.Conv1d(in_channels  = initial_num_channels, 
                      out_channels = num_channels, 
                      kernel_size  = 3),
            ### act1 - ELU has been shown to be a promising nonlinearity to use between
            ###        convolutional layers (Clevert et al., 2015)
            nn.ELU(),
            ### conv2
            nn.Conv1d(in_channels  = num_channels, 
                      out_channels = num_channels,
                      kernel_size  = 3, 
                      stride       = 2),
            ### act2
            nn.ELU(),
            ### conv3
            nn.Conv1d(in_channels  = num_channels, 
                      out_channels = num_channels, 
                      kernel_size  = 3, 
                      stride       = 2),
            ### act3
            nn.ELU(),
            ### conv4
            nn.Conv1d(in_channels  = num_channels, 
                      out_channels = num_channels, 
                      kernel_size  = 3),
            nn.ELU()
        )
        
        self.dropout = nn.Dropout(p=args.dropout_p)
        self.fc = nn.Linear(num_channels, num_classes)

    def forward(self, x_in, apply_softmax=False, apply_dropout=False):
        """The forward pass of the classifier
        Args:
            x_in (torch.Tensor): an input data tensor
            x_in.shape should be (batch, initial_num_channels, max_surname_length)
            apply_softmax (bool): a flag for the softmax activation should be false if 
                                  used with the cross-entropy losses
        
        Returns: the resulting tensor. tensor.shape should be (batch, num_classes).
        """
        ### .squeeze(dim=2): Transforming the shape of a tensor  
        ### from (batch_size,num_channels,1) to (batch_size,num_channels)
        features = self.convnet(x_in).squeeze(dim=2)
        
        features = self.dropout(features)
       
        prediction_vector = self.fc(features)

        if apply_softmax:
            prediction_vector = F.softmax(prediction_vector, dim=1)

        return prediction_vector

### Define the model 
### - Data: df_all
### - Training hyperparameters (batch_size, device): defined in args

In [12]:
dataset    = SurnameDataset.load_df_and_make_vectorizer(df_all)
dataloader = DataLoader(dataset = dataset,
                        batch_size = 128,
                        shuffle = True,
                        drop_last = True)
vectorizer = dataset.get_vectorizer()

In [13]:
classifier = SurnameClassifier(initial_num_channels = len(vectorizer.surname_vocab),
                               num_classes  = len(vectorizer.nationality_vocab),
                               num_channels = 10,
                               dropout_prob = 0)
classifier

SurnameClassifier(
  (convnet): Sequential(
    (0): Conv1d(77, 10, kernel_size=(3,), stride=(1,))
    (1): ELU(alpha=1.0)
    (2): Conv1d(10, 10, kernel_size=(3,), stride=(2,))
    (3): ELU(alpha=1.0)
    (4): Conv1d(10, 10, kernel_size=(3,), stride=(2,))
    (5): ELU(alpha=1.0)
    (6): Conv1d(10, 10, kernel_size=(3,), stride=(1,))
    (7): ELU(alpha=1.0)
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (fc): Linear(in_features=10, out_features=18, bias=True)
)

In [14]:
one_batch = next(iter(dataloader))
print("shape of input:", one_batch['x_data'].shape)
print('-'*80)
print("Outout (using initialized parameters)")
print(classifier(one_batch['x_data'],
                 apply_softmax=False,
                 apply_dropout=False).max(dim=1))

shape of input: torch.Size([128, 77, 17])
--------------------------------------------------------------------------------
Outout (using initialized parameters)
torch.return_types.max(
values=tensor([0.3129, 0.3121, 0.3042, 0.3046, 0.3628, 0.3173, 0.3156, 0.2802, 0.3414,
        0.3142, 0.2945, 0.3087, 0.2813, 0.3512, 0.3170, 0.2985, 0.3248, 0.3113,
        0.3190, 0.2929, 0.3171, 0.3131, 0.3192, 0.2970, 0.3157, 0.3173, 0.3192,
        0.3127, 0.3643, 0.3122, 0.2948, 0.2951, 0.3092, 0.2975, 0.2659, 0.2655,
        0.2754, 0.3257, 0.3175, 0.3086, 0.3597, 0.3155, 0.2895, 0.3000, 0.2887,
        0.3195, 0.3648, 0.2909, 0.3146, 0.3058, 0.3145, 0.3150, 0.3490, 0.2862,
        0.3265, 0.3171, 0.3052, 0.2960, 0.3094, 0.3551, 0.3153, 0.3000, 0.2872,
        0.3143, 0.3134, 0.3148, 0.3312, 0.3220, 0.2982, 0.3158, 0.3122, 0.3672,
        0.3134, 0.2822, 0.3085, 0.2933, 0.2649, 0.2833, 0.2981, 0.3221, 0.2803,
        0.3162, 0.2897, 0.2968, 0.3097, 0.3203, 0.3147, 0.3086, 0.3205, 0.3238,
        

## 2.2 - The loss function (Binary Cross-Entropy)
### - The loss - "how far off" the model predictions are from the target.
### - The gradient of the loss function - a signal for “how much” the parameters should change (according to "how much" each parameter contributed to the loss function).
### - As mentioned, the loss function should be appropriate for the activation function.

In [15]:
loss_weighted_func   = nn.CrossEntropyLoss(dataset.class_weights)
loss_unweighted_func = nn.CrossEntropyLoss()

## 2.3 - The optimizer and scheduler
### The initialized state of the classifier
### Using [torch.nn.Module.parameters](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.parameters)
#### These are the tensors that the optimizer will get. After calling **model.backward()** the parameters are populated with their grad, and the optimizer then updates their values accordingly during the **optimizer.step()** call.
#### The requires_grad = True argument is telling PyTorch to track the entire family tree of tensors resulting from operations on *parameters*.

In [16]:
classifier

SurnameClassifier(
  (convnet): Sequential(
    (0): Conv1d(77, 10, kernel_size=(3,), stride=(1,))
    (1): ELU(alpha=1.0)
    (2): Conv1d(10, 10, kernel_size=(3,), stride=(2,))
    (3): ELU(alpha=1.0)
    (4): Conv1d(10, 10, kernel_size=(3,), stride=(2,))
    (5): ELU(alpha=1.0)
    (6): Conv1d(10, 10, kernel_size=(3,), stride=(1,))
    (7): ELU(alpha=1.0)
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (fc): Linear(in_features=10, out_features=18, bias=True)
)

In [17]:
n_param = len(list(classifier.parameters()))
print(f"classifier.parameters() includes {n_param} sets of parameters.")

print("1 - Parameters for the convolutional layers: conv1.weight")
print("shape:", list(classifier.parameters())[0].shape)
print("-"*60)

print("2 - Parameters for the convolutional layers: conv1.bias")
print("shape:", list(classifier.parameters())[1].shape)
print("-"*60)

print("3 - Parameters for the convolutional layers: conv2.weight")
print("shape:", list(classifier.parameters())[2].shape)
print("-"*60)

print("4 - Parameters for the convolutional layers: conv2.bias")
print("shape:", list(classifier.parameters())[3].shape)
print("-"*60)

print("5 - Parameters for the convolutional layers: conv3.weight")
print("shape:", list(classifier.parameters())[4].shape)
print("-"*60)

print("6 - Parameters for the convolutional layers: conv3.bias")
print("shape:", list(classifier.parameters())[5].shape)
print("-"*60)

print("7 - Parameters for the convolutional layers: conv4.weight")
print("shape:", list(classifier.parameters())[6].shape)
print("-"*60)

print("8 - Parameters for the convolutional layers: conv4.bias")
print("shape:", list(classifier.parameters())[7].shape)
print("-"*60)

print("9 - Parameters for the convolutional layers: fc.weight")
print("shape:", list(classifier.parameters())[8].shape)
print("-"*60)

print("10 - Parameters for the convolutional layers: fc.bias")
print("shape:", list(classifier.parameters())[9].shape)
print("-"*60)

classifier.parameters() includes 10 sets of parameters.
1 - Parameters for the convolutional layers: conv1.weight
shape: torch.Size([10, 77, 3])
------------------------------------------------------------
2 - Parameters for the convolutional layers: conv1.bias
shape: torch.Size([10])
------------------------------------------------------------
3 - Parameters for the convolutional layers: conv2.weight
shape: torch.Size([10, 10, 3])
------------------------------------------------------------
4 - Parameters for the convolutional layers: conv2.bias
shape: torch.Size([10])
------------------------------------------------------------
5 - Parameters for the convolutional layers: conv3.weight
shape: torch.Size([10, 10, 3])
------------------------------------------------------------
6 - Parameters for the convolutional layers: conv3.bias
shape: torch.Size([10])
------------------------------------------------------------
7 - Parameters for the convolutional layers: conv4.weight
shape: torch.

### Define the [Adam](https://pytorch.org/docs/stable/generated/torch.optim.Adam.html) optimizer 
#### - the learning rate is set adaptively
#### - it is not sensitive to the scaling of the parameters

In [18]:
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)

### Apply a scheduler for adjusting learning rate
### - [torch.optim.lr_scheduler](https://pytorch.org/docs/stable/optim.html) provides several methods to adjust the learning rate based on the number of epochs.
### - [torch.optim.lr_scheduler.ReduceLROnPlateau](https://pytorch.org/docs/stable/generated/torch.optim.lr_scheduler.ReduceLROnPlateau.html#torch.optim.lr_scheduler.ReduceLROnPlateau) allows dynamic learning rate reducing based on some validation measurements.
### - This scheduler reads a metrics quantity and if no improvement is seen for a ‘patience’ number of epochs, the learning rate is reduced.
### - Parameters:
1. **mode(str)**: In "min" mode, lr will be reduced when the quantity monitored has stopped decreasing; in "max" mode it will be reduced when the quantity monitored has stopped increasing. Default: 0.1.
2. **factor(float)**: Factor by which the learning rate will be reduced. new_lr = lr * factor. Default: 0.1.
3. **patience (int)**: Number of epochs with no improvement after which learning rate will be reduced. For example, if patience = 2, then we will ignore the first 2 epochs with no improvement, and will only decrease the LR after the 3rd epoch if the loss still hasn’t improved then. Default: 10.
4. **threshold (float)** – Threshold for measuring the new optimum, to only focus on significant changes. Default: 1e-4.

### - A **scheduler.step(val_loss)** method is called at the end of each epoch to execute the update of the learning rate. The parameters “val_loss” represents the loss (or other monitoring metric) computed for the model on the validation set. This loss value is typically used by the scheduler to assess the model's performance on the validation set and update the learning rate accordingly.
![image.png](attachment:image.png)

In [19]:
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer = optimizer,
                                                 mode      = 'min', 
                                                 factor    = 0.5,
                                                 patience  = 1)

# @@@@@ 3. Training Routine
## 3.1 - Helper function: tracking the training state

In [20]:
def init_train_state(args, save_model_name):
    train_state_dict = {'stop_early'    : False,
                        'early_stopping_step'     : 0,
                        'early_stopping_best_val' : 1e8,
                        'learning_rate' : args.learning_rate,
                        'epoch_index'   : 0,
                        'train_loss'    : [],
                        'train_acc'     : [],
                        'val_loss'      : [],
                        'val_acc'       : [],
                        'test_loss'     : -1,
                        'test_acc'      : -1,
                        'model_filename': save_model_name
                       }
    return train_state_dict

def update_train_state(args, model, train_state):
    """
    Handle the training state updates.
     - Early Stopping: Prevent overfitting.
     - Model Checkpoint: Model is saved if the model is better
    
    Args:
        args:  arguments
        model: model to train
        train_state: a dictionary representing the training state values
    
    Returns:
        a new train_state
    """

    # Save the first model
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # Save model if performance improved
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # If loss increased (not a better model)
        if loss_t >= train_state['early_stopping_best_val']:
            # Update early_stopping_step
            train_state['early_stopping_step'] += 1
        # If loss decreased
        else:
            # Save the best model and update the early_stopping_best_val
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])
                train_state['early_stopping_best_val'] = loss_t

            # Reset early stopping step
            train_state['early_stopping_step'] = 0

        # Stop early ?
        # In the main training loop, if train_state['stop_early']: break
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

## 3.2 - Helper function: compute accurary rate

In [21]:
def compute_accuracy(y_pred, y_target, device):
    y_target = y_target.to(device)

    ##### tensor.max(dim=1): the results include two output tensors (max, max_indices)
    _, y_pred_indices = y_pred.to(device).max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [22]:
##### An example
one_batch = next(iter(dataloader))
classifier(one_batch['x_data']).max(dim=1)

torch.return_types.max(
values=tensor([0.3199, 0.3153, 0.2973, 0.3092, 0.3125, 0.2728, 0.3022, 0.3157, 0.2793,
        0.2648, 0.3156, 0.3065, 0.3092, 0.3153, 0.2909, 0.3193, 0.3091, 0.3087,
        0.3567, 0.3199, 0.3124, 0.3168, 0.3056, 0.3016, 0.2991, 0.3109, 0.3093,
        0.3141, 0.2791, 0.3163, 0.3107, 0.3032, 0.2791, 0.3100, 0.2668, 0.3125,
        0.3542, 0.3173, 0.2792, 0.3084, 0.2815, 0.3137, 0.3091, 0.3091, 0.3190,
        0.3123, 0.3007, 0.3173, 0.3158, 0.2789, 0.3133, 0.3130, 0.3075, 0.3166,
        0.2764, 0.3174, 0.3016, 0.2907, 0.2960, 0.3540, 0.2922, 0.3134, 0.3094,
        0.3140, 0.3177, 0.3129, 0.3080, 0.3600, 0.3095, 0.3582, 0.3239, 0.2573,
        0.3193, 0.3094, 0.2864, 0.3155, 0.3132, 0.3583, 0.2844, 0.3129, 0.3027,
        0.3074, 0.2373, 0.3423, 0.3186, 0.3069, 0.3199, 0.3120, 0.3592, 0.2849,
        0.2954, 0.3046, 0.3225, 0.2909, 0.3101, 0.3092, 0.3162, 0.3164, 0.3061,
        0.3092, 0.3162, 0.2876, 0.2989, 0.3136, 0.2798, 0.3132, 0.3222, 0.3086,
        0

In [23]:
outputs = classifier(one_batch['x_data'])
_, pred = outputs.max(dim=1)  
targets = one_batch['y_target']
print('targets:', targets)
print('-'*80)
print('pred:',pred)
print('-'*80)
print('accurary rate:',compute_accuracy(outputs,targets,device='cpu'))

targets: tensor([14,  2,  5,  2, 14, 14,  4,  4, 10,  9, 10,  2,  4,  4,  1,  8, 14,  4,
         2,  8, 14,  0, 10,  0,  4,  4,  4,  4,  0, 16,  0, 16, 14,  4,  4,  0,
        14, 14, 14,  6,  4,  9,  0,  0, 10, 14,  0,  8,  4,  0,  4,  9, 14,  8,
        10,  6,  2,  1,  6,  0,  4, 14,  9, 14, 14, 14,  4,  4,  1, 16, 14,  2,
         6,  0,  3,  4, 10,  6, 10,  0,  4,  6,  0,  7,  0, 14,  4, 16,  4, 14,
        14,  4, 14,  0, 14,  4,  4,  0,  4,  4, 16,  4, 14,  4,  8,  4,  2,  8,
         2, 10, 14,  4,  6,  9,  8,  4,  9,  5,  4,  0, 14,  1,  1, 14,  0,  4,
        14,  6])
--------------------------------------------------------------------------------
pred: tensor([ 8,  5,  8,  8, 16, 16,  8,  8,  8,  5,  8,  8,  8,  8,  8, 16,  8,  8,
         8,  8,  8,  8,  8,  5,  8, 16,  8,  8, 16,  8,  8,  8,  8,  8,  5,  8,
         8,  8,  5, 16, 16,  8,  8,  8,  5,  8,  8,  8,  8,  8,  8,  8, 16,  8,
         8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  5,  8,  5,  8,  8,  8,  8,  8,
       

## 3.3 - Training loop
### - The training loop updates the model parameters so that it improves over time.
### - The training loop is composed of two loops: an inner loop over batches in the dataset, and an outer loop, which repeats the inner loop a number of times.
### - The inner loop (batch), losses are computed for each batch, and the optimizer is used to update the model parameters.

In [24]:
def print_train_state(train_state):
    print('Train Loss:',round(train_state['train_loss'][-1],5))
    print('Train Accuracy:',round(train_state['train_acc'][-1],5))
    print('Valid Loss:',round(train_state['val_loss'][-1],5))
    print('Valid Accuracy:',round(train_state['val_acc'][-1],5))
    print('early_stopping_best_val:',round(train_state['early_stopping_best_val'],5))
    print('early_stopping_step:',train_state['early_stopping_step'])
    print('stop_early:',train_state['stop_early'])

In [25]:
args

Namespace(get_model='TRAIN', input_path='/Users/home/JH/Machine Learning/Surnames', output_path='/Users/home/JH/Machine Learning/Surnames/OUTPUT/', learning_rate=0.001, batch_size=128, hidden_dim=300, num_channels=256, device='cpu', num_epochs=100, dropout_p=0.1, early_stopping_criteria=5)

In [26]:
def TrainingClassifier(classifier,args,dataset,use_weight,use_dropout):
    
    optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                     mode='min', factor=0.5,
                                                     patience=1)
    
    if use_weight:
        loss_func       = nn.CrossEntropyLoss(dataset.class_weights)
        if use_dropout:
            save_model_name = args.output_path+'/cnn_weighted_dropout_model.pth'
        else: 
            save_model_name = args.output_path+'/cnn_weighted_nodropout_model.pth'
    else:
        loss_func       = nn.CrossEntropyLoss()
        if use_dropout:
            save_model_name = args.output_path+'/cnn_unweighted_dropout_model.pth'
        else: 
            save_model_name = args.output_path+'/cnn_unweighted_nodropout_model.pth'
        

    if args.get_model == 'TRAIN':
        ##### Get an initialized train_state
        train_state = init_train_state(args,save_model_name)

        for epoch_index in range(args.num_epochs):
            print('-'*60)
            print(f'Epoch {epoch_index}...')
            train_state['epoch_index'] = epoch_index

            ##################################################
            #####     Iterate over training dataset      #####
            ##################################################
            print('Training Iteration...')

            ##### Create a batch_generator using training data
            dataset.set_split('train')
            batch_generator = generate_batches(dataset,
                                               batch_size = args.batch_size,
                                               device     = args.device)

            ##### running_loss and running_acc are equivalent to the moving averages of loss and accuracy.
            ##### when the loop ends, a moving average is just an average. 
            ##### In each epoch loop, they are reset to zero before the batch loop.
            running_loss = 0.0
            running_acc  = 0.0

            ##### Indicate that the model is in “training mode” 
            # makes the model parameters mutable 
            # and enables regularization mechanisms like dropout
            classifier.train()

            for batch_index, batch_dict in enumerate(batch_generator):
                # the training routine is these 5 steps:
                # --------------------------------------
                ##### STEP 1. zero the gradients
                # Inside each batch iteration, the optimizer’s gradients are first reset
                # Calling backward (step 4 below) will ACCUMULATE gradients, so if the backward()
                # is called earlier, the new gradient is accumulated on top of the one computed 
                # in previous iterations, which leads ao an incorrect value for the gradient.
                # Therefore, use this zero_ method to reset the gradients.
                optimizer.zero_grad()

                ##### STEP 2. compute the output
                y_pred = classifier(x_in=batch_dict['x_data'])

                ##### STEP 3. compute the loss
                loss   = loss_func(y_pred, batch_dict['y_target'])
                loss_t = loss.item()
                # update the moving average of loss, batch by batch
                running_loss += (loss_t - running_loss) / (batch_index + 1)

                ##### STEP 4. use loss to produce gradients (gradients are propagated to each parameter)
                # Calling backward() and the gradients at each leaf is ACCUMULATED, not stored.
                # Note that the loss is the loss in train split. There is no valid_loss.backward()
                # because we don't want to train the model on the validation data. 
                loss.backward()

                ##### STEP 5. use optimizer to update parameters
                # the optimizer uses the propagated gradients to perform parameter updates
                # The value of classifier.parameters(), i.e., params is automatically updated in this step. 
                # In specific, the optimizer looks into params.grad and updates params, by substracting 
                # learning_rate * grad from it. 
                optimizer.step()

                ##### Tracking the accuracy
                # compute the accuracy
                acc_t = compute_accuracy(y_pred, batch_dict['y_target'],args.device)
                # update the moving average of acc, batch by batch
                running_acc += (acc_t - running_acc) / (batch_index + 1)

            ##### After this inner loop (training) ends
            # Append the running_loss and running_acc to train_state
            # (the average of loss and acc in all the batches in the current epoch)
            train_state['train_loss'].append(running_loss)
            train_state['train_acc'].append(running_acc)

            ##################################################
            #####     Iterate over validation dataset    #####
            ##################################################
            print('Validation Iteration...')

            ##### Create a batch_generator using validation data
            dataset.set_split('val')    
            batch_generator = generate_batches(dataset, 
                                               batch_size = args.batch_size, 
                                               device     = args.device)
            ##### Create new running loss, and running accuracy
            running_loss = 0.0
            running_acc  = 0.0

            ##### Indicate that the model is in “evaluation mode”
            # makes the model parameters immutable 
            # disables dropout
            # disables computation of the loss and propagation of gradients back to the parameters
            classifier.eval()

            for batch_index, batch_dict in enumerate(batch_generator):
                ##### compute the output
                y_pred = classifier(x_in=batch_dict['x_data'].float())

                ##### STEP 3. compute the loss
                loss   = loss_func(y_pred, batch_dict['y_target'])
                loss_t = loss.item()
                running_loss += (loss_t - running_loss) / (batch_index + 1)

                ##### compute the accuracy
                acc_t = compute_accuracy(y_pred, batch_dict['y_target'],args.device)
                running_acc += (acc_t - running_acc) / (batch_index + 1)

            ##### After this inner loop (validation) ends
            # Append the running_loss and running_acc to train_state
            # (the average of loss and acc in all the batches in the current epoch)
            train_state['val_loss'].append(running_loss)
            train_state['val_acc'].append(running_acc)

            ##### Update the status of Early Stopping and Model Checkpoint
            # Input: the current classifier and current train_state (end of the current epoch loop)
            # Update three items in train_state
            # 1."early_stopping_step" +=1 or reset to 0 (comparing val_loss and early_stopping_best_val)
            # 2."early_stopping_best_val" update to the current val_loss if it is the best model
            # 3."stop_early" if early_stopping_step reaches early_stopping_criteria. If True, break all loops below. 
            # Save a new model if the current model has early_stopping_best_val
            train_state = update_train_state(args = args, model = classifier,
                                             train_state = train_state)

            scheduler.step(train_state['val_loss'][-1])
            print('Current lr:', optimizer.param_groups[0]['lr'])

            print_train_state(train_state)

            if train_state['stop_early']:
                break

    elif args.get_model == 'LOAD':
        print('Please load the model.')
        
    

### Model 1 - Use weight, no dropout layer

In [27]:
dataset    = SurnameDataset.load_df_and_make_vectorizer(df_all)
classifier = SurnameClassifier(initial_num_channels = len(vectorizer.surname_vocab),
                               num_classes          = len(vectorizer.nationality_vocab),
                               num_channels         = args.num_channels,
                               dropout_prob         = 0)
TrainingClassifier(classifier  = classifier,
                   args        = args,
                   dataset     = dataset,
                   use_weight  = True,
                   use_dropout = False)

------------------------------------------------------------
Epoch 0...
Training Iteration...
Validation Iteration...
Current lr: 0.001
Train Loss: 2.63226
Train Accuracy: 17.05729
Valid Loss: 2.25687
Valid Accuracy: 26.23698
early_stopping_best_val: 100000000.0
early_stopping_step: 0
stop_early: False
------------------------------------------------------------
Epoch 1...
Training Iteration...
Validation Iteration...
Current lr: 0.001
Train Loss: 1.95257
Train Accuracy: 37.57812
Valid Loss: 2.03483
Valid Accuracy: 40.6901
early_stopping_best_val: 2.03483
early_stopping_step: 0
stop_early: False
------------------------------------------------------------
Epoch 2...
Training Iteration...
Validation Iteration...
Current lr: 0.001
Train Loss: 1.68467
Train Accuracy: 41.99219
Valid Loss: 1.91348
Valid Accuracy: 40.625
early_stopping_best_val: 1.91348
early_stopping_step: 0
stop_early: False
------------------------------------------------------------
Epoch 3...
Training Iteration...
Valid

### Model 2 - No weight, no dropout layer

In [28]:
dataset    = SurnameDataset.load_df_and_make_vectorizer(df_all)
classifier = SurnameClassifier(initial_num_channels = len(vectorizer.surname_vocab),
                               num_classes          = len(vectorizer.nationality_vocab),
                               num_channels         = args.num_channels,
                               dropout_prob         = 0)
TrainingClassifier(classifier  = classifier,
                   args        = args,
                   dataset     = dataset,
                   use_weight  = False,
                   use_dropout = False)

------------------------------------------------------------
Epoch 0...
Training Iteration...
Validation Iteration...
Current lr: 0.001
Train Loss: 1.98284
Train Accuracy: 40.49479
Valid Loss: 1.6269
Valid Accuracy: 54.88281
early_stopping_best_val: 100000000.0
early_stopping_step: 0
stop_early: False
------------------------------------------------------------
Epoch 1...
Training Iteration...
Validation Iteration...
Current lr: 0.001
Train Loss: 1.42789
Train Accuracy: 58.82812
Valid Loss: 1.38884
Valid Accuracy: 59.375
early_stopping_best_val: 1.38884
early_stopping_step: 0
stop_early: False
------------------------------------------------------------
Epoch 2...
Training Iteration...
Validation Iteration...
Current lr: 0.001
Train Loss: 1.25304
Train Accuracy: 63.46354
Valid Loss: 1.28807
Valid Accuracy: 61.06771
early_stopping_best_val: 1.28807
early_stopping_step: 0
stop_early: False
------------------------------------------------------------
Epoch 3...
Training Iteration...
Valid

### Model 3 - Use weight, use dropout layer

In [29]:
dataset    = SurnameDataset.load_df_and_make_vectorizer(df_all)
classifier = SurnameClassifier(initial_num_channels = len(vectorizer.surname_vocab),
                               num_classes          = len(vectorizer.nationality_vocab),
                               num_channels         = args.num_channels,
                               dropout_prob         = args.dropout_p)
TrainingClassifier(classifier  = classifier,
                   args        = args,
                   dataset     = dataset,
                   use_weight  = True,
                   use_dropout = True)

------------------------------------------------------------
Epoch 0...
Training Iteration...
Validation Iteration...
Current lr: 0.001
Train Loss: 2.59456
Train Accuracy: 20.0
Valid Loss: 2.1953
Valid Accuracy: 28.0599
early_stopping_best_val: 100000000.0
early_stopping_step: 0
stop_early: False
------------------------------------------------------------
Epoch 1...
Training Iteration...
Validation Iteration...
Current lr: 0.001
Train Loss: 1.93544
Train Accuracy: 35.70313
Valid Loss: 1.98677
Valid Accuracy: 36.65365
early_stopping_best_val: 1.98677
early_stopping_step: 0
stop_early: False
------------------------------------------------------------
Epoch 2...
Training Iteration...
Validation Iteration...
Current lr: 0.001
Train Loss: 1.70394
Train Accuracy: 41.61458
Valid Loss: 1.9236
Valid Accuracy: 38.28125
early_stopping_best_val: 1.9236
early_stopping_step: 0
stop_early: False
------------------------------------------------------------
Epoch 3...
Training Iteration...
Validation

# @@@@@ 4. Evaluation
## 4.1 - Evaluation on Test Data
### A. Weighted Model

In [30]:
classifier_weighted = SurnameClassifier(initial_num_channels = len(vectorizer.surname_vocab),
                                        num_classes          = len(vectorizer.nationality_vocab),
                                        num_channels         = args.num_channels,
                                        dropout_prob         = args.dropout_p)

filename   = args.output_path+'/cnn_weighted_nodropout_model.pth'
classifier_weighted.load_state_dict(torch.load(filename))
classifier_weighted = classifier_weighted.to(args.device)

loss_func  = nn.CrossEntropyLoss(dataset.class_weights)

##### Create a batch_generator using test data
# The test set should be run as little as possible
# Avoid make a new model decision based on the evaluation on test data
# Otherwise the model might be biased toward the test data, and the test data will 
# become meaningless as an measure of truly held-out data.
dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)

##### Create new running loss, and running accuracy
running_loss = 0.0
running_acc = 0.0

##### Indicate that the model is in “evaluation mode”
classifier_weighted.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = classifier_weighted(batch_dict['x_data'])

    # compute the loss
    loss   = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'],args.device)
    running_acc += (acc_t - running_acc) / (batch_index + 1)

print("Test loss: {:.5f}".format(running_loss))
print("Test Accuracy: {:.5f}".format(running_acc))

Test loss: 1.79241
Test Accuracy: 53.77604


### B. Un-weighted Model

In [31]:
classifier_unweighted = SurnameClassifier(initial_num_channels = len(vectorizer.surname_vocab),
                                          num_classes          = len(vectorizer.nationality_vocab),
                                          num_channels         = args.num_channels,
                                          dropout_prob         = args.dropout_p)
filename   = args.output_path+'/cnn_unweighted_nodropout_model.pth'
classifier_unweighted.load_state_dict(torch.load(filename))
classifier_unweighted = classifier_unweighted.to(args.device)

loss_func  = nn.CrossEntropyLoss()

##### Create a batch_generator using test data
# The test set should be run as little as possible
# Avoid make a new model decision based on the evaluation on test data
# Otherwise the model might be biased toward the test data, and the test data will 
# become meaningless as an measure of truly held-out data.
dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)

##### Create new running loss, and running accuracy
running_loss = 0.0
running_acc = 0.0

##### Indicate that the model is in “evaluation mode”
classifier_unweighted.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # compute the output
    y_pred = classifier_unweighted(batch_dict['x_data'])

    # compute the loss
    loss   = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # compute the accuracy
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'],args.device)
    running_acc += (acc_t - running_acc) / (batch_index + 1)

print("Test loss: {:.5f}".format(running_loss))
print("Test Accuracy: {:.5f}".format(running_acc))

Test loss: 0.97421
Test Accuracy: 72.20052


## 4.2 Generation a prediction for a given surname

In [32]:
surnames = {
    'Smith': 'English',
    'Yamamoto': 'Japanese',
    'Dubois': 'French',
    'Rossi': 'Italian',
    'Zhang':  'Chinese',
    'Petrov':  'Russian' ,
    'Kowalski': 'Polish',
    'Park': 'Korean'
 }
surnames

{'Smith': 'English',
 'Yamamoto': 'Japanese',
 'Dubois': 'French',
 'Rossi': 'Italian',
 'Zhang': 'Chinese',
 'Petrov': 'Russian',
 'Kowalski': 'Polish',
 'Park': 'Korean'}

In [33]:
def predict_surname(surname,
                    classifier,
                    device, 
                    vectorizer,
                    top_k):
    """Predict the nationality of a surname
    
    Args:
        surname (str): the text of the surname
        classifier (SurnameClassifier): the trained model
        device: device
        vectorizer (SurnameVectorizer): the corresponding vectorizer
        top_k: specify the k in “top-k”
    """
    
    classifier         = classifier.to(device)
    vectorized_surname = torch.tensor(vectorizer.vectorize(surname)).unsqueeze(dim=0)
    output             = classifier(vectorized_surname, apply_softmax=True)
    
    probability_value, index = torch.topk(output, k=top_k)
    
    ### Predicted label 
    index = index.detach().numpy()[0]
    
    ### Predicted probability 
    probability_value = probability_value.detach().numpy()[0]
    
    results = []
    for p, i in zip(probability_value, index):
        nationality = vectorizer.nationality_vocab.lookup_index(i)
        results.append({'nationality': nationality, 
                        'probability': p})
    
    return results

In [34]:
surname    = 'Schmidt'
classifier = classifier_unweighted
device     = 'cpu'
vectorizer = dataset.get_vectorizer()
input      = torch.tensor(vectorizer.vectorize(surname)).unsqueeze(dim=0)
output     = classifier(input, apply_softmax=True)
print('Input:', input)
print('Shape:', input.shape)
print('-'*80)
print('Output', output)
print('Shape:', output.shape)

Input: tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]])
Shape: torch.Size([1, 77, 17])
--------------------------------------------------------------------------------
Output tensor([[7.3557e-11, 1.5791e-07, 3.3766e-03, 5.9650e-03, 2.7705e-01, 3.3040e-03,
         6.8944e-01, 5.9915e-08, 2.0650e-04, 5.9280e-07, 7.3073e-09, 5.1065e-07,
         3.0034e-05, 1.4814e-06, 2.2151e-03, 1.8353e-02, 4.0501e-05, 1.0901e-05]],
       grad_fn=<SoftmaxBackward0>)
Shape: torch.Size([1, 18])


In [35]:
surname    = 'Schmidt'
classifier = classifier_unweighted
device     = 'cpu'
vectorizer = dataset.get_vectorizer()
predict_surname(surname,
                classifier,
                device, 
                vectorizer,5)

[{'nationality': 'German', 'probability': 0.6894437},
 {'nationality': 'English', 'probability': 0.27705163},
 {'nationality': 'Scottish', 'probability': 0.01835314},
 {'nationality': 'Dutch', 'probability': 0.0059650303},
 {'nationality': 'Czech', 'probability': 0.0033765961}]

In [36]:
surname    = 'Schmidt'
classifier = classifier_unweighted
device     = 'cpu'
vectorizer = dataset.get_vectorizer()
predict_surname(surname,
                classifier,
                device, 
                vectorizer,1)

[{'nationality': 'German', 'probability': 0.6894437}]

### A. Weighted Model

In [37]:
classifier = classifier_weighted
device     = 'cpu'
vectorizer = dataset.get_vectorizer()
for s,n in surnames.items():
    print('surname:', s)
    print('nationality:', n)
    print('predicted(top k)')
    print(predict_surname(s,classifier,
                          device,vectorizer,5))
    print('-'*60)

surname: Smith
nationality: English
predicted(top k)
[{'nationality': 'Scottish', 'probability': 0.611422}, {'nationality': 'German', 'probability': 0.13715632}, {'nationality': 'English', 'probability': 0.13082278}, {'nationality': 'Dutch', 'probability': 0.08253884}, {'nationality': 'Czech', 'probability': 0.01834927}]
------------------------------------------------------------
surname: Yamamoto
nationality: Japanese
predicted(top k)
[{'nationality': 'Japanese', 'probability': 0.9988399}, {'nationality': 'Greek', 'probability': 0.00055738416}, {'nationality': 'Czech', 'probability': 0.00028685236}, {'nationality': 'Russian', 'probability': 0.0001600626}, {'nationality': 'Italian', 'probability': 6.7969726e-05}]
------------------------------------------------------------
surname: Dubois
nationality: French
predicted(top k)
[{'nationality': 'Portuguese', 'probability': 0.47490096}, {'nationality': 'French', 'probability': 0.21429545}, {'nationality': 'Arabic', 'probability': 0.173225

### B. Un-weighted Model

In [38]:
classifier = classifier_unweighted
device     = 'cpu'
vectorizer = dataset.get_vectorizer()
for s,n in surnames.items():
    print('surname:', s)
    print('nationality:', n)
    print('predicted(top k)')
    print(predict_surname(s,classifier,
                          device,vectorizer,5))
    print('-'*60)

surname: Smith
nationality: English
predicted(top k)
[{'nationality': 'English', 'probability': 0.64895386}, {'nationality': 'German', 'probability': 0.110422485}, {'nationality': 'Russian', 'probability': 0.10835821}, {'nationality': 'Scottish', 'probability': 0.049528986}, {'nationality': 'Czech', 'probability': 0.031344082}]
------------------------------------------------------------
surname: Yamamoto
nationality: Japanese
predicted(top k)
[{'nationality': 'Japanese', 'probability': 0.9981171}, {'nationality': 'Russian', 'probability': 0.0014242005}, {'nationality': 'Greek', 'probability': 0.00040092773}, {'nationality': 'Czech', 'probability': 3.1969957e-05}, {'nationality': 'Italian', 'probability': 1.0447679e-05}]
------------------------------------------------------------
surname: Dubois
nationality: French
predicted(top k)
[{'nationality': 'English', 'probability': 0.5398026}, {'nationality': 'French', 'probability': 0.15025407}, {'nationality': 'Portuguese', 'probability': 0