Here are some imports needed:

In [11]:
from argparse import Namespace
from collections import Counter
import json
import os
import string

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm_notebook

# Basic MLP Toy Model
***

In [2]:
class MultilayerPerceptron(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        """
        Args:
            input_dim (int) - size of input vector
            hidden_dim (int) - the size after first Linear Layer
            output_dim (int) - size after second Linear Layer
        """
        super(MultilayerPerceptron, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x_in, apply_softmax=False):
        """
        compute forward pass
        
        Args:
            x_in (torch.Tensor) - input data tensor. x_in.shape is (batch, input_dim)
            apply_softmax (bool) - a flag for the softmax activation. 
                should be False if used with cross-entropy loss
        Returns:
            resulting tensor. tensor.shape is (batch, output_dim)
        """
        intermediate = F.relu(self.fc1(x_in))
        output = self.fc2(intermediate)
        
        if apply_softmax:
            output = F.softmax(output, dim=1)
        return output

Here is an example of how to instantiate an MLP

In [3]:
# use 2 rows at once
batch_size = 2
# there are 3 original features
input_dim = 3
# 100 nodes in first hidden layer
hidden_dim = 100
# output 4 numbers
output_dim = 4

In [4]:
#initialize the Model
mlp = MultilayerPerceptron(input_dim, hidden_dim, output_dim)
print(mlp)

MultilayerPerceptron(
  (fc1): Linear(in_features=3, out_features=100, bias=True)
  (fc2): Linear(in_features=100, out_features=4, bias=True)
)


Now we can test the integrity to make sure we got the dimensions correctly by passing some random inputs. 

In [7]:
def describe(x):
    print("Type: {}".format(x.type()))
    print("Shape/size: {}".format(x.shape))
    print("Values: \n{}".format(x))
    
x_input = torch.rand(batch_size, input_dim)
describe(x_input)

y_output = mlp(x_input, apply_softmax=False)
describe(y_output)

Type: torch.FloatTensor
Shape/size: torch.Size([2, 3])
Values: 
tensor([[0.0797, 0.8508, 0.2240],
        [0.5168, 0.9134, 0.5197]])
Type: torch.FloatTensor
Shape/size: torch.Size([2, 4])
Values: 
tensor([[ 0.0599,  0.0319, -0.2909, -0.0025],
        [ 0.1854, -0.0154, -0.4302,  0.0374]], grad_fn=<AddmmBackward>)


If we want, we can convert each of the output vectors (each row) into a vector of probabilities by enabling the softmax activation function:

In [8]:
y_output_softmax = mlp(x_input, apply_softmax=True)
describe(y_output_softmax)

Type: torch.FloatTensor
Shape/size: torch.Size([2, 4])
Values: 
tensor([[0.2766, 0.2689, 0.1947, 0.2598],
        [0.3105, 0.2540, 0.1678, 0.2678]], grad_fn=<SoftmaxBackward>)


We can check to make sure that each of these 2 rows sums to one:

In [10]:
y_output_softmax.detach().numpy().sum(axis=1)

array([0.9999999, 1.       ], dtype=float32)

# Classification Objective
***
Our objective in this project is to predict the nationality associated with a given last name. 

The original dataset contains 10000 surnames from 18 different nationalities. This dataset is imbalanced, as there is larger representation from certain nationalities than that of others. 

In [None]:
class SurnameDataset(Dataset):
    def __init__(self, surname_df, vectorizer):
        """
        Args:
            surname_df (pandas.DataFrame) - the dataset
            vectorizer (SurnameVectorizer) - vectorizer instantiated from dataset
        """
        self.surname_df = surname_df
        self._vectorizer = vectorizer
        
        self.train_df = self.surname_df[self.surname_df['split']=="train"]
        self.train_size = len(self.train_df)
        
        self.val_df = self.surname_df[self.surname_df['split']=='val']
        self.validation_size = len(self.val_df)
        
        self.test_df = self.surname_df[self.surname_df['split']=='test']
        self.test_size = len(self.test_df)
        
        self._lookup_dict = {"train": (self.train_df, self.train_size),\
                            "val": (self.val_df, self.validation_size),\
                            "test": (self.test_df, self.test_size)}
        self.set_split("train")
        
        counts_by_class = surname_df['nationality'].value_counts().to_dict()
        
        def sort_key(item):
            return self._vectorizer['nationality_vocab'].lookup_token(item[0])
        
        sorted_counts = sorted(counts_by_class.items(), key=sort_key)
        counts = [count for _, count in sorted_counts]
        self.class_weights = 1.0 / torch.tensor(counts, dtype=torch.float32)
        
    @classmethod
    def load_dataset_and_make_vectorizer(cls, surname_csv):
        """
        loads a dataset and makes vectorizer
        
        Args:
            surname_csv (str) - location of dataset
        Returns:
            SurnameDataset instance
        """
        surname_df = pd.read_csv(surname_csv)
        train_surname_df = surname_df[surname_df['split']=='train']
        return cls(surname_df, SurnameVectorizer.from_dataframe(train_surname_df))
    
    @classmethod
    def load_dataset_and_load_vectorizer(cls, surname_csv, vectorizer_filepath):
        """
        load dataset and the vectorizier
        use this when vectorizer has been cached for re-use
        
        Args:
            surname_csv (str) - location of dataset
            vectorizer_filepath (str) - location of saved vectorizer
        Returns:
            SurnameDataset instance
        """
        surname_df = pd.read_csv(surname_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(surname_df, vectorizer)
    
    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        """
        static method for loading vectorizer from file
        
        Args:
            vectorizer_filepath (str) - location of serialized vectorizer
        Returns:
            SurnameVectorizer instance
        """
        with open(vectorizer_filepath) as fp:
            return SurnameVectorizer.from_serializable(json.load(fp))
        
    
        
    def __getitem__(self, index):
        row = self._target_df.iloc[index]
        surname_vector = self._vectorizer.vectorize(row.surname)
        nationality_index = self._vectorizer.nationality_vocab.lookup_token(row.nationality)
        return {"x_surname": surname_vector, "y_nationality": nationality_index}
    