In [1]:
import numpy as np

class Serializer:
    def __init__(self, possible_labels):
        self.allowed_chars_in_filtered_name = [
            ' ', 'a', 'b', 'c', 
            'd', 'e', 'f', 'g', 
            'h', 'i', 'j', 'k', 
            'l', 'm', 'n', 'o', 
            'p', 'q', 'r', 's', 
            't', 'u', 'v', 'w', 
            'x', 'y', 'z', 'á', 
            'ã', 'ä', 'ç', 'è', 
            'é', 'ë', 'ï', 'ô', 
            'ö', 'ü', '-'#,         <- We remove '$' and '+' for this matter
            #$', '+',
        ]

        self.allowed_chars_in_name = set([
            ' ', 'a', 'b', 'c', 
            'd', 'e', 'f', 'g', 
            'h', 'i', 'j', 'k', 
            'l', 'm', 'n', 'o', 
            'p', 'q', 'r', 's', 
            't', 'u', 'v', 'w', 
            'x', 'y', 'z', 'á', 
            'ã', 'ä', 'ç', 'è', 
            'é', 'ë', 'ï', 'ô', 
            'ö', 'ü', '-'
        ])

        self.personal_titles = set([
            'dr', 'esq', 'hon', 'jr', 
            'mr', 'mrs', 'ms', 'messrs', 
            'mmes', 'msgr', 'prof', 'rev', 
            'rt', 'sr', 'st'
        ])

        # Map allowed chars to the index above
        self.allowed_chars_in_filtered_names_to_index = {}
        for i in range(len(self.allowed_chars_in_filtered_name)):
            self.allowed_chars_in_filtered_names_to_index[self.allowed_chars_in_filtered_name[i]] = i

        # We now want to map label to index, and index to label
        self.label_to_index = {}
        self.index_to_label = {}
        
        for i in range(len(possible_labels)):
            label = possible_labels[i]
            self.label_to_index[label] = i
            self.index_to_label[i] = label

        self.input_dimensions = len(self.allowed_chars_in_filtered_name)
        self.target_dimensions = len(possible_labels)

    '''
        Puts the examples into an array of chars, with each char being a 28 bit array, 
        and labels into a bit array
    '''
    def serialize_examples_and_labels(self, examples, labels):
        if len(examples) != len(labels):
            raise Exception('Number of examples does not match number of labels!')

        serialized_examples = []
        serialized_labels = []

        for i in range(len(examples)):
            example = examples[i]
            label = labels[i]
            serialized_example = self.serialize_example(example)
            serialized_label = self.serialize_label(label)

            if serialized_example is not None and serialized_label is not None:
                serialized_examples.append(serialized_example)
                serialized_labels.append(serialized_label)

        print('serialized', len(serialized_examples), 'examples')
        print('serialized', len(serialized_labels), 'labels')

        return np.array(serialized_examples), np.array(serialized_labels)
                
    '''
        It converts a label into a binary form
        For example, if we have self.label_to_index as:
        {'US': 0, 'Canada': 1, 'Mexico': 2, 'Europe': 3}

        and the label to be 'Mexico', it will return:
        [0, 0, 1, 0].

        Note that the length of the binary array will depend on the number of
        keys in self.label_to_index
    '''
    def serialize_label(self, label):
        if label in self.label_to_index:
            index = self.label_to_index[label]
            expected_val = np.zeros(self.target_dimensions)
            expected_val[index] = 1
            
            return expected_val
        else:
            raise Exception('The label', label, 'does not exist in', self.label_to_index)

    '''
        Given an example with string 'abc', it will return:
        [
            [1, 0, 0, 0, ..., 0],
            [0, 1, 0, 0, ..., 0],
            [0, 0, 1, 0, ..., 0]
        ]
    '''
    def serialize_example(self, example):
        filtered_char = self._filter_chars_(example)
        if filtered_char is None:
            return None

        name_array = []
        for letter in filtered_char:
            ascii_code = ord(letter)
            letter_array = np.zeros(self.input_dimensions, )

            if letter in self.allowed_chars_in_filtered_names_to_index:
                letter_array[self.allowed_chars_in_filtered_names_to_index[letter]] = 1
            else:
                raise Exception("Illegal character in name:", letter)

            name_array.append(letter_array)

        return np.array(name_array)

    def _filter_chars_(self, example):
        unfiltered_example = example

        # Make letters all lowercase
        # Ex: Mrs. John Smith -> mrs. john smith
        example = example.lower()

        # Remove non-space and non-letter characters
        # Ex: mrs. john smith -> mrs john smith
        filtered_example = ''
        for c in example:
            if c in self.allowed_chars_in_name:
                filtered_example += c
        example = filtered_example

        # Remove duplicated spaces
        # Ex: john  smith -> john smith
        example = example.split()
        new_example = ''
        for c in example:
            new_example += c + ' '
        example = new_example[0:-1]

        # Remove personal titles
        # Ex: mr john smith -> john smith
        example = example.split()
        new_example = ''
        for c in example:
            if c not in self.personal_titles:
                new_example += c + ' '
        example = new_example[0:-1]

        # Reject those with no characters
        if len(example) == 0 or len(example.split()) == 0:
            return None

        # Reject those whose first or last name is only one letter
        tokenized_example = example.split()
        if len(tokenized_example) == 0 or len(tokenized_example[0]) <= 1 or len(tokenized_example[-1]) <= 1:
            return None

        # Remove names with single letters
        # Ex: john n smith -> john smith
        example = example.split()
        new_example = ''
        for c in example:
            if len(c) > 1:
                new_example += c + ' '
        example = new_example[0:-1]

        tokenized_example = example.split()

        # Needs to contain only first and last name
        # if len(tokenized_example) != 2:
        #     return None


        # Obtain the last name
        # example = tokenized_example[-1]
        # if len(tokenized_example) <= 1:
        #     return None

        # Needs to contain at least the first and last name
        if len(tokenized_example) < 2:
            return None

        # final_example = tokenized_example[-1]
        # return final_example

        # print('OK')

        # Add '$' in between text, and add '+' at the beginning and end of last name
        # final_example = ''
        # for i in range(len(tokenized_example) - 1):
        #     final_example += '$' + tokenized_example[i] + '$ '
        # final_example += '+' + tokenized_example[-1] + '+'

        # Concat the final results
        # final_example = ''
        # for i in range(len(tokenized_example) - 1):
        #     final_example += tokenized_example[i] + ' '
        # final_example += tokenized_example[-1]

        # Get only the first and last name
        final_example = tokenized_example[0] + ' ' + tokenized_example[-1]

        # print('Example:', unfiltered_example, '-> "' + final_example + '"')

        return final_example


In [2]:
import numpy as np

'''
    This contains useful activation functions
'''
class ActivationFunctions:

    @staticmethod
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    @staticmethod
    def sigmoid_derivative_given_sigmoid_val(sigmoid_value):
	    return sigmoid_value * (1 - sigmoid_value)

    @staticmethod
    def tanh(x):
        return np.tanh(x)

    @staticmethod
    def tanh_derivative_given_tanh_val(tanh_value):
        return 1.0 - (tanh_value ** 2)

    @staticmethod
    def softmax(x):
        e_x = np.exp(x - np.max(x))
        return e_x / np.sum(e_x, axis=0)

    @staticmethod
    def softmax_derivative(val):
        softmax_val = ActivationFunctions.softmax(val)
        reshaped_softmax_val = softmax_val.reshape(-1,1)
        return np.diagflat(reshaped_softmax_val) - np.dot(reshaped_softmax_val, reshaped_softmax_val.T)

'''
    This contains useful loss functions
'''
class LossFunctions:
    @staticmethod
    def cross_entropy(hypothesis, expected_result, epsilon=1e-12):
        return -np.sum(np.multiply(expected_result, np.log(hypothesis + epsilon)))

In [3]:
import copy
import numpy as np
import random 
from sklearn.utils import shuffle
from ml_utils import ActivationFunctions, LossFunctions
import time
from serializer import Serializer

class NamesToNationalityClassifier:

    def __init__(self, possible_labels, alpha=0.0001, hidden_dimensions=500, l2_lambda = 0.02, momentum=0.9, num_epoche=30):
        self.serializer = Serializer(possible_labels)

        self.alpha = alpha
        self.input_dimensions = self.serializer.input_dimensions
        self.hidden_dimensions = hidden_dimensions
        self.output_dimensions = self.serializer.target_dimensions
        self.training_to_validation_ratio = 0.7 # This means 70% of the dataset will be used for training, and 30% is for validation

        # Weight Initialization
        # We are using the Xavier initialization
        # Reference: https://medium.com/usf-msds/deep-learning-best-practices-1-weight-initialization-14e5c0295b94
        self.weight_init_type = 'X1'
        self.W0 = np.random.randn(self.hidden_dimensions, self.hidden_dimensions) * np.sqrt(1 / self.hidden_dimensions)
        self.W1 = np.random.randn(self.hidden_dimensions, self.input_dimensions + 1) * np.sqrt(1 / (self.input_dimensions + 1))
        self.W2 = np.random.randn(self.output_dimensions, self.hidden_dimensions + 1) * np.sqrt(1 / (self.hidden_dimensions + 1))

        # Momentum and regularization
        self.l2_lambda = l2_lambda # The lambda for L2 regularization
        self.momentum = momentum
        self.W0_velocity = np.zeros((self.hidden_dimensions, self.hidden_dimensions))
        self.W1_velocity = np.zeros((self.hidden_dimensions, self.input_dimensions + 1))
        self.W2_velocity = np.zeros((self.output_dimensions, self.hidden_dimensions + 1))

        # Bias values
        self.layer_1_bias = 1
        self.layer_2_bias = 1

        # Num epoche
        self.num_epoche = num_epoche

        self.serialized_training_examples = []
        self.serialized_training_labels = []
        self.serialized_testing_examples = []
        self.serialized_testing_labels = []

    
    def add_training_examples(self, examples, labels):
        serialized_examples, serialized_labels = self.serializer.serialize_examples_and_labels(examples, labels) #self.__serialize_examples_and_labels__(examples, labels)
        num_training_data = int(len(serialized_examples) * self.training_to_validation_ratio)

        self.serialized_training_examples = serialized_examples[:num_training_data]
        self.serialized_training_labels = serialized_labels[:num_training_data]
        self.serialized_testing_examples = serialized_examples[num_training_data:]
        self.serialized_testing_labels = serialized_labels[num_training_data:]

    '''
        Trains the model based on the training data provided.
        It will output a dictionary with the following keys:
        {
            'epoche_to_train_avg_error': the train avg error per epoche,
            'epoche_to_test_avg_error': the test avg error per epoche,
            'epoche_to_train_accuracy': the train accuracy per epoche,
            'epoche_to_test_accuracy': the test accuracy per epoche
        }
    '''
    def train(self):
        print("Training...")
        print(self)

        epoche_to_train_avg_error = np.zeros((self.num_epoche, ))
        epoche_to_test_avg_error = np.zeros((self.num_epoche, ))
        epoche_to_train_accuracy = np.zeros((self.num_epoche, ))
        epoche_to_test_accuracy = np.zeros((self.num_epoche, ))

        for epoche in range(self.num_epoche):
            train_avg_error = 0
            train_accuracy = 0

            # Reshuffle the data
            self.serialized_training_examples, self.serialized_training_labels = shuffle(
                self.serialized_training_examples, self.serialized_training_labels)

            for i in range(len(self.serialized_training_examples)):

                # It is a "num_char" x "self.input_dimensions" matrix
                example = self.serialized_training_examples[i]

                # It is a 1D array with "self.output_dimensions" elements
                label = self.serialized_training_labels[i] 

                # Perform forward propagation
                forward_propagation_results = self.__perform_forward_propagation__(example, label)
                letter_pos_to_hypothesis = forward_propagation_results['letter_pos_to_hypothesis']
                letter_pos_to_loss = forward_propagation_results['letter_pos_to_loss']

                # Calculate the train avg error and the train accuracy
                train_avg_error += np.sum(letter_pos_to_loss)
                train_accuracy += 1 if self.__is_hypothesis_correct__(letter_pos_to_hypothesis[-1], label) else 0

                # Perform back propagation
                self.__perform_back_propagation__(example, label, forward_propagation_results)

            epoche_to_train_avg_error[epoche] = train_avg_error / len(self.serialized_training_examples)
            epoche_to_train_accuracy[epoche] = train_accuracy / len(self.serialized_training_examples)

            test_avg_error, test_accuracy, test_runnable_ratio = self.__validate__()
            epoche_to_test_accuracy[epoche] = test_accuracy
            epoche_to_test_avg_error[epoche] = test_avg_error

            print(epoche, epoche_to_train_avg_error[epoche], epoche_to_test_avg_error[epoche], epoche_to_train_accuracy[epoche], epoche_to_test_accuracy[epoche], test_runnable_ratio, time.time())

        return {
            'epoche_to_train_avg_error': epoche_to_train_avg_error,
            'epoche_to_test_avg_error': epoche_to_test_avg_error,
            'epoche_to_train_accuracy': epoche_to_train_accuracy,
            'epoche_to_test_accuracy': epoche_to_test_accuracy
        }

    '''
        Trains an example with a label.
        The example is a name (like "Bob Smith") and its label is a country name (ex: "Canada")
    '''
    def train_example(self, example, label):
        serialized_example = self.serializer.serialize_example(example)
        serialized_label = self.serializer.serialize_label(label)

        # Perform forward propagation
        forward_propagation_results = self.__perform_forward_propagation__(serialized_example, serialized_label)

        # Perform back propagation
        self.__perform_back_propagation__(serialized_example, serialized_label, forward_propagation_results)

    '''
        It computes how well the model runs based on the validation data
        It returns the avg. error and accuracy rate
    '''
    def __validate__(self):
        total_cost = 0
        num_correct = 0
        num_examples_ran = 0

        for i in range(len(self.serialized_testing_examples)):

            # It is a num_char x 27 matrix
            example = self.serialized_testing_examples[i]

            # It is a 1D 124 element array
            label = self.serialized_testing_labels[i] 

            forward_propagation_results = self.__perform_forward_propagation__(example, label)
            letter_pos_to_loss = forward_propagation_results['letter_pos_to_loss']
            letter_pos_to_hypothesis = forward_propagation_results['letter_pos_to_hypothesis']

            if len(letter_pos_to_hypothesis) > 0:
                final_hypothesis = letter_pos_to_hypothesis[-1]

                # Seeing whether the hypothesis is correct
                if self.__is_hypothesis_correct__(final_hypothesis, label):
                    num_correct += 1

                total_cost += np.sum(letter_pos_to_loss)

                num_examples_ran += 1

        avg_cost = total_cost / num_examples_ran
        accuracy = num_correct / num_examples_ran
        runnable_examples_ratio = num_examples_ran / len(self.serialized_testing_examples)

        return avg_cost, accuracy, runnable_examples_ratio

    def __is_hypothesis_correct__(self, hypothesis, label):
        return np.argmax(hypothesis, axis=0) == np.argmax(label, axis=0)

    '''
        This function will perform a forward propagation with the serialized version of the example
        and the serialized version of the label.

        The serialized_example needs to be a 2D matrix with size num_char x self.input_dimensions.
        The serialized_label needs to be a 1D array with size self.output_dimentions.

        So this function will return:
        - the loss at each timestep (called 'letter_pos_to_loss')
        - the hidden states at each timestep (called 'letter_pos_to_hidden_state')
        - the layer 2 values at each timestep (called 'letter_pos_to_layer_2_values')
        - the hypothesis at each timestep (called 'letter_pos_to_hypothesis')
        - 
    '''
    def __perform_forward_propagation__(self, serialized_example, serialized_label):
        num_chars = len(serialized_example)

        # Stores the hidden state for each letter position.
        letter_pos_to_h0 = np.zeros((num_chars + 1, self.hidden_dimensions))

        # Stores the layer 2 values for each letter position
        letter_pos_to_h1 = np.zeros((num_chars, self.hidden_dimensions))

        # Stores the hypothesis for each letter position
        letter_pos_to_h2 = np.zeros((num_chars, self.output_dimensions))

        # The hidden state for the first letter position is all 0s.
        letter_pos_to_h0[0] = np.zeros(self.hidden_dimensions)

        # The loss for each letter position
        letter_pos_to_loss = np.zeros((num_chars, ))

        for j in range(num_chars):
            # The inputs
            X = serialized_example[j]
            X_with_bias = np.r_[[self.layer_1_bias], X] # <- We add a bias to the input. It is now a 28 element array
            h0 = letter_pos_to_h0[j]

            y1 = np.dot(self.W1, X_with_bias) + np.dot(self.W0, h0)
            h1 = ActivationFunctions.tanh(y1)

            # Adding the bias
            h1_with_bias = np.r_[[self.layer_2_bias], h1]

            y2 = np.dot(self.W2, h1_with_bias)
            h2 = ActivationFunctions.softmax(y2)

            # Update the dictionaries
            letter_pos_to_h1[j] = h1
            letter_pos_to_h2[j] = h2
            letter_pos_to_h0[j + 1] = h1

            letter_pos_to_loss[j] = LossFunctions.cross_entropy(h2, serialized_label)
        
        return {
            'letter_pos_to_loss': letter_pos_to_loss,
            'letter_pos_to_hidden_state': letter_pos_to_h0,
            'letter_pos_to_layer_2_values': letter_pos_to_h1,
            'letter_pos_to_hypothesis': letter_pos_to_h2
        }

    '''
        Performs back propagation.
        Note that it requires the results from self.__perform_forward_propagation__() on the same example
        Note that the example needs to be a serialized example, and the label needs to be a serialized label
    '''
    def __perform_back_propagation__(self, serialized_example, serialized_label, forward_propagation_results):
        letter_pos_to_h0 = forward_propagation_results['letter_pos_to_hidden_state']
        letter_pos_to_h1 = forward_propagation_results['letter_pos_to_layer_2_values']
        letter_pos_to_h2 = forward_propagation_results['letter_pos_to_hypothesis']
        letter_pos_to_loss = forward_propagation_results['letter_pos_to_loss']

        # The loss gradients w.r.t W0, W1, W2
        dL_dW0 = np.zeros((self.hidden_dimensions, self.hidden_dimensions))
        dL_dW1 = np.zeros((self.hidden_dimensions, self.input_dimensions + 1))
        dL_dW2 = np.zeros((self.output_dimensions, self.hidden_dimensions + 1))

        num_chars = len(serialized_example)

        for j in range(num_chars - 1, -1, -1):
            X = serialized_example[j]
            X_with_bias = np.r_[[self.layer_1_bias], X]
            
            # This is a 1D array with "self.hidden_dimensions" elements
            h0 = letter_pos_to_h0[j]                    

            # This is a 1D array with "self.hidden_dimensions" elements
            h1 = letter_pos_to_h1[j]

            # Adding the bias
            # This is a 1D array with "self.hidden_dimensions + 1" elements
            h1_with_bias = np.r_[[self.layer_2_bias], h1]

            # This is a 1D array with "self.output_dimensions" elements                    
            h2 = letter_pos_to_h2[j]

            # This is a 1D array with "self.output_dimentions" elements
            # This is the derivative of y with respect to the cross entropy score
            dL_dY2 = h2 - serialized_label

            # This is a 1D array with "self.hidden_dimensions + 1" elements
            dL_dH1 = np.dot(dL_dY2.T, self.W2)
            dL_dY1 = np.multiply(dL_dH1, ActivationFunctions.tanh_derivative_given_tanh_val(h1_with_bias))

            # We are removing the bias value
            # So now it is a "self.hidden_dimensions" elements
            dL_dY1 = dL_dY1[1:]

            # We are not updating the weights of the bias value, so we are setting the changes for the bias weights to 0
            # We are going to update the weights of the bias value later
            dL_dW0 += np.dot(np.array([dL_dY1]).T, np.array([h0]))
            dL_dW1 += np.dot(np.array([dL_dY1]).T, np.array([X_with_bias]))
            dL_dW2 += np.dot(np.array([dL_dY2]).T, np.array([h1_with_bias]))

        # Add regularization
        dL_dW0 += self.l2_lambda * self.W0
        dL_dW1 += self.l2_lambda * self.W1
        dL_dW2 += self.l2_lambda * self.W2

        # Add the velocity
        self.W0_velocity = (self.momentum * self.W0_velocity) + (self.alpha * dL_dW0)
        self.W1_velocity = (self.momentum * self.W1_velocity) + (self.alpha * dL_dW1)
        self.W2_velocity = (self.momentum * self.W2_velocity) + (self.alpha * dL_dW2)

        # Update weights
        self.W0 -= self.W0_velocity
        self.W1 -= self.W1_velocity
        self.W2 -= self.W2_velocity

    def predict(self, name):
        # Serialize the name to a num_char x 27 matrix
        example = self.serializer.serialize_example(name)
        # num_chars = len(example)
        label = np.zeros((self.output_dimensions, ))

        forward_propagation_results = self.__perform_forward_propagation__(example, label)
        letter_pos_to_y2 = forward_propagation_results['letter_pos_to_hypothesis']

        if len(letter_pos_to_y2) > 0:
            hypothesis = ActivationFunctions.softmax(letter_pos_to_y2[-1])
            formatted_hypothesis = []
            for k in range(self.output_dimensions):
                formatted_hypothesis.append((hypothesis[k], self.serializer.index_to_label[k]))

            formatted_hypothesis.sort(reverse=True)

            return formatted_hypothesis
        else:
            raise Exception('Hypothesis cannot be obtained')

    def save_model(self, filename):
        np.savez_compressed(filename, 
            layer_1_weights=self.W1, 
            layer_2_weights=self.W2, 
            hidden_state_weights=self.W0)

    def load_model_from_file(self, filename):
        data = np.load(filename)
        self.W1 = data['layer_1_weights']
        self.W2 = data['layer_2_weights']
        self.W0 = data['hidden_state_weights']

    def __str__(self):
        description = "RNN with learning rate: {}, momentum: {}, L2 reg. rate: {}, Weight Init. Type: {}, Num. Epoche: {}" 
        return description.format(self.alpha, 
                                  self.momentum, 
                                  self.l2_lambda, 
                                  self.weight_init_type, 
                                  self.num_epoche)


In [None]:
import numpy as np
from names_to_nationality_classifier import NamesToNationalityClassifier
from collections import OrderedDict

# Make matplotlib not interactive
import matplotlib as mpl
mpl.use('Agg')

import matplotlib.pyplot as plt 

'''
    Obtains a map from country ID to country name.
    For example,
    {
        5998: ("United Kingdom", "British"),
        5978: ("China", "Chinese"),
        ...
    }
'''
def get_countries(filepath='data/countries.csv'):
    country_id_to_country_name = {}
    print('Countries Filepath:', filepath)

    with open(filepath) as countries_file_reader:

        line = countries_file_reader.readline()
        while line:
            tokenized_line = line.split(',')
            if len(tokenized_line) == 3:
                country_id = int(tokenized_line[0])
                country_name = tokenized_line[1]
                nationality = tokenized_line[2]

                country_id_to_country_name[country_id] = (country_name, nationality)

            line = countries_file_reader.readline()

    return country_id_to_country_name

'''
    Obtains the records from the CSV file into a list.
    For example,
    [
        ("Bob Smith", 5998),
        ("Xi Jinping", 5978),
        ...
    ]
'''
def get_records(max_records_per_country=float("inf")):

    # We first put all the records from the file
    raw_records = []
    with open('data/records.csv') as reader:

        line = reader.readline()
        while line:
            tokenized_line = line.split(',')

            if len(tokenized_line) == 3:
                name = tokenized_line[1]
                country_of_birth_id = int(tokenized_line[2])
                raw_records.append((name, country_of_birth_id))

            line = reader.readline()

    # Shuffle the raw records to remove the potential ordering in the file
    np.random.shuffle(raw_records)

    # We then add the records to our dataset ensuring that it meets the count
    records = []
    country_id_to_num_records = {}
    for record in raw_records:
        country_of_birth_id = record[1]

        if country_of_birth_id not in country_id_to_num_records:
            records.append(record)
            country_id_to_num_records[country_of_birth_id] = 1

        elif country_id_to_num_records[country_of_birth_id] < max_records_per_country:
            records.append(record)
            country_id_to_num_records[country_of_birth_id] += 1

    return records

'''
    It will return three values:
    1.  A list of all possible labels
        For example,
        [
            "United Kingdom", 
            "China", 
            ...
        ]

    2.  A list of examples
        For example,
        [
            "Bob Smith",
            "Xi Jinping",
            ...
        ]

    3.  A list of labels where label[i] is the label for example[i]
        For example,
        [
            "United Kingdom",
            "China",
            ...
        ]

    It returns in the order listed above

    Note: For data/china-korea-japan-vietnam-countries.csv, use the following hyper-params:
    - Momentum = 0.9
    - L2 = 0.0001
    - Learning Rate = 0.0001
    - Hidden Dimensions: 200
    - Epoche: 50

    Note: For data/countries-without-usa-or-canada.csv, use the following hyper-params:
    - Momentum = 0.9
    - L2 = 0
    - Learning Rate = 0.0001
    - Hidden Dimensions: 500
    - Epoche: 20
'''
def get_dataset():
    country_id_to_country = get_countries(filepath='data/countries.csv')
    #country_id_to_country = get_countries(filepath='data/countries-without-usa-or-canada.csv')
    # country_id_to_country = get_countries(filepath='data/china-korea-japan-vietnam-countries.csv')
    # country_id_to_country = get_countries(filepath='data/european-countries.csv')
    countries = [ country_id_to_country[id][0] for id in country_id_to_country ]
    countries.sort()

    records = get_records(max_records_per_country=5000)
    records = list(filter(lambda x: x[1] in country_id_to_country, records))
    records = [( record[0], country_id_to_country[record[1]][0] ) for record in records]

    # Shuffle the records
    np.random.shuffle(records)
        
    # Splits the records into two lists
    examples = [ record[0] for record in records ]
    labels = [ record[1] for record in records ]

    return countries, examples, labels

'''
    The main method
'''
def main():
    countries, examples, labels = get_dataset()
    plt.ioff()

    classifier = NamesToNationalityClassifier(countries, 
                                            alpha=0.0001,
                                            hidden_dimensions=500, 
                                            momentum=0.9,
                                            num_epoche=20,
                                            l2_lambda=0)

    classifier.add_training_examples(examples, labels)
    performance = classifier.train()

    epoches = [i for i in range(classifier.num_epoche)]

    # Plot the performance
    fig, (errors_plt, accuracy_plt) = plt.subplots(2)
    plt_title_format = "Performance. for Learning Rate: {:.5f}, Hidden Dim: {:.5f}, \nL2_lambda: {:.5f}, Momentum: {:.5f}, Num Epoche: {:.5f}"
    fig_title = plt_title_format.format(classifier.alpha, 
                                        classifier.hidden_dimensions, 
                                        classifier.l2_lambda, 
                                        classifier.momentum, 
                                        classifier.num_epoche)
    fig.suptitle(fig_title, fontsize=10)

    errors_plt.set_title('Errors vs Epoche', fontsize=10)
    errors_plt.plot(epoches, performance['epoche_to_train_avg_error'], label='Train Avg. Error')
    errors_plt.plot(epoches, performance['epoche_to_test_avg_error'], label='Test Avg. Error')
    errors_plt.legend()
    errors_plt.set_xlabel('Epoche')
    errors_plt.set_ylabel('Error')

    accuracy_plt.set_title('Accuracy vs Epoche', fontsize=10)
    accuracy_plt.plot(epoches, performance['epoche_to_train_accuracy'], label='Train Accuracy')
    accuracy_plt.plot(epoches, performance['epoche_to_test_accuracy'], label='Test Accuracy')
    accuracy_plt.legend()
    accuracy_plt.set_xlabel('Epoche')
    accuracy_plt.set_ylabel('Accuracy')

    plt.subplots_adjust(top=0.85)
    plt.subplots_adjust(hspace=0.5)

    # Save the plot
    plt_file_name_format = 'L{}-H-{}-R-{}-M-{}-E-{}-plots.png'
    plt_file_name = plt_file_name_format.format(classifier.weight_init_type,
                                                str(classifier.hidden_dimensions).replace('.', '_'), 
                                                str(classifier.alpha).replace('.', '_'), 
                                                str(classifier.momentum).replace('.', '_'), 
                                                str(classifier.num_epoche).replace('.', '_'))
    plt.savefig(plt_file_name)

    # Save the data
    data_file_name_format = 'L{}-H-{}-R-{}-M-{}-E-{}-data'
    data_file_name = data_file_name_format.format(classifier.weight_init_type,
                                                    str(classifier.hidden_dimensions).replace('.', '_'), 
                                                    str(classifier.alpha).replace('.', '_'), 
                                                    str(classifier.momentum).replace('.', '_'), 
                                                    str(classifier.num_epoche).replace('.', '_'))
    print('Saved model to', data_file_name + '.npz')
    classifier.save_model('data/' + data_file_name)

    # # Train the model
    # classifier = NamesToNationalityClassifier(countries)
    # try:
    #     print('Training data')
    #     classifier.add_training_examples(examples, labels)
    #     classifier.train()
    # finally:
    #     print('Saved model to data.npz')
    #     classifier.save_model('data/data')

    # Make predictions
    # classifier.load_model_from_file('data/data.npz')
    # print(classifier.predict('Emilio Kartono'))

main()


Countries Filepath: data/countries-without-usa-or-canada.csv
