In [1]:
import pandas as pd
import collections
from collections import Counter
import string
import numpy as np

In [2]:
df_all = pd.read_csv('surnames_with_splits.csv')

In [3]:
print("shape of the data: ", df_all.shape)
print('-'*60)
print(df_all.head())

shape of the data:  (10980, 4)
------------------------------------------------------------
  nationality  nationality_index  split   surname
0      Arabic                 15  train     Totah
1      Arabic                 15  train    Abboud
2      Arabic                 15  train  Fakhoury
3      Arabic                 15  train     Srour
4      Arabic                 15  train    Sayegh


In [4]:
class Vocabulary(object):
    """Class to process text and extract vocabulary for mapping"""
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
            add_unk (bool): a flag that indicates whether to add the UNK token
            unk_token (str): the UNK token to add into the Vocabulary
        """
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        
        self._add_unk   = add_unk
        self._unk_token = unk_token      
        self.unk_index  = -999
        ### the unk_token, i.e, "<UNK>" is the first added token if add_unk=True
        ### self.unk_index is changed from -999 to 0
        if add_unk:
            self.unk_index = self.add_token(unk_token) 

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
   
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            ### .get(): return self.unk_index if the key "token" does not exist. 
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
    
    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    def __len__(self):
        return len(self._token_to_idx)

# 1. SurnameVectorizer class

In [5]:
class SurnameVectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use"""
    def __init__(self, surname_vocab, nationality_vocab):
        """
        Args:
            surname_vocab (Vocabulary): maps characters to integers
            nationality_vocab (Vocabulary): maps nationalities to integers
        """
        self.surname_vocab = surname_vocab
        self.nationality_vocab = nationality_vocab
         
    @classmethod
    def from_dataframe(cls, surname_df):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            surname_df (pandas.DataFrame): the surnames dataset
        Returns:
            an instance of the SurnameVectorizer
        """
        surname_vocab     = Vocabulary(add_unk=True, unk_token="@")
        nationality_vocab = Vocabulary(add_unk=False)
        
        ########## Add tokens to surname_vocab and nationality_vocab
        for index, row in surname_df.iterrows():
            # Add tokens(characters) to surname_vocab
            for letter in row.surname:
                surname_vocab.add_token(letter)
            # Add tokens(words) to nationality_vocab
            nationality_vocab.add_token(row.nationality)

        return cls(surname_vocab, nationality_vocab)

    ### This is the key functionality of the Vectorizer.
    ### It takes as an argument a string representing a surname,
    ### and returns a vectorized representation of the surname.
    def vectorize(self, surname):
        """
        Create a collapsed one-hot representation vector for the surname
        Limitations of the one-hot method:
        1 - Sparseness, n_unique_words in a surname << n_unique_words in a vocabulary
        2 - Discarding the order of the words' appearance
        
        Args:
            surname (str): the surname 
        Returns:
            one_hot (np.ndarray): the collapsed one-hot encoding 
        """
        ### Create an array where each element corresponds to each character in the vocabulary
        one_hot = np.zeros(len(self.surname_vocab), dtype=np.float32)
        ### Run lookup_token() for each character in the surname sequentially, return an index
        ### Assign the corresponding element in the array to 1.
        for token in surname:
            one_hot[self.surname_vocab.lookup_token(token)] = 1
        return one_hot


# 2. Instantiate the SurnameVectorizer from the training data

### First draw a (static, fixed random seed) from the entire dataset

In [6]:
df_sample = df_all.sample(100,random_state=100)

In [7]:
df_sample.head()

Unnamed: 0,nationality,nationality_index,split,surname
5362,English,12,test,Hepples
900,Arabic,15,train,Sarkis
10686,Spanish,6,train,Garza
8902,Russian,13,train,Zhurikhin
1296,Arabic,15,val,Aswad


In [8]:
pd.crosstab(df_sample['nationality'], df_sample['split'])

split,test,train,val
nationality,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Arabic,3,12,2
Chinese,1,2,0
Czech,1,2,0
English,4,18,1
French,0,2,0
German,1,4,1
Irish,0,3,0
Italian,2,5,1
Japanese,2,4,3
Korean,0,1,0


In [9]:
vectorizer = SurnameVectorizer.from_dataframe(df_sample)

### A vectorizer has two vocabularies(attributes), one for surname, one for nationality 

In [10]:
vars(vectorizer)

{'surname_vocab': <__main__.Vocabulary at 0x7f7fe4a77970>,
 'nationality_vocab': <__main__.Vocabulary at 0x7f7fe4a77f10>}

In [11]:
print('nationality_vocab')
print(vectorizer.nationality_vocab._token_to_idx)
print(vectorizer.nationality_vocab._idx_to_token)
print('-'*60)
print('surname_vocab')
print(f"Includes {len(vectorizer.surname_vocab)} tokens")

nationality_vocab
{'English': 0, 'Arabic': 1, 'Spanish': 2, 'Russian': 3, 'Japanese': 4, 'Chinese': 5, 'German': 6, 'Italian': 7, 'Irish': 8, 'Polish': 9, 'Korean': 10, 'French': 11, 'Czech': 12}
{0: 'English', 1: 'Arabic', 2: 'Spanish', 3: 'Russian', 4: 'Japanese', 5: 'Chinese', 6: 'German', 7: 'Italian', 8: 'Irish', 9: 'Polish', 10: 'Korean', 11: 'French', 12: 'Czech'}
------------------------------------------------------------
surname_vocab
Includes 49 tokens


# 3. Methods

### (classmethod) from_dataframe(surname_df): Instantiate the vectorizer from the dataset dataframe.
1. First instantiate two Vocabularies based on the input data "surname_df". [See a walkthrough of Vocabulary class here](https://github.com/houzhj/Machine_Learning/blob/main/ipynb/Surname_Nationality/class_Vocabulary.ipynb).
2. Use the surname_vocab and nationality_vocab as inputs to instantiate a vectorizer.

### vectorize(surname): It takes as an argument a string representing a surname, and returns a vectorized representation of the surname. This is the key functionality of the Vectorizer.

In [12]:
example_surname = "Onizuka"

In [13]:
vectorizer = SurnameVectorizer.from_dataframe(df_sample)
one_hot    = vectorizer.vectorize(example_surname)
print('Surname_vocab:',vectorizer.surname_vocab._idx_to_token)
print('-'*100)
print('One-hot representation:', one_hot)


Surname_vocab: {0: '@', 1: 'H', 2: 'e', 3: 'p', 4: 'l', 5: 's', 6: 'S', 7: 'a', 8: 'r', 9: 'k', 10: 'i', 11: 'G', 12: 'z', 13: 'Z', 14: 'h', 15: 'u', 16: 'n', 17: 'A', 18: 'w', 19: 'd', 20: 't', 21: 'c', 22: 'L', 23: 'b', 24: 'o', 25: 'Y', 26: 'v', 27: 'g', 28: 'y', 29: 'm', 30: 'E', 31: 'V', 32: 'D', 33: 'N', 34: 'T', 35: 'C', 36: 'J', 37: 'K', 38: 'M', 39: 'W', 40: 'j', 41: 'B', 42: 'P', 43: 'q', 44: 'F', 45: 'è', 46: 'O', 47: 'f', 48: 'R'}
----------------------------------------------------------------------------------------------------
One-hot representation: [0. 0. 0. 0. 0. 0. 0. 1. 0. 1. 1. 0. 1. 0. 0. 1. 1. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.
 0.]
