In [1]:
import pandas as pd
import collections
from collections import Counter
import string
import numpy as np
import pandas as pd

In [2]:
df_all = pd.read_csv("surnames_with_splits.csv")

In [3]:
print("shape of the data: ", df_all.shape)
print('-'*60)
print(df_all.sample(10))

shape of the data:  (10980, 4)
------------------------------------------------------------
     nationality  nationality_index  split     surname
9297     Russian                 13  train     Iskakov
1261      Arabic                 15    val      Dagher
7711    Japanese                  7  train      Narato
5767      German                  9  train       Reier
5222     English                 12   test      Gately
8798     Russian                 13  train  Jachmenkov
1708     Chinese                  3  train         Wei
2057       Czech                  5  train   Kucharova
6550       Irish                  1    val       Flann
2212       Czech                  5   test         Opp


In [4]:
class Vocabulary(object):

    def __init__(self, token_to_idx=None):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
        """
        if token_to_idx is None:
            token_to_idx = {}
            
        self._token_to_idx = token_to_idx
        
        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            ### add a new element to _token_to_idx
            self._token_to_idx[token] = index
            ### add a new element to _idx_to_token
            self._idx_to_token[index] = token
        return index
   
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        """
        return self._token_to_idx[token]
    
    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    def __len__(self):
        return len(self._token_to_idx)


class SequenceVocabulary(Vocabulary):
    def __init__(self, 
                 token_to_idx    = None, 
                 unk_token       = "<UNK>",
                 mask_token      = "<MASK>", 
                 begin_seq_token = "<BEGIN>",
                 end_seq_token   = "<END>"):
        
        
        super().__init__(token_to_idx)
        """
        The follow attributes have been defined in the Vocabulary class:
            - ._token_to_idx
            - ._idx_to_token
        """

        self._mask_token      = mask_token      # default: "<MASK>"
        self._unk_token       = unk_token       # default: "<UNK>"
        self._begin_seq_token = begin_seq_token # default: "<BEGIN>"
        self._end_seq_token   = end_seq_token   # default: "<END>"

        self.mask_index       = self.add_token(self._mask_token)      # return 0
        self.unk_index        = self.add_token(self._unk_token)       # return 1
        self.begin_seq_index  = self.add_token(self._begin_seq_token) # return 2
        self.end_seq_index    = self.add_token(self._end_seq_token)   # return 3
        
    
    ### Overriding the self.lookup_token() method
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

# 1. SurnameVectorizer class

In [14]:
class SurnameVectorizer(object):
    """ The Vectorizer which coordinates the Vocabularies and puts them to use"""
    def __init__(self, surname_vocab, nationality_vocab):
        """
        Args:
            surname_vocab (Vocabulary): maps characters to integers
            nationality_vocab (Vocabulary): maps nationalities to integers
        """
        self.surname_vocab       = surname_vocab
        self.nationality_vocab   = nationality_vocab
         
    @classmethod
    def from_dataframe(cls, surname_df):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            surname_df (pandas.DataFrame): the surnames dataset
        Returns:
            an instance of the SurnameVectorizer
        """
        surname_vocab      = SequenceVocabulary()
        nationality_vocab  = Vocabulary()
        
        ########## Add tokens to surname_vocab and nationality_vocab
        for index, row in surname_df.iterrows():

            # Add tokens(characters) to surname_vocab
            for letter in row.surname:
                surname_vocab.add_token(letter)
            # Add tokens(words) to nationality_vocab
            nationality_vocab.add_token(row.nationality)

        return cls(surname_vocab, nationality_vocab)

    def vectorize(self, surname, vector_length=-1):
        """
        Args:
            surname (str): the string of characters
            vector_length (int): an argument for forcing the length of index vector
        """
        ### set the first index to be begin_seq_index=2 (defined in SequenceVocabulary)
        indices = [self.surname_vocab.begin_seq_index]
        
        ### adding the indeces for the surname after the first index
        indices.extend(self.surname_vocab.lookup_token(token) 
                       for token in surname)
        
        ### set the last index to be end_seq_index=3 (defined in SequenceVocabulary)
        indices.append(self.surname_vocab.end_seq_index)

        if vector_length < 0:
            vector_length = len(indices) - 1

        ### from_vector, will be used as SurnameDataset.x_data
        ### The slice of the indices that doesn’t include the last index is placed inside from_vector
        from_vector = np.empty(vector_length, dtype=np.int64)         
        from_indices = indices[:-1]
        from_vector[:len(from_indices)] = from_indices
        ### the sequences are filled (or padded) to the right with the mask_index
        from_vector[len(from_indices):] = self.surname_vocab.mask_index
        
        ### to_vector, will be used as SurnameDataset.y_target
        ### The slice of the indices that doesn’t include the first index is placed inside to_vector
        to_vector = np.empty(vector_length, dtype=np.int64)
        to_indices = indices[1:]
        to_vector[:len(to_indices)] = to_indices
        ### the sequences are filled (or padded) to the right with the mask_index
        to_vector[len(to_indices):] = self.surname_vocab.mask_index
        
        return from_vector, to_vector

# 2. Instantiate a SurnameVectorizer from the training data

In [15]:
df_sample = df_all.copy()
df_sample.head()

Unnamed: 0,nationality,nationality_index,split,surname
0,Arabic,15,train,Totah
1,Arabic,15,train,Abboud
2,Arabic,15,train,Fakhoury
3,Arabic,15,train,Srour
4,Arabic,15,train,Sayegh


In [16]:
### Instantiate a vectorizer
vectorizer_sample = SurnameVectorizer.from_dataframe(df_sample)

### A vectorizer has two vocabularies(attributes), one for review, one for rating 

In [17]:
vars(vectorizer_sample)

{'surname_vocab': <__main__.SequenceVocabulary at 0x7fccbcc2dac0>,
 'nationality_vocab': <__main__.Vocabulary at 0x7fccbcc2dc40>}

In [18]:
print(f"surname_vocab includes {len(vectorizer_sample.surname_vocab)} tokens")
print("-"*100)
print("_token_to_idx:")
print(list(vectorizer_sample.surname_vocab._token_to_idx.items()))

surname_vocab includes 88 tokens
----------------------------------------------------------------------------------------------------
_token_to_idx:
[('<MASK>', 0), ('<UNK>', 1), ('<BEGIN>', 2), ('<END>', 3), ('T', 4), ('o', 5), ('t', 6), ('a', 7), ('h', 8), ('A', 9), ('b', 10), ('u', 11), ('d', 12), ('F', 13), ('k', 14), ('r', 15), ('y', 16), ('S', 17), ('e', 18), ('g', 19), ('C', 20), ('m', 21), ('H', 22), ('i', 23), ('K', 24), ('n', 25), ('W', 26), ('s', 27), ('f', 28), ('G', 29), ('M', 30), ('l', 31), ('B', 32), ('z', 33), ('N', 34), ('I', 35), ('w', 36), ('D', 37), ('Q', 38), ('j', 39), ('E', 40), ('R', 41), ('Z', 42), ('c', 43), ('Y', 44), ('J', 45), ('L', 46), ('O', 47), ('-', 48), ('P', 49), ('X', 50), ('p', 51), (':', 52), ('v', 53), ('U', 54), ('1', 55), ('V', 56), ('x', 57), ('/', 58), ('q', 59), ('é', 60), ('É', 61), ("'", 62), ('ç', 63), ('ê', 64), ('ß', 65), ('ö', 66), ('ä', 67), ('ü', 68), ('ú', 69), ('à', 70), ('ò', 71), ('è', 72), ('ó', 73), ('ù', 74), ('ì', 75), ('Ś',

# 3. Methods

### (classmethod) from_dataframe(surname_df): Instantiate the vectorizer from the dataset dataframe.
1. First instantiate a Vocabulariy for nationalities and a SequenceVocabulary for surnames, based on the input data "surnames_with_splits.csv".
2. Use the surname_vocab and nationality_vocab as the inputs to instantiate a vectorizer.

### vectorize(review): It takes as an argument a string representing a surname, and returns a vectorized representation of the surname. This is the key functionality of the Vectorizer.

In [19]:
##### Initializing SurnameVectorizer
vectorizer     = SurnameVectorizer.from_dataframe(df_sample)
print('Surname_vocab:',vectorizer.surname_vocab._idx_to_token)

Surname_vocab: {0: '<MASK>', 1: '<UNK>', 2: '<BEGIN>', 3: '<END>', 4: 'T', 5: 'o', 6: 't', 7: 'a', 8: 'h', 9: 'A', 10: 'b', 11: 'u', 12: 'd', 13: 'F', 14: 'k', 15: 'r', 16: 'y', 17: 'S', 18: 'e', 19: 'g', 20: 'C', 21: 'm', 22: 'H', 23: 'i', 24: 'K', 25: 'n', 26: 'W', 27: 's', 28: 'f', 29: 'G', 30: 'M', 31: 'l', 32: 'B', 33: 'z', 34: 'N', 35: 'I', 36: 'w', 37: 'D', 38: 'Q', 39: 'j', 40: 'E', 41: 'R', 42: 'Z', 43: 'c', 44: 'Y', 45: 'J', 46: 'L', 47: 'O', 48: '-', 49: 'P', 50: 'X', 51: 'p', 52: ':', 53: 'v', 54: 'U', 55: '1', 56: 'V', 57: 'x', 58: '/', 59: 'q', 60: 'é', 61: 'É', 62: "'", 63: 'ç', 64: 'ê', 65: 'ß', 66: 'ö', 67: 'ä', 68: 'ü', 69: 'ú', 70: 'à', 71: 'ò', 72: 'è', 73: 'ó', 74: 'ù', 75: 'ì', 76: 'Ś', 77: 'ą', 78: 'ń', 79: 'á', 80: 'ż', 81: 'Ż', 82: 'ł', 83: 'õ', 84: 'ã', 85: 'í', 86: 'ñ', 87: 'Á'}


In [22]:
example_surname = "Nakamura"
example_vector = vectorizer.vectorize(example_surname)
print('Example surname', example_surname)
print('-'*100)
print('from_vector:', example_vector[0])
print('to_vector:', example_vector[1])

Example surname Nakamura
----------------------------------------------------------------------------------------------------
from_vector: [ 2 34  7 14  7 21 11 15  7]
to_vector: [34  7 14  7 21 11 15  7  3]


In [24]:
example_surname = "Miller"
example_vector = vectorizer.vectorize(example_surname)
print('Example surname', example_surname)
print('-'*100)
print('from_vector:', example_vector[0])
print('to_vector:', example_vector[1])

Example surname Miller
----------------------------------------------------------------------------------------------------
from_vector: [ 2 30 23 31 31 18 15]
to_vector: [30 23 31 31 18 15  3]


In [25]:
example_surname = "Li"
example_vector = vectorizer.vectorize(example_surname)
print('Example surname', example_surname)
print('-'*100)
print('from_vector:', example_vector[0])
print('to_vector:', example_vector[1])

Example surname Li
----------------------------------------------------------------------------------------------------
from_vector: [ 2 46 23]
to_vector: [46 23  3]


In [26]:
example_surname = "Li"
example_vector = vectorizer.vectorize(example_surname,15)
print('Example surname', example_surname)
print('-'*100)
print('from_vector:', example_vector[0])
print('to_vector:', example_vector[1])

Example surname Li
----------------------------------------------------------------------------------------------------
from_vector: [ 2 46 23  0  0  0  0  0  0  0  0  0  0  0  0]
to_vector: [46 23  3  0  0  0  0  0  0  0  0  0  0  0  0]
