In [3]:
import pandas as pd
import collections
from collections import Counter
import string

In [4]:
df_all = pd.read_csv("surnames_with_splits.csv")

In [6]:
print("shape of the data: ", df_all.shape)
print('-'*60)
print(df_all.sample(10))

shape of the data:  (10980, 4)
------------------------------------------------------------
      nationality  nationality_index  split     surname
2462        Dutch                  2   test      Klerks
10225     Russian                 13    val  Halymbadja
2382        Dutch                  2  train  Meeuwissen
8089       Polish                 14  train      Smolák
1050       Arabic                 15  train      Morcos
10018     Russian                 13    val    Timashov
1192       Arabic                 15    val      Morcos
5763       German                  9  train       Acker
5123      English                 12   test       Eltis
10669     Spanish                  6  train     Montero


In [7]:
### Define Vocabulary class

class Vocabulary(object):

    def __init__(self, token_to_idx=None):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
        """
        if token_to_idx is None:
            token_to_idx = {}
            
        self._token_to_idx = token_to_idx
        
        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            ### add a new element to _token_to_idx
            self._token_to_idx[token] = index
            ### add a new element to _idx_to_token
            self._idx_to_token[index] = token
        return index
   
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        """
        return self._token_to_idx[token]
    
    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    def __len__(self):
        return len(self._token_to_idx)

# 1. SequenceVocabulary class

In [8]:
class SequenceVocabulary(Vocabulary):
    def __init__(self, 
                 token_to_idx    = None, 
                 unk_token       = "<UNK>",
                 mask_token      = "<MASK>", 
                 begin_seq_token = "<BEGIN>",
                 end_seq_token   = "<END>"):
        
        
        super().__init__(token_to_idx)
        """
        The follow attributes have been defined in the Vocabulary class:
            - ._token_to_idx
            - ._idx_to_token
        """

        self._mask_token      = mask_token      # default: "<MASK>"
        self._unk_token       = unk_token       # default: "<UNK>"
        self._begin_seq_token = begin_seq_token # default: "<BEGIN>"
        self._end_seq_token   = end_seq_token   # default: "<END>"

        self.mask_index       = self.add_token(self._mask_token)      # return 0
        self.unk_index        = self.add_token(self._unk_token)       # return 1
        self.begin_seq_index  = self.add_token(self._begin_seq_token) # return 2
        self.end_seq_index    = self.add_token(self._end_seq_token)   # return 3
        
    
    ### Overriding the self.lookup_token() method
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

# 2. Instantiate the SequenceVocabulary from the training data

## (1) The vocabulary for the nationality - nationality_vocab
### The corpus of nationality

In [16]:
print('corpus')
print(df_all.nationality.unique())
print('-'*80)
print('counts')
print(df_all.nationality.value_counts())

corpus
['Arabic' 'Chinese' 'Czech' 'Dutch' 'English' 'French' 'German' 'Greek'
 'Irish' 'Italian' 'Japanese' 'Korean' 'Polish' 'Portuguese' 'Russian'
 'Scottish' 'Spanish' 'Vietnamese']
--------------------------------------------------------------------------------
counts
nationality
English       2972
Russian       2373
Arabic        1603
Japanese       775
Italian        600
German         576
Czech          414
Spanish        258
Dutch          236
French         229
Chinese        220
Irish          183
Greek          156
Polish         120
Korean          77
Scottish        75
Vietnamese      58
Portuguese      55
Name: count, dtype: int64


### Initializing nationality_vocab.

In [18]:
nationality_vocab = Vocabulary()
vars(nationality_vocab)

{'_token_to_idx': {}, '_idx_to_token': {}}

### Add tokens appear in the nationality to nationality_vocab. 
### There are 18 tokens in the corpus of nationality. 

In [27]:
sorted(set(df_all.nationality))

['Arabic',
 'Chinese',
 'Czech',
 'Dutch',
 'English',
 'French',
 'German',
 'Greek',
 'Irish',
 'Italian',
 'Japanese',
 'Korean',
 'Polish',
 'Portuguese',
 'Russian',
 'Scottish',
 'Spanish',
 'Vietnamese']

In [29]:
nationality_vocab = Vocabulary()
for n in sorted(set(df_all.nationality)):
    nationality_vocab.add_token(n)
vars(nationality_vocab)

{'_token_to_idx': {'Arabic': 0,
  'Chinese': 1,
  'Czech': 2,
  'Dutch': 3,
  'English': 4,
  'French': 5,
  'German': 6,
  'Greek': 7,
  'Irish': 8,
  'Italian': 9,
  'Japanese': 10,
  'Korean': 11,
  'Polish': 12,
  'Portuguese': 13,
  'Russian': 14,
  'Scottish': 15,
  'Spanish': 16,
  'Vietnamese': 17},
 '_idx_to_token': {0: 'Arabic',
  1: 'Chinese',
  2: 'Czech',
  3: 'Dutch',
  4: 'English',
  5: 'French',
  6: 'German',
  7: 'Greek',
  8: 'Irish',
  9: 'Italian',
  10: 'Japanese',
  11: 'Korean',
  12: 'Polish',
  13: 'Portuguese',
  14: 'Russian',
  15: 'Scottish',
  16: 'Spanish',
  17: 'Vietnamese'}}

### Another way to add tokens to nationality_vocab

In [31]:
nationality_vocab = Vocabulary()
for index, row in df_all.iterrows():
    nationality_vocab.add_token(row.nationality)
vars(nationality_vocab)

{'_token_to_idx': {'Arabic': 0,
  'Chinese': 1,
  'Czech': 2,
  'Dutch': 3,
  'English': 4,
  'French': 5,
  'German': 6,
  'Greek': 7,
  'Irish': 8,
  'Italian': 9,
  'Japanese': 10,
  'Korean': 11,
  'Polish': 12,
  'Portuguese': 13,
  'Russian': 14,
  'Scottish': 15,
  'Spanish': 16,
  'Vietnamese': 17},
 '_idx_to_token': {0: 'Arabic',
  1: 'Chinese',
  2: 'Czech',
  3: 'Dutch',
  4: 'English',
  5: 'French',
  6: 'German',
  7: 'Greek',
  8: 'Irish',
  9: 'Italian',
  10: 'Japanese',
  11: 'Korean',
  12: 'Polish',
  13: 'Portuguese',
  14: 'Russian',
  15: 'Scottish',
  16: 'Spanish',
  17: 'Vietnamese'}}

### (2) The vocabulary for the surnames
### The corpus - the difference between nationality and surname is that the In the corpus of nationality, each word is treated as a token, whereas in the corpus of surname, each character is treated as a token.
### - Tokens in "nationality_vocab": 'Arabic', 'Chinese', 'Czech', 'Dutch', 'English', ...
### - Tokens in "surname_vocab": 'T', 'a', 't', 'V', ...

In [32]:
df_all.surname

0           Totah
1          Abboud
2        Fakhoury
3           Srour
4          Sayegh
           ...   
10975        Dinh
10976       Phung
10977       Quang
10978          Vu
10979          Ha
Name: surname, Length: 10980, dtype: object

### Initializing surname_vocab.

In [33]:
surname_vocab = SequenceVocabulary()
vars(surname_vocab)

{'_token_to_idx': {'<MASK>': 0, '<UNK>': 1, '<BEGIN>': 2, '<END>': 3},
 '_idx_to_token': {0: '<MASK>', 1: '<UNK>', 2: '<BEGIN>', 3: '<END>'},
 '_mask_token': '<MASK>',
 '_unk_token': '<UNK>',
 '_begin_seq_token': '<BEGIN>',
 '_end_seq_token': '<END>',
 'mask_index': 0,
 'unk_index': 1,
 'begin_seq_index': 2,
 'end_seq_index': 3}

In [35]:
for index, row in df_all.iterrows():
    for letter in row.surname:
        surname_vocab.add_token(letter)
print(f'The number of tokens is {len(surname_vocab)}')


The number of tokens is 88


# 3. Attributes

### ._token_to_idx: a mapping of index and token added to the SequenceVocabulary (inherited from Vocabulary)

In [39]:
print("Print out 20 tokens in the vocabulary")
list(surname_vocab._token_to_idx.items())[:20]

Print out 20 tokens in the vocabulary


[('<MASK>', 0),
 ('<UNK>', 1),
 ('<BEGIN>', 2),
 ('<END>', 3),
 ('T', 4),
 ('o', 5),
 ('t', 6),
 ('a', 7),
 ('h', 8),
 ('A', 9),
 ('b', 10),
 ('u', 11),
 ('d', 12),
 ('F', 13),
 ('k', 14),
 ('r', 15),
 ('y', 16),
 ('S', 17),
 ('e', 18),
 ('g', 19)]

In [40]:
tokens  = ['a','b','c','A','B','C']
mapping = surname_vocab._token_to_idx
print("Print a few elements in surname_vocab._token_to_idx")
for i in tokens:
    print(f'The index for "{i}" is {mapping.get(i,0)}')

Print a few elements in surname_vocab._token_to_idx
The index for "a" is 7
The index for "b" is 10
The index for "c" is 43
The index for "A" is 9
The index for "B" is 32
The index for "C" is 20


### ._idx_to_token: a mapping of index and token added to the SequenceVocabulary

In [41]:
print("Print out 20 tokens in the vocabulary")
list(surname_vocab._idx_to_token.items())[:20]

Print out 20 tokens in the vocabulary


[(0, '<MASK>'),
 (1, '<UNK>'),
 (2, '<BEGIN>'),
 (3, '<END>'),
 (4, 'T'),
 (5, 'o'),
 (6, 't'),
 (7, 'a'),
 (8, 'h'),
 (9, 'A'),
 (10, 'b'),
 (11, 'u'),
 (12, 'd'),
 (13, 'F'),
 (14, 'k'),
 (15, 'r'),
 (16, 'y'),
 (17, 'S'),
 (18, 'e'),
 (19, 'g')]

In [42]:
indices  = [0,2,6,100]
mapping = surname_vocab._idx_to_token
print("Print a few elements in title_vocab._idx_to_token")
for i in indices:
    print(f'The token for index={i} is {mapping.get(i,0)}')

Print a few elements in title_vocab._idx_to_token
The token for index=0 is <MASK>
The token for index=2 is <BEGIN>
The token for index=6 is t
The token for index=100 is 0


# 4. Methods

### add_token(token): Update mapping dicts based on the token

In [43]:
example_vocab = SequenceVocabulary()
vars(example_vocab)

{'_token_to_idx': {'<MASK>': 0, '<UNK>': 1, '<BEGIN>': 2, '<END>': 3},
 '_idx_to_token': {0: '<MASK>', 1: '<UNK>', 2: '<BEGIN>', 3: '<END>'},
 '_mask_token': '<MASK>',
 '_unk_token': '<UNK>',
 '_begin_seq_token': '<BEGIN>',
 '_end_seq_token': '<END>',
 'mask_index': 0,
 'unk_index': 1,
 'begin_seq_index': 2,
 'end_seq_index': 3}

In [49]:
new_token = 'apple'
example_vocab.add_token(new_token)
print(f"Add one token {new_token}")
print('-'*60)
vars(example_vocab)


Add one token apple
------------------------------------------------------------


{'_token_to_idx': {'<MASK>': 0,
  '<UNK>': 1,
  '<BEGIN>': 2,
  '<END>': 3,
  'apple': 4},
 '_idx_to_token': {0: '<MASK>',
  1: '<UNK>',
  2: '<BEGIN>',
  3: '<END>',
  4: 'apple'},
 '_mask_token': '<MASK>',
 '_unk_token': '<UNK>',
 '_begin_seq_token': '<BEGIN>',
 '_end_seq_token': '<END>',
 'mask_index': 0,
 'unk_index': 1,
 'begin_seq_index': 2,
 'end_seq_index': 3}

In [50]:
new_token = 'm'
example_vocab.add_token(new_token)
print(f"Add one token {new_token}")
print('-'*60)
vars(example_vocab)

Add one token m
------------------------------------------------------------


{'_token_to_idx': {'<MASK>': 0,
  '<UNK>': 1,
  '<BEGIN>': 2,
  '<END>': 3,
  'apple': 4,
  'm': 5},
 '_idx_to_token': {0: '<MASK>',
  1: '<UNK>',
  2: '<BEGIN>',
  3: '<END>',
  4: 'apple',
  5: 'm'},
 '_mask_token': '<MASK>',
 '_unk_token': '<UNK>',
 '_begin_seq_token': '<BEGIN>',
 '_end_seq_token': '<END>',
 'mask_index': 0,
 'unk_index': 1,
 'begin_seq_index': 2,
 'end_seq_index': 3}

### lookup_token(token): Retrieve the index associated with the token or the UNK index if token isn't present.

In [53]:
example_vocab = SequenceVocabulary()
vars(example_vocab)

{'_token_to_idx': {'<MASK>': 0, '<UNK>': 1, '<BEGIN>': 2, '<END>': 3},
 '_idx_to_token': {0: '<MASK>', 1: '<UNK>', 2: '<BEGIN>', 3: '<END>'},
 '_mask_token': '<MASK>',
 '_unk_token': '<UNK>',
 '_begin_seq_token': '<BEGIN>',
 '_end_seq_token': '<END>',
 'mask_index': 0,
 'unk_index': 1,
 'begin_seq_index': 2,
 'end_seq_index': 3}

In [54]:
tokens_to_add = ['apple','banana','peach','orange','coconut']
for i in tokens_to_add:
    example_vocab.add_token(i)
    print(i + ' added')
vars(example_vocab)

apple added
banana added
peach added
orange added
coconut added


{'_token_to_idx': {'<MASK>': 0,
  '<UNK>': 1,
  '<BEGIN>': 2,
  '<END>': 3,
  'apple': 4,
  'banana': 5,
  'peach': 6,
  'orange': 7,
  'coconut': 8},
 '_idx_to_token': {0: '<MASK>',
  1: '<UNK>',
  2: '<BEGIN>',
  3: '<END>',
  4: 'apple',
  5: 'banana',
  6: 'peach',
  7: 'orange',
  8: 'coconut'},
 '_mask_token': '<MASK>',
 '_unk_token': '<UNK>',
 '_begin_seq_token': '<BEGIN>',
 '_end_seq_token': '<END>',
 'mask_index': 0,
 'unk_index': 1,
 'begin_seq_index': 2,
 'end_seq_index': 3}

In [55]:
tokens_list = ['orange','rice']
for i in tokens_list:
    print(f"The index for {i} is {example_vocab.lookup_token(i)}")

The index for orange is 7
The index for rice is 1


In [56]:
### Equivalent codes
for i in tokens_list:
    print(f"The index for {i} is {SequenceVocabulary.lookup_token(example_vocab,i)}")

The index for orange is 7
The index for rice is 1


### lookup_index(index): Return the token associated with the index

In [57]:
indices_list = [1,4]
for i in indices_list:
    print(f"The token with index={i} is {example_vocab.lookup_index(i)}")

The token with index=1 is <UNK>
The token with index=4 is apple


In [58]:
### Equivalent codes
for i in indices_list:
    print(f"The token with index={i} is {Vocabulary.lookup_index(example_vocab,i)}")

The token with index=1 is <UNK>
The token with index=4 is apple


### \_\_len\_\_(): Return the length of _token_to_idx (i.e, the number of tokens in the vocabulary)

In [59]:
example_vocab = SequenceVocabulary()
tokens_to_add = ['token1','token2','token3','token4']
for i in tokens_to_add:
    example_vocab.add_token(i)
example_vocab._idx_to_token

{0: '<MASK>',
 1: '<UNK>',
 2: '<BEGIN>',
 3: '<END>',
 4: 'token1',
 5: 'token2',
 6: 'token3',
 7: 'token4'}

In [60]:
len(example_vocab)

8