In [1]:
import pandas as pd
import collections
from collections import Counter
import string

In [2]:
df_all = pd.read_csv('surnames_with_splits.csv')

In [3]:
print("shape of the data: ", df_all.shape)
print('-'*60)
print(df_all.head())

shape of the data:  (10980, 4)
------------------------------------------------------------
  nationality  nationality_index  split   surname
0      Arabic                 15  train     Totah
1      Arabic                 15  train    Abboud
2      Arabic                 15  train  Fakhoury
3      Arabic                 15  train     Srour
4      Arabic                 15  train    Sayegh


# 1. Vocabulary class

In [4]:
class Vocabulary(object):
    
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
            add_unk (bool): a flag that indicates whether to add the UNK token
            unk_token (str): the UNK token to add into the Vocabulary
        """
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        
        self._add_unk   = add_unk
        self._unk_token = unk_token      
        self.unk_index  = -999
        ### the unk_token, i.e, "<UNK>" is the first added token if add_unk=True
        ### self.unk_index is changed from -999 to 0
        if add_unk:
            self.unk_index = self.add_token(unk_token) 

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            ### add a new element to _token_to_idx
            self._token_to_idx[token] = index
            ### add a new element to _idx_to_token
            self._idx_to_token[index] = token
        return index
   
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            ### .get(): return self.unk_index if the key "token" does not exist. 
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
    
    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    def __len__(self):
        return len(self._token_to_idx)
    

# 2. Instantiate the Vocabulary from the training data

## (1) The vocabulary for the nationality - nationality_vocab

### The corpus of nationality

In [5]:
print('corpus')
print(df_all.nationality.unique())
print('-'*80)
print('counts')
print(df_all.nationality.value_counts())

corpus
['Arabic' 'Chinese' 'Czech' 'Dutch' 'English' 'French' 'German' 'Greek'
 'Irish' 'Italian' 'Japanese' 'Korean' 'Polish' 'Portuguese' 'Russian'
 'Scottish' 'Spanish' 'Vietnamese']
--------------------------------------------------------------------------------
counts
English       2972
Russian       2373
Arabic        1603
Japanese       775
Italian        600
German         576
Czech          414
Spanish        258
Dutch          236
French         229
Chinese        220
Irish          183
Greek          156
Polish         120
Korean          77
Scottish        75
Vietnamese      58
Portuguese      55
Name: nationality, dtype: int64


### Initializing nationality_vocab.
### The unk_token, i.e,  "UNK" (unknown word), is the first added token if add_unk=True. 
### After the initialization, there is only one token stored in the object - UNK, and the index of this token in nationality_vocab is 0 (changed from -999 to 0). 

In [6]:
nationality_vocab = Vocabulary(add_unk=True)
vars(nationality_vocab)

{'_token_to_idx': {'<UNK>': 0},
 '_idx_to_token': {0: '<UNK>'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

### Add tokens appear in the nationality to nationality_vocab. 
### There are 18 tokens in the corpus of nationality. 

In [7]:
sorted(set(df_all.nationality))

['Arabic',
 'Chinese',
 'Czech',
 'Dutch',
 'English',
 'French',
 'German',
 'Greek',
 'Irish',
 'Italian',
 'Japanese',
 'Korean',
 'Polish',
 'Portuguese',
 'Russian',
 'Scottish',
 'Spanish',
 'Vietnamese']

In [8]:
nationality_vocab = Vocabulary(add_unk=True)
for n in sorted(set(df_all.nationality)):
    nationality_vocab.add_token(n)
vars(nationality_vocab)

{'_token_to_idx': {'<UNK>': 0,
  'Arabic': 1,
  'Chinese': 2,
  'Czech': 3,
  'Dutch': 4,
  'English': 5,
  'French': 6,
  'German': 7,
  'Greek': 8,
  'Irish': 9,
  'Italian': 10,
  'Japanese': 11,
  'Korean': 12,
  'Polish': 13,
  'Portuguese': 14,
  'Russian': 15,
  'Scottish': 16,
  'Spanish': 17,
  'Vietnamese': 18},
 '_idx_to_token': {0: '<UNK>',
  1: 'Arabic',
  2: 'Chinese',
  3: 'Czech',
  4: 'Dutch',
  5: 'English',
  6: 'French',
  7: 'German',
  8: 'Greek',
  9: 'Irish',
  10: 'Italian',
  11: 'Japanese',
  12: 'Korean',
  13: 'Polish',
  14: 'Portuguese',
  15: 'Russian',
  16: 'Scottish',
  17: 'Spanish',
  18: 'Vietnamese'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

### Another way to add tokens to nationality_vocab

In [9]:
nationality_vocab = Vocabulary(add_unk=True)
for index, row in df_all.iterrows():
    nationality_vocab.add_token(row.nationality)
vars(nationality_vocab)

{'_token_to_idx': {'<UNK>': 0,
  'Arabic': 1,
  'Chinese': 2,
  'Czech': 3,
  'Dutch': 4,
  'English': 5,
  'French': 6,
  'German': 7,
  'Greek': 8,
  'Irish': 9,
  'Italian': 10,
  'Japanese': 11,
  'Korean': 12,
  'Polish': 13,
  'Portuguese': 14,
  'Russian': 15,
  'Scottish': 16,
  'Spanish': 17,
  'Vietnamese': 18},
 '_idx_to_token': {0: '<UNK>',
  1: 'Arabic',
  2: 'Chinese',
  3: 'Czech',
  4: 'Dutch',
  5: 'English',
  6: 'French',
  7: 'German',
  8: 'Greek',
  9: 'Irish',
  10: 'Italian',
  11: 'Japanese',
  12: 'Korean',
  13: 'Polish',
  14: 'Portuguese',
  15: 'Russian',
  16: 'Scottish',
  17: 'Spanish',
  18: 'Vietnamese'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

### (2) The vocabulary for the surnames

### The corpus - the difference between nationality and surname is that in the corpus of nationality, each word is treated as a token, whereas in the corpus of surname, each character is treated as a token.
### - Tokens in "surname_vocab": 'Arabic', 'Chinese', 'Czech', 'Dutch', 'English', ...
### - Tokens in "nationality_vocab": 'T', 'a', 't', 'V', ...

In [10]:
df_all.surname

0           Totah
1          Abboud
2        Fakhoury
3           Srour
4          Sayegh
           ...   
10975        Dinh
10976       Phung
10977       Quang
10978          Vu
10979          Ha
Name: surname, Length: 10980, dtype: object

### Initializing surname_vocab.

In [11]:
surname_vocab = Vocabulary(add_unk=True)
vars(surname_vocab)

{'_token_to_idx': {'<UNK>': 0},
 '_idx_to_token': {0: '<UNK>'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

In [12]:
for index, row in df_all.iterrows():
    for letter in row.surname:
        surname_vocab.add_token(letter)
print('The number of tokens in ')
len(surname_vocab)

The number of tokens in 


85

# 3. Attributes

### ._token_to_idx: a mapping of index and token added to the Vocabulary

In [13]:
print("Print out 20 tokens in the vocabulary")
list(surname_vocab._token_to_idx.items())[:20]

Print out 20 tokens in the vocabulary


[('<UNK>', 0),
 ('T', 1),
 ('o', 2),
 ('t', 3),
 ('a', 4),
 ('h', 5),
 ('A', 6),
 ('b', 7),
 ('u', 8),
 ('d', 9),
 ('F', 10),
 ('k', 11),
 ('r', 12),
 ('y', 13),
 ('S', 14),
 ('e', 15),
 ('g', 16),
 ('C', 17),
 ('m', 18),
 ('H', 19)]

In [14]:
tokens  = ['a','b','c','A','B','C']
mapping = surname_vocab._token_to_idx
print("Print a few elements in surname_vocab._token_to_idx")
for i in tokens:
    print(f'The index for "{i}" is {mapping.get(i,0)}')

Print a few elements in surname_vocab._token_to_idx
The index for "a" is 4
The index for "b" is 7
The index for "c" is 40
The index for "A" is 6
The index for "B" is 29
The index for "C" is 17


### ._idx_to_token: a mapping of index and token added to the Vocabulary

In [15]:
print("Print out 20 tokens in the vocabulary")
list(surname_vocab._idx_to_token.items())[:20]

Print out 20 tokens in the vocabulary


[(0, '<UNK>'),
 (1, 'T'),
 (2, 'o'),
 (3, 't'),
 (4, 'a'),
 (5, 'h'),
 (6, 'A'),
 (7, 'b'),
 (8, 'u'),
 (9, 'd'),
 (10, 'F'),
 (11, 'k'),
 (12, 'r'),
 (13, 'y'),
 (14, 'S'),
 (15, 'e'),
 (16, 'g'),
 (17, 'C'),
 (18, 'm'),
 (19, 'H')]

In [16]:
indices  = [0,2,6,100]
mapping = surname_vocab._idx_to_token
print("Print a few elements in surname_vocab._idx_to_token")
for i in indices:
    print(f'The token for index={i} is {mapping.get(i,0)}')

Print a few elements in surname_vocab._idx_to_token
The token for index=0 is <UNK>
The token for index=2 is o
The token for index=6 is A
The token for index=100 is 0


# 4. Methods

### add_token(token): Update mapping dicts based on the token

In [17]:
example_vocab = Vocabulary(add_unk=True)
vars(example_vocab)

{'_token_to_idx': {'<UNK>': 0},
 '_idx_to_token': {0: '<UNK>'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

In [18]:
new_token = 'apple'
example_vocab.add_token(new_token)
print(f"Add one token {new_token}")
print('-'*60)
vars(example_vocab)

Add one token apple
------------------------------------------------------------


{'_token_to_idx': {'<UNK>': 0, 'apple': 1},
 '_idx_to_token': {0: '<UNK>', 1: 'apple'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

In [19]:
new_token = 'banana'
example_vocab.add_token(new_token)
print(f"Add one token {new_token}")
print('-'*60)
vars(example_vocab)

Add one token banana
------------------------------------------------------------


{'_token_to_idx': {'<UNK>': 0, 'apple': 1, 'banana': 2},
 '_idx_to_token': {0: '<UNK>', 1: 'apple', 2: 'banana'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

### lookup_token(token): Retrieve the index associated with the token or the UNK index if token isn't present.

In [20]:
example_vocab = Vocabulary(add_unk=True)
vars(example_vocab)

{'_token_to_idx': {'<UNK>': 0},
 '_idx_to_token': {0: '<UNK>'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

In [21]:
tokens_to_add = ['apple','banana','peach','orange','coconut']
for i in tokens_to_add:
    example_vocab.add_token(i)
    print(i + ' added')
vars(example_vocab)

apple added
banana added
peach added
orange added
coconut added


{'_token_to_idx': {'<UNK>': 0,
  'apple': 1,
  'banana': 2,
  'peach': 3,
  'orange': 4,
  'coconut': 5},
 '_idx_to_token': {0: '<UNK>',
  1: 'apple',
  2: 'banana',
  3: 'peach',
  4: 'orange',
  5: 'coconut'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

In [22]:
tokens_list = ['orange','rice']
for i in tokens_list:
    print(f"The index for {i} is {example_vocab.lookup_token(i)}")

The index for orange is 4
The index for rice is 0


In [23]:
### Equivalent codes
for i in tokens_list:
    print(f"The index for {i} is {Vocabulary.lookup_token(example_vocab,i)}")

The index for orange is 4
The index for rice is 0


### lookup_index(index): Return the token associated with the index

In [24]:
indices_list = [1,4]
for i in indices_list:
    print(f"The token with index={i} is {example_vocab.lookup_index(i)}")

The token with index=1 is apple
The token with index=4 is orange


In [25]:
### Equivalent codes
for i in indices_list:
    print(f"The token with index={i} is {Vocabulary.lookup_index(example_vocab,i)}")

The token with index=1 is apple
The token with index=4 is orange


### \_\_len\_\_(): Return the length of _token_to_idx (i.e, the number of tokens in the vocabulary)

In [26]:
example_vocab = Vocabulary(add_unk=True)
tokens_to_add = ['token1','token2','token3','token4']
for i in tokens_to_add:
    example_vocab.add_token(i)
example_vocab._idx_to_token

{0: '<UNK>', 1: 'token1', 2: 'token2', 3: 'token3', 4: 'token4'}

In [27]:
len(example_vocab)

5