In [1]:
import pandas as pd
import collections
from collections import Counter
import string

In [2]:
df_all = pd.read_csv('frankenstein_with_splits.csv')

In [3]:
print("shape of the data: ", df_all.shape)
print('-'*60)
print(df_all.head())

shape of the data:  (90698, 3)
------------------------------------------------------------
                                  context        target  split
0                                , or the  frankenstein  train
1              frankenstein or the modern             ,  train
2    frankenstein , the modern prometheus            or  train
3  frankenstein , or modern prometheus by           the  train
4             , or the prometheus by mary        modern  train


# 1. Vocabulary class

In [4]:
class Vocabulary(object):
    """Class to process text and extract vocabulary for mapping"""
    def __init__(self, token_to_idx=None, 
                 mask_token="<MASK>", add_unk=True, 
                 unk_token="<UNK>"):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
            mask_token (str): the MASK token to add into the Vocabulary; indicates
                a position that will not be used in updating the model's parameters
            add_unk (bool): a flag that indicates whether to add the UNK token
            unk_token (str): the UNK token to add into the Vocabulary
        """
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        
        self._add_unk   = add_unk
        self._unk_token = unk_token      
        self.unk_index  = -999
        ### the unk_token, i.e, "<UNK>" is the first added token if add_unk=True
        ### self.unk_index is changed from -999 to 0
        if add_unk:
            self.unk_index = self.add_token(unk_token) 

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            ### add a new element to _token_to_idx
            self._token_to_idx[token] = index
            ### add a new element to _idx_to_token
            self._idx_to_token[index] = token
        return index
   
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            ### .get(): return self.unk_index if the key "token" does not exist. 
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
    
    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    def __len__(self):
        return len(self._token_to_idx)

# 2. Instantiate the Vocabulary from the training data

### Initializing rating_vocab.
### The unk_token, i.e,  "UNK" (unknown word), is the first added token if add_unk=True. 
### After the initialization, there is only one token stored in the object - UNK, and the index of this token in rating_vocab is 0 (changed from -999 to 0). 

In [5]:
cbow_vocab = Vocabulary(add_unk=True)
vars(cbow_vocab)

{'_token_to_idx': {'<UNK>': 0},
 '_idx_to_token': {0: '<UNK>'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

### Add tokens appear in the ratings to cbow_vocab. 

### (There is one additional optional step for creating the vocabulary - couting the tokens appeared in the "context" and "target" columns , and ONLY add frequent tokens that apprear more than a pre-specified number to the Vocabulary, while treat infrequent tokens as UNK.  See an example with cutoff = 25 [here](https://github.com/houzhj/Machine_Learning/blob/main/ipynb/Yelp_Reviews/yelp_perceptron.ipynb).)

In [6]:
cbow_vocab = Vocabulary()
for index, row in df_all.iterrows():
    for token in row.context.split(' '):
        cbow_vocab.add_token(token)
    cbow_vocab.add_token(row.target)

In [7]:
print(f"{len(cbow_vocab._token_to_idx)} tokesn added into cbow_vocab")

7269 tokesn added into cbow_vocab


In [8]:
df_all.head(6)

Unnamed: 0,context,target,split
0,", or the",frankenstein,train
1,frankenstein or the modern,",",train
2,"frankenstein , the modern prometheus",or,train
3,"frankenstein , or modern prometheus by",the,train
4,", or the prometheus by mary",modern,train
5,or the modern by mary wollstonecraft,prometheus,train


In [9]:
print('Tokens added to cbow_vocab based on the first 6 rows:')
list(cbow_vocab._token_to_idx.items())[:10]

Tokens added to cbow_vocab based on the first 6 rows:


[('<UNK>', 0),
 (',', 1),
 ('or', 2),
 ('the', 3),
 ('frankenstein', 4),
 ('modern', 5),
 ('prometheus', 6),
 ('by', 7),
 ('mary', 8),
 ('wollstonecraft', 9)]

# 3. Attributes

### ._token_to_idx: a mapping of index and token added to the Vocabulary

In [10]:
print("Print out 20 tokens in the vocabulary")
list(cbow_vocab._token_to_idx.items())[:20]

Print out 20 tokens in the vocabulary


[('<UNK>', 0),
 (',', 1),
 ('or', 2),
 ('the', 3),
 ('frankenstein', 4),
 ('modern', 5),
 ('prometheus', 6),
 ('by', 7),
 ('mary', 8),
 ('wollstonecraft', 9),
 ('godwin', 10),
 ('shelley', 11),
 ('letter', 12),
 ('st', 13),
 ('.', 14),
 ('petersburgh', 15),
 ('dec', 16),
 ('th', 17),
 ('to', 18),
 ('mrs', 19)]

In [11]:
tokens  = ['one','ten','hundred','thousand','million']
mapping = cbow_vocab._token_to_idx
print("Print a few elements in cbow_vocab._token_to_idx")
for i in tokens:
    print(f'The index for "{i}" is {mapping.get(i,0)}')

Print a few elements in cbow_vocab._token_to_idx
The index for "one" is 357
The index for "ten" is 1709
The index for "hundred" is 1129
The index for "thousand" is 184
The index for "million" is 0


### ._idx_to_token: a mapping of index and token added to the Vocabulary

In [12]:
print("Print out 20 tokens in the vocabulary")
list(cbow_vocab._idx_to_token.items())[:20]

Print out 20 tokens in the vocabulary


[(0, '<UNK>'),
 (1, ','),
 (2, 'or'),
 (3, 'the'),
 (4, 'frankenstein'),
 (5, 'modern'),
 (6, 'prometheus'),
 (7, 'by'),
 (8, 'mary'),
 (9, 'wollstonecraft'),
 (10, 'godwin'),
 (11, 'shelley'),
 (12, 'letter'),
 (13, 'st'),
 (14, '.'),
 (15, 'petersburgh'),
 (16, 'dec'),
 (17, 'th'),
 (18, 'to'),
 (19, 'mrs')]

In [13]:
indices  = [0,2,6,100]
mapping = cbow_vocab._idx_to_token
print("Print a few elements in review_vocab._idx_to_token")
for i in indices:
    print(f'The token for index={i} is {mapping.get(i,0)}')

Print a few elements in review_vocab._idx_to_token
The token for index=0 is <UNK>
The token for index=2 is or
The token for index=6 is prometheus
The token for index=100 is promise


# 4. Methods

### add_token(token): Update mapping dicts based on the token

In [14]:
example_vocab = Vocabulary(add_unk=True)
vars(example_vocab)

{'_token_to_idx': {'<UNK>': 0},
 '_idx_to_token': {0: '<UNK>'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

In [15]:
new_token = 'apple'
example_vocab.add_token(new_token)
print(f"Add one token {new_token}")
print('-'*60)
vars(example_vocab)

Add one token apple
------------------------------------------------------------


{'_token_to_idx': {'<UNK>': 0, 'apple': 1},
 '_idx_to_token': {0: '<UNK>', 1: 'apple'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

In [16]:
new_token = 'banana'
example_vocab.add_token(new_token)
print(f"Add one token {new_token}")
print('-'*60)
vars(example_vocab)

Add one token banana
------------------------------------------------------------


{'_token_to_idx': {'<UNK>': 0, 'apple': 1, 'banana': 2},
 '_idx_to_token': {0: '<UNK>', 1: 'apple', 2: 'banana'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

### lookup_token(token): Retrieve the index associated with the token or the UNK index if token isn't present.

In [17]:
example_vocab = Vocabulary(add_unk=True)
vars(example_vocab)

{'_token_to_idx': {'<UNK>': 0},
 '_idx_to_token': {0: '<UNK>'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

In [18]:
tokens_to_add = ['apple','banana','peach','orange','coconut']
for i in tokens_to_add:
    example_vocab.add_token(i)
    print(i + ' added')
vars(example_vocab)

apple added
banana added
peach added
orange added
coconut added


{'_token_to_idx': {'<UNK>': 0,
  'apple': 1,
  'banana': 2,
  'peach': 3,
  'orange': 4,
  'coconut': 5},
 '_idx_to_token': {0: '<UNK>',
  1: 'apple',
  2: 'banana',
  3: 'peach',
  4: 'orange',
  5: 'coconut'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

In [19]:
tokens_list = ['orange','rice']
for i in tokens_list:
    print(f"The index for {i} is {example_vocab.lookup_token(i)}")

The index for orange is 4
The index for rice is 0


In [20]:
### Equivalent codes
for i in tokens_list:
    print(f"The index for {i} is {Vocabulary.lookup_token(example_vocab,i)}")

The index for orange is 4
The index for rice is 0


### lookup_index(index): Return the token associated with the index

In [21]:
indices_list = [1,4]
for i in indices_list:
    print(f"The token with index={i} is {example_vocab.lookup_index(i)}")

The token with index=1 is apple
The token with index=4 is orange


In [22]:
### Equivalent codes
for i in indices_list:
    print(f"The token with index={i} is {Vocabulary.lookup_index(example_vocab,i)}")

The token with index=1 is apple
The token with index=4 is orange


### \_\_len\_\_(): Return the length of _token_to_idx (i.e, the number of tokens in the vocabulary)

In [23]:
example_vocab = Vocabulary(add_unk=True)
tokens_to_add = ['token1','token2','token3','token4']
for i in tokens_to_add:
    example_vocab.add_token(i)
example_vocab._idx_to_token

{0: '<UNK>', 1: 'token1', 2: 'token2', 3: 'token3', 4: 'token4'}

In [24]:
len(example_vocab)

5