In [1]:
import pandas as pd
import collections
from collections import Counter
import string

In [2]:
df_all = pd.read_csv("reviews_with_splits_lite.csv")

In [3]:
print("shape of the data: ", df_all.shape)
print('-'*60)
print(df_all.head())

shape of the data:  (56000, 3)
------------------------------------------------------------
     rating                                             review  split
0  negative  terrible place to work for i just heard a stor...  train
1  negative   hours , minutes total time for an extremely s...  train
2  negative  my less than stellar review is for service . w...  train
3  negative  i m granting one star because there s no way t...  train
4  negative  the food here is mediocre at best . i went aft...  train


# 1. Vocabulary class

In [4]:
class Vocabulary(object):
    
    def __init__(self, token_to_idx=None, add_unk=True, unk_token="<UNK>"):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
            add_unk (bool): a flag that indicates whether to add the UNK token
            unk_token (str): the UNK token to add into the Vocabulary
        """
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        
        self._add_unk   = add_unk
        self._unk_token = unk_token      
        self.unk_index  = -999
        ### the unk_token, i.e, "<UNK>" is the first added token if add_unk=True
        ### self.unk_index is changed from -999 to 0
        if add_unk:
            self.unk_index = self.add_token(unk_token) 

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            ### add a new element to _token_to_idx
            self._token_to_idx[token] = index
            ### add a new element to _idx_to_token
            self._idx_to_token[index] = token
        return index
   
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            ### .get(): return self.unk_index if the key "token" does not exist. 
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
    
    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    def __len__(self):
        return len(self._token_to_idx)

# 2. Instantiate the Vocabulary from the training data

## (1) The vocabulary for the ratings - rating_vocab

### The corpus of ratings - apparently, the vocabulary for the ratings is ['positive','negative']

In [5]:
df_all.rating

0        negative
1        negative
2        negative
3        negative
4        negative
           ...   
55995    positive
55996    positive
55997    positive
55998    positive
55999    positive
Name: rating, Length: 56000, dtype: object

### Initializing rating_vocab.
### The unk_token, i.e,  "UNK" (unknown word), is the first added token if add_unk=True. 
### After the initialization, there is only one token stored in the object - UNK, and the index of this token in rating_vocab is 0 (changed from -999 to 0). 

In [6]:
rating_vocab = Vocabulary(add_unk=True)
vars(rating_vocab)

{'_token_to_idx': {'<UNK>': 0},
 '_idx_to_token': {0: '<UNK>'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

### Add tokens appear in the ratings to rating_vocab. 
### Actually there are only two tokens in the corpus of ratings. 

In [7]:
sorted(set(df_all.rating))

['negative', 'positive']

In [8]:
rating_vocab = Vocabulary(add_unk=True)
for rating in sorted(set(df_all.rating)):
    rating_vocab.add_token(rating)
vars(rating_vocab)

{'_token_to_idx': {'<UNK>': 0, 'negative': 1, 'positive': 2},
 '_idx_to_token': {0: '<UNK>', 1: 'negative', 2: 'positive'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

## (2) The vocabulary for the reviews

### The corpus - the difference between reviews and ratings is that the corpus for the reviews includes much more words (tokens) than that for the reviews. 
### So there is one additional step for creating the vocabulary for reviews - couting the tokens appeared in the reviews, and add frequent tokens that apprear more than a pre-specified number to review_vocab, while treat infrequent tokens as UNK.

In [9]:
df_all.review

0        terrible place to work for i just heard a stor...
1         hours , minutes total time for an extremely s...
2        my less than stellar review is for service . w...
3        i m granting one star because there s no way t...
4        the food here is mediocre at best . i went aft...
                               ...                        
55995    great food . wonderful , friendly service . i ...
55996    charlotte should be the new standard for moder...
55997    get the encore sandwich ! ! make sure to get i...
55998    i m a pretty big ice cream gelato fan . pretty...
55999    where else can you find all the parts and piec...
Name: review, Length: 56000, dtype: object

### Initializing rating_vocab.

In [10]:
review_vocab = Vocabulary(add_unk=True)
vars(review_vocab)

{'_token_to_idx': {'<UNK>': 0},
 '_idx_to_token': {0: '<UNK>'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

In [11]:
word_counts=Counter()
for review in df_all.review:
    for word in review.split(" "):
        if word not in string.punctuation:
            word_counts[word] += 1

In [12]:
print("The 20 most frequent words")
word_counts.most_common(20)

The 20 most frequent words


[('the', 339990),
 ('and', 231288),
 ('i', 225732),
 ('to', 185031),
 ('a', 179478),
 ('was', 117940),
 ('it', 106288),
 ('of', 103076),
 ('for', 83622),
 ('in', 82762),
 ('is', 82250),
 ('n', 75090),
 ('that', 73885),
 ('my', 70406),
 ('they', 61324),
 ('this', 58229),
 ('you', 57180),
 ('with', 53761),
 ('t', 52863),
 ('but', 52479)]

### Only the token with more than 1000 (a pre-specified number) counts will be added to the vocabulary.

In [13]:
cut_off = 1000
for word, count in word_counts.items():
    if count > cut_off:
        review_vocab.add_token(word)

In [14]:
print(f"When cut_off = {cut_off}, {len(review_vocab._token_to_idx)} tokens added into review_vocab")

When cut_off = 1000, 776 tokens added into review_vocab


### If cut_off = 10000, there are less tokens added to the review_vocab.

In [15]:
review_vocab = Vocabulary(add_unk=True)
cut_off = 10000
for word, count in word_counts.items():
    if count > cut_off:
        review_vocab.add_token(word)
print(f"When cut_off = {cut_off}, {len(review_vocab._token_to_idx)} tokens added into review_vocab")

When cut_off = 10000, 104 tokens added into review_vocab


# 3. Attributes

### ._token_to_idx: a mapping of index and token added to the Vocabulary

In [16]:
print("Print out 20 tokens in the vocabulary")
list(review_vocab._token_to_idx.items())[:20]

Print out 20 tokens in the vocabulary


[('<UNK>', 0),
 ('place', 1),
 ('to', 2),
 ('for', 3),
 ('i', 4),
 ('just', 5),
 ('a', 6),
 ('of', 7),
 ('them', 8),
 ('over', 9),
 ('her', 10),
 ('in', 11),
 ('there', 12),
 ('she', 13),
 ('t', 14),
 ('said', 15),
 ('which', 16),
 ('and', 17),
 ('they', 18),
 ('the', 19)]

In [17]:
tokens  = ['place','and','follow','good']
mapping = review_vocab._token_to_idx
print("Print a few elements in review_vocab._token_to_idx")
for i in tokens:
    print(f'The index for "{i}" is {mapping.get(i,0)}')

Print a few elements in review_vocab._token_to_idx
The index for "place" is 1
The index for "and" is 17
The index for "follow" is 0
The index for "good" is 78


### ._idx_to_token: a mapping of index and token added to the Vocabulary

In [18]:
print("Print out 20 tokens in the vocabulary")
list(review_vocab._idx_to_token.items())[:20]

Print out 20 tokens in the vocabulary


[(0, '<UNK>'),
 (1, 'place'),
 (2, 'to'),
 (3, 'for'),
 (4, 'i'),
 (5, 'just'),
 (6, 'a'),
 (7, 'of'),
 (8, 'them'),
 (9, 'over'),
 (10, 'her'),
 (11, 'in'),
 (12, 'there'),
 (13, 'she'),
 (14, 't'),
 (15, 'said'),
 (16, 'which'),
 (17, 'and'),
 (18, 'they'),
 (19, 'the')]

In [19]:
indices  = [0,2,6,100]
mapping = review_vocab._idx_to_token
print("Print a few elements in review_vocab._idx_to_token")
for i in indices:
    print(f'The token for index={i} is {mapping.get(i,0)}')

Print a few elements in review_vocab._idx_to_token
The token for index=0 is <UNK>
The token for index=2 is to
The token for index=6 is a
The token for index=100 is always


# 4. Methods

### add_token(token): Update mapping dicts based on the token

In [20]:
example_vocab = Vocabulary(add_unk=True)
vars(example_vocab)

{'_token_to_idx': {'<UNK>': 0},
 '_idx_to_token': {0: '<UNK>'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

In [21]:
new_token = 'apple'
example_vocab.add_token(new_token)
print(f"Add one token {new_token}")
print('-'*60)
vars(example_vocab)

Add one token apple
------------------------------------------------------------


{'_token_to_idx': {'<UNK>': 0, 'apple': 1},
 '_idx_to_token': {0: '<UNK>', 1: 'apple'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

In [22]:
new_token = 'banana'
example_vocab.add_token(new_token)
print(f"Add one token {new_token}")
print('-'*60)
vars(example_vocab)

Add one token banana
------------------------------------------------------------


{'_token_to_idx': {'<UNK>': 0, 'apple': 1, 'banana': 2},
 '_idx_to_token': {0: '<UNK>', 1: 'apple', 2: 'banana'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

### lookup_token(token): Retrieve the index associated with the token or the UNK index if token isn't present.

In [23]:
example_vocab = Vocabulary(add_unk=True)
vars(example_vocab)

{'_token_to_idx': {'<UNK>': 0},
 '_idx_to_token': {0: '<UNK>'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

In [24]:
tokens_to_add = ['apple','banana','peach','orange','coconut']
for i in tokens_to_add:
    example_vocab.add_token(i)
    print(i + ' added')
vars(example_vocab)

apple added
banana added
peach added
orange added
coconut added


{'_token_to_idx': {'<UNK>': 0,
  'apple': 1,
  'banana': 2,
  'peach': 3,
  'orange': 4,
  'coconut': 5},
 '_idx_to_token': {0: '<UNK>',
  1: 'apple',
  2: 'banana',
  3: 'peach',
  4: 'orange',
  5: 'coconut'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

In [25]:
tokens_list = ['orange','rice']
for i in tokens_list:
    print(f"The index for {i} is {example_vocab.lookup_token(i)}")

The index for orange is 4
The index for rice is 0


In [26]:
### Equivalent codes
for i in tokens_list:
    print(f"The index for {i} is {Vocabulary.lookup_token(example_vocab,i)}")

The index for orange is 4
The index for rice is 0


### lookup_index(index): Return the token associated with the index

In [27]:
indices_list = [1,4]
for i in indices_list:
    print(f"The token with index={i} is {example_vocab.lookup_index(i)}")

The token with index=1 is apple
The token with index=4 is orange


In [28]:
### Equivalent codes
for i in indices_list:
    print(f"The token with index={i} is {Vocabulary.lookup_index(example_vocab,i)}")

The token with index=1 is apple
The token with index=4 is orange


### \_\_len\_\_(): Return the length of _token_to_idx (i.e, the number of tokens in the vocabulary)

In [29]:
example_vocab = Vocabulary(add_unk=True)
tokens_to_add = ['token1','token2','token3','token4']
for i in tokens_to_add:
    example_vocab.add_token(i)
example_vocab._idx_to_token

{0: '<UNK>', 1: 'token1', 2: 'token2', 3: 'token3', 4: 'token4'}

In [30]:
len(example_vocab)

5