In [1]:
import pandas as pd
import collections
from collections import Counter
import string

In [2]:
df_all = pd.read_csv("news_with_splits.csv")

In [3]:
print("shape of the data: ", df_all.shape)
print('-'*60)
print(df_all.head())

shape of the data:  (120000, 3)
------------------------------------------------------------
   category  split                                 title
0  Business  train    Jobs, tax cuts key issues for Bush
1  Business  train  Jarden Buying Mr. Coffee #39;s Maker
2  Business  train     Retail sales show festive fervour
3  Business  train   Intervoice's Customers Come Calling
4  Business  train     Boeing Expects Air Force Contract


# 1. Vocabulary class

In [4]:
class Vocabulary(object):

    def __init__(self, token_to_idx=None, add_unk=False, unk_token="<UNK>"):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
            add_unk (bool): a flag that indicates whether to add the UNK token
            unk_token (str): the UNK token to add into the Vocabulary
        """
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        
        self._add_unk   = add_unk
        self._unk_token = unk_token      
        self.unk_index  = -999
        ### the unk_token, i.e, "<UNK>" is the first added token if add_unk=True
        ### self.unk_index is changed from -999 to 0
        if add_unk:
            self.unk_index = self.add_token(unk_token) 
        

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            ### add a new element to _token_to_idx
            self._token_to_idx[token] = index
            ### add a new element to _idx_to_token
            self._idx_to_token[index] = token
        return index
   
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            ### .get(): return self.unk_index if the key "token" does not exist. 
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
    
    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    def __len__(self):
        return len(self._token_to_idx)

# 2. Instantiate the Vocabulary from the training data

## (1) The vocabulary for the categories - category_vocab

### The corpus of categories - apparently, the vocabulary for the categories include four words ['Business','Sci/Tech','Sports','World']

In [5]:
df_all['category'].value_counts().reset_index().\
rename(columns={'index':'category','category':'freq'})

Unnamed: 0,category,freq
0,Business,30000
1,Sci/Tech,30000
2,Sports,30000
3,World,30000


### Initializing category_vocab.
### The unk_token, i.e,  "UNK" (unknown word), is the first added token if add_unk=True. 
### After the initialization, there is only one token stored in the object - UNK, and the index of this token in rating_vocab is 0 (changed from 999 to 0). 

In [6]:
category_vocab = Vocabulary(add_unk=True)
vars(category_vocab)

{'_token_to_idx': {'<UNK>': 0},
 '_idx_to_token': {0: '<UNK>'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

### Use  add_unk=False. No tokens are stored in _token_to_idx or _idx_to_token.

In [7]:
category_vocab = Vocabulary(add_unk=False)
vars(category_vocab)

{'_token_to_idx': {},
 '_idx_to_token': {},
 '_add_unk': False,
 '_unk_token': '<UNK>',
 'unk_index': -999}

### Add tokens appear in the categories to category_vocab. 
### Actually there are only four tokens in the corpus of categories. 

In [8]:
sorted(set(df_all.category))

['Business', 'Sci/Tech', 'Sports', 'World']

In [9]:
category_vocab = Vocabulary(add_unk=True)
for category in sorted(set(df_all.category)):
    category_vocab.add_token(category)
vars(category_vocab)

{'_token_to_idx': {'<UNK>': 0,
  'Business': 1,
  'Sci/Tech': 2,
  'Sports': 3,
  'World': 4},
 '_idx_to_token': {0: '<UNK>',
  1: 'Business',
  2: 'Sci/Tech',
  3: 'Sports',
  4: 'World'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

# 3. Attributes

### ._token_to_idx: a mapping of index and token added to the Vocabulary

In [10]:
print("Print out the tokens in the vocabulary")
list(category_vocab._token_to_idx.items())

Print out the tokens in the vocabulary


[('<UNK>', 0), ('Business', 1), ('Sci/Tech', 2), ('Sports', 3), ('World', 4)]

In [11]:
tokens  = ['Business','business','Politics','Entertainment']
mapping = category_vocab._token_to_idx
print("Print a few elements in category_vocab._token_to_idx")
for i in tokens:
    print(f'The index for "{i}" is {mapping.get(i,0)}')

Print a few elements in category_vocab._token_to_idx
The index for "Business" is 1
The index for "business" is 0
The index for "Politics" is 0
The index for "Entertainment" is 0


### ._idx_to_token: a mapping of index and token added to the Vocabulary

In [12]:
print("Print out the tokens in the vocabulary")
list(category_vocab._idx_to_token.items())

Print out the tokens in the vocabulary


[(0, '<UNK>'), (1, 'Business'), (2, 'Sci/Tech'), (3, 'Sports'), (4, 'World')]

In [13]:
indices  = [0,2,6,100]
mapping = category_vocab._idx_to_token
print("Print a few elements in review_vocab._idx_to_token")
for i in indices:
    print(f'The token for index={i} is {mapping.get(i,0)}')

Print a few elements in review_vocab._idx_to_token
The token for index=0 is <UNK>
The token for index=2 is Sci/Tech
The token for index=6 is 0
The token for index=100 is 0


# 4. Methods

### add_token(token): Update mapping dicts based on the token

In [14]:
example_vocab = Vocabulary(add_unk=True)
vars(example_vocab)

{'_token_to_idx': {'<UNK>': 0},
 '_idx_to_token': {0: '<UNK>'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

In [15]:
new_token = 'apple'
example_vocab.add_token(new_token)
print(f"Add one token {new_token}")
print('-'*60)
vars(example_vocab)

Add one token apple
------------------------------------------------------------


{'_token_to_idx': {'<UNK>': 0, 'apple': 1},
 '_idx_to_token': {0: '<UNK>', 1: 'apple'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

In [16]:
new_token = 'banana'
example_vocab.add_token(new_token)
print(f"Add one token {new_token}")
print('-'*60)
vars(example_vocab)

Add one token banana
------------------------------------------------------------


{'_token_to_idx': {'<UNK>': 0, 'apple': 1, 'banana': 2},
 '_idx_to_token': {0: '<UNK>', 1: 'apple', 2: 'banana'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

### lookup_token(token): Retrieve the index associated with the token or the UNK index if token isn't present.

In [17]:
example_vocab = Vocabulary(add_unk=True)
vars(example_vocab)

{'_token_to_idx': {'<UNK>': 0},
 '_idx_to_token': {0: '<UNK>'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

In [18]:
tokens_to_add = ['apple','banana','peach','orange','coconut']
for i in tokens_to_add:
    example_vocab.add_token(i)
    print(i + ' added')
vars(example_vocab)

apple added
banana added
peach added
orange added
coconut added


{'_token_to_idx': {'<UNK>': 0,
  'apple': 1,
  'banana': 2,
  'peach': 3,
  'orange': 4,
  'coconut': 5},
 '_idx_to_token': {0: '<UNK>',
  1: 'apple',
  2: 'banana',
  3: 'peach',
  4: 'orange',
  5: 'coconut'},
 '_add_unk': True,
 '_unk_token': '<UNK>',
 'unk_index': 0}

In [19]:
tokens_list = ['orange','rice']
for i in tokens_list:
    print(f"The index for {i} is {example_vocab.lookup_token(i)}")

The index for orange is 4
The index for rice is 0


In [20]:
### Equivalent codes
for i in tokens_list:
    print(f"The index for {i} is {Vocabulary.lookup_token(example_vocab,i)}")

The index for orange is 4
The index for rice is 0


### lookup_index(index): Return the token associated with the index

In [21]:
indices_list = [1,4]
for i in indices_list:
    print(f"The token with index={i} is {example_vocab.lookup_index(i)}")

The token with index=1 is apple
The token with index=4 is orange


In [22]:
### Equivalent codes
for i in indices_list:
    print(f"The token with index={i} is {Vocabulary.lookup_index(example_vocab,i)}")

The token with index=1 is apple
The token with index=4 is orange


### \_\_len\_\_(): Return the length of _token_to_idx (i.e, the number of tokens in the vocabulary)

In [23]:
example_vocab = Vocabulary(add_unk=True)
tokens_to_add = ['token1','token2','token3','token4']
for i in tokens_to_add:
    example_vocab.add_token(i)
example_vocab._idx_to_token

{0: '<UNK>', 1: 'token1', 2: 'token2', 3: 'token3', 4: 'token4'}

In [24]:
len(example_vocab)

5