In [1]:
import pandas as pd
import collections
from collections import Counter
import string

In [2]:
df_all = pd.read_csv("news_with_splits.csv")

In [3]:
print("shape of the data: ", df_all.shape)
print('-'*60)
print(df_all.head())

shape of the data:  (120000, 3)
------------------------------------------------------------
   category  split                                 title
0  Business  train    Jobs, tax cuts key issues for Bush
1  Business  train  Jarden Buying Mr. Coffee #39;s Maker
2  Business  train     Retail sales show festive fervour
3  Business  train   Intervoice's Customers Come Calling
4  Business  train     Boeing Expects Air Force Contract


In [4]:
### Define Vocabulary class

class Vocabulary(object):

    def __init__(self, token_to_idx=None):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
        """
        if token_to_idx is None:
            token_to_idx = {}
            
        self._token_to_idx = token_to_idx
        
        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            ### add a new element to _token_to_idx
            self._token_to_idx[token] = index
            ### add a new element to _idx_to_token
            self._idx_to_token[index] = token
        return index
   
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        """
        return self._token_to_idx[token]
    
    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    def __len__(self):
        return len(self._token_to_idx)

# 1. SequenceVocabulary class

In [5]:
class SequenceVocabulary(Vocabulary):
    def __init__(self, 
                 token_to_idx    = None, 
                 unk_token       = "<UNK>",
                 mask_token      = "<MASK>", 
                 begin_seq_token = "<BEGIN>",
                 end_seq_token   = "<END>"):
        
        
        super().__init__(token_to_idx)
        """
        The follow attributes have been defined in the Vocabulary class:
            - ._token_to_idx
            - ._idx_to_token
        """

        self._mask_token      = mask_token      # default: "<MASK>"
        self._unk_token       = unk_token       # default: "<UNK>"
        self._begin_seq_token = begin_seq_token # default: "<BEGIN>"
        self._end_seq_token   = end_seq_token   # default: "<END>"

        self.mask_index       = self.add_token(self._mask_token)      # return 0
        self.unk_index        = self.add_token(self._unk_token)       # return 1
        self.begin_seq_index  = self.add_token(self._begin_seq_token) # return 2
        self.end_seq_index    = self.add_token(self._end_seq_token)   # return 3
        
    
    ### Overriding the self.lookup_token() method
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

# 2. Instantiate the SequenceVocabulary from the training data

## (1) The SequenceVocabulary for the titles - title_vocab

### There is one additional step for creating the vocabulary for titles - couting the tokens appeared in the titles, and add frequent tokens that apprear more than a pre-specified number to title_vocab, while treat infrequent tokens as UNK.

In [6]:
df_all.title

0                        Jobs, tax cuts key issues for Bush
1                      Jarden Buying Mr. Coffee #39;s Maker
2                         Retail sales show festive fervour
3                       Intervoice's Customers Come Calling
4                         Boeing Expects Air Force Contract
                                ...                        
119995            Genesis Space Capsule Crashes Into Desert
119996             U.S.: Too Early to Tell Iraq Unit's Fate
119997                   AFGHAN OPIUM GROWING UP TWO THIRDS
119998    At least one Saudi policeman killed in clashes...
119999                   U.S. Forces Claim Most of Fallujah
Name: title, Length: 120000, dtype: object

### Initializing title_vocab.

In [7]:
title_vocab = SequenceVocabulary()
vars(title_vocab)

{'_token_to_idx': {'<MASK>': 0, '<UNK>': 1, '<BEGIN>': 2, '<END>': 3},
 '_idx_to_token': {0: '<MASK>', 1: '<UNK>', 2: '<BEGIN>', 3: '<END>'},
 '_mask_token': '<MASK>',
 '_unk_token': '<UNK>',
 '_begin_seq_token': '<BEGIN>',
 '_end_seq_token': '<END>',
 'mask_index': 0,
 'unk_index': 1,
 'begin_seq_index': 2,
 'end_seq_index': 3}

In [8]:
word_counts=Counter()
for title in df_all.title:
    for word in title.split(" "):
        if word not in string.punctuation:
            word_counts[word] += 1

In [9]:
print(f"The 20 most frequent words (out of {len(word_counts)} in total)")
word_counts.most_common(20)

The 20 most frequent words (out of 71747 in total)


[('to', 22691),
 ('in', 16690),
 ('for', 11625),
 ('on', 8915),
 ('of', 8716),
 ('(AP)', 7692),
 ('#39;s', 6048),
 ('the', 4950),
 ('(Reuters)', 4231),
 ('a', 3728),
 ('US', 3702),
 ('at', 3653),
 ('#39;', 3155),
 ('with', 3034),
 ('as', 3006),
 ('and', 2878),
 ('New', 2566),
 ('&lt;b&gt;...&lt;/b&gt;', 2559),
 ('Microsoft', 2100),
 ('Iraq', 2057)]

### Only the tokens with more than 100 (a pre-specified number) counts will be added to the vocabulary.

In [10]:
title_vocab = SequenceVocabulary()
cut_off = 100
for word, count in word_counts.items():
    if count > cut_off:
        title_vocab.add_token(word)
print(f"When cut_off = {cut_off}, {len(title_vocab._token_to_idx)} tokesn added into title_vocab.")

When cut_off = 100, 1288 tokesn added into title_vocab.


### If cut_off = 1000, there are fewer tokens added to the title_vocab.

In [11]:
title_vocab = SequenceVocabulary()
cut_off = 1000
for word, count in word_counts.items():
    if count > cut_off:
        title_vocab.add_token(word)
print(f"When cut_off = {cut_off}, {len(title_vocab._token_to_idx)} tokesn added into title_vocab.")

When cut_off = 1000, 45 tokesn added into title_vocab.


### If cut_off = 10000, only a few tokens are added to the title_vocab.

In [12]:
title_vocab = SequenceVocabulary()
cut_off = 10000
for word, count in word_counts.items():
    if count > cut_off:
        title_vocab.add_token(word)
print(f"When cut_off = {cut_off}, {len(title_vocab._token_to_idx)} tokesn added into title_vocab.")

When cut_off = 10000, 7 tokesn added into title_vocab.


### In the following, use cut_off = 50.

In [13]:
title_vocab = SequenceVocabulary()
cut_off = 50
for word, count in word_counts.items():
    if count > cut_off:
        title_vocab.add_token(word)
print(f"When cut_off = {cut_off}, {len(title_vocab._token_to_idx)} tokesn added into title_vocab.")

When cut_off = 50, 2642 tokesn added into title_vocab.


# 3. Attributes

### ._token_to_idx: a mapping of index and token added to the SequenceVocabulary (inherited from Vocabulary)

In [14]:
print("Print out 20 tokens in the vocabulary")
list(title_vocab._token_to_idx.items())[:20]

Print out 20 tokens in the vocabulary


[('<MASK>', 0),
 ('<UNK>', 1),
 ('<BEGIN>', 2),
 ('<END>', 3),
 ('tax', 4),
 ('cuts', 5),
 ('key', 6),
 ('issues', 7),
 ('for', 8),
 ('Bush', 9),
 ('Buying', 10),
 ('#39;s', 11),
 ('Maker', 12),
 ('Retail', 13),
 ('sales', 14),
 ('show', 15),
 ('Customers', 16),
 ('Boeing', 17),
 ('Expects', 18),
 ('Air', 19)]

In [15]:
tokens  = ['place','and','follow','good']
mapping = title_vocab._token_to_idx
print("Print a few elements in title_vocab._token_to_idx")
for i in tokens:
    print(f'The index for "{i}" is {mapping.get(i,0)}')

Print a few elements in title_vocab._token_to_idx
The index for "place" is 1455
The index for "and" is 444
The index for "follow" is 0
The index for "good" is 198


### ._idx_to_token: a mapping of index and token added to the SequenceVocabulary

In [16]:
print("Print out 20 tokens in the vocabulary")
list(title_vocab._idx_to_token.items())[:20]

Print out 20 tokens in the vocabulary


[(0, '<MASK>'),
 (1, '<UNK>'),
 (2, '<BEGIN>'),
 (3, '<END>'),
 (4, 'tax'),
 (5, 'cuts'),
 (6, 'key'),
 (7, 'issues'),
 (8, 'for'),
 (9, 'Bush'),
 (10, 'Buying'),
 (11, '#39;s'),
 (12, 'Maker'),
 (13, 'Retail'),
 (14, 'sales'),
 (15, 'show'),
 (16, 'Customers'),
 (17, 'Boeing'),
 (18, 'Expects'),
 (19, 'Air')]

In [17]:
indices  = [0,2,6,100]
mapping = title_vocab._idx_to_token
print("Print a few elements in title_vocab._idx_to_token")
for i in indices:
    print(f'The token for index={i} is {mapping.get(i,0)}')

Print a few elements in title_vocab._idx_to_token
The token for index=0 is <MASK>
The token for index=2 is <BEGIN>
The token for index=6 is key
The token for index=100 is business


# 4. Methods

### add_token(token): Update mapping dicts based on the token

In [18]:
example_vocab = SequenceVocabulary()
vars(example_vocab)

{'_token_to_idx': {'<MASK>': 0, '<UNK>': 1, '<BEGIN>': 2, '<END>': 3},
 '_idx_to_token': {0: '<MASK>', 1: '<UNK>', 2: '<BEGIN>', 3: '<END>'},
 '_mask_token': '<MASK>',
 '_unk_token': '<UNK>',
 '_begin_seq_token': '<BEGIN>',
 '_end_seq_token': '<END>',
 'mask_index': 0,
 'unk_index': 1,
 'begin_seq_index': 2,
 'end_seq_index': 3}

In [19]:
new_token = 'apple'
example_vocab.add_token(new_token)
print(f"Add one token {new_token}")
print('-'*60)
vars(example_vocab)

Add one token apple
------------------------------------------------------------


{'_token_to_idx': {'<MASK>': 0,
  '<UNK>': 1,
  '<BEGIN>': 2,
  '<END>': 3,
  'apple': 4},
 '_idx_to_token': {0: '<MASK>',
  1: '<UNK>',
  2: '<BEGIN>',
  3: '<END>',
  4: 'apple'},
 '_mask_token': '<MASK>',
 '_unk_token': '<UNK>',
 '_begin_seq_token': '<BEGIN>',
 '_end_seq_token': '<END>',
 'mask_index': 0,
 'unk_index': 1,
 'begin_seq_index': 2,
 'end_seq_index': 3}

In [20]:
new_token = 'banana'
example_vocab.add_token(new_token)
print(f"Add one token {new_token}")
print('-'*60)
vars(example_vocab)

Add one token banana
------------------------------------------------------------


{'_token_to_idx': {'<MASK>': 0,
  '<UNK>': 1,
  '<BEGIN>': 2,
  '<END>': 3,
  'apple': 4,
  'banana': 5},
 '_idx_to_token': {0: '<MASK>',
  1: '<UNK>',
  2: '<BEGIN>',
  3: '<END>',
  4: 'apple',
  5: 'banana'},
 '_mask_token': '<MASK>',
 '_unk_token': '<UNK>',
 '_begin_seq_token': '<BEGIN>',
 '_end_seq_token': '<END>',
 'mask_index': 0,
 'unk_index': 1,
 'begin_seq_index': 2,
 'end_seq_index': 3}

### lookup_token(token): Retrieve the index associated with the token or the UNK index if token isn't present.

In [21]:
example_vocab = SequenceVocabulary()
vars(example_vocab)

{'_token_to_idx': {'<MASK>': 0, '<UNK>': 1, '<BEGIN>': 2, '<END>': 3},
 '_idx_to_token': {0: '<MASK>', 1: '<UNK>', 2: '<BEGIN>', 3: '<END>'},
 '_mask_token': '<MASK>',
 '_unk_token': '<UNK>',
 '_begin_seq_token': '<BEGIN>',
 '_end_seq_token': '<END>',
 'mask_index': 0,
 'unk_index': 1,
 'begin_seq_index': 2,
 'end_seq_index': 3}

In [22]:
tokens_to_add = ['apple','banana','peach','orange','coconut']
for i in tokens_to_add:
    example_vocab.add_token(i)
    print(i + ' added')
vars(example_vocab)

apple added
banana added
peach added
orange added
coconut added


{'_token_to_idx': {'<MASK>': 0,
  '<UNK>': 1,
  '<BEGIN>': 2,
  '<END>': 3,
  'apple': 4,
  'banana': 5,
  'peach': 6,
  'orange': 7,
  'coconut': 8},
 '_idx_to_token': {0: '<MASK>',
  1: '<UNK>',
  2: '<BEGIN>',
  3: '<END>',
  4: 'apple',
  5: 'banana',
  6: 'peach',
  7: 'orange',
  8: 'coconut'},
 '_mask_token': '<MASK>',
 '_unk_token': '<UNK>',
 '_begin_seq_token': '<BEGIN>',
 '_end_seq_token': '<END>',
 'mask_index': 0,
 'unk_index': 1,
 'begin_seq_index': 2,
 'end_seq_index': 3}

In [23]:
tokens_list = ['orange','rice']
for i in tokens_list:
    print(f"The index for {i} is {example_vocab.lookup_token(i)}")

The index for orange is 7
The index for rice is 1


In [24]:
### Equivalent codes
for i in tokens_list:
    print(f"The index for {i} is {SequenceVocabulary.lookup_token(example_vocab,i)}")

The index for orange is 7
The index for rice is 1


### lookup_index(index): Return the token associated with the index

In [25]:
indices_list = [1,4]
for i in indices_list:
    print(f"The token with index={i} is {example_vocab.lookup_index(i)}")

The token with index=1 is <UNK>
The token with index=4 is apple


In [26]:
### Equivalent codes
for i in indices_list:
    print(f"The token with index={i} is {Vocabulary.lookup_index(example_vocab,i)}")

The token with index=1 is <UNK>
The token with index=4 is apple


### \_\_len\_\_(): Return the length of _token_to_idx (i.e, the number of tokens in the vocabulary)

In [27]:
example_vocab = SequenceVocabulary()
tokens_to_add = ['token1','token2','token3','token4']
for i in tokens_to_add:
    example_vocab.add_token(i)
example_vocab._idx_to_token

{0: '<MASK>',
 1: '<UNK>',
 2: '<BEGIN>',
 3: '<END>',
 4: 'token1',
 5: 'token2',
 6: 'token3',
 7: 'token4'}

In [28]:
len(example_vocab)

8