In [1]:
import pandas as pd
import collections
from collections import Counter
import string
import numpy as np
import pandas as pd

In [2]:
df_all = pd.read_csv('news_with_splits.csv')

In [3]:
print("shape of the data: ", df_all.shape)
print('-'*60)
print(df_all.head())

shape of the data:  (120000, 3)
------------------------------------------------------------
   category  split                                 title
0  Business  train    Jobs, tax cuts key issues for Bush
1  Business  train  Jarden Buying Mr. Coffee #39;s Maker
2  Business  train     Retail sales show festive fervour
3  Business  train   Intervoice's Customers Come Calling
4  Business  train     Boeing Expects Air Force Contract


In [4]:
class Vocabulary(object):

    def __init__(self, token_to_idx=None, add_unk=False, unk_token="<UNK>"):
        """
        Args:
            token_to_idx (dict): a pre-existing map of tokens to indices
            add_unk (bool): a flag that indicates whether to add the UNK token
            unk_token (str): the UNK token to add into the Vocabulary
        """
        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx
        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        
        self._add_unk   = add_unk
        self._unk_token = unk_token      
        self.unk_index  = -999
        ### the unk_token, i.e, "<UNK>" is the first added token if add_unk=True
        ### self.unk_index is changed from -999 to 0
        if add_unk:
            self.unk_index = self.add_token(unk_token) 
        

    def add_token(self, token):
        """Update mapping dicts based on the token.

        Args:
            token (str): the item to add into the Vocabulary
        Returns:
            index (int): the integer corresponding to the token
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            ### add a new element to _token_to_idx
            self._token_to_idx[token] = index
            ### add a new element to _idx_to_token
            self._idx_to_token[index] = token
        return index
   
    def lookup_token(self, token):
        """Retrieve the index associated with the token 
          or the UNK index if token isn't present.
        
        Args:
            token (str): the token to look up 
        Returns:
            index (int): the index corresponding to the token
        Notes:
            `unk_index` needs to be >=0 (having been added into the Vocabulary) 
              for the UNK functionality 
        """
        if self.unk_index >= 0:
            ### .get(): return self.unk_index if the key "token" does not exist. 
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]
    
    def lookup_index(self, index):
        """Return the token associated with the index
        
        Args: 
            index (int): the index to look up
        Returns:
            token (str): the token corresponding to the index
        Raises:
            KeyError: if the index is not in the Vocabulary
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]
    
    def __len__(self):
        return len(self._token_to_idx)
    
class SequenceVocabulary(Vocabulary):
    def __init__(self, token_to_idx = None, 
                 unk_token          = "<UNK>",
                 mask_token         = "<MASK>", 
                 begin_seq_token    = "<BEGIN>",
                 end_seq_token      = "<END>"):

        super(SequenceVocabulary, self).__init__(token_to_idx)
        """
        The follow attributes have been defined in the Vocabulary class:
            - ._token_to_idx
            - ._idx_to_token
            - ._add_unk
            - ._unk_token
            - .unk_index
        """
        self._mask_token      = mask_token
        self._unk_token       = unk_token
        self._begin_seq_token = begin_seq_token
        self._end_seq_token   = end_seq_token

        self.mask_index       = self.add_token(self._mask_token)
        self.unk_index        = self.add_token(self._unk_token)
        self.begin_seq_index  = self.add_token(self._begin_seq_token)
        self.end_seq_index    = self.add_token(self._end_seq_token)
        
        self._add_unk         = True

# 1. NewsVectorizer class

In [5]:
class NewsVectorizer(object):
    
    def __init__(self, title_vocab, category_vocab):
        self.title_vocab    = title_vocab
        self.category_vocab = category_vocab
         
    @classmethod
    def from_dataframe(cls, news_df, cutoff):
        """Instantiate the vectorizer from the dataset dataframe
        
        Args:
            news_df (pandas.DataFrame): the news dataset
            cutoff (int): the parameter for frequency-based filtering
        Returns:
            an instance of the NewsVectorizer
        """
        category_vocab = Vocabulary()
        title_vocab    = SequenceVocabulary()
        
        ########## Add tokens to category_vocab ('Business','Sci/Tech','Sports','World')
        for category in sorted(set(news_df.category)):
            category_vocab.add_token(category)
            
        ########## Add tokens to title_vocab
        ### Create a Counter() to count all tokens appears in news_df.title
        word_counts = Counter()
        for title in news_df.title:
            for word in title.split(" "):
                if word not in string.punctuation:
                    word_counts[word] += 1
        ### execute add_token if a word appears more than "cutoff" times
        for word, count in word_counts.items():
            if count > cutoff:
                title_vocab.add_token(word)
                
        return cls(title_vocab, category_vocab)
    
    ### This is the key functionality of the Vectorizer.
    ### It takes as an argument a string representing a text,
    ### and returns a vectorized representation of the text.
    def vectorize(self, title, vector_length=-1):
        """
        Args:
            context (str): the string of words separated by a space
            vector_length (int): an argument for forcing the length of index vector
        """
        ### set the first index to be begin_seq_index=2 (defined in SequenceVocabulary)
        indices = [self.title_vocab.begin_seq_index]
        
        ### adding the indeces for the title after the first index
        indices.extend(self.title_vocab.lookup_token(token)
                       for token in title.split(" "))
        
        ### set the last index to be end_seq_index=3 (defined in SequenceVocabulary)
        indices.append(self.title_vocab.end_seq_index)

        if vector_length < 0:
            vector_length = len(indices)

        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.title_vocab.mask_index

        return out_vector

# 2. Instantiate a NewsVectorizer from the training data

In [6]:
df_sample = df_all.copy()
df_sample.head()

Unnamed: 0,category,split,title
0,Business,train,"Jobs, tax cuts key issues for Bush"
1,Business,train,Jarden Buying Mr. Coffee #39;s Maker
2,Business,train,Retail sales show festive fervour
3,Business,train,Intervoice's Customers Come Calling
4,Business,train,Boeing Expects Air Force Contract


In [7]:
### Instantiate a vectorizer
vectorizer = NewsVectorizer.from_dataframe(df_sample,cutoff=25)

### A vectorizer has two vocabularies(attributes), one for title, one for category 

In [8]:
vars(vectorizer)

{'title_vocab': <__main__.SequenceVocabulary at 0x7fc50aa1e6d0>,
 'category_vocab': <__main__.Vocabulary at 0x7fc50aa1e550>}

In [9]:
print('category_vocab')
print(vectorizer.category_vocab._token_to_idx)
print(vectorizer.category_vocab._idx_to_token)
print('-'*60)
print('title_vocab')
print(f"- Includes {len(vectorizer.title_vocab)} tokens")
print("- First ten _token_to_idx:")
print(list(vectorizer.title_vocab._token_to_idx.items())[:10])
print('-'*60)

category_vocab
{'Business': 0, 'Sci/Tech': 1, 'Sports': 2, 'World': 3}
{0: 'Business', 1: 'Sci/Tech', 2: 'Sports', 3: 'World'}
------------------------------------------------------------
title_vocab
- Includes 4794 tokens
- First ten _token_to_idx:
[('<MASK>', 0), ('<UNK>', 1), ('<BEGIN>', 2), ('<END>', 3), ('Jobs,', 4), ('tax', 5), ('cuts', 6), ('key', 7), ('issues', 8), ('for', 9)]
------------------------------------------------------------


# 3. Methods

### (classmethod) from_dataframe(news_df, cutoff): Instantiate the vectorizer from the dataset dataframe.
1. First instantiate a Vocabulariy for categories and a SequenceVocabulary for titles, based on the input data "news_with_splits.csv".
2. Use category_vocab and title_vocab (and a pre-specified cutoff value) as the inputs to instantiate a vectorizer.

### vectorize(title): It takes as an argument a string of words separated by a space, and returns a vectorized representation of the string. This is the key functionality of the Vectorizer.

In [10]:
example_text = "the sun is shining and it is a beautiful day"

In [11]:
##### Initializing NewsVectorizer
vectorizer = NewsVectorizer.from_dataframe(df_sample, 25)
indices = [vectorizer.title_vocab.lookup_token(token) for token in example_text.split(' ')]
print(f'The number of tokens in the example_text is {len(indices)}')
print('The indeces of these tokens in title_vocab:' + str(indices))

The number of tokens in the example_text is 10
The indeces of these tokens in title_vocab:[276, 1, 132, 1, 536, 1860, 132, 101, 1, 72]


In [12]:
##### Use NewsVectorizer.vectorize() with vector_length=-1
##### i.e., no pre-specified length of index vector 
##### begin_seq_index:<2> and end_seq_index<3> are added to the front and end of the vector.
vector_1 = vectorizer.vectorize(example_text,vector_length=-1)
print('vector_1:',vector_1)
print('-'*100)
print('indeces for <BEGIN> and <END>:')
print(list(vectorizer.title_vocab._token_to_idx.items())[:4])

vector_1: [   2  276    1  132    1  536 1860  132  101    1   72    3]
----------------------------------------------------------------------------------------------------
indeces for <BEGIN> and <END>:
[('<MASK>', 0), ('<UNK>', 1), ('<BEGIN>', 2), ('<END>', 3)]


In [13]:
token = 'the'
index = vectorizer.title_vocab.lookup_token(token) 
print(f"The first token: {token}")
print(f"The index of the first token in vectorizer.title_vocab: {index}")

The first token: the
The index of the first token in vectorizer.title_vocab: 276


In [14]:
token = 'shining'
index = vectorizer.title_vocab.lookup_token(token) 
print(f"The fourth token: {token}")
print(f"The index of the fourth token in vectorizer.title_vocab: {index}")

The fourth token: shining
The index of the fourth token in vectorizer.title_vocab: 1


In [15]:
##### Use NewsVectorizer.vectorize() with vector_length>len(indices)
##### out_vector[len(indices):] are assigned as NewsVectorizer.title_vocab.mask_index
##### I.e., if the number of tokens in the context is less than the max length, 
##### the remaining entries are filled with zeros. 
vector_2 = vectorizer.vectorize(example_text,vector_length=15)
vector_2

array([   2,  276,    1,  132,    1,  536, 1860,  132,  101,    1,   72,
          3,    0,    0,    0])

In [16]:
##### Use NewsVectorizer.vectorize() with vector_length<len(indices)
try:
    vector_3 = vectorizer.vectorize(example_text,vector_length=5)
    vector_3
except Exception as e:
    print(e)

could not broadcast input array from shape (12,) into shape (5,)


### Use NewsVectorizer.vectorize() with vector_length = max length among all comments, so that the vectors for different rows will have the same length.

In [17]:
context_1 = 'Earth is the third planet from the Sun.'
context_2 = 'Earth is the only astronomical object known to harbor life.'
context_3 = 'Earth has a dynamic atmosphere.'
context_df = pd.DataFrame(dict(id=[i for i in range(3)],
                               context=[context_1,context_2,context_3]))

context_df

Unnamed: 0,id,context
0,0,Earth is the third planet from the Sun.
1,1,Earth is the only astronomical object known to...
2,2,Earth has a dynamic atmosphere.


In [18]:
### A function returns the length (number of tokens) in a context
measure_len = lambda context: len(context.split(" "))

### calculate the length of each context in context_df
for i in range(3):
    print(f"Context {i}:")
    print(context_df.loc[i,'context'])
    print(f"length of context {i}: {measure_len(context_df.loc[i,'context'])}")
    print('-'*60)

Context 0:
Earth is the third planet from the Sun.
length of context 0: 8
------------------------------------------------------------
Context 1:
Earth is the only astronomical object known to harbor life.
length of context 1: 10
------------------------------------------------------------
Context 2:
Earth has a dynamic atmosphere.
length of context 2: 5
------------------------------------------------------------


In [19]:
### Use map() function
list(map(measure_len,context_df['context']))

[8, 10, 5]

In [20]:
max(map(measure_len,context_df['context']))

10

In [21]:
### +1 if only using begin_seq, +2 if using both begin and end seq tokens
max_length = max(map(measure_len,context_df['context']))+2
for i in range(3):
    text_now = context_df.loc[i,'context']
    print(text_now)
    print(vectorizer.vectorize(text_now,vector_length=max_length))

Earth is the third planet from the Sun.
[   2 4228  132  276   70    1  386  276    1    3    0    0]
Earth is the only astronomical object known to harbor life.
[   2 4228  132  276 2260    1    1    1   39    1    1    3]
Earth has a dynamic atmosphere.
[   2 4228 1850  101    1    1    3    0    0    0    0    0]


### Use NewsVectorizer.vectorize() with different cutoff values
#### The larger the cutoff, the fewer tokens with appearance counts greater than this value, and the more tokens in the text with an index of 1 in the vector representation (indicating they are recognized as "unk".

In [22]:
print('Text:', example_text)
cut_off_list = [10,50,100,1000]
for c in cut_off_list:
    vectorizer = vectorizer = NewsVectorizer.from_dataframe(df_sample, c)
    vector     = vectorizer.vectorize(example_text)
    print(f"cutoff={c}")
    print(f'Title Vocabulary: the words appear >{c} times')
    print(f'The number of tokens: {len(vectorizer.title_vocab)}')
    print('Vector representation:', vector)
    print('-'*100)

Text: the sun is shining and it is a beautiful day
cutoff=10
Title Vocabulary: the words appear >10 times
The number of tokens: 9530
Vector representation: [   2  321 5924  143    1  632 2453  143  110    1   79    3]
----------------------------------------------------------------------------------------------------
cutoff=50
Title Vocabulary: the words appear >50 times
The number of tokens: 2642
Vector representation: [   2  231    1  117    1  444 1347  117   92    1   66    3]
----------------------------------------------------------------------------------------------------
cutoff=100
Title Vocabulary: the words appear >100 times
The number of tokens: 1288
Vector representation: [  2 175   1  90   1 326 846  90  69   1  48   3]
----------------------------------------------------------------------------------------------------
cutoff=1000
Title Vocabulary: the words appear >1000 times
The number of tokens: 45
Vector representation: [ 2 29  1 20  1 39  1 20 17  1  1  3]
----------