In [1]:
import pandas as pd
import numpy as np
import nltk

from nltk import word_tokenize

In [2]:
# read dataframe
df = pd.read_csv('bbc_text_cls.csv')

In [3]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [7]:
%%time
# populate word2idx
# convert documents into sequences of ints / ids / indices

idx = 0
word2idx = {}
tokenized_docs = []

for doc in df['text']:
    words = word_tokenize(doc.lower())
    docs_as_int = []
    for word in words:
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1
            
        docs_as_int.append(word2idx[word])
        
    tokenized_docs.append(docs_as_int)

CPU times: user 6.24 s, sys: 20 ms, total: 6.26 s
Wall time: 6.29 s


In [8]:
# reversing mapping
# if you do it smarter you can store ir as a list
idx2word = {v:k for k, v in word2idx.items()}

In [9]:
# number of documents
N = len(df['text'])

In [10]:
V = len(word2idx)

In [11]:
# instantiate term-frequency matrix
# note: could have also used count vectorizer
tf = np.zeros((N, V))

In [14]:
# populate term-frequency counts
for i, doc_as_int in enumerate(tokenized_docs):
    for j in doc_as_int:
        tf[i, j] += 1

In [15]:
# compute IDF
document_frequency = np.sum(tf > 0, axis = 0) # document frequency (shape = (V, ))
idf = np.log(N / document_frequency)

In [19]:
# compute TF-IDF
tf_idf = tf * idf

In [17]:
np.random.seed(123)

In [23]:
# pick a random document, show the top 5 terms (in terms of tf_idf score)
i = np.random.choice(N)
row = df.iloc[i]

print("Label: ", row['labels'])
print("Text: ", row['text'].split("\n", 1)[0])
print("Top 5 terms:")

scores = tf_idf[i]
indices = (-scores).argsort()

for j in indices[:5]:
    print(idx2word[j])

Label:  sport
Text:  Hingis hints at playing comeback
Top 5 terms:
hingis
pattaya
thailand
95th
30th
