# Word-to-index Mapping

Instead of using TfidfVectorizer() from `sciki-learn` lib, we are writing code to compute tf-idf from scratch using word-to-index mappping. 

In [2]:
import pandas as pd
import numpy as np
import nltk

from nltk import word_tokenize

Objective: Pick random documents and look at which words having the largest tf-idf. Some requirements: 
- Their TF value is high. 
- They are unique. 

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hanhhieudao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
df = pd.read_csv('../data/bbc_text_cls.csv')

In [4]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


# Word-to-index Mapping

In [5]:
# convert documents into sequences of ints / ids / indices
idx = 0
word2idx = {} # word2idx dictionary: key is word, value is index 
tokenized_docs = []
for doc in df['text']:
    words = word_tokenize(doc.lower())
    doc_as_int = []
    for word in words:
        if word not in word2idx:
            word2idx[word] = idx
            idx += 1

        doc_as_int.append(word2idx[word]) # each doc (=row) has a list of indices of tokens
    tokenized_docs.append(doc_as_int) # add all docs lists into a big list 

In [17]:
word2idx

{'ad': 0,
 'sales': 1,
 'boost': 2,
 'time': 3,
 'warner': 4,
 'profit': 5,
 'quarterly': 6,
 'profits': 7,
 'at': 8,
 'us': 9,
 'media': 10,
 'giant': 11,
 'timewarner': 12,
 'jumped': 13,
 '76': 14,
 '%': 15,
 'to': 16,
 '$': 17,
 '1.13bn': 18,
 '(': 19,
 '£600m': 20,
 ')': 21,
 'for': 22,
 'the': 23,
 'three': 24,
 'months': 25,
 'december': 26,
 ',': 27,
 'from': 28,
 '639m': 29,
 'year-earlier': 30,
 '.': 31,
 'firm': 32,
 'which': 33,
 'is': 34,
 'now': 35,
 'one': 36,
 'of': 37,
 'biggest': 38,
 'investors': 39,
 'in': 40,
 'google': 41,
 'benefited': 42,
 'high-speed': 43,
 'internet': 44,
 'connections': 45,
 'and': 46,
 'higher': 47,
 'advert': 48,
 'said': 49,
 'fourth': 50,
 'quarter': 51,
 'rose': 52,
 '2': 53,
 '11.1bn': 54,
 '10.9bn': 55,
 'its': 56,
 'were': 57,
 'buoyed': 58,
 'by': 59,
 'one-off': 60,
 'gains': 61,
 'offset': 62,
 'a': 63,
 'dip': 64,
 'bros': 65,
 'less': 66,
 'users': 67,
 'aol': 68,
 'on': 69,
 'friday': 70,
 'that': 71,
 'it': 72,
 'owns': 73,
 '8

In [12]:
# reverse mapping: the dict 'idx2word' maps indices back to words 
idx2word = {v:k for k, v in word2idx.items()}
idx2word

{0: 'ad',
 1: 'sales',
 2: 'boost',
 3: 'time',
 4: 'warner',
 5: 'profit',
 6: 'quarterly',
 7: 'profits',
 8: 'at',
 9: 'us',
 10: 'media',
 11: 'giant',
 12: 'timewarner',
 13: 'jumped',
 14: '76',
 15: '%',
 16: 'to',
 17: '$',
 18: '1.13bn',
 19: '(',
 20: '£600m',
 21: ')',
 22: 'for',
 23: 'the',
 24: 'three',
 25: 'months',
 26: 'december',
 27: ',',
 28: 'from',
 29: '639m',
 30: 'year-earlier',
 31: '.',
 32: 'firm',
 33: 'which',
 34: 'is',
 35: 'now',
 36: 'one',
 37: 'of',
 38: 'biggest',
 39: 'investors',
 40: 'in',
 41: 'google',
 42: 'benefited',
 43: 'high-speed',
 44: 'internet',
 45: 'connections',
 46: 'and',
 47: 'higher',
 48: 'advert',
 49: 'said',
 50: 'fourth',
 51: 'quarter',
 52: 'rose',
 53: '2',
 54: '11.1bn',
 55: '10.9bn',
 56: 'its',
 57: 'were',
 58: 'buoyed',
 59: 'by',
 60: 'one-off',
 61: 'gains',
 62: 'offset',
 63: 'a',
 64: 'dip',
 65: 'bros',
 66: 'less',
 67: 'users',
 68: 'aol',
 69: 'on',
 70: 'friday',
 71: 'that',
 72: 'it',
 73: 'owns',
 74

In [7]:
# find number of doc ~ number of rows in df 
N = len(df['text'])
N

2225

In [19]:
# number of words in all docs
V = len(word2idx)
V

34762

In [21]:
# instantiate term-frequency matrix of size N by V
# note: could have also used count vectorizer
tf = np.zeros((N, V))
tf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [22]:
# populate term-frequency counts (compute df)
for i, doc_as_int in enumerate(tokenized_docs):
    for j in doc_as_int: # go through indices of each doc
        tf[i, j] += 1 # we are counting the occurrences of jth term in ith doc

In [23]:
# compute IDF
document_freq = np.sum(tf > 0, axis=0) # document frequency (shape = (V,))
# logarithm is used to squash down the value 
idf = np.log(N / document_freq)

In [24]:
# compute TF-IDF
tf_idf = tf * idf

In [25]:
np.random.seed(123)

In [26]:
# pick a random document, show the top 5 terms (in terms of tf_idf score)
i = np.random.choice(N)
row = df.iloc[i]
print("Label:", row['labels'])
print("Text:", row['text'].split("\n", 1)[0])
print("Top 5 terms:")

scores = tf_idf[i]
indices = (-scores).argsort()

for j in indices[:5]:
  print(idx2word[j])

Label: sport
Text: Athens memories soar above lows
Top 5 terms:
paula
athens
1500m
her
kelly
