Computers are good with numbers, but not that much with textual data. One of the most widely used techniques to process textual data is TF-IDF

### Term Frequency Inverse Document Frequency
Term Frequency: This summarizes how often a given word appears within a document.<br>
Inverse Document Frequency: This downscales words that appear a lot across documents.

### Preliminaries


In [1]:
# Load libraries
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

### Create Text Data


In [2]:
# Create text
text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

### Create Feature Matrix


In [3]:
# Create the tf-idf feature matrix
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)

# Show tf-idf feature matrix
feature_matrix.toarray()

array([[0.        , 0.        , 0.        , 0.89442719, 0.        ,
        0.        , 0.4472136 , 0.        ],
       [0.        , 0.57735027, 0.        , 0.        , 0.        ,
        0.57735027, 0.        , 0.57735027],
       [0.57735027, 0.        , 0.57735027, 0.        , 0.57735027,
        0.        , 0.        , 0.        ]])

In [4]:
# Show tf-idf feature matrix
tfidf.get_feature_names()

['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden']

### View Feature Matrix As Data Frame


In [5]:
# Create data frame
pd.DataFrame(feature_matrix.toarray(), columns=tfidf.get_feature_names())

Unnamed: 0,beats,best,both,brazil,germany,is,love,sweden
0,0.0,0.0,0.0,0.894427,0.0,0.0,0.447214,0.0
1,0.0,0.57735,0.0,0.0,0.0,0.57735,0.0,0.57735
2,0.57735,0.0,0.57735,0.0,0.57735,0.0,0.0,0.0


## Implementation of TF-IDF from scratch in Python


In [6]:
docA = "The cat sat on my face"
docB = "The dog sat on my bed"

In [7]:
bowA = docA.split(" ")
bowB = docB.split(" ")

In [8]:
bowB

['The', 'dog', 'sat', 'on', 'my', 'bed']

In [10]:
wordSet = set(bowA).union(set(bowB))

In [11]:
wordSet

{'The', 'bed', 'cat', 'dog', 'face', 'my', 'on', 'sat'}

In [12]:
wordDictA = dict.fromkeys(wordSet, 0) 
wordDictB = dict.fromkeys(wordSet, 0)

In [13]:
wordDictA

{'The': 0, 'sat': 0, 'my': 0, 'cat': 0, 'dog': 0, 'face': 0, 'on': 0, 'bed': 0}

In [14]:
for word in bowA:
    wordDictA[word]+=1
    
for word in bowB:
    wordDictB[word]+=1

In [15]:
wordDictA

{'The': 1, 'sat': 1, 'my': 1, 'cat': 1, 'dog': 0, 'face': 1, 'on': 1, 'bed': 0}

In [16]:
import pandas as pd
pd.DataFrame([wordDictA, wordDictB])

Unnamed: 0,The,bed,cat,dog,face,my,on,sat
0,1,0,1,0,1,1,1,1
1,1,1,0,1,0,1,1,1


In [17]:
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict

In [18]:
tfBowA = computeTF(wordDictA, bowA)
tfBowB = computeTF(wordDictB, bowB)

In [19]:
tfBowA

{'The': 0.16666666666666666,
 'sat': 0.16666666666666666,
 'my': 0.16666666666666666,
 'cat': 0.16666666666666666,
 'dog': 0.0,
 'face': 0.16666666666666666,
 'on': 0.16666666666666666,
 'bed': 0.0}

In [20]:
tfBowB

{'The': 0.16666666666666666,
 'sat': 0.16666666666666666,
 'my': 0.16666666666666666,
 'cat': 0.0,
 'dog': 0.16666666666666666,
 'face': 0.0,
 'on': 0.16666666666666666,
 'bed': 0.16666666666666666}

In [21]:
def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / float(val))
        
    return idfDict

In [22]:
idfs = computeIDF([wordDictA, wordDictB])

In [23]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf

In [24]:
tfidfBowA = computeTFIDF(tfBowA, idfs)
tfidfBowB = computeTFIDF(tfBowB, idfs)

In [25]:
import pandas as pd
pd.DataFrame([tfidfBowA, tfidfBowB])

Unnamed: 0,The,bed,cat,dog,face,my,on,sat
0,0.0,0.0,0.050172,0.0,0.050172,0.0,0.0,0.0
1,0.0,0.050172,0.0,0.050172,0.0,0.0,0.0,0.0
