# TF-IDF Explaination:



    TF: Term Frequency, which measures how frequently a term occurs in a document. Since every document is different in length, it is possible that a term would appear much more times in long documents than shorter ones. Thus, the term frequency is often divided by the document length (aka. the total number of terms in the document) as a way of normalization:

    TF(t) = (Number of times term t appears in a document) / (Total number of terms in the document).

    IDF: Inverse Document Frequency, which measures how important a term is. While computing TF, all terms are considered equally important. However it is known that certain terms, such as "is", "of", and "that", may appear a lot of times but have little importance. Thus we need to weigh down the frequent terms while scale up the rare ones, by computing the following:

    IDF(t) = log_e(Total number of documents / Number of documents with term t in it).

In [1]:
from time import time
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

data = '''Time flies like an arrow
Fruit flies like a banana,
Sam sat on the cat
The cat is white.'''

In [2]:
print(data)

Time flies like an arrow
Fruit flies like a banana,
Sam sat on the cat
The cat is white.


In [3]:
print("Loading dataset...")
t0 = time()
dataset = data.split('\n')
print("done in %0.3fs." % (time() - t0))
dataset

Loading dataset...
done in 0.000s.


['Time flies like an arrow',
 'Fruit flies like a banana,',
 'Sam sat on the cat',
 'The cat is white.']

In [106]:
#Get the TF matrix.
tf_vectorizer = CountVectorizer(ngram_range=(1,1))
tf = tf_vectorizer.fit_transform(dataset)
print(tf)

  (0, 1)	1
  (0, 0)	1
  (0, 7)	1
  (0, 4)	1
  (0, 12)	1
  (1, 2)	1
  (1, 5)	1
  (1, 7)	1
  (1, 4)	1
  (2, 3)	1
  (2, 11)	1
  (2, 8)	1
  (2, 10)	1
  (2, 9)	1
  (3, 13)	1
  (3, 6)	1
  (3, 3)	1
  (3, 11)	1


In [107]:
#Get TF-IDFs.
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1))#max_df=0.95, min_df=2, stop_words='english' #USE HELP TO SEE WHAT EACH DOES)
tfidf = tfidf_vectorizer.fit_transform(dataset)
type(tfidf)
print(tfidf)

  (0, 12)	0.485460611816
  (0, 4)	0.382742722417
  (0, 7)	0.382742722417
  (0, 0)	0.485460611816
  (0, 1)	0.485460611816
  (1, 4)	0.437791231086
  (1, 7)	0.437791231086
  (1, 5)	0.555282664941
  (1, 2)	0.555282664941
  (2, 9)	0.485460611816
  (2, 10)	0.485460611816
  (2, 8)	0.485460611816
  (2, 11)	0.382742722417
  (2, 3)	0.382742722417
  (3, 11)	0.437791231086
  (3, 3)	0.437791231086
  (3, 6)	0.555282664941
  (3, 13)	0.555282664941


In [108]:
feature_names = tfidf_vectorizer.get_feature_names()
print(len(feature_names))
feature_names[0:14]

14


[u'an',
 u'arrow',
 u'banana',
 u'cat',
 u'flies',
 u'fruit',
 u'is',
 u'like',
 u'on',
 u'sam',
 u'sat',
 u'the',
 u'time',
 u'white']

In [109]:
dense = tfidf.todense()
dense.shape
dense

matrix([[ 0.48546061,  0.48546061,  0.        ,  0.        ,  0.38274272,
          0.        ,  0.        ,  0.38274272,  0.        ,  0.        ,
          0.        ,  0.        ,  0.48546061,  0.        ],
        [ 0.        ,  0.        ,  0.55528266,  0.        ,  0.43779123,
          0.55528266,  0.        ,  0.43779123,  0.        ,  0.        ,
          0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.38274272,  0.        ,
          0.        ,  0.        ,  0.        ,  0.48546061,  0.48546061,
          0.48546061,  0.38274272,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.43779123,  0.        ,
          0.        ,  0.55528266,  0.        ,  0.        ,  0.        ,
          0.        ,  0.43779123,  0.        ,  0.55528266]])

# Manual TF and TF-IDF functions. 

In [110]:
docA = "the cat sat on my sofa"
docB = "the dog sat on my bed" 

In [111]:
bowA = docA.split(" ")
bowB = docB.split(" ")
bowA

['the', 'cat', 'sat', 'on', 'my', 'sofa']

In [112]:
set(bowA)

{'cat', 'my', 'on', 'sat', 'sofa', 'the'}

In [113]:
# Vocabulary in the corpus
wordSet = set(bowA).union(set(bowB))
wordSet

{'bed', 'cat', 'dog', 'my', 'on', 'sat', 'sofa', 'the'}

In [114]:
#Dictionaries to keep the word count in each bag of words
wordDictA = dict.fromkeys(wordSet,0)
wordDictB = dict.fromkeys(wordSet,0)

In [115]:
wordDictA

{'bed': 0, 'cat': 0, 'dog': 0, 'my': 0, 'on': 0, 'sat': 0, 'sofa': 0, 'the': 0}

In [116]:
# count the frequency of each word in the dictionary
for word in bowA:
    wordDictA[word]+=1
    
for word in bowB:
    wordDictB[word]+=1

In [117]:
print(wordDictA)
print(wordDictB)

{'on': 1, 'sofa': 1, 'the': 1, 'sat': 1, 'my': 1, 'dog': 0, 'bed': 0, 'cat': 1}
{'on': 1, 'sofa': 0, 'the': 1, 'sat': 1, 'my': 1, 'dog': 1, 'bed': 1, 'cat': 0}


In [118]:
import pandas as pd
#Put them into a matrix
pd.DataFrame([wordDictA,wordDictB])

Unnamed: 0,bed,cat,dog,my,on,sat,sofa,the
0,0,1,0,1,1,1,1,1
1,1,0,1,1,1,1,0,1


In [119]:
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/ float(bowCount)
    return tfDict

In [120]:
tfbowA = computeTF(wordDictA,bowA)
tfbowA

{'bed': 0.0,
 'cat': 0.16666666666666666,
 'dog': 0.0,
 'my': 0.16666666666666666,
 'on': 0.16666666666666666,
 'sat': 0.16666666666666666,
 'sofa': 0.16666666666666666,
 'the': 0.16666666666666666}

In [121]:
tfbowB = computeTF(wordDictB,bowB)
tfbowB

{'bed': 0.16666666666666666,
 'cat': 0.0,
 'dog': 0.16666666666666666,
 'my': 0.16666666666666666,
 'on': 0.16666666666666666,
 'sat': 0.16666666666666666,
 'sofa': 0.0,
 'the': 0.16666666666666666}

In [122]:
import math
def computeIDF(docList):
    idfDict = {}
    N = len(docList)
    
    #count the number of documents that contains the word w
    idfDict = dict.fromkeys(docList[0].keys(),0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
                
    #divide N by denominator above and take log of that
    for word, val in idfDict.items():
        idfDict[word]= math.log(N/float(val))
        
    return idfDict

In [123]:
idfs = computeIDF([wordDictA,wordDictB])
idfs

{'bed': 0.6931471805599453,
 'cat': 0.6931471805599453,
 'dog': 0.6931471805599453,
 'my': 0.0,
 'on': 0.0,
 'sat': 0.0,
 'sofa': 0.6931471805599453,
 'the': 0.0}

In [124]:
def computeTFIDF(tfBow,idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf

In [125]:
tfIDFA = computeTFIDF(tfbowA,idfs)
tfIDFA

{'bed': 0.0,
 'cat': 0.11552453009332421,
 'dog': 0.0,
 'my': 0.0,
 'on': 0.0,
 'sat': 0.0,
 'sofa': 0.11552453009332421,
 'the': 0.0}

In [126]:
tfIDFB = computeTFIDF(tfbowB, idfs)
tfIDFB

{'bed': 0.11552453009332421,
 'cat': 0.0,
 'dog': 0.11552453009332421,
 'my': 0.0,
 'on': 0.0,
 'sat': 0.0,
 'sofa': 0.0,
 'the': 0.0}

In [102]:
pd.DataFrame([tfIDFA,tfIDFB])

Unnamed: 0,bed,cat,dog,my,on,sat,sofa,the
0,0.0,0.115525,0.0,0.0,0.0,0.0,0.115525,0.0
1,0.115525,0.0,0.115525,0.0,0.0,0.0,0.0,0.0
