# import the necessary library

In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer

# User defined dataset ( Two simple documents containing one sentence each)

In [2]:
documentA = 'the man went out for a walk '
documentB = 'the children sat around the fire'


# BoW(To convert text into vectors of numbers)

In [3]:
#spilts two documents in individuals words
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

print(bagOfWordsA)
print(bagOfWordsB)



['the', 'man', 'went', 'out', 'for', 'a', 'walk', '']
['the', 'children', 'sat', 'around', 'the', 'fire']


In [4]:
#remove any duplicate words

uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

In [5]:
#Creater a dictnoary of words and their occurence for each documents in the cor

numOfWordsA = dict.fromkeys(uniqueWords,0)
print(numOfWordsA)

{'': 0, 'fire': 0, 'walk': 0, 'around': 0, 'out': 0, 'a': 0, 'went': 0, 'children': 0, 'for': 0, 'sat': 0, 'the': 0, 'man': 0}


In [6]:
numOfWordsA = dict.fromkeys(uniqueWords,0)

for word in bagOfWordsA:
    numOfWordsA[word] += 1
    
numOfWordsB = dict.fromkeys(uniqueWords,0)

for word in bagOfWordsB:
    numOfWordsB[word] += 1
    
df = pd.DataFrame([numOfWordsA,numOfWordsB])
print(df)

      fire  walk  around  out  a  went  children  for  sat  the  man
0  1     0     1       0    1  1     1         0    1    0    1    1
1  0     1     0       1    0  0     0         1    0    1    2    0


In [7]:
numOfWordsB.items()

dict_items([('', 0), ('fire', 1), ('walk', 0), ('around', 1), ('out', 0), ('a', 0), ('went', 0), ('children', 1), ('for', 0), ('sat', 1), ('the', 2), ('man', 0)])

# Term Frequency (TF)

In [10]:
def computeTF(wordDict , bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word , count in wordDict.items():
        tfDict[word]= count/ float(bagOfWordsCount)
    return tfDict

In [12]:
#computer the term Frequecy for each of Documents

tfA = computeTF(numOfWordsA , bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

tfA
tfB

{'': 0.0,
 'fire': 0.16666666666666666,
 'walk': 0.0,
 'around': 0.16666666666666666,
 'out': 0.0,
 'a': 0.0,
 'went': 0.0,
 'children': 0.16666666666666666,
 'for': 0.0,
 'sat': 0.16666666666666666,
 'the': 0.3333333333333333,
 'man': 0.0}

# Inverse Data frequency (IDF)

In [16]:
def computeIDF(documents):
    import math
    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(),0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
                
    for word, val in idfDict.items():
        idfDict[word] = math.log(N/float(val))
        
    return idfDict

In [18]:
# The  IDF is computed once for all documents

idfs = computeIDF([numOfWordsA,numOfWordsB])
df = pd.DataFrame([idfs])

print(idfs,df)


{'': 0.6931471805599453, 'fire': 0.6931471805599453, 'walk': 0.6931471805599453, 'around': 0.6931471805599453, 'out': 0.6931471805599453, 'a': 0.6931471805599453, 'went': 0.6931471805599453, 'children': 0.6931471805599453, 'for': 0.6931471805599453, 'sat': 0.6931471805599453, 'the': 0.0, 'man': 0.6931471805599453}                  fire      walk    around       out         a      went  \
0  0.693147  0.693147  0.693147  0.693147  0.693147  0.693147  0.693147   

   children       for       sat  the       man  
0  0.693147  0.693147  0.693147  0.0  0.693147  


In [21]:
# the TF-IDf is simply the TF multiplied by IDF

def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word , val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
        
    return tfidf

In [24]:
 #computer the TF- IDf scores for all the words in the corpus

tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
df = pd.DataFrame([tfidfA,tfidfB])

In [25]:
print(df)

                 fire      walk    around       out         a      went  \
0  0.086643  0.000000  0.086643  0.000000  0.086643  0.086643  0.086643   
1  0.000000  0.115525  0.000000  0.115525  0.000000  0.000000  0.000000   

   children       for       sat  the       man  
0  0.000000  0.086643  0.000000  0.0  0.086643  
1  0.115525  0.000000  0.115525  0.0  0.000000  
