## TF-IDF 
### (Term Frequency-Inverse Document Frequency)
----------------------------------

In [4]:
#---EXAMPLE---#
docA = "The cat sat on my face"
docB = "The dog sat on my bed"

In [5]:
bowA = docA.split(" ")
bowB = docB.split(" ")

['The', 'cat', 'sat', 'on', 'my', 'face']


In [8]:
print(set(bowA))
print(set(bowB))

{'my', 'face', 'cat', 'on', 'sat', 'The'}
{'bed', 'my', 'dog', 'on', 'sat', 'The'}


In [11]:
word_set = set(bowA).union(set(bowB))
print(word_set)

{'bed', 'my', 'face', 'dog', 'cat', 'on', 'sat', 'The'}


In [12]:
# dict.fromkeys(keys, value)
    # keys: keys of the new dict (required)
    # value: the value of all keys (optional; deflt value == None)

    # key가 word_set, value가 각각 0인 dictionary 생성!
    # ㄴ{'dog': 0, 'The': 0, 'bed': 0, 'cat': 0, 'face': 0, 'my': 0, 'on': 0, 'sat': 0}
word_dictA = dict.fromkeys(word_set, 0) 
word_dictB = dict.fromkeys(word_set, 0)

In [14]:
print(word_dictA)
print(word_dictB)

{'bed': 0, 'my': 0, 'face': 0, 'dog': 0, 'cat': 0, 'on': 0, 'sat': 0, 'The': 0}
{'bed': 0, 'my': 0, 'face': 0, 'dog': 0, 'cat': 0, 'on': 0, 'sat': 0, 'The': 0}


In [15]:
for word in bowA:
    word_dictA[word] += 1
for word in bowB:
    word_dictB[word] += 1

In [17]:
print(word_dictA)
print(word_dictB)

{'bed': 0, 'my': 1, 'face': 1, 'dog': 0, 'cat': 1, 'on': 1, 'sat': 1, 'The': 1}
{'bed': 1, 'my': 1, 'face': 0, 'dog': 1, 'cat': 0, 'on': 1, 'sat': 1, 'The': 1}


In [18]:
import pandas as pd
pd.DataFrame([word_dictA, word_dictB])

Unnamed: 0,The,bed,cat,dog,face,my,on,sat
0,1,0,1,0,1,1,1,1
1,1,1,0,1,0,1,1,1


----------------------------------
### Term Frequency (TF) 
 
 * 비율 계산: 단어 빈도 수 / 문서에 포함된 총 단어 수

In [19]:
def computeTF(word_dict, bow):
    tf_dict = {}
    bow_count = len(bow)
    print("\n해당 문서에 포함된 총 단어 수: ", bow_count)
    for word, count in word_dict.items():
        tf_dict[word] = count / float(bow_count)
        print("  {} \t{} / {} = {}".format(word, count, float(bow_count), tf_dict[word]))
    return tf_dict

tfBowA = computeTF(word_dictA, bowA)
tfBowB = computeTF(word_dictB, bowB)


해당 문서에 포함된 총 단어 수:  6
  bed 	0 / 6.0 = 0.0
  my 	1 / 6.0 = 0.16666666666666666
  face 	1 / 6.0 = 0.16666666666666666
  dog 	0 / 6.0 = 0.0
  cat 	1 / 6.0 = 0.16666666666666666
  on 	1 / 6.0 = 0.16666666666666666
  sat 	1 / 6.0 = 0.16666666666666666
  The 	1 / 6.0 = 0.16666666666666666

해당 문서에 포함된 총 단어 수:  6
  bed 	1 / 6.0 = 0.16666666666666666
  my 	1 / 6.0 = 0.16666666666666666
  face 	0 / 6.0 = 0.0
  dog 	1 / 6.0 = 0.16666666666666666
  cat 	0 / 6.0 = 0.0
  on 	1 / 6.0 = 0.16666666666666666
  sat 	1 / 6.0 = 0.16666666666666666
  The 	1 / 6.0 = 0.16666666666666666


In [20]:
pd.DataFrame([tfBowA, tfBowB])

Unnamed: 0,The,bed,cat,dog,face,my,on,sat
0,0.166667,0.0,0.166667,0.0,0.166667,0.166667,0.166667,0.166667
1,0.166667,0.166667,0.0,0.166667,0.0,0.166667,0.166667,0.166667


----------------------------------
### Inverse Document Frequency (IDF)

In [151]:
import math

def computeIDF(doc_list):
    ''' doc_list
     [{'dog': 0, 'The': 1, 'bed': 0, 'cat': 1, 'face': 1, my': 1, 'on': 1, 'sat': 1}, 
      {'dog': 1, 'The': 1, 'bed': 1, 'cat': 0, 'face': 0, 'my': 1, 'on': 1, 'sat': 1}]
    '''
    # 1. Document Frequency (DF)
    idf_dict = {}
    N = len(doc_list) # N == 2
    idf_dict = dict.fromkeys(doc_list[0].keys(), 0)
    # ㄴ {'dog': 0, 'The': 0, 'bed': 0, 'cat': 0, 
    #     'face': 0, 'my': 0, 'on': 0, 'sat': 0}
    
    for doc in doc_list:
        for word, val in doc.items():
            if val > 0:
                idf_dict[word] += 1       
    # idf_dict == {'dog': 1, 'The': 2, 'bed': 1, 'cat': 1, 
    #              'face': 1, 'my': 2, 'on': 2, 'sat': 2}
    print('DF 결과: \n', idf_dict)
    
    # 2. Inverse Document Frequncy (IDF) : log
    print("\n--------------------------------------")
    for word, val in idf_dict.items():
        idf_dict[word] = math.log(N / float(val))
        print("  {} \tlog({}/{}) = {}".format(word, N, float(val), idf_dict[word]))
    print("--------------------------------------")
    return idf_dict

In [152]:
idfs = computeIDF([word_dictA, word_dictB])
print('\n IDF 결과:\n', idfs)

DF 결과: 
 {'dog': 1, 'The': 2, 'bed': 1, 'cat': 1, 'face': 1, 'my': 2, 'on': 2, 'sat': 2}

--------------------------------------
  dog 	log(2/1.0) = 0.6931471805599453
  The 	log(2/2.0) = 0.0
  bed 	log(2/1.0) = 0.6931471805599453
  cat 	log(2/1.0) = 0.6931471805599453
  face 	log(2/1.0) = 0.6931471805599453
  my 	log(2/2.0) = 0.0
  on 	log(2/2.0) = 0.0
  sat 	log(2/2.0) = 0.0
--------------------------------------

 IDF 결과:
 {'dog': 0.6931471805599453, 'The': 0.0, 'bed': 0.6931471805599453, 'cat': 0.6931471805599453, 'face': 0.6931471805599453, 'my': 0.0, 'on': 0.0, 'sat': 0.0}


----------------------------------
### TF-IDF : TF * IDF

In [170]:
def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [172]:
tfidfBowA = computeTFIDF(tfBowA, idfs)
tfidfBowB = computeTFIDF(tfBowB, idfs)

In [162]:
import pandas as pd
pd.DataFrame([tfidfBowA, tfidfBowB])

Unnamed: 0,The,bed,cat,dog,face,my,on,sat
0,0.0,0.0,0.115525,0.0,0.115525,0.0,0.0,0.0
1,0.0,0.115525,0.0,0.115525,0.0,0.0,0.0,0.0


https://www.youtube.com/watch?v=hXNbFNCgPfY