## TF-IDF
### TF

In [1]:
docs = ['오늘 동물원에서 원숭이와 코끼리를 봤어',
       '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [2]:
doc_ls =[doc.split() for doc in docs]
doc_ls

[['오늘', '동물원에서', '원숭이와', '코끼리를', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

In [3]:
from collections import defaultdict

wordid= defaultdict(lambda : len(wordid))
wordid

defaultdict(<function __main__.<lambda>()>, {})

In [4]:
for doc in doc_ls:
    for token in doc:
        wordid[token]
wordid

defaultdict(<function __main__.<lambda>()>,
            {'오늘': 0,
             '동물원에서': 1,
             '원숭이와': 2,
             '코끼리를': 3,
             '봤어': 4,
             '원숭이에게': 5,
             '바나나를': 6,
             '줬어': 7})

In [5]:
import numpy as np

In [6]:
DTM = np.zeros((len(doc_ls), len(wordid)), dtype=int)
print(DTM)

[[0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]]


In [7]:
for i, doc in enumerate(doc_ls):
    for token in doc:
        DTM[i, wordid[token]] += 1 # 해당 토큰의 위치 (columns)
# 행렬로 표기 (BOW와 차이점: BOW는 1차원적 배열)
DTM

array([[1, 1, 1, 1, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 2, 1]])

In [8]:
DTM[0].sum()

5

In [11]:
def computeTF(DTM):
    doc_len = len(DTM) # 문서개수 2개
    word_len = len(DTM[0]) # 토큰의 개수 8개
    # tf를 계산하기 전 0으로 셋팅
    tf = np.zeros((doc_len, word_len))
    print(tf)
    # TF 계산 특정단어등장빈도/문서 내 전체등장단어빈도
    for doc_i in range(doc_len) :
        for word_i in range(word_len) :
            tf[doc_i, word_i] = DTM[doc_i, word_i]/DTM[doc_i].sum()
    return tf

In [12]:
computeTF(DTM)

[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]


array([[0.2, 0.2, 0.2, 0.2, 0.2, 0. , 0. , 0. ],
       [0. , 0.2, 0. , 0. , 0. , 0.2, 0.4, 0.2]])

In [13]:
import math
# IDF 계산 : log (총문서수/단어가등장한문서수)
def computeIDF(DTM):
    doc_len = len(DTM)
    word_len = len(DTM[0])
    
    idf = np.zeros(word_len)
    
    for i in range(word_len) :
        idf[i] = math.log10(doc_len/ np.count_nonzero(DTM[:,i]))
    return idf

In [14]:
computeIDF(DTM)

array([0.30103, 0.     , 0.30103, 0.30103, 0.30103, 0.30103, 0.30103,
       0.30103])

In [15]:
# TF-IDF 곱
def computeTFIDF(DTM):
    tf = computeTF(DTM)
    idf = computeIDF(DTM)
    tfidf = np.zeros(tf.shape)
    for doc_i in range(tf.shape[0]) :
        for word_i in range(tf.shape[1]) :
            tfidf[doc_i, word_i] = tf[doc_i, word_i] * idf[word_i]
    return tfidf

In [16]:
computeTFIDF(DTM)

[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]


array([[0.060206, 0.      , 0.060206, 0.060206, 0.060206, 0.      ,
        0.      , 0.      ],
       [0.      , 0.      , 0.      , 0.      , 0.      , 0.060206,
        0.120412, 0.060206]])

In [17]:
import pandas as pd

sorted_vocab = sorted((value,key)for key, value in wordid.items())
print(sorted_vocab)

[(0, '오늘'), (1, '동물원에서'), (2, '원숭이와'), (3, '코끼리를'), (4, '봤어'), (5, '원숭이에게'), (6, '바나나를'), (7, '줬어')]


In [18]:
vocab = [v[1] for v in sorted_vocab]
print(vocab)

['오늘', '동물원에서', '원숭이와', '코끼리를', '봤어', '원숭이에게', '바나나를', '줬어']


In [19]:
tfidf = computeTFIDF(DTM)
pd.DataFrame(tfidf, columns=vocab)

[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]


Unnamed: 0,오늘,동물원에서,원숭이와,코끼리를,봤어,원숭이에게,바나나를,줬어
0,0.060206,0.0,0.060206,0.060206,0.060206,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.060206,0.120412,0.060206
