<a href="https://colab.research.google.com/github/leenago/NLP_edu/blob/master/200903_TF_IDF_%EC%8B%A4%EC%8A%B5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TF-IDF 구현하기

## 1. TF 구하기

### 1) 토큰 Index 생성하기
- 각 토큰을 그대로 사용할 수 없기 때문에 토큰에 Index를 부여해준다.

In [2]:
docs = ['오늘 동물원에서 원숭이와 코끼리를 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [3]:
doc_list =[]
for doc in docs:
  doc_list.append(doc.split(' '))

doc_list

[['오늘', '동물원에서', '원숭이와', '코끼리를', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

In [4]:
from collections import defaultdict
word2id = defaultdict(lambda: len(word2id))

In [5]:
for doc in doc_list:
  for token in doc:
    word2id[token]

word2id

defaultdict(<function __main__.<lambda>>,
            {'동물원에서': 1,
             '바나나를': 6,
             '봤어': 4,
             '오늘': 0,
             '원숭이에게': 5,
             '원숭이와': 2,
             '줬어': 7,
             '코끼리를': 3})

### 2)TF 계산
- 각 토큰(단어)의 등장 빈도를 계산

#### (1) 수업 같이 듣는 친구 코드

In [6]:
import numpy as np
bag_of_words =[]
for i, doc in enumerate(doc_list):
  bow = np.zeros(len(word2id), dtype = int)
  
  for token in doc:
    bow[word2id[token]] += 1
  bag_of_words.append(bow.tolist())

bag_of_words

[[1, 1, 1, 1, 1, 0, 0, 0], [0, 1, 0, 0, 0, 1, 2, 1]]

In [7]:
from IPython.core import display as ICD
import pandas as pd

In [8]:
sorted_vocab = sorted((value,key) for key, value in word2id.items())
print('sorted_vocab: ', sorted_vocab)

sorted_vocab:  [(0, '오늘'), (1, '동물원에서'), (2, '원숭이와'), (3, '코끼리를'), (4, '봤어'), (5, '원숭이에게'), (6, '바나나를'), (7, '줬어')]


In [9]:
vocab = []
for v in sorted_vocab:
  vocab.append(v[1])

vocab

['오늘', '동물원에서', '원숭이와', '코끼리를', '봤어', '원숭이에게', '바나나를', '줬어']

In [10]:
for i in range(len(docs)):
  print('문서{}:{}'.format(i, docs[i]))
  ICD.display(pd.DataFrame([bag_of_words[i]], columns = vocab))
  print('\n\n')

문서0:오늘 동물원에서 원숭이와 코끼리를 봤어


Unnamed: 0,오늘,동물원에서,원숭이와,코끼리를,봤어,원숭이에게,바나나를,줬어
0,1,1,1,1,1,0,0,0





문서1:동물원에서 원숭이에게 바나나를 줬어 바나나를


Unnamed: 0,오늘,동물원에서,원숭이와,코끼리를,봤어,원숭이에게,바나나를,줬어
0,0,1,0,0,0,1,2,1







In [11]:
df = pd.DataFrame(bag_of_words, columns =word2id.keys()).T
mapping = {0:'문서0', 1:'문서1'}
df = df.rename(columns = mapping)
df

Unnamed: 0,문서0,문서1
오늘,1,0
동물원에서,1,1
원숭이와,1,0
코끼리를,1,0
봤어,1,0
원숭이에게,0,1
바나나를,0,2
줬어,0,1


In [12]:
df['문서0 내 전체 토큰 개수'] = len(doc_list[0])
df['문서1 내 전체 토큰 개수'] = len(doc_list[1])
df

Unnamed: 0,문서0,문서1,문서0 내 전체 토큰 개수,문서1 내 전체 토큰 개수
오늘,1,0,5,5
동물원에서,1,1,5,5
원숭이와,1,0,5,5
코끼리를,1,0,5,5
봤어,1,0,5,5
원숭이에게,0,1,5,5
바나나를,0,2,5,5
줬어,0,1,5,5


In [13]:
df['TF0'] = df['문서0']/df['문서0 내 전체 토큰 개수']
df['TF1'] = df['문서1']/df['문서1 내 전체 토큰 개수']
df

Unnamed: 0,문서0,문서1,문서0 내 전체 토큰 개수,문서1 내 전체 토큰 개수,TF0,TF1
오늘,1,0,5,5,0.2,0.0
동물원에서,1,1,5,5,0.2,0.2
원숭이와,1,0,5,5,0.2,0.0
코끼리를,1,0,5,5,0.2,0.0
봤어,1,0,5,5,0.2,0.0
원숭이에게,0,1,5,5,0.0,0.2
바나나를,0,2,5,5,0.0,0.4
줬어,0,1,5,5,0.0,0.2


## IDF 구하기
- TDM[:,0]: 열로 읽기

In [24]:
from math import log10

In [15]:
df['문서수'] = len(doc_list)
df

Unnamed: 0,문서0,문서1,문서0 내 전체 토큰 개수,문서1 내 전체 토큰 개수,TF0,TF1,문서수
오늘,1,0,5,5,0.2,0.0,2
동물원에서,1,1,5,5,0.2,0.2,2
원숭이와,1,0,5,5,0.2,0.0,2
코끼리를,1,0,5,5,0.2,0.0,2
봤어,1,0,5,5,0.2,0.0,2
원숭이에게,0,1,5,5,0.0,0.2,2
바나나를,0,2,5,5,0.0,0.4,2
줬어,0,1,5,5,0.0,0.2,2


In [18]:
num_docs = np.zeros(len(vocab), dtype=int)

for i in range(len(vocab)):
  if (vocab[i] in doc_list[0]) == True:
    num_docs[i] += 1
  if (vocab[i] in doc_list[1]) == True:
    num_docs[i] += 1

num_docs

array([1, 2, 1, 1, 1, 1, 1, 1])

In [19]:
df['단어가 등장한 문서수'] = num_docs
df

Unnamed: 0,문서0,문서1,문서0 내 전체 토큰 개수,문서1 내 전체 토큰 개수,TF0,TF1,문서수,단어가 등장한 문서수
오늘,1,0,5,5,0.2,0.0,2,1
동물원에서,1,1,5,5,0.2,0.2,2,2
원숭이와,1,0,5,5,0.2,0.0,2,1
코끼리를,1,0,5,5,0.2,0.0,2,1
봤어,1,0,5,5,0.2,0.0,2,1
원숭이에게,0,1,5,5,0.0,0.2,2,1
바나나를,0,2,5,5,0.0,0.4,2,1
줬어,0,1,5,5,0.0,0.2,2,1


In [26]:
df['IDF'] = np.log10(df['문서수']/df['단어가 등장한 문서수'])
df

Unnamed: 0,문서0,문서1,문서0 내 전체 토큰 개수,문서1 내 전체 토큰 개수,TF0,TF1,문서수,단어가 등장한 문서수,IDF
오늘,1,0,5,5,0.2,0.0,2,1,0.30103
동물원에서,1,1,5,5,0.2,0.2,2,2,0.0
원숭이와,1,0,5,5,0.2,0.0,2,1,0.30103
코끼리를,1,0,5,5,0.2,0.0,2,1,0.30103
봤어,1,0,5,5,0.2,0.0,2,1,0.30103
원숭이에게,0,1,5,5,0.0,0.2,2,1,0.30103
바나나를,0,2,5,5,0.0,0.4,2,1,0.30103
줬어,0,1,5,5,0.0,0.2,2,1,0.30103


In [27]:
df['TF-IDF0'] = (df['TF0'] * df['IDF'])
df['TF-IDF1'] = (df['TF1'] * df['IDF'])
df

Unnamed: 0,문서0,문서1,문서0 내 전체 토큰 개수,문서1 내 전체 토큰 개수,TF0,TF1,문서수,단어가 등장한 문서수,IDF,TF-IDF0,TF-IDF1
오늘,1,0,5,5,0.2,0.0,2,1,0.30103,0.060206,0.0
동물원에서,1,1,5,5,0.2,0.2,2,2,0.0,0.0,0.0
원숭이와,1,0,5,5,0.2,0.0,2,1,0.30103,0.060206,0.0
코끼리를,1,0,5,5,0.2,0.0,2,1,0.30103,0.060206,0.0
봤어,1,0,5,5,0.2,0.0,2,1,0.30103,0.060206,0.0
원숭이에게,0,1,5,5,0.0,0.2,2,1,0.30103,0.0,0.060206
바나나를,0,2,5,5,0.0,0.4,2,1,0.30103,0.0,0.120412
줬어,0,1,5,5,0.0,0.2,2,1,0.30103,0.0,0.060206


#### (2) 선생님 코드
- 선생님은 matrix 형태에서 계산하셨다.

In [28]:
docs = ['오늘 동물원에서 원숭이와 코끼리를 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [29]:
doc_list = []

for doc in docs:
  doc_list.append(doc.split(' '))

doc_list

[['오늘', '동물원에서', '원숭이와', '코끼리를', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

In [30]:
from collections import defaultdict

word2id = defaultdict(lambda : len(word2id))
for doc in doc_list:
  for token in doc:
    word2id[token]

word2id

defaultdict(<function __main__.<lambda>>,
            {'동물원에서': 1,
             '바나나를': 6,
             '봤어': 4,
             '오늘': 0,
             '원숭이에게': 5,
             '원숭이와': 2,
             '줬어': 7,
             '코끼리를': 3})

In [35]:
import numpy as np

DTM = np.zeros((len(doc_list), len(word2id)), dtype=int)
#행이 word2id, 열이 doc_list, np.zeros때는 열-행

DTM

array([[0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0]])

In [36]:
for i, doc in enumerate(doc_list):
  for token in doc:
    DTM[i, word2id[token]] += 1
    #BoW때와 달리, 지금은 2차원이기 때문에 빈 공간을 할당해줄 필요 없이 바로 진행하면 된다!

DTM

array([[1, 1, 1, 1, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 2, 1]])

In [37]:
DTM[0].sum()

5

In [39]:
def computeTF(DTM):
  doc_len = len(DTM)     #문서 갯수 - 2개
  word_len = len(DTM[0]) #토큰 갯수 - 8개

  #tf를 계산하기 전에 0으로 세팅하기
  tf = np.zeros((doc_len, word_len))
  
  #TF 계산: 특정 단어 빈도 / 문서 내 전체 등장 단어 빈도
  for doc_i in range(doc_len):
    for word_i in range(word_len):
      tf[doc_i, word_i] = DTM[doc_i, word_i] / DTM[doc_i].sum()
  return tf

In [40]:
computeTF(DTM)

array([[0.2, 0.2, 0.2, 0.2, 0.2, 0. , 0. , 0. ],
       [0. , 0.2, 0. , 0. , 0. , 0.2, 0.4, 0.2]])

In [60]:
import math

#IDF계산: log(총 문서수/단어가 등장한 문서수)
def computeIDF(DTM):
  doc_len = len(DTM)
  word_len = len(DTM[0]) #토큰 갯수 - 8개

  idf = np.zeros(word_len)

  for i in range(word_len):
    idf[i] = math.log10(doc_len / np.count_nonzero(DTM[:,i]))
  return idf

In [54]:
DTM[:,7]  #얘는 열로 세어주는 기능

array([0, 1])

- count_nonzero(): This function (recursively: 반복적으로) counts how many elements in variable (and in sub-arrays thereof) have their __nonzero__()

In [58]:
np.count_nonzero(DTM[:,1]) 

2

In [61]:
computeIDF(DTM) #

array([0.30103, 0.     , 0.30103, 0.30103, 0.30103, 0.30103, 0.30103,
       0.30103])

In [64]:
#TF-IDF 곱

def computeTFIDF(DTM):
  tf = computeTF(DTM)
  idf = computeIDF(DTM)
  
  tfidf = np.zeros(tf.shape)

  for doc_i in range(tf.shape[0]):
    for word_i in range(tf.shape[1]):
      tfidf[doc_i, word_i] = tf[doc_i, word_i] * idf[word_i]

  return tfidf

In [65]:
computeTFIDF(DTM)

array([[0.060206, 0.      , 0.060206, 0.060206, 0.060206, 0.      ,
        0.      , 0.      ],
       [0.      , 0.      , 0.      , 0.      , 0.      , 0.060206,
        0.120412, 0.060206]])

In [70]:
import pandas as pd

sorted_vocab = sorted((value, key) for key, value in word2id.items())
print(sorted_vocab)
print('\n')

vocab = [v[1] for v in sorted_vocab]
print(vocab)
print('\n\n')

tfidf = computeTFIDF(DTM)
pd.DataFrame(tfidf, columns = vocab)

[(0, '오늘'), (1, '동물원에서'), (2, '원숭이와'), (3, '코끼리를'), (4, '봤어'), (5, '원숭이에게'), (6, '바나나를'), (7, '줬어')]


['오늘', '동물원에서', '원숭이와', '코끼리를', '봤어', '원숭이에게', '바나나를', '줬어']





Unnamed: 0,오늘,동물원에서,원숭이와,코끼리를,봤어,원숭이에게,바나나를,줬어
0,0.060206,0.0,0.060206,0.060206,0.060206,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.060206,0.120412,0.060206
