In [1]:
# TF-IDF

docs = ['오늘 동물원에서 원숭이와 코끼리를 봤어',
       '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [2]:
doc_ls = []
for doc in docs:
    doc_ls.append(doc.split())
doc_ls

[['오늘', '동물원에서', '원숭이와', '코끼리를', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

In [3]:
from collections import defaultdict
word2id = defaultdict(lambda:len(word2id))

In [4]:
for doc in doc_ls:
    for token in doc:
        word2id[token] # []안의 것은 인덱스라고 이해하면됨
word2id

defaultdict(<function __main__.<lambda>()>,
            {'오늘': 0,
             '동물원에서': 1,
             '원숭이와': 2,
             '코끼리를': 3,
             '봤어': 4,
             '원숭이에게': 5,
             '바나나를': 6,
             '줬어': 7})

In [6]:
import numpy as np

In [7]:
DTM = np.zeros((len(doc_ls),len(word2id)), dtype=int)
for i, doc in enumerate(doc_ls):
    for token in doc:
        DTM[i,word2id[token]] += 1        
DTM

array([[1, 1, 1, 1, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 2, 1]])

In [8]:
# TF

def computeTF(DTM):
    doc_len = len(DTM)
    word_len = len(DTM[0])
    tf = np.zeros((doc_len,word_len))
    for doc_i in range(doc_len):
        for word_i in range(word_len):
            tf[doc_i,word_i] = DTM[doc_i,word_i]/DTM[doc_i].sum()
    return tf

In [9]:
computeTF(DTM)

array([[0.2, 0.2, 0.2, 0.2, 0.2, 0. , 0. , 0. ],
       [0. , 0.2, 0. , 0. , 0. , 0.2, 0.4, 0.2]])

In [12]:
# IDF

import math

def computeIDF(DTM):
    doc_len = len(DTM)
    word_len = len(DTM[0])
    
    idf = np.zeros(word_len)
    
    for i in range(word_len):
        idf[i] = math.log10(doc_len / np.count_nonzero(DTM[:,i]))
    return idf

In [13]:
computeIDF(DTM)

array([0.30103, 0.     , 0.30103, 0.30103, 0.30103, 0.30103, 0.30103,
       0.30103])

In [14]:
# TF-IDF

def computeTFIDF(DTM):
    tf = computeTF(DTM)
    idf = computeIDF(DTM)
    tfidf = np.zeros(tf.shape)
    for doc_i in range(tf.shape[0]):
        for word_i in range(tf.shape[1]):
            tfidf[doc_i,word_i] = tf[doc_i,word_i] * idf[word_i]
    return tfidf

In [15]:
computeTFIDF(DTM)

array([[0.060206, 0.      , 0.060206, 0.060206, 0.060206, 0.      ,
        0.      , 0.      ],
       [0.      , 0.      , 0.      , 0.      , 0.      , 0.060206,
        0.120412, 0.060206]])

In [19]:
import pandas as pd

sorted_vocab = sorted((value,key) for key,value in word2id.items())
print(sorted_vocab)
vocab = [v[1] for v in sorted_vocab]
tfidf=computeTFIDF(DTM)
pd.DataFrame(tfidf, columns=vocab)

[(0, '오늘'), (1, '동물원에서'), (2, '원숭이와'), (3, '코끼리를'), (4, '봤어'), (5, '원숭이에게'), (6, '바나나를'), (7, '줬어')]


Unnamed: 0,오늘,동물원에서,원숭이와,코끼리를,봤어,원숭이에게,바나나를,줬어
0,0.060206,0.0,0.060206,0.060206,0.060206,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.060206,0.120412,0.060206


In [21]:
print('-'*70)

----------------------------------------------------------------------
