# TF-IDF 직접구현

In [1]:
docs = ['오늘 동물원에서 원숭이와 코끼리를 봤어',
       '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [2]:
doc_ls=[doc.split() for doc in docs]
doc_ls

[['오늘', '동물원에서', '원숭이와', '코끼리를', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

In [3]:
from collections import defaultdict
word2id = defaultdict(lambda : len(word2id))

for doc in doc_ls:
    for token in doc:
        word2id[token]
word2id

defaultdict(<function __main__.<lambda>()>,
            {'오늘': 0,
             '동물원에서': 1,
             '원숭이와': 2,
             '코끼리를': 3,
             '봤어': 4,
             '원숭이에게': 5,
             '바나나를': 6,
             '줬어': 7})

In [4]:
# TF 구하기
# 문서내 토큰 빈도 /문서내 전체 토큰빈도
import numpy as np

TF = np.zeros((len(doc_ls),len(word2id)))
for i, doc in enumerate(doc_ls):
    for token in doc:
        TF[i, word2id[token]] += 1
    TF[i] = TF[i]/len(doc_ls[i])
    print(len(doc_ls[i]))
    print(TF)
TF


5
[[0.2 0.2 0.2 0.2 0.2 0.  0.  0. ]
 [0.  0.  0.  0.  0.  0.  0.  0. ]]
5
[[0.2 0.2 0.2 0.2 0.2 0.  0.  0. ]
 [0.  0.2 0.  0.  0.  0.2 0.4 0.2]]


array([[0.2, 0.2, 0.2, 0.2, 0.2, 0. , 0. , 0. ],
       [0. , 0.2, 0. , 0. , 0. , 0.2, 0.4, 0.2]])

In [5]:
# IDF 구하기
# log(총문서수/토큰이 등장한 문서수)
import math

IDF = np.zeros(len(word2id))
count = np.zeros(len(word2id))

for i, word in enumerate(word2id):
    for doc in doc_ls:
        if ((word in doc) == True):   
            print(word, doc)
            count[word2id[word]] += 1
    print(count)
    IDF[i] = math.log10(len(doc_ls)/count[i])
print(IDF)

오늘 ['오늘', '동물원에서', '원숭이와', '코끼리를', '봤어']
[1. 0. 0. 0. 0. 0. 0. 0.]
동물원에서 ['오늘', '동물원에서', '원숭이와', '코끼리를', '봤어']
동물원에서 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']
[1. 2. 0. 0. 0. 0. 0. 0.]
원숭이와 ['오늘', '동물원에서', '원숭이와', '코끼리를', '봤어']
[1. 2. 1. 0. 0. 0. 0. 0.]
코끼리를 ['오늘', '동물원에서', '원숭이와', '코끼리를', '봤어']
[1. 2. 1. 1. 0. 0. 0. 0.]
봤어 ['오늘', '동물원에서', '원숭이와', '코끼리를', '봤어']
[1. 2. 1. 1. 1. 0. 0. 0.]
원숭이에게 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']
[1. 2. 1. 1. 1. 1. 0. 0.]
바나나를 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']
[1. 2. 1. 1. 1. 1. 1. 0.]
줬어 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']
[1. 2. 1. 1. 1. 1. 1. 1.]
[0.30103 0.      0.30103 0.30103 0.30103 0.30103 0.30103 0.30103]


In [6]:
# TF_IDF 계산
TF_IDF = np.zeros((len(doc_ls),len(word2id)))
for i in range(len(doc_ls)):
    TF_IDF[i]=TF[i]*IDF
print(TF)
print(IDF)
print(TF_IDF)

[[0.2 0.2 0.2 0.2 0.2 0.  0.  0. ]
 [0.  0.2 0.  0.  0.  0.2 0.4 0.2]]
[0.30103 0.      0.30103 0.30103 0.30103 0.30103 0.30103 0.30103]
[[0.060206 0.       0.060206 0.060206 0.060206 0.       0.       0.      ]
 [0.       0.       0.       0.       0.       0.060206 0.120412 0.060206]]


In [7]:
# DF화
import pandas as pd

sorted_vocab = sorted((value, key) for key, value in word2id.items())
print(sorted_vocab)
vocab=[v[1] for v in sorted_vocab]
print(vocab)
pd.DataFrame(TF_IDF, columns=vocab)

[(0, '오늘'), (1, '동물원에서'), (2, '원숭이와'), (3, '코끼리를'), (4, '봤어'), (5, '원숭이에게'), (6, '바나나를'), (7, '줬어')]
['오늘', '동물원에서', '원숭이와', '코끼리를', '봤어', '원숭이에게', '바나나를', '줬어']


Unnamed: 0,오늘,동물원에서,원숭이와,코끼리를,봤어,원숭이에게,바나나를,줬어
0,0.060206,0.0,0.060206,0.060206,0.060206,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.060206,0.120412,0.060206


- 지금까지의 코드는 직접 구현하는 과정이였다.
- 강의자료의 코드는 다음과 같다.

In [8]:
#TDM 세팅

TDM = np.zeros((len(doc_ls), len(word2id)), dtype=int)
print(TDM)
for i, doc in enumerate(doc_ls):
    for token in doc:
        TDM[i, word2id[token]] += 1 # 해당 토큰의 위치
TDM

[[0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]]


array([[1, 1, 1, 1, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 2, 1]])

In [9]:
# 문서내 전체 토큰의 수를 계산 예시
TDM[0].sum()

5

In [10]:
# TF 함수
def computeTF(TDM):
    doc_len = len(TDM) # 문서 개수
    word_len = len(TDM[0]) # 토큰의 개수
    
    # TF를 계산하기 전 0으로 세팅
    TF = np.zeros((doc_len, word_len))
    
    # TF 계산 : 특정단어등장빈도/문서내 전체등장단어빈도
    for doc_i in range(doc_len):
        for word_i in range(word_len):
            TF[doc_i, word_i] = TDM[doc_i, word_i]/TDM[doc_i].sum()
    return TF

In [11]:
computeTF(TDM)

array([[0.2, 0.2, 0.2, 0.2, 0.2, 0. , 0. , 0. ],
       [0. , 0.2, 0. , 0. , 0. , 0.2, 0.4, 0.2]])

In [12]:
# IDF 함수
import math # 로그 함수 사용하기 위함

def computeIDF(TDM):
    doc_len = len(TDM)
    word_len = len(TDM[0])
    
    # IDF를 계산하기 전 0으로 세팅
    IDF = np.zeros(word_len)
    
    # IDF 계산 : log(총문서수/단어가등장한문서수)
    for i in range(word_len):
        IDF[i]=math.log10(doc_len/np.count_nonzero(TDM[:,i]))
    return IDF

In [13]:
computeIDF(TDM)

array([0.30103, 0.     , 0.30103, 0.30103, 0.30103, 0.30103, 0.30103,
       0.30103])

In [14]:
# TF-IDF 함수 (TF * IDF)
def computeTF_IDF(TDM):
    TF = computeTF(TDM)
    IDF = computeIDF(TDM)
    
    # TF-IDF를 계산하기 전 0으로 세팅
    TF_IDF = np.zeros(TF.shape)
    
    # TF * IDF
    for doc_i in range(TF.shape[0]):
        for word_i in range(TF.shape[1]):
            TF_IDF[doc_i, word_i] = TF[doc_i, word_i] * IDF[word_i]
    return TF_IDF

In [15]:
computeTF_IDF(TDM)

array([[0.060206, 0.      , 0.060206, 0.060206, 0.060206, 0.      ,
        0.      , 0.      ],
       [0.      , 0.      , 0.      , 0.      , 0.      , 0.060206,
        0.120412, 0.060206]])

In [16]:
pd.DataFrame(data=computeTF_IDF(TDM), columns=vocab)

Unnamed: 0,오늘,동물원에서,원숭이와,코끼리를,봤어,원숭이에게,바나나를,줬어
0,0.060206,0.0,0.060206,0.060206,0.060206,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.060206,0.120412,0.060206


# sklearn

In [19]:
docs = ['오늘 동물원에서 원숭이와 코끼리를 봤어',
       '동물원에서 원숭이에게 바나나를 줬어 바나나를']

from sklearn.feature_extraction.text import TfidfVectorizer
tfidv=TfidfVectorizer()
tfidv = tfidv.fit(docs)
tfidv.transform(docs).toarray()
tfidv

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [21]:
vocab = tfidv.get_feature_names()
vocab

['동물원에서', '바나나를', '봤어', '오늘', '원숭이에게', '원숭이와', '줬어', '코끼리를']

In [23]:
import pandas as pd
df = pd.DataFrame(tfidv.transform(docs).toarray(),columns=vocab)
df

Unnamed: 0,동물원에서,바나나를,봤어,오늘,원숭이에게,원숭이와,줬어,코끼리를
0,0.335176,0.0,0.471078,0.471078,0.0,0.471078,0.0,0.471078
1,0.278943,0.784088,0.0,0.0,0.392044,0.0,0.392044,0.0


# gensim

In [29]:
import gensim
from gensim import corpora
from gensim.models import TfidfModel

docs = ['오늘 동물원에서 원숭이와 코끼리를 봤어',
       '동물원에서 원숭이에게 바나나를 줬어 바나나를']

doc_ls = [doc.split() for doc in docs]
id2word = corpora.Dictionary(doc_ls)
TDM = [id2word.doc2bow(doc) for doc in doc_ls]
model = TfidfModel(TDM)
tfidf = model[TDM]
tfidf[0]

[(1, 0.5), (2, 0.5), (3, 0.5), (4, 0.5)]

In [31]:
from gensim.matutils import sparse2full

vocab = [id2word[i] for i in id2word.keys()]
TDM_matrix = [sparse2full(doc, len(vocab)).tolist() for doc in tfidf]
pd.DataFrame(TDM_matrix, columns=vocab)

Unnamed: 0,동물원에서,봤어,오늘,원숭이와,코끼리를,바나나를,원숭이에게,줬어
0,0.0,0.5,0.5,0.5,0.5,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.816497,0.408248,0.408248
