1. TF (Term Frequency) --> 특정단어빈도수 / 전체단어수
2. IDF (TF의 역수) --> 문서등장빈도수의 역수 
3. TF X IDF

In [67]:
docs = ['오늘 동물원에서 코끼리와 동물원을 봤어', '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [68]:
doc_ls = []
for doc in docs: 
    doc_ls.append(doc.split())
doc_ls

[['오늘', '동물원에서', '코끼리와', '동물원을', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

### TF 

In [69]:
from collections import defaultdict 
word_dic = defaultdict(lambda: len(word_dic))
word_dic

defaultdict(<function __main__.<lambda>()>, {})

In [70]:
for doc in doc_ls:
    for token in doc: 
        word_dic[token]
word_dic

defaultdict(<function __main__.<lambda>()>,
            {'오늘': 0,
             '동물원에서': 1,
             '코끼리와': 2,
             '동물원을': 3,
             '봤어': 4,
             '원숭이에게': 5,
             '바나나를': 6,
             '줬어': 7})

In [71]:
import numpy as np
TDM = np.zeros((len(doc_ls), len(word_dic)), dtype = int)
print(TDM)
for i, doc in enumerate(doc_ls): 
    for token in doc:
        TDM[i, word_dic[token] ] += 1 # 해당 토큰의 위치
TDM # 행렬로 표시됨 

[[0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]]


array([[1, 1, 1, 1, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 2, 1]])

In [72]:
TDM[0].sum()

5

In [73]:
def computeTF(TDM): 
    doc_len = len(TDM) # 문서 갯수 2개 
    word_len = len(TDM[0]) # 토큰의 갯수 8개 --> 개수 구하려면 len() 이용, not count()
    #tf를 계산하기 전 0으로 셋팅 
    tf = np.zeros((doc_len, word_len))
    #print(tf)
    #TF 계산 특정단어등장빈도/문서내 전체등장단어빈도 
    for doc_i in range(doc_len) :
        for word_i in range(word_len) :
            tf[doc_i, word_i] = TDM[doc_i, word_i] / TDM[doc_i].sum() ##다시 이해필요
    return tf

### IDF

In [75]:
import math 
#IDF 계산 : log(총문서수 / 단어가 등장한 문서수)
def computeIDF(TDM) : 
    doc_len = len(TDM)
    word_len = len(TDM[0])
    
    idf = np.zeros(word_len)
    
    for i in range(word_len):
        idf[i] = math.log10(doc_len / np.count_nonzero(TDM[ :, i]))    #이 부분 설명 다시 필요! 
    return idf

In [76]:
computeIDF(TDM)

array([0.30103, 0.     , 0.30103, 0.30103, 0.30103, 0.30103, 0.30103,
       0.30103])

### TF X IDF 

In [83]:
def computeTFIDF(TDM):
    tf = computeTF(TDM)
    idf = computeIDF(TDM)
    tfidf = np.zeros(tf.shape)
    for doc_i in range(tf.shape[0]) : 
        for word_i in range(tf.shape[1]):
            tfidf[doc_i, word_i] = tf[doc_i, word_i] * idf[word_i]
    return tfidf

In [84]:
computeTFIDF(TDM)

array([[0.060206, 0.      , 0.060206, 0.060206, 0.060206, 0.      ,
        0.      , 0.      ],
       [0.      , 0.      , 0.      , 0.      , 0.      , 0.060206,
        0.120412, 0.060206]])

In [85]:
import pandas as pd 
sorted_vocab = sorted((value, key) for key, value in word_dic.items())
print(sorted_vocab)
vocab = [v[1] for v in sorted_vocab]
print(vocab)
tfidf = computeTFIDF(TDM)
pd.DataFrame(tfidf, columns = vocab)

[(0, '오늘'), (1, '동물원에서'), (2, '코끼리와'), (3, '동물원을'), (4, '봤어'), (5, '원숭이에게'), (6, '바나나를'), (7, '줬어')]
['오늘', '동물원에서', '코끼리와', '동물원을', '봤어', '원숭이에게', '바나나를', '줬어']


Unnamed: 0,오늘,동물원에서,코끼리와,동물원을,봤어,원숭이에게,바나나를,줬어
0,0.060206,0.0,0.060206,0.060206,0.060206,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.060206,0.120412,0.060206


## sklearn

In [87]:
docs = ['오늘 동물원에서 코끼리와 동물원을 봤어', '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [89]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidv = TfidfVectorizer()
tfidv = tfidv.fit(docs)
tfidv.transform(docs).toarray()
vocab = tfidv.get_feature_names()

In [90]:
vocab

['동물원에서', '동물원을', '바나나를', '봤어', '오늘', '원숭이에게', '줬어', '코끼리와']

In [92]:
tfidv.transform(docs).toarray()[0]

array([0.33517574, 0.47107781, 0.        , 0.47107781, 0.47107781,
       0.        , 0.        , 0.47107781])

In [93]:
import pandas as pd 
df = pd.DataFrame(tfidv.transform(docs).toarray(), columns = vocab)
df

Unnamed: 0,동물원에서,동물원을,바나나를,봤어,오늘,원숭이에게,줬어,코끼리와
0,0.335176,0.471078,0.0,0.471078,0.471078,0.0,0.0,0.471078
1,0.278943,0.0,0.784088,0.0,0.0,0.392044,0.392044,0.0


## gensim

In [63]:
docs = ['오늘 동물원에서 원숭이와 코끼리를 봤어', '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [64]:
import gensim 
from gensim import corpora 
from gensim.models import TfidfModel

docs_ls = [doc.split() for doc in docs] # 공백으로 토큰화 
id2word = corpora.Dictionary(doc_ls)
TDM = [id2word.doc2bow(doc) for doc in doc_ls]
model = TfidfModel(TDM)
tfidf = model[TDM]
tfidf[0]

[(1, 0.5), (2, 0.5), (3, 0.5), (4, 0.5)]

In [65]:
import pandas as pd 
from gensim.matutils import sparse2full

vocab = [id2word[i] for i in id2word.keys()]
TDM_matrix = [sparse2full(doc, len(vocab)).tolist( ) for doc in tfidf]
pd.DataFrame(TDM_matrix, columns=vocab)

Unnamed: 0,동물원에서,동물원을,봤어,오늘,코끼리와,바나나를,원숭이에게,줬어
0,0.0,0.5,0.5,0.5,0.5,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.816497,0.408248,0.408248


In [66]:
tfidf[1]

[(5, 0.8164965809277261), (6, 0.4082482904638631), (7, 0.4082482904638631)]