## TDM 직접구현

In [5]:
from collections import defaultdict
import numpy as np

docs = ['동물원 코끼리', '동물원 원숭이 바나나',
       '엄마 코끼리 아기 코끼리', '원숭이 바나나 코끼리 바나나']

In [6]:
doc_ls = []
for doc in docs :
    doc_ls.append(doc.split(' '))
doc_ls

[['동물원', '코끼리'],
 ['동물원', '원숭이', '바나나'],
 ['엄마', '코끼리', '아기', '코끼리'],
 ['원숭이', '바나나', '코끼리', '바나나']]

In [7]:
word2id  = defaultdict(lambda : len(word2id))

for doc in doc_ls :
    for token in doc :
        word2id[token]
word2id

defaultdict(<function __main__.<lambda>()>,
            {'동물원': 0, '코끼리': 1, '원숭이': 2, '바나나': 3, '엄마': 4, '아기': 5})

In [8]:
TDM = np.zeros((len(word2id), len(doc_ls)), dtype = int)

for i, doc in enumerate(doc_ls) :
    for token in doc :
        TDM[word2id[token],i] += 1
TDM

array([[1, 1, 0, 0],
       [1, 0, 2, 1],
       [0, 1, 0, 1],
       [0, 1, 0, 2],
       [0, 0, 1, 0],
       [0, 0, 1, 0]])

In [9]:
import pandas as pd
doc_names = ['문서' + str(i) for i in range(len(doc_ls))]
print('doc_names', doc_names)
sorted_vocab = sorted((value, key) for key, value in word2id.items())
vocab = [v[1] for v in sorted_vocab]
df_TDM = pd.DataFrame(TDM, columns=doc_names)
df_TDM['단어'] = vocab
df_TDM.set_index('단어')

doc_names ['문서0', '문서1', '문서2', '문서3']


Unnamed: 0_level_0,문서0,문서1,문서2,문서3
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
동물원,1,1,0,0
코끼리,1,0,2,1
원숭이,0,1,0,1
바나나,0,1,0,2
엄마,0,0,1,0
아기,0,0,1,0


## sklearn으로 구현

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()
DTM = count_vect.fit_transform(docs) ## sklearn에서는 DTM을 만들어주므로
                                     ## 전치시켜서 TDM 사용가능
DTM.toarray()

array([[1, 0, 0, 0, 0, 1],
       [1, 1, 0, 0, 1, 0],
       [0, 0, 1, 1, 0, 2],
       [0, 2, 0, 0, 1, 1]], dtype=int64)

In [11]:
DTM.toarray().T ## sklearn에서는 DTM을 만들어주므로
                ## 전치시켜서 TDM 사용가능

array([[1, 1, 0, 0],
       [0, 1, 0, 2],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 1],
       [1, 0, 2, 1]], dtype=int64)

## gensim으로 구현

In [12]:
docs = ['동물원 코끼리', '동물원 원숭이 바나나',
       '엄마 코끼리 아기 코끼리', '원숭이 바나나 코끼리 바나나']

In [13]:
import gensim
from gensim import corpora

doc_ls = [doc.split() for doc in docs]
id2word = corpora.Dictionary(doc_ls)
TDM = [id2word.doc2bow(doc) for doc in doc_ls]
TDM ## 0인 부분을 아예 안보여준다

[[(0, 1), (1, 1)],
 [(0, 1), (2, 1), (3, 1)],
 [(1, 2), (4, 1), (5, 1)],
 [(1, 1), (2, 2), (3, 1)]]

In [14]:
from gensim.matutils import sparse2full

doc_names = ['문서' + str(i) for i in range(len(doc_ls))]
vocab = [id2word[i] for i in id2word.keys()]
DTM_matrix = [sparse2full(doc, len(vocab)).tolist() for doc in TDM]

df_TDM = pd.DataFrame(np.array(DTM_matrix,dtype = int).T, columns=doc_names)
df_TDM['단어'] = vocab
df_TDM.set_index('단어')

Unnamed: 0_level_0,문서0,문서1,문서2,문서3
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
동물원,1,1,0,0
코끼리,1,0,2,1
바나나,0,1,0,2
원숭이,0,1,0,1
아기,0,0,1,0
엄마,0,0,1,0


## TF-IDF

In [15]:
docs = ['오늘 동물원에서 원숭이와 코끼리를 봤어',
       '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [20]:
word2id  = defaultdict(lambda : len(word2id))

for doc in doc_ls :
    for token in doc :
        word2id[token]
word2id

defaultdict(<function __main__.<lambda>()>,
            {'오늘': 0,
             '동물원에서': 1,
             '원숭이와': 2,
             '코끼리를': 3,
             '봤어': 4,
             '원숭이에게': 5,
             '바나나를': 6,
             '줬어': 7})

In [21]:
## TF : 특정 단어 등장 빈도/문서 내 전체 등장 단어 빈도
## IDF : log(총 문서수/단어가 등장한 문서수)
## TF-IDF : TF*IDF

In [26]:
doc_count = len(docs)

Bow_ls = []
for i, doc in enumerate(doc_ls) :
    bow = np.zeros(len(word2id), dtype = int)
    for token in doc :
        bow[word2id[token]] += 1
    Bow_ls.append(bow.tolist())
Bow_ls ## 문서 내 토큰 빈도

[[1, 1, 1, 1, 1, 0, 0, 0], [0, 1, 0, 0, 0, 1, 2, 1]]

In [35]:
df1 = pd.DataFrame(Bow_ls[0])
df1 = df1.rename({0:'문서내토큰빈도'}, axis = 'columns')
df1['문서내전체토큰빈도'] = len(doc_ls[0])

TF = []
for i in range(0,len(df)) :
    a = df1['문서내토큰빈도'][i]/df1['문서내전체토큰빈도'][i]
    TF.append(a)
df1['TF'] = TF
df1

Unnamed: 0,문서내토큰빈도,문서내전체토큰빈도,TF
0,1,5,0.2
1,1,5,0.2
2,1,5,0.2
3,1,5,0.2
4,1,5,0.2
5,0,5,0.0
6,0,5,0.0
7,0,5,0.0


In [37]:
df2 = pd.DataFrame(Bow_ls[1])
df2 = df2.rename({0:'문서내토큰빈도'}, axis = 'columns')
df2['문서내전체토큰빈도'] = len(doc_ls[1])

TF = []
for i in range(0,len(df)) :
    a = df2['문서내토큰빈도'][i]/df2['문서내전체토큰빈도'][i]
    TF.append(a)
df2['TF'] = TF
df2

Unnamed: 0,문서내토큰빈도,문서내전체토큰빈도,TF
0,0,5,0.0
1,1,5,0.2
2,0,5,0.0
3,0,5,0.0
4,0,5,0.0
5,1,5,0.2
6,2,5,0.4
7,1,5,0.2


In [44]:
## 정답
TDM = np.zeros((len(doc_ls), len(word2id)), dtype = int)
print(TDM)

for i, doc in enumerate(doc_ls) :
    for token in doc :
        TDM[i, word2id[token]] += 1 
TDM

[[0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]]


array([[1, 1, 1, 1, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 2, 1]])

In [45]:
## TF구하기 
def computeTF(TDM) :
    doc_len = len(TDM)
    word_len = len(TDM[0])
    
    tf = np.zeros((doc_len, word_len))
    print(tf)
    
    for doc_i in range(doc_len) :
        for word_i in range(word_len) :
            tf[doc_i, word_i] = TDM[doc_i, word_i]/TDM[doc_i].sum()
    return tf

computeTF(TDM)

[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]


array([[0.2, 0.2, 0.2, 0.2, 0.2, 0. , 0. , 0. ],
       [0. , 0.2, 0. , 0. , 0. , 0.2, 0.4, 0.2]])

In [48]:
import math

## IDF구하기
def computeIDF(TDM) :
    doc_len = len(TDM)
    word_len = len(TDM[0])
    
    idf = np.zeros(word_len)
    
    for i in range(word_len) :
        idf[i] = math.log10(doc_len/np.count_nonzero(TDM[:,i]))
    return idf

computeIDF(TDM)

array([0.30103, 0.     , 0.30103, 0.30103, 0.30103, 0.30103, 0.30103,
       0.30103])

In [49]:
def computeTFIDF(TDM) :
    tf = computeTF(TDM)
    idf = computeIDF(TDM)
    tfidf = np.zeros(tf.shape)
    for doc_i in range(tf.shape[0]) :
        for word_i in range(tf.shape[1]) :
            tfidf[doc_i, word_i] = tf[doc_i, word_i]*idf[word_i]
    return tfidf

computeTFIDF(TDM)

[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]


array([[0.060206, 0.      , 0.060206, 0.060206, 0.060206, 0.      ,
        0.      , 0.      ],
       [0.      , 0.      , 0.      , 0.      , 0.      , 0.060206,
        0.120412, 0.060206]])

In [52]:
df = pd.DataFrame(computeTFIDF(TDM))
df.T

[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]


Unnamed: 0,0,1
0,0.060206,0.0
1,0.0,0.0
2,0.060206,0.0
3,0.060206,0.0
4,0.060206,0.0
5,0.0,0.060206
6,0.0,0.120412
7,0.0,0.060206


## sklearn으로 구현하기

In [53]:
docs = ['오늘 동물원에서 원숭이와 코끼리를 봤어',
       '동물원에서 원숭이에게 바나나를 줬어 바나나를']

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
tfidf = tfidf.fit(docs)
tfidf.transform(docs).toarray()
vocab = tfidf.get_feature_names()

In [54]:
vocab

['동물원에서', '바나나를', '봤어', '오늘', '원숭이에게', '원숭이와', '줬어', '코끼리를']

In [59]:
df = pd.DataFrame(tfidf.transform(docs).toarray(), columns=vocab)
df.T

Unnamed: 0,0,1
동물원에서,0.335176,0.278943
바나나를,0.0,0.784088
봤어,0.471078,0.0
오늘,0.471078,0.0
원숭이에게,0.0,0.392044
원숭이와,0.471078,0.0
줬어,0.0,0.392044
코끼리를,0.471078,0.0


## gensim으로 구현하기

In [60]:
import gensim
from gensim import corpora
from gensim.models import TfidfModel

doc_ls = [doc.split() for doc in docs]
id2word = corpora.Dictionary(doc_ls)
TDM = [id2word.doc2bow(doc) for doc in doc_ls]
model = TfidfModel(TDM)
tfidf = model[TDM]
tfidf[0]

[(1, 0.5), (2, 0.5), (3, 0.5), (4, 0.5)]

In [62]:
from gensim.matutils import sparse2full

vacab = [id2word[i] for i in id2word.keys()]
TDM_matrix = [sparse2full(doc, len(vocab)).tolist() for doc in tfidf]
pd.DataFrame(TDM_matrix, columns=vacab)

Unnamed: 0,동물원에서,봤어,오늘,원숭이와,코끼리를,바나나를,원숭이에게,줬어
0,0.0,0.5,0.5,0.5,0.5,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.816497,0.408248,0.408248
