# TDM 직접구현

In [30]:
docs = ['동물원 코끼리',
       '동물원 원숭이 바나나',
        '엄마 코끼리 아기 코끼리',
        '원숭이 바나나 코끼리 바나나']

In [31]:
doc_ls = []
for doc in docs:
    doc_ls.append(doc.split(' '))
doc_ls

[['동물원', '코끼리'],
 ['동물원', '원숭이', '바나나'],
 ['엄마', '코끼리', '아기', '코끼리'],
 ['원숭이', '바나나', '코끼리', '바나나']]

In [32]:
from collections import defaultdict
word2id = defaultdict(lambda : len(word2id))
word2id

defaultdict(<function __main__.<lambda>()>, {})

In [33]:
for doc in doc_ls:
    for token in doc:
        word2id[token]
word2id

defaultdict(<function __main__.<lambda>()>,
            {'동물원': 0, '코끼리': 1, '원숭이': 2, '바나나': 3, '엄마': 4, '아기': 5})

In [41]:
import numpy as np
TDM = np.zeros((len(word2id), len(doc_ls)), dtype=int)
print(TDM)
for i, doc in enumerate(doc_ls):
    for token in doc:
        TDM[word2id[token], i] += 1 # 해당 토큰의 위치(column)
        print(word2id[token])
# 행렬로 표기(BoW와 차이점 : BoW는 1차원 배열)
TDM

[[0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]
0
1
0
2
3
4
1
5
1
2
3
1
3


array([[1, 1, 0, 0],
       [1, 0, 2, 1],
       [0, 1, 0, 1],
       [0, 1, 0, 2],
       [0, 0, 1, 0],
       [0, 0, 1, 0]])

In [43]:
from IPython.core import display as ICD
import pandas as pd
sorted_vocab = sorted((value, key) for key, value in word2id.items())
print('sorted_vocab',sorted_vocab)

vocab=[v[1] for v in sorted_vocab]
print('vocab',vocab)

doc_names = ['문서'+str(i) for i in range(len(doc_ls))]
print(doc_names)

df_TDM = pd.DataFrame(TDM, columns=doc_names)
df_TDM['단어']=vocab
df_TDM.set_index('단어')

sorted_vocab [(0, '동물원'), (1, '코끼리'), (2, '원숭이'), (3, '바나나'), (4, '엄마'), (5, '아기')]
vocab ['동물원', '코끼리', '원숭이', '바나나', '엄마', '아기']
['문서0', '문서1', '문서2', '문서3']


Unnamed: 0_level_0,문서0,문서1,문서2,문서3
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
동물원,1,1,0,0
코끼리,1,0,2,1
원숭이,0,1,0,1
바나나,0,1,0,2
엄마,0,0,1,0
아기,0,0,1,0


# sklearn으로 TDM구현

In [39]:
docs = ['동물원 코끼리',
       '동물원 원숭이 바나나',
        '엄마 코끼리 아기 코끼리',
       '원숭이 바나나 코끼리 바나나']

In [44]:
from sklearn.feature_extraction.text import CountVectorizer
# 참고 sklearn은 DTM으로 만들어지게 설정되어 있음.
count_vect = CountVectorizer()
DTM = count_vect.fit_transform(docs)
DTM.toarray()

array([[1, 0, 0, 0, 0, 1],
       [1, 1, 0, 0, 1, 0],
       [0, 0, 1, 1, 0, 2],
       [0, 2, 0, 0, 1, 1]], dtype=int64)

In [45]:
# DTM을 TDM으로
DTM.toarray().T

array([[1, 1, 0, 0],
       [0, 1, 0, 2],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 1],
       [1, 0, 2, 1]], dtype=int64)

In [46]:
import pandas as pd

doc_names = ['문서'+str(i) for i in range(len(doc_ls))]
vocab=count_vect.get_feature_names()
print(vocab)
df_TDM = pd.DataFrame(DTM.toarray().T, columns=doc_names)
df_TDM['단어']=vocab
df_TDM.set_index('단어')

['동물원', '바나나', '아기', '엄마', '원숭이', '코끼리']


Unnamed: 0_level_0,문서0,문서1,문서2,문서3
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
동물원,1,1,0,0
바나나,0,1,0,2
아기,0,0,1,0
엄마,0,0,1,0
원숭이,0,1,0,1
코끼리,1,0,2,1


# Gensim으로 TDM 구현

In [47]:
docs = ['동물원 코끼리',
       '동물원 원숭이 바나나',
        '엄마 코끼리 아기 코끼리',
       '원숭이 바나나 코끼리 바나나']

In [48]:
import gensim
from gensim import corpora

doc_ls = [doc.split() for doc in docs] # 공백으로 토큰화

id2word = corpora.Dictionary(doc_ls)
TDM = [id2word.doc2bow(doc) for doc in doc_ls]
TDM

[[(0, 1), (1, 1)],
 [(0, 1), (2, 1), (3, 1)],
 [(1, 2), (4, 1), (5, 1)],
 [(1, 1), (2, 2), (3, 1)]]

In [50]:
from gensim.matutils import sparse2full
import pandas as pd
import numpy as np
doc_names = ['문서'+str(i) for i in range(len(doc_ls))]
vocab=[id2word[i] for i in id2word.keys()]
DTM_matrix = [sparse2full(doc, len(vocab)).tolist() for doc in TDM]
df_TDM = pd.DataFrame(np.array(DTM_matrix, dtype=int).T,
                      columns=doc_names)
df_TDM['단어']=vocab
df_TDM.set_index('단어')

Unnamed: 0_level_0,문서0,문서1,문서2,문서3
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
동물원,1,1,0,0
코끼리,1,0,2,1
바나나,0,1,0,2
원숭이,0,1,0,1
아기,0,0,1,0
엄마,0,0,1,0
