## TDM 직접구현

In [2]:
docs= ['동물원 코끼리',
       '동물원 원숭이 바나나',
       '엄마 코끼리 아기 코끼리',
       '원숭이 바나나 코끼리 바나나']

In [3]:
doc_ls=[]
for doc in docs:
    doc_ls.append(doc.split(' '))
doc_ls

[['동물원', '코끼리'],
 ['동물원', '원숭이', '바나나'],
 ['엄마', '코끼리', '아기', '코끼리'],
 ['원숭이', '바나나', '코끼리', '바나나']]

In [4]:
from collections import defaultdict

word2id = defaultdict(lambda : len(word2id))

for doc in doc_ls:
    for token in doc:
        word2id[token]
word2id

defaultdict(<function __main__.<lambda>()>,
            {'동물원': 0, '코끼리': 1, '원숭이': 2, '바나나': 3, '엄마': 4, '아기': 5})

In [5]:
import numpy as np

In [9]:
# 단어 고유 인덱스

TDM = np.zeros((len(word2id), len(doc_ls)), dtype=int)
print(TDM)
for i, doc in enumerate(doc_ls):
    for token in doc:
        TDM[word2id[token],i] += 1 #해당 토큰의 위치
        #행렬로 표기(BOW와 차이점, BOW는 1차원 배열)
TDM

[[0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]
 [0 0 0 0]]


array([[1, 1, 0, 0],
       [1, 0, 2, 1],
       [0, 1, 0, 1],
       [0, 1, 0, 2],
       [0, 0, 1, 0],
       [0, 0, 1, 0]])

In [11]:
import pandas as pd

doc_names =['문서'+str(i) for i in range(len(doc_ls))]
print('doc_names', doc_names)
sorted_vocab = sorted((value,key) for key, value in word2id.items())
vocab = [v[1] for v in sorted_vocab]
df_TDM = pd.DataFrame(TDM, columns=doc_names)
df_TDM['단어'] = vocab
df_TDM.set_index('단어')


doc_names ['문서0', '문서1', '문서2', '문서3']


Unnamed: 0_level_0,문서0,문서1,문서2,문서3
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
동물원,1,1,0,0
코끼리,1,0,2,1
원숭이,0,1,0,1
바나나,0,1,0,2
엄마,0,0,1,0
아기,0,0,1,0


## sklearn

In [12]:
docs= ['동물원 코끼리',
       '동물원 원숭이 바나나',
       '엄마 코끼리 아기 코끼리',
       '원숭이 바나나 코끼리 바나나']

In [13]:
#토큰 빈도 계산 : CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
DTM = count_vect.fit_transform(docs)
DTM.toarray()

array([[1, 0, 0, 0, 0, 1],
       [1, 1, 0, 0, 1, 0],
       [0, 0, 1, 1, 0, 2],
       [0, 2, 0, 0, 1, 1]], dtype=int64)

In [15]:
TDM = DTM.toarray().T
TDM

array([[1, 1, 0, 0],
       [0, 1, 0, 2],
       [0, 0, 1, 0],
       [0, 0, 1, 0],
       [0, 1, 0, 1],
       [1, 0, 2, 1]], dtype=int64)

In [24]:
import pandas as pd

doc_names =['문서'+str(i) for i in range(len(doc_ls))]
print('doc_names', doc_names)

vocab = count_vect.get_feature_names()
print(vocab)

df_TDM = pd.DataFrame(DTM.toarray().T, columns=doc_names)
df_TDM['단어'] = vocab
df_TDM.set_index('단어')


doc_names ['문서0', '문서1', '문서2', '문서3']
['동물원', '바나나', '아기', '엄마', '원숭이', '코끼리']


Unnamed: 0_level_0,문서0,문서1,문서2,문서3
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
동물원,1,1,0,0
바나나,0,1,0,2
아기,0,0,1,0
엄마,0,0,1,0
원숭이,0,1,0,1
코끼리,1,0,2,1


## gensim

In [25]:
docs= ['동물원 코끼리',
       '동물원 원숭이 바나나',
       '엄마 코끼리 아기 코끼리',
       '원숭이 바나나 코끼리 바나나']

In [21]:
import gensim
from gensim import corpora

doc_ls = [doc.split() for doc in docs] #공백으로 토큰화
print(doc_ls)

# doc_ls=[]
# for doc in docs:
#     doc_ls.append(doc.split()) 위에 코드랑 같은 의미

id2word = corpora.Dictionary(doc_ls)
print('\n')
print(id2word)

TDM = [id2word.doc2bow(doc) for doc in doc_ls]
# TDM=[]
# for doc in doc_ls:
#     TDM.append(id2word.doc2bow(doc)) 위 코드랑 같은 의미
TDM

[['동물원', '코끼리'], ['동물원', '원숭이', '바나나'], ['엄마', '코끼리', '아기', '코끼리'], ['원숭이', '바나나', '코끼리', '바나나']]


Dictionary(6 unique tokens: ['동물원', '코끼리', '바나나', '원숭이', '아기']...)


[[(0, 1), (1, 1)],
 [(0, 1), (2, 1), (3, 1)],
 [(1, 2), (4, 1), (5, 1)],
 [(1, 1), (2, 2), (3, 1)]]

In [26]:
from gensim.matutils import sparse2full
import pandas as pd
import numpy as np

doc_names =['문서'+str(i) for i in range(len(doc_ls))]
vocab = [id2word[i] for i in id2word.keys()]
DTM_matrix= [sparse2full(doc, len(vocab)).tolist() for doc in TDM]

df_TDM = pd.DataFrame(np.array(DTM_matrix, dtype=int).T,
                     columns = doc_names)
df_TDM['단어']= vocab
df_TDM.set_index('단어')


Unnamed: 0_level_0,문서0,문서1,문서2,문서3
단어,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
동물원,1,1,0,0
코끼리,1,0,2,1
바나나,0,1,0,2
원숭이,0,1,0,1
아기,0,0,1,0
엄마,0,0,1,0
