*tool - jupyter notebook (python=3.6)

# 단어의 표현 (Word Representation)
- 단어의 표현: 문자를 수치화하는 방법으로 자연어 처리에 필수
- Local Representation (통계 기반): 단어만을 보고 수치화
    + One-hot Encoding
    + N-gram
    + Count Based (BoW, TDM, TF-IDF)
- Distributed Representation (딥러닝 기반): 주변단어 참고하여 수치화
    + Prediction Based (Word2Vec, FastText)
    + Count Based (Windows-GloVe, Full Document-LSA)

## TF-IDF 
### (Term Frequency-Inverse Document Frequency)
- 단어와 문서의 관련성을 평가하는 방법
- TF * IDF
    + TF (단어빈도) : 특정단어등장빈도/문서내전체등장단어빈도
    + IDF (역문서빈도) : log(총문서수/단어가등장한문서수)
- 구현 방법
    + 1. 직접 구현하기
    + 2. sklearn : 통째로
    + 3. gensim : 쪼개서
- 실제로는 공백 토큰화 X, 형태소 분석기로 정제한 후에 docs에 넣어주기

## 1. 직접 구현하기
- 토큰 index 생성 > TF 구하기 > IDF 구하기 > TF X IDF

In [1]:
docs = ['오늘 동물원에서 원숭이와 코끼리를 봤어',
       '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [2]:
doc_ls = []
for doc in docs:
    doc_ls.append(doc.split())
doc_ls

[['오늘', '동물원에서', '원숭이와', '코끼리를', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

In [3]:
from collections import defaultdict
word2id = defaultdict(lambda : len(word2id))
for doc in doc_ls:
    for token in doc:
        word2id[token]        # token을 인덱스로 넣자
word2id

defaultdict(<function __main__.<lambda>()>,
            {'오늘': 0,
             '동물원에서': 1,
             '원숭이와': 2,
             '코끼리를': 3,
             '봤어': 4,
             '원숭이에게': 5,
             '바나나를': 6,
             '줬어': 7})

### 1) TF 계산

In [4]:
import numpy as np

DTM = np.zeros((len(doc_ls),len(word2id)), dtype=int)
print(DTM)
for i, doc in enumerate(doc_ls):
    for token in doc:
        DTM[i,word2id[token]] += 1
DTM

[[0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0]]


array([[1, 1, 1, 1, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 2, 1]])

In [5]:
def computeTF(DTM):
    doc_len = len(DTM)       # 문서 갯수 2개
    word_len = len(DTM[0])   # 토큰의 갯수 8개
    # tf를 계산하기 전 0으로 셋팅
    tf = np.zeros((doc_len, word_len))
    print(tf)
    # TF 계산 특정단어등장빈도/문서내 전체등장단어빈도
    for doc_i in range(doc_len):
        for word_i in range(word_len):
            tf[doc_i, word_i] = DTM[doc_i, word_i]/DTM[doc_i].sum()
    return tf

In [6]:
computeTF(DTM)

[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]


array([[0.2, 0.2, 0.2, 0.2, 0.2, 0. , 0. , 0. ],
       [0. , 0.2, 0. , 0. , 0. , 0.2, 0.4, 0.2]])

### 2) IDF 계산

In [7]:
import math
# IDF 계산 : log(총문서수/단어가등장한문서수)
def computeIDF(DTM):
    doc_len = len(DTM)
    word_len = len(DTM[0])
    
    idf = np.zeros(word_len)
    
    for i in range(word_len):
        idf[i] = math.log10(doc_len/np.count_nonzero(DTM[:,i]))
    return idf

In [8]:
# np.count_nonzero(TDM[:,1])
    # 열로 읽은 것 중에 0이 아닌 것의 갯수를 센다
print(DTM)
print(DTM[:,0],DTM[:,1],DTM[:,2])
print('-'*10)
print("DTM[:,0]:", DTM[:,0], ", nonzero:", np.count_nonzero(DTM[:,0]))

[[1 1 1 1 1 0 0 0]
 [0 1 0 0 0 1 2 1]]
[1 0] [1 1] [1 0]
----------
DTM[:,0]: [1 0] , nonzero: 1


In [9]:
computeIDF(DTM)

array([0.30103, 0.     , 0.30103, 0.30103, 0.30103, 0.30103, 0.30103,
       0.30103])

### 3) TF-IDF 계산

In [10]:
# TF-IDF 곱
def computeTFIDF(DTM):
    tf = computeTF(DTM)
    idf = computeIDF(DTM)
    tfidf = np.zeros(tf.shape)    # np.shape[0] : 행, shape[1] : 열
    for doc_i in range(tf.shape[0]):
        for word_i in range(tf.shape[1]):
            tfidf[doc_i, word_i] = tf[doc_i, word_i] * idf[word_i]
    return tfidf

In [11]:
computeTFIDF(DTM)

[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]


array([[0.060206, 0.      , 0.060206, 0.060206, 0.060206, 0.      ,
        0.      , 0.      ],
       [0.      , 0.      , 0.      , 0.      , 0.      , 0.060206,
        0.120412, 0.060206]])

In [12]:
# DataFrame화 및 시각화
import pandas as pd

sorted_vocab = sorted((value, key) for key, value in word2id.items())
print(sorted_vocab)    # sorted() > 리스트로 변환
vocab = [v[1] for v in sorted_vocab]
print(vocab)
tfidf = computeTFIDF(DTM)
pd.DataFrame(tfidf, columns=vocab)

[(0, '오늘'), (1, '동물원에서'), (2, '원숭이와'), (3, '코끼리를'), (4, '봤어'), (5, '원숭이에게'), (6, '바나나를'), (7, '줬어')]
['오늘', '동물원에서', '원숭이와', '코끼리를', '봤어', '원숭이에게', '바나나를', '줬어']
[[0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0.]]


Unnamed: 0,오늘,동물원에서,원숭이와,코끼리를,봤어,원숭이에게,바나나를,줬어
0,0.060206,0.0,0.060206,0.060206,0.060206,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.060206,0.120412,0.060206


## 2. sklearn

In [13]:
docs = ['오늘 동물원에서 원숭이와 코끼리를 봤어',
       '동물원에서 원숭이에게 바나나를 줬어 바나나를']

from sklearn.feature_extraction.text import TfidfVectorizer
tfidv = TfidfVectorizer()
tfidv = tfidv.fit(docs)
tfidv.transform(docs).toarray()

array([[0.33517574, 0.        , 0.47107781, 0.47107781, 0.        ,
        0.47107781, 0.        , 0.47107781],
       [0.27894255, 0.78408803, 0.        , 0.        , 0.39204401,
        0.        , 0.39204401, 0.        ]])

In [14]:
vocab = tfidv.get_feature_names()
vocab

['동물원에서', '바나나를', '봤어', '오늘', '원숭이에게', '원숭이와', '줬어', '코끼리를']

In [15]:
tfidv.transform(docs).toarray()[0]

array([0.33517574, 0.        , 0.47107781, 0.47107781, 0.        ,
       0.47107781, 0.        , 0.47107781])

In [16]:
import pandas as pd
df = pd.DataFrame(tfidv.transform(docs).toarray(), columns = vocab)
df

Unnamed: 0,동물원에서,바나나를,봤어,오늘,원숭이에게,원숭이와,줬어,코끼리를
0,0.335176,0.0,0.471078,0.471078,0.0,0.471078,0.0,0.471078
1,0.278943,0.784088,0.0,0.0,0.392044,0.0,0.392044,0.0


## 3. gensim

In [17]:
import gensim
from gensim import corpora
from gensim.models import TfidfModel

docs = ['오늘 동물원에서 원숭이와 코끼리를 봤어',
       '동물원에서 원숭이에게 바나나를 줬어 바나나를']

doc_ls = [doc.split() for doc in docs]        # 공백으로 토큰화
id2word = corpora.Dictionary(doc_ls)
DTM = [id2word.doc2bow(doc) for doc in doc_ls]
model = TfidfModel(DTM)
tfidf = model[DTM]
tfidf[0]

[(1, 0.5), (2, 0.5), (3, 0.5), (4, 0.5)]

In [18]:
import pandas as pd
from gensim.matutils import sparse2full   # 비어있는 것을 모두 0으로 채워줌

vocab = [id2word[i] for i in id2word.keys()]
DTM_matrix = [sparse2full(doc, len(vocab)).tolist() for doc in tfidf]
pd.DataFrame(DTM_matrix, columns = vocab)

Unnamed: 0,동물원에서,봤어,오늘,원숭이와,코끼리를,바나나를,원숭이에게,줬어
0,0.0,0.5,0.5,0.5,0.5,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.816497,0.408248,0.408248
