In [None]:
import pandas as pd # 데이터프레임 사용을 위해
from math import log # IDF 계산을 위해

In [7]:
docs = [
  '먹고 싶은 사과',
  '먹고 싶은 바나나',
  '길고 노란 바나나 바나나',
  '저는 과일이 좋아요'
] 

# 각 문서(docs)를 공백 기준으로 나눔
# 이중 루프를 사용 / 외부 루프는 각 문서를 순회, 내부 루프는 해당 문서를 단어로 나누어 순회
# 중복을 제거하여 고유한 단어를 획득(Set(집합)은 중복을 허용하지 않음))
vocab = list(set(w for doc in docs for w in doc.split()))
# 단어를 자음 순으로 정렬
vocab.sort()

In [10]:
vocab

['과일이', '길고', '노란', '먹고', '바나나', '사과', '싶은', '저는', '좋아요']

## TF, IDF, TF-IDF 값을 구하는 함수

In [12]:
# 총 문서의 수
N = len(docs) 

# t = 특정 단어 번호, d = 특정 문서 번호

def tf(t, d):
    return d.count(t)

def idf(t):
    df = 0
    for doc in docs:
        # 만약 t가 현재 처리 중인 문서 doc 안에 존재하면, t in doc는 True를 반환하므로 df에 1이 더해짐
        # t가 해당 문서에 등장했다는 것을 의미함
        df += t in doc
    return log(N/(df+1))

def tfidf(t, d):
    return tf(t,d)* idf(t)

## TF 구하기 ; DTM을 데이터프레임에 저장하여 출력하기

In [21]:
result = []

# 각 문서에 대해서 아래 연산을 반복
for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]
        result[-1].append(tf(t, d))

tf_ = pd.DataFrame(result, columns = vocab)
tf_

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


In [5]:
result = []
for j in range(len(vocab)):
    t = vocab[j]
    result.append(idf(t))

idf_ = pd.DataFrame(result, index=vocab, columns=["IDF"])
idf_

Unnamed: 0,IDF
과일이,0.693147
길고,0.693147
노란,0.693147
먹고,0.287682
바나나,0.287682
사과,0.693147
싶은,0.287682
저는,0.693147
좋아요,0.693147


In [6]:
result = []
for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]
        result[-1].append(tfidf(t,d))

tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0.0,0.0,0.0,0.287682,0.0,0.693147,0.287682,0.0,0.0
1,0.0,0.0,0.0,0.287682,0.287682,0.0,0.287682,0.0,0.0
2,0.0,0.693147,0.693147,0.0,0.575364,0.0,0.0,0.0,0.0
3,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147
