In [6]:
#사전 구축
from nltk import FreqDist
import numpy as np
import re

def buildDict(docs):
    doc_tokens = []     # python list
    for doc in docs:
        delim = re.compile(r'[\s,.]+')
        tokens = delim.split(doc.lower()) 
        if tokens[-1] == '' :   tokens = tokens[:-1] 
        doc_tokens.append(tokens)

        
    vocab = FreqDist(np.hstack(doc_tokens))
    vocab = vocab.most_common()
    word_to_id = {word[0] : id for id, word in enumerate(vocab)}
    id_to_word = {id : word[0] for id, word in enumerate(vocab)}
    return doc_tokens, vocab, word_to_id, id_to_word

docs = []
docs.append('To do is to be. To be is to do.')
docs.append('To be or not to be. I am what I am')
docs.append('I think therefore I am. Do be do be do.')
docs.append('Do do do da da da. Let it be let it be.')

doc_tokens, vocab, word_to_id, id_to_word = buildDict(docs)

In [8]:
from collections import Counter
import math

tf_vectors = []
for doc in doc_tokens:
    vec = [0.0 for _ in range((len(word_to_id)))] #사전 길이 리스트
    word_count = Counter(doc)     #단어별 문서 내 출현빈도
    for key, value in word_count.items():
        vec[word_to_id[key]] = 1+ math.log2(value) #tf계산
    tf_vectors.append(vec)

In [9]:
import pandas as pd

df = pd.DataFrame(tf_vectors, columns=id_to_word.values())
print(df)

         do   be   to    i   am        da   is  let   it   or  not  what  \
0  2.000000  2.0  3.0  0.0  0.0  0.000000  2.0  0.0  0.0  0.0  0.0   0.0   
1  0.000000  2.0  2.0  2.0  2.0  0.000000  0.0  0.0  0.0  1.0  1.0   1.0   
2  2.584963  2.0  0.0  2.0  1.0  0.000000  0.0  0.0  0.0  0.0  0.0   0.0   
3  2.584963  2.0  0.0  0.0  0.0  2.584963  0.0  2.0  2.0  0.0  0.0   0.0   

   think  therefore  
0    0.0        0.0  
1    0.0        0.0  
2    1.0        1.0  
3    0.0        0.0  


In [10]:
idf = {}
for id, _ in id_to_word.items():
    idf[id] = 0.0
    for doc in tf_vectors:
        if doc[id] > 0:
            idf[id] += 1

In [11]:
df = pd.Series(idf.values(), index=idf.keys())
print(df)

0     3.0
1     4.0
2     2.0
3     2.0
4     2.0
5     1.0
6     1.0
7     1.0
8     1.0
9     1.0
10    1.0
11    1.0
12    1.0
13    1.0
dtype: float64


In [13]:
import numpy as np

idf_list = [val for _, val in idf.items()]
tfidf = np.array([np.multiply(tf, idf_list) for tf in tf_vectors])

df = pd.DataFrame(tfidf, columns=id_to_word.values())
print(df)

         do   be   to    i   am        da   is  let   it   or  not  what  \
0  6.000000  8.0  6.0  0.0  0.0  0.000000  2.0  0.0  0.0  0.0  0.0   0.0   
1  0.000000  8.0  4.0  4.0  4.0  0.000000  0.0  0.0  0.0  1.0  1.0   1.0   
2  7.754888  8.0  0.0  4.0  2.0  0.000000  0.0  0.0  0.0  0.0  0.0   0.0   
3  7.754888  8.0  0.0  0.0  0.0  2.584963  0.0  2.0  2.0  0.0  0.0   0.0   

   think  therefore  
0    0.0        0.0  
1    0.0        0.0  
2    1.0        1.0  
3    0.0        0.0  


In [15]:
print(df.T)

             0    1         2         3
do         6.0  0.0  7.754888  7.754888
be         8.0  8.0  8.000000  8.000000
to         6.0  4.0  0.000000  0.000000
i          0.0  4.0  4.000000  0.000000
am         0.0  4.0  2.000000  0.000000
da         0.0  0.0  0.000000  2.584963
is         2.0  0.0  0.000000  0.000000
let        0.0  0.0  0.000000  2.000000
it         0.0  0.0  0.000000  2.000000
or         0.0  1.0  0.000000  0.000000
not        0.0  1.0  0.000000  0.000000
what       0.0  1.0  0.000000  0.000000
think      0.0  0.0  1.000000  0.000000
therefore  0.0  0.0  1.000000  0.000000


In [17]:
from collections import Counter
import math
import numpy as np

def TFIDF(doc_tokens, id_to_word, word_to_id):
    tf_vectors = []
    idf = {}

    #TF 구하기
    for doc in doc_tokens:
        vec = [0.0 for _ in range((len(id_to_word)))]
        word_count = Counter(doc)
        for key, value in word_count.items():
            vec[word_to_id[key]] = 1+ math.log2(value) #tf계산
        tf_vectors.append(vec)
    
    #IDF 구하기
    for id, _ in id_to_word.items():
        idf[id] = 0.0
        for doc in tf_vectors:
            if doc[id] > 0:
                idf[id] += 1
    N = len(tf_vectors)            
    idf = {id : math.log2(N/val) for id, val in idf.items()}

    #TF-IDF 구하기
    idf_list = [val for _, val in idf.items()]
    tfidf = np.array([np.multiply(tf, idf_list) for tf in tf_vectors])

    return tf_vectors, idf, tfidf

In [20]:
docs = []
docs.append('To do is to be. To be is to do.')
docs.append('To be or not to be. I am what I am')
docs.append('I think therefore I am. Do be do be do.')
docs.append('Do do do da da da. Let it be let it be.')

doc_tokens, vocab, word_to_id, id_to_word = buildDict(docs)
tf_vectors, idf, tfidf = TFIDF(doc_tokens, id_to_word, word_to_id)

In [21]:
print(idf)

{0: 0.41503749927884376, 1: 0.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 2.0, 6: 2.0, 7: 2.0, 8: 2.0, 9: 2.0, 10: 2.0, 11: 2.0, 12: 2.0, 13: 2.0}
