In [1]:
from nltk import FreqDist
import numpy as np
import re

def buildDict(docs):
    doc_tokens = []     # python list
    for doc in docs:
        delim = re.compile(r'[\s,.]+')
        tokens = delim.split(doc.lower()) 
        if tokens[-1] == '' :   tokens = tokens[:-1] 
        doc_tokens.append(tokens)

    vocab = FreqDist(np.hstack(doc_tokens))
    vocab = vocab.most_common()
    word_to_id = {word[0] : id for id, word in enumerate(vocab)}
    id_to_word = {id : word[0] for id, word in enumerate(vocab)}
    return doc_tokens, vocab, word_to_id, id_to_word

In [2]:
docs = []
docs.append('To do is to be. To be is to do.')
docs.append('To be or not to be. I am what I am')
docs.append('I think therefore I am. Do be do be do.')
docs.append('Do do do da da da. Let it be let it be.')

doc_tokens, vocab, word_to_id, id_to_word = buildDict(docs)

In [3]:
from collections import Counter
import math

tf_vectors = []
for doc in doc_tokens:
    vec = [0.0 for _ in range((len(word_to_id)))] #사전 길이 리스트
    word_count = Counter(doc)     #단어별 문서 내 출현빈도
    for key, value in word_count.items():
        vec[word_to_id[key]] = 1+ math.log2(value) #tf계산
    tf_vectors.append(vec)

In [4]:
import pandas as pd

df = pd.DataFrame(tf_vectors, columns=id_to_word.values())
print(df)

         do   be   to    i   am        da   is  let   it   or  not  what  \
0  2.000000  2.0  3.0  0.0  0.0  0.000000  2.0  0.0  0.0  0.0  0.0   0.0   
1  0.000000  2.0  2.0  2.0  2.0  0.000000  0.0  0.0  0.0  1.0  1.0   1.0   
2  2.584963  2.0  0.0  2.0  1.0  0.000000  0.0  0.0  0.0  0.0  0.0   0.0   
3  2.584963  2.0  0.0  0.0  0.0  2.584963  0.0  2.0  2.0  0.0  0.0   0.0   

   think  therefore  
0    0.0        0.0  
1    0.0        0.0  
2    1.0        1.0  
3    0.0        0.0  


In [5]:
idf = {}
for id, _ in id_to_word.items():
    idf[id] = 0.0
    for doc in tf_vectors:
        if doc[id] > 0:
            idf[id] += 1

In [6]:
N = len(tf_vectors)
idf = {id : math.log2(N/val) for id, val in idf.items()}

In [7]:
df = pd.Series(idf.values(), index=idf.keys())
print(df)

0     0.415037
1     0.000000
2     1.000000
3     1.000000
4     1.000000
5     2.000000
6     2.000000
7     2.000000
8     2.000000
9     2.000000
10    2.000000
11    2.000000
12    2.000000
13    2.000000
dtype: float64


In [None]:
import numpy as np

idf_list = [val for _, val in idf.items()]
tfidf = np.array([np.multiply(tf, idf_list) for tf in tf_vectors])

In [None]:
df = pd.DataFrame(tfidf, columns=id_to_word.values())
df

Unnamed: 0,do,be,to,i,am,da,is,let,it,or,not,what,think,therefore
0,2.446287,2.0,6.043302,0.0,0.0,0.0,3.832581,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,2.0,3.021651,3.021651,3.021651,0.0,0.0,0.0,0.0,1.916291,1.916291,1.916291,0.0,0.0
2,3.669431,2.0,0.0,3.021651,1.510826,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.916291,1.916291
3,3.669431,2.0,0.0,0.0,0.0,5.748872,0.0,3.832581,3.832581,0.0,0.0,0.0,0.0,0.0


In [None]:
print(df.T)

In [None]:
from nltk import FreqDist
import numpy as np
import re

def buildDict(docs):
    doc_tokens = []     # python list
    for doc in docs:
        delim = re.compile(r'[\s,.]+')
        tokens = delim.split(doc.lower()) 
        if tokens[-1] == '' :   tokens = tokens[:-1] 
        doc_tokens.append(tokens)

    vocab = FreqDist(np.hstack(doc_tokens))
    vocab = vocab.most_common()
    word_to_id = {word[0] : id for id, word in enumerate(vocab)}
    id_to_word = {id : word[0] for id, word in enumerate(vocab)}
    return doc_tokens, vocab, word_to_id, id_to_word

In [None]:
from collections import Counter
import math
import numpy as np

def TFIDF(doc_tokens, id_to_word):
    tf_vectors = []
    idf = {}

    #TF 구하기
    for doc in doc_tokens:
        vec = [0.0 for _ in range((len(id_to_word)))]
        word_count = Counter(doc)
        for key, value in word_count.items():
            vec[word_to_id[key]] = value
            #vec[word_to_id[key]] = 1+ math.log2(value) #tf계산
        tf_vectors.append(vec)
    
    #IDF 구하기
    for id, _ in id_to_word.items():
        idf[id] = 0.0
        for doc in tf_vectors:
            if doc[id] > 0:
                idf[id] += 1
    N = len(tf_vectors)            
    idf = {id : np.log((N+1)/(val+1))+1 for id, val in idf.items()}

    #TF-IDF 구하기
    idf_list = [val for _, val in idf.items()]
    tfidf = np.array([np.multiply(tf, idf_list) for tf in tf_vectors])

    return tf_vectors, idf, tfidf

In [None]:
docs = []
docs.append('To do is to be. To be is to do.')
docs.append('To be or not to be. I am what I am')
docs.append('I think therefore I am. Do be do be do.')
docs.append('Do do do da da da. Let it be let it be.')

doc_tokens, vocab, word_to_id, id_to_word = buildDict(docs)
tf_vectors, idf, tfidf = TFIDF(doc_tokens, id_to_word)

TFIDF Vector 정규화

In [None]:
import pandas as pd

pd.DataFrame(tfidf, columns=word_to_id.keys())

In [None]:
tfidf_l1 = np.array([vec / np.sum(vec) for vec in tfidf]) #l1 norm
pd.DataFrame(tfidf_l1, columns=word_to_id.keys())

Unnamed: 0,do,be,to,i,am,da,is,let,it,or,not,what,think,therefore
0,0.170804,0.139644,0.421954,0.0,0.0,0.0,0.267598,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.11895,0.179712,0.179712,0.179712,0.0,0.0,0.0,0.0,0.113971,0.113971,0.113971,0.0,0.0
2,0.261458,0.142506,0.0,0.215302,0.107651,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.136542,0.136542
3,0.192283,0.104803,0.0,0.0,0.0,0.301249,0.0,0.200833,0.200833,0.0,0.0,0.0,0.0,0.0


In [None]:
tfidf_l2 = np.array([vec / (np.sum(vec ** 2) ** 0.5) for vec in tfidf]) #l2 norm
pd.DataFrame(tfidf_l2, columns=word_to_id.keys())

Unnamed: 0,do,be,to,i,am,da,is,let,it,or,not,what,think,therefore
0,0.312717,0.255666,0.772535,0.0,0.0,0.0,0.489931,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.30712,0.464005,0.464005,0.464005,0.0,0.0,0.0,0.0,0.294266,0.294266,0.294266,0.0,0.0
2,0.609695,0.33231,0.0,0.502063,0.251031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.318401,0.318401
3,0.410533,0.223758,0.0,0.0,0.0,0.643179,0.0,0.428786,0.428786,0.0,0.0,0.0,0.0,0.0


In [None]:
from numpy.linalg import norm

tfidf_l2 = np.array([np.divide(vec , norm(vec)) for vec in tfidf]) #l2 norm
pd.DataFrame(tfidf_l2, columns=word_to_id.keys())

Unnamed: 0,do,be,to,i,am,da,is,let,it,or,not,what,think,therefore
0,0.312717,0.255666,0.772535,0.0,0.0,0.0,0.489931,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.30712,0.464005,0.464005,0.464005,0.0,0.0,0.0,0.0,0.294266,0.294266,0.294266,0.0,0.0
2,0.609695,0.33231,0.0,0.502063,0.251031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.318401,0.318401
3,0.410533,0.223758,0.0,0.0,0.0,0.643179,0.0,0.428786,0.428786,0.0,0.0,0.0,0.0,0.0


코사인 유사도 계산 - 1. 질의어 순위계산

In [None]:

user = input('질의어 입력')
delim = re.compile(r'[\s,.]+')
tokens = delim.split(user.lower()) 
if tokens[-1] == '' :   tokens = tokens[:-1] 
tokens

질의어 입력to do


['to', 'do']

In [None]:
user_vector = np.array([0.0 for _ in vocab])
for token in tokens:
    if token in word_to_id:
        user_vector[word_to_id[token]] = idf[word_to_id[token]]
user_vector = np.array([np.divide(user_vector , norm(user_vector))]) #l2 norm
user_vector

array([[0.62922751, 0.        , 0.77722116, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ]])

In [None]:
pd.DataFrame(tfidf_l2, columns=word_to_id.keys())

Unnamed: 0,do,be,to,i,am,da,is,let,it,or,not,what,think,therefore
0,0.312717,0.255666,0.772535,0.0,0.0,0.0,0.489931,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.30712,0.464005,0.464005,0.464005,0.0,0.0,0.0,0.0,0.294266,0.294266,0.294266,0.0,0.0
2,0.609695,0.33231,0.0,0.502063,0.251031,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.318401,0.318401
3,0.410533,0.223758,0.0,0.0,0.0,0.643179,0.0,0.428786,0.428786,0.0,0.0,0.0,0.0,0.0


In [None]:
user_rank = [user_vector.dot(vec) for vec in tfidf_l2]
user_rank

[array([0.7972005]),
 array([0.3606342]),
 array([0.38363656]),
 array([0.25831867])]

코사인 유사도 계산 - 2. 문서 간 유사도 계산

In [None]:
cos_sim = np.array([np.dot(tfidf_l2, vector) for vector in tfidf_l2])
cos_sim

array([[1.        , 0.43698001, 0.27562227, 0.1855881 ],
       [0.43698001, 1.        , 0.45149814, 0.06872066],
       [0.27562227, 0.45149814, 1.        , 0.32465691],
       [0.1855881 , 0.06872066, 0.32465691, 1.        ]])