In [29]:
from nltk import FreqDist
import numpy as np
import re

def buildDict(docs):
    doc_tokens = []     # python list
    for doc in docs:
        delim = re.compile(r'[\s,.]+')
        tokens = delim.split(doc.lower()) 
        if tokens[-1] == '' :   tokens = tokens[:-1] 
        doc_tokens.append(tokens)

    vocab = FreqDist(np.hstack(doc_tokens))
    vocab = vocab.most_common()
    word_to_id = {word[0] : id for id, word in enumerate(vocab)}
    id_to_word = {id : word[0] for id, word in enumerate(vocab)}
    return doc_tokens, vocab, word_to_id, id_to_word

In [15]:
from collections import Counter
import math
import numpy as np

def TFIDF(doc_tokens, id_to_word):
    tf_vectors = []
    idf = {}

    #TF 구하기
    for doc in doc_tokens:
        vec = [0.0 for _ in range((len(id_to_word)))]
        word_count = Counter(doc)
        for key, value in word_count.items():
            vec[word_to_id[key]] = value
            #vec[word_to_id[key]] = 1+ math.log2(value) #tf계산
        tf_vectors.append(vec)
    
    #IDF 구하기
    for id, _ in id_to_word.items():
        idf[id] = 0.0
        for doc in tf_vectors:
            if doc[id] > 0:
                idf[id] += 1
    N = len(tf_vectors)            
    idf = {id : np.log((N+1)/(val+1))+1 for id, val in idf.items()}

    #TF-IDF 구하기
    idf_list = [val for _, val in idf.items()]
    tfidf = np.array([np.multiply(tf, idf_list) for tf in tf_vectors])

    return tf_vectors, idf, tfidf

In [31]:
text = '''Python is a high-level, general-purpose programming language.
Its design philosophy emphasizes code readability with the use of significant indentation.
Its language constructs and object-oriented approach aim to help programmers write clear, logical code for small- and large-scale projects.
Python is dynamically-typed and garbage-collected.
It supports multiple programming paradigms, including structured (particularly procedural), object-oriented and functional programming.
It is often described as a "batteries included" language due to its comprehensive standard library.
Guido van Rossum began working on Python in the late 1980s as a successor to the ABC programming language and first released it in 1991 as Python 0.9.0.
Python 2.0 was released in 2000 and introduced new features such as list comprehensions, cycle-detecting garbage collection, reference counting, and Unicode support.
Python 3.0, released in 2008, was a major revision that is not completely backward-compatible with earlier versions.
Python 2 was discontinued with version 2.7.18 in 2020.
Python consistently ranks as one of the most popular programming languages'''
docs = []
docs = text.split('\n')

doc_tokens, vocab, word_to_id, id_to_word = buildDict(docs)
tf_vectors, idf, tfidf = TFIDF(doc_tokens, id_to_word)

TFIDF Vector 정규화

In [32]:
import pandas as pd

pd.DataFrame(tfidf, columns=word_to_id.keys())

Unnamed: 0,python,and,programming,as,in,is,a,language,the,0,...,version,7,18,2020,consistently,ranks,one,most,popular,languages
0,1.405465,0.0,1.875469,0.0,0.0,1.875469,1.875469,1.875469,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.098612,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,3.386294,0.0,0.0,0.0,0.0,0.0,1.875469,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.405465,1.693147,0.0,0.0,0.0,1.875469,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.693147,3.750937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.875469,0.0,1.875469,1.875469,1.875469,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2.81093,1.693147,1.875469,3.750937,3.750937,0.0,1.875469,1.875469,4.197225,4.197225,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,1.405465,3.386294,0.0,1.875469,1.875469,0.0,0.0,0.0,0.0,2.098612,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,1.405465,0.0,0.0,0.0,1.875469,1.875469,1.875469,0.0,0.0,2.098612,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,1.405465,0.0,0.0,0.0,1.875469,0.0,0.0,0.0,0.0,0.0,...,2.791759,2.791759,2.791759,2.791759,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
tfidf_l1 = np.array([vec / np.sum(vec) for vec in tfidf]) #l1 norm
pd.DataFrame(tfidf_l1, columns=word_to_id.keys())

Unnamed: 0,python,and,programming,as,in,is,a,language,the,0,...,version,7,18,2020,consistently,ranks,one,most,popular,languages
0,0.09699,0.0,0.129424,0.0,0.0,0.129424,0.129424,0.129424,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.068558,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.070943,0.0,0.0,0.0,0.0,0.0,0.039291,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.133124,0.160372,0.0,0.0,0.0,0.177642,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.052479,0.116261,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.051906,0.0,0.051906,0.051906,0.051906,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.040966,0.024676,0.027333,0.054666,0.054666,0.0,0.027333,0.027333,0.06117,0.06117,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.02496,0.060137,0.0,0.033306,0.033306,0.0,0.0,0.0,0.0,0.037269,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.032426,0.0,0.0,0.0,0.043269,0.043269,0.043269,0.0,0.0,0.048418,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.053624,0.0,0.0,0.0,0.071557,0.0,0.0,0.0,0.0,0.0,...,0.106517,0.106517,0.106517,0.106517,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
tfidf_l2 = np.array([vec / (np.sum(vec ** 2) ** 0.5) for vec in tfidf]) #l2 norm
pd.DataFrame(tfidf_l2, columns=word_to_id.keys())

Unnamed: 0,python,and,programming,as,in,is,a,language,the,0,...,version,7,18,2020,consistently,ranks,one,most,popular,languages
0,0.249892,0.0,0.333458,0.0,0.0,0.333458,0.333458,0.333458,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.235875,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.298484,0.0,0.0,0.0,0.0,0.0,0.165313,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.287205,0.345992,0.0,0.0,0.0,0.383249,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.179085,0.396739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.198096,0.0,0.198096,0.198096,0.198096,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.199148,0.119956,0.132873,0.265746,0.265746,0.0,0.132873,0.132873,0.297364,0.297364,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.115334,0.277884,0.0,0.153903,0.153903,0.0,0.0,0.0,0.0,0.172215,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.135189,0.0,0.0,0.0,0.180397,0.180397,0.180397,0.0,0.0,0.201861,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.161167,0.0,0.0,0.0,0.215063,0.0,0.0,0.0,0.0,0.0,...,0.320135,0.320135,0.320135,0.320135,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
from numpy.linalg import norm

tfidf_l2 = np.array([np.divide(vec , norm(vec)) for vec in tfidf]) #l2 norm
pd.DataFrame(tfidf_l2, columns=word_to_id.keys())

Unnamed: 0,python,and,programming,as,in,is,a,language,the,0,...,version,7,18,2020,consistently,ranks,one,most,popular,languages
0,0.249892,0.0,0.333458,0.0,0.0,0.333458,0.333458,0.333458,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.235875,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.298484,0.0,0.0,0.0,0.0,0.0,0.165313,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.287205,0.345992,0.0,0.0,0.0,0.383249,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.179085,0.396739,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.198096,0.0,0.198096,0.198096,0.198096,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.199148,0.119956,0.132873,0.265746,0.265746,0.0,0.132873,0.132873,0.297364,0.297364,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.115334,0.277884,0.0,0.153903,0.153903,0.0,0.0,0.0,0.0,0.172215,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.135189,0.0,0.0,0.0,0.180397,0.180397,0.180397,0.0,0.0,0.201861,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.161167,0.0,0.0,0.0,0.215063,0.0,0.0,0.0,0.0,0.0,...,0.320135,0.320135,0.320135,0.320135,0.0,0.0,0.0,0.0,0.0,0.0


코사인 유사도 계산 - 1. 질의어 순위계산

코사인 유사도 계산 - 2. 문서 간 유사도 계산

In [46]:
cos_sim = np.array([np.dot(tfidf_l2, vector) for vector in tfidf_l2])
for i, cos_sim_of_doc in enumerate(cos_sim):
  print('{}번째 함수 코사인 유사도\n{}\n'.format(i+1,cos_sim_of_doc),'='*80,'\n')

1번째 함수 코사인 유사도
[1.         0.         0.05512489 0.19956785 0.13229584 0.19816993
 0.18268837 0.02882109 0.15409263 0.04027423 0.12032831]

2번째 함수 코사인 유사도
[0.         1.         0.10004763 0.         0.         0.05228534
 0.07014096 0.         0.04761408 0.05676364 0.13984903]

3번째 함수 코사인 유사도
[0.05512489 0.10004763 1.         0.10327314 0.10654357 0.11475561
 0.08527395 0.08294382 0.         0.         0.        ]

4번째 함수 코사인 유사도
[0.19956785 0.         0.10327314 1.         0.06196206 0.07592003
 0.0987002  0.12927018 0.10796408 0.04628787 0.04973484]

5번째 함수 코사인 유사도
[0.13229584 0.         0.10654357 0.06196206 1.         0.04920329
 0.10720134 0.04976483 0.         0.         0.09167761]

6번째 함수 코사인 유사도
[0.19816993 0.05228534 0.11475561 0.07592003 0.04920329 1.
 0.17120151 0.03048759 0.07147188 0.         0.04577555]

7번째 함수 코사인 유사도
[0.18268837 0.07014096 0.08527395 0.0987002  0.10720134 0.17120151
 1.         0.21491668 0.18887195 0.08924815 0.20348838]

8번째 함수 코사인 유사도
[0.02882109 0