# 문서 유사도

- 일반적으로 **코사인 유사도**를 기반으로 한다. 코사인 유사도는 벡터와 벡터 간의 유사도를 비교할 때 벡터의 **상호 방향성**이 얼마나 유사한지에 기반한다.

In [11]:
import numpy as np

In [28]:
def get_cos_sim(v1, v2) :
    dot_product = np.dot(v1, v2)
    l2_norm = (np.sqrt(sum(np.square(v1))) * np.sqrt(sum(np.square(v2))))
    sim = dot_product / l2_norm
    
    return sim

numpy를 활용해서 cosine 유사도 구하기

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [30]:
tfidf = TfidfVectorizer(lowercase = True)

In [31]:
docs = ['if you take the blue pill, the story ends',
       'if you take the red pill, you stay in Wonderland',
       'if you take the red pill, I show you how deep the rabbit hole goes']

다음과 같은 text들이 있을 때 각 text간의 코사인 유사도를 구한다.

In [32]:
tfidf_vect = tfidf.fit_transform(docs)

In [33]:
tfidf_vect.shape

(3, 18)

In [34]:
doc1 = np.array(tfidf_vect.todense()[0]).reshape(-1, )
doc2 = np.array(tfidf_vect.todense()[1]).reshape(-1, )

In [35]:
get_cos_sim(doc1, doc2)

0.40207758214950134

In [36]:
doc2 = np.array(tfidf_vect.todense()[1]).reshape(-1, )
doc3 = np.array(tfidf_vect.todense()[2]).reshape(-1, )

In [37]:
get_cos_sim(doc2, doc3)

0.45647296026166395

In [38]:
from sklearn.metrics.pairwise import cosine_similarity

In [39]:
cosine_similarity(tfidf_vect, tfidf_vect)

array([[1.        , 0.40207758, 0.40425045],
       [0.40207758, 1.        , 0.45647296],
       [0.40425045, 0.45647296, 1.        ]])

***
## Opinion Review

In [43]:
import pandas as pd
import glob, os
from sklearn.cluster import KMeans

In [42]:
path = './topics/'

In [44]:
all_files = glob.glob(os.path.join(path, '*.data'))

In [86]:
file_names = []
texts = []

In [87]:
for file in all_files :
    data = pd.read_table(file, index_col = None, header = 0, encoding = 'latin1')
    filename = file.split('/')[2].split('.')[0]
    file_names.append(filename)
    texts.append(data.to_string())

In [88]:
doc_data = pd.DataFrame({'filename' : file_names, 'text' : texts})

In [89]:
tfidf = TfidfVectorizer(lowercase = True, stop_words = 'english', ngram_range = (1, 2), min_df = 0.05, max_df = 0.95)

In [90]:
features = tfidf.fit_transform(doc_data['text'])

In [91]:
features.shape

(51, 4418)

In [92]:
km = KMeans(random_state = 42, max_iter = 10000, n_clusters = 3)

In [93]:
km.fit(features)

KMeans(max_iter=10000, n_clusters=3, random_state=42)

In [94]:
cls = km.labels_

In [95]:
cls_center = km.cluster_centers_

In [96]:
doc_data['label'] = cls

In [97]:
h_idx = doc_data[doc_data.label == 1].index

In [98]:
h_idx

Int64Index([1, 18, 22, 23, 29, 35, 42, 43, 45, 47], dtype='int64')

In [99]:
comp_doc = doc_data.iloc[h_idx[0]]['filename']

In [102]:
cosine_similarity(features[h_idx[0]], features[h_idx])

array([[1.        , 0.16857284, 0.08719997, 0.0824738 , 0.0564993 ,
        0.96717521, 0.1445608 , 0.27935015, 0.05514648, 0.20177631]])