In [None]:
# https://studymachinelearning.com/cosine-similarity-text-similarity-metric/

### doc_1 = "Data is the oil of the digital economy"
### doc_2 = "Data is a new oil"
### doc_3 = "Data is an information"

In [14]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(min_df = 1, ngram_range = (1,1))

corpus = ["Data is the oil of the digital economy",
          "Data is a new oil",
          "Data is an information"]

features = vectorizer.fit_transform(corpus)
vocab = vectorizer.get_feature_names()

doc = pd.DataFrame(features.toarray(), columns = vocab)
doc.index = ['doc_1', 'doc_2', 'doc_3']
doc

Unnamed: 0,an,data,digital,economy,information,is,new,of,oil,the
doc_1,0,1,1,1,0,1,0,1,1,2
doc_2,0,1,0,0,0,1,1,0,1,0
doc_3,1,1,0,0,1,1,0,0,0,0


## 카운트 벡터 사용해서 유사도 구하기 (CountVectorizer())

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

count_vectorizer = CountVectorizer()

doc_1 = "Data is the oil of the digital economy"
doc_2 = "Data is a new oil"
doc_3 = "Data is an information"

data = [doc_1, doc_2, doc_3]

vector_matrix = count_vectorizer.fit_transform(data)
tokens = count_vectorizer.get_feature_names()
tokens

['an',
 'data',
 'digital',
 'economy',
 'information',
 'is',
 'new',
 'of',
 'oil',
 'the']

In [17]:
vector_matrix.toarray()

array([[0, 1, 1, 1, 0, 1, 0, 1, 1, 2],
       [0, 1, 0, 0, 0, 1, 1, 0, 1, 0],
       [1, 1, 0, 0, 1, 1, 0, 0, 0, 0]], dtype=int64)

In [19]:
import pandas as pd

def create_dataframe(matrix, tokens):

    doc_names = [f'doc_{i+1}' for i, _ in enumerate(matrix)]
    df = pd.DataFrame(data=matrix, index=doc_names, columns=tokens)
    return(df)

In [20]:
create_dataframe(vector_matrix.toarray(),tokens)

Unnamed: 0,an,data,digital,economy,information,is,new,of,oil,the
doc_1,0,1,1,1,0,1,0,1,1,2
doc_2,0,1,0,0,0,1,1,0,1,0
doc_3,1,1,0,0,1,1,0,0,0,0


In [22]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity_matrix = cosine_similarity(vector_matrix)
create_dataframe(cosine_similarity_matrix,['doc_1','doc_2','doc_3'])

Unnamed: 0,doc_1,doc_2,doc_3
doc_1,1.0,0.474342,0.316228
doc_2,0.474342,1.0,0.5
doc_3,0.316228,0.5,1.0


## TF-IDF 사용해서 유사도 구하기 (TfidfVectorizer())

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

Tfidf_vect = TfidfVectorizer()
vector_matrix = Tfidf_vect.fit_transform(data)

tokens = Tfidf_vect.get_feature_names()
create_dataframe(vector_matrix.toarray(),tokens)

Unnamed: 0,an,data,digital,economy,information,is,new,of,oil,the
doc_1,0.0,0.205302,0.347607,0.347607,0.0,0.205302,0.0,0.347607,0.264364,0.695214
doc_2,0.0,0.391484,0.0,0.0,0.0,0.391484,0.66284,0.0,0.504107,0.0
doc_3,0.608845,0.359594,0.0,0.0,0.608845,0.359594,0.0,0.0,0.0,0.0


In [24]:
cosine_similarity_matrix = cosine_similarity(vector_matrix)
create_dataframe(cosine_similarity_matrix,['doc_1','doc_2','doc_3'])

Unnamed: 0,doc_1,doc_2,doc_3
doc_1,1.0,0.294013,0.147651
doc_2,0.294013,1.0,0.28155
doc_3,0.147651,0.28155,1.0
