### Scikit-learn 을 이용하는 방법 

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
contents = ['고양이랑 같이 놀러가고 싶지만 바빠서 시간이 안되요.',
            '고양이는 동네에서 산책하고 노는 것을 싫어해요',
            '고양이는 동네에서 노는 것도 싫어해요.',
            '여행을 떠나고 싶은데 너무 바빠서 못가고 있어요.']

In [3]:
vectorizer = TfidfVectorizer(min_df=1, decode_error='ignore')
tfidf_matrix = vectorizer.fit_transform(contents)
tfidf_matrix.toarray()

array([[0.38861429, 0.        , 0.        , 0.        , 0.38861429,
        0.        , 0.        , 0.38861429, 0.        , 0.        ,
        0.        , 0.30638797, 0.        , 0.38861429, 0.        ,
        0.        , 0.38861429, 0.38861429, 0.        , 0.        ],
       [0.        , 0.        , 0.47212003, 0.37222485, 0.        ,
        0.        , 0.37222485, 0.        , 0.37222485, 0.        ,
        0.        , 0.        , 0.47212003, 0.        , 0.37222485,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.53556627, 0.        , 0.4222466 , 0.        ,
        0.        , 0.4222466 , 0.        , 0.4222466 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.4222466 ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.38861429, 0.        , 0.        , 0.        , 0.38861429,
        0.38861429, 0.30638797, 0.        , 0

In [4]:
vectorizer.vocabulary_

{'고양이랑': 4,
 '같이': 0,
 '놀러가고': 7,
 '싶지만': 16,
 '바빠서': 11,
 '시간이': 13,
 '안되요': 17,
 '고양이는': 3,
 '동네에서': 8,
 '산책하고': 12,
 '노는': 6,
 '것을': 2,
 '싫어해요': 14,
 '것도': 1,
 '여행을': 18,
 '떠나고': 9,
 '싶은데': 15,
 '너무': 5,
 '못가고': 10,
 '있어요': 19}

In [5]:
from konlpy.tag import Okt 
t = Okt() 
# tokens = [t.morphs(row) for row in contents]

In [6]:
tokens=set()
tokens.update(t.morphs(contents[0]))
tokens.update(t.morphs(contents[1]))
tokens.update(t.morphs(contents[2]))
tokens.update(t.morphs(contents[3]))

In [7]:
tokens

{'.',
 '가고',
 '같이',
 '것',
 '고양이',
 '너무',
 '노',
 '놀러',
 '는',
 '도',
 '동네',
 '떠나고',
 '랑',
 '못',
 '바빠서',
 '산책',
 '시간',
 '싫어해요',
 '싶은데',
 '싶지만',
 '안되요',
 '에서',
 '여행',
 '을',
 '이',
 '있어요',
 '하고'}

In [8]:
len(tokens)

27

In [9]:
X = vectorizer.fit(tokens)

In [10]:
vectorizer.get_feature_names()

['가고',
 '같이',
 '고양이',
 '너무',
 '놀러',
 '동네',
 '떠나고',
 '바빠서',
 '산책',
 '시간',
 '싫어해요',
 '싶은데',
 '싶지만',
 '안되요',
 '에서',
 '여행',
 '있어요',
 '하고']

In [11]:
result = vectorizer.transform(contents)
result.toarray()

array([[0.       , 0.5      , 0.       , 0.       , 0.       , 0.       ,
        0.       , 0.5      , 0.       , 0.       , 0.       , 0.       ,
        0.5      , 0.5      , 0.       , 0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.       , 0.       , 0.       , 0.       , 1.       , 0.       ,
        0.       , 0.       , 0.       , 0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       , 0.       , 0.       , 0.       ,
        0.       , 0.       , 0.       , 0.       , 1.       , 0.       ,
        0.       , 0.       , 0.       , 0.       , 0.       , 0.       ],
       [0.       , 0.       , 0.       , 0.4472136, 0.       , 0.       ,
        0.4472136, 0.4472136, 0.       , 0.       , 0.       , 0.4472136,
        0.       , 0.       , 0.       , 0.       , 0.4472136, 0.       ]])

In [12]:
result_arr = result.toarray()

### 문장끼리의 유사도 측정

In [13]:
from numpy import dot 
from numpy.linalg import norm 
import numpy as np 

def cos_sim(A, B):
    return dot(A, B)/(norm(A)*norm(B))

In [14]:
print(cos_sim(result_arr[0],result_arr[1]))
print(cos_sim(result_arr[0],result_arr[2]))
print(cos_sim(result_arr[0],result_arr[3]))

0.0
0.0
0.223606797749979
