## Libraries

In [1]:
import pandas as pd
from itertools import combinations
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## Data

In [2]:
dataset = ["the house had a tiny little mouse",
           "the cat saw the mouse",
           "the house mouse ran away from the house",
           "the cat finally ate the mouse",
           "the end of the mouse story"]
print(dataset)

['the house had a tiny little mouse', 'the cat saw the mouse', 'the house mouse ran away from the house', 'the cat finally ate the mouse', 'the end of the mouse story']


## Method 1

In [25]:
tfidf_model = TfidfVectorizer(stop_words='english')
X = tfidf_model.fit_transform(dataset)
X.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.4755751 , 0.58946308, 0.28088232, 0.        , 0.        ,
        0.        , 0.58946308],
       [0.        , 0.        , 0.58873218, 0.        , 0.        ,
        0.        , 0.        , 0.34771471, 0.        , 0.72971837,
        0.        , 0.        ],
       [0.        , 0.45498177, 0.        , 0.        , 0.        ,
        0.73415285, 0.        , 0.21680125, 0.45498177, 0.        ,
        0.        , 0.        ],
       [0.58946308, 0.        , 0.4755751 , 0.        , 0.58946308,
        0.        , 0.        , 0.28088232, 0.        , 0.        ,
        0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.67009179, 0.        ,
        0.        , 0.        , 0.31930233, 0.        , 0.        ,
        0.67009179, 0.        ]])

In [26]:
print(cosine_similarity(X[-1], X))

[[0.08968638 0.11102612 0.06922514 0.08968638 1.        ]]


## Method 2

In [5]:
corpus = ['The weather is hot under the sun',
          'I make my hot chocolate with milk',
          'One hot encoding',
          'I will have a chai latte with milk',
          'There is a hot sale today']

In [13]:
# create the document-term matrix with count vectorizer
cv = CountVectorizer(stop_words="english")
X = cv.fit_transform(corpus).toarray()
X

array([[0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1],
       [0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0]], dtype=int64)

In [14]:
dt = pd.DataFrame(X, columns=cv.get_feature_names())
dt

Unnamed: 0,chai,chocolate,encoding,hot,latte,make,milk,sale,sun,today,weather
0,0,0,0,1,0,0,0,0,1,0,1
1,0,1,0,1,0,1,1,0,0,0,0
2,0,0,1,1,0,0,0,0,0,0,0
3,1,0,0,0,1,0,1,0,0,0,0
4,0,0,0,1,0,0,0,1,0,1,0


In [15]:
# list all of the combinations of 5 take 2 as well as the pairs of phrases
pairs = list(combinations(range(len(corpus)),2))
pairs

[(0, 1),
 (0, 2),
 (0, 3),
 (0, 4),
 (1, 2),
 (1, 3),
 (1, 4),
 (2, 3),
 (2, 4),
 (3, 4)]

In [16]:
combos = [(corpus[a_index], corpus[b_index]) for (a_index, b_index) in pairs]
combos

[('The weather is hot under the sun', 'I make my hot chocolate with milk'),
 ('The weather is hot under the sun', 'One hot encoding'),
 ('The weather is hot under the sun', 'I will have a chai latte with milk'),
 ('The weather is hot under the sun', 'There is a hot sale today'),
 ('I make my hot chocolate with milk', 'One hot encoding'),
 ('I make my hot chocolate with milk', 'I will have a chai latte with milk'),
 ('I make my hot chocolate with milk', 'There is a hot sale today'),
 ('One hot encoding', 'I will have a chai latte with milk'),
 ('One hot encoding', 'There is a hot sale today'),
 ('I will have a chai latte with milk', 'There is a hot sale today')]

In [21]:
# calculate the cosine similarity for all pairs of phrases and sort by most similar
results = [cosine_similarity([X[a_index]], [X[b_index]]) for (a_index, b_index) in pairs]
results

[array([[0.28867513]]),
 array([[0.40824829]]),
 array([[0.]]),
 array([[0.33333333]]),
 array([[0.35355339]]),
 array([[0.28867513]]),
 array([[0.28867513]]),
 array([[0.]]),
 array([[0.40824829]]),
 array([[0.]])]

In [22]:
sorted(zip(results, combos), reverse=True)

[(array([[0.40824829]]),
  ('The weather is hot under the sun', 'One hot encoding')),
 (array([[0.40824829]]), ('One hot encoding', 'There is a hot sale today')),
 (array([[0.35355339]]),
  ('I make my hot chocolate with milk', 'One hot encoding')),
 (array([[0.33333333]]),
  ('The weather is hot under the sun', 'There is a hot sale today')),
 (array([[0.28867513]]),
  ('The weather is hot under the sun', 'I make my hot chocolate with milk')),
 (array([[0.28867513]]),
  ('I make my hot chocolate with milk', 'There is a hot sale today')),
 (array([[0.28867513]]),
  ('I make my hot chocolate with milk', 'I will have a chai latte with milk')),
 (array([[0.]]),
  ('The weather is hot under the sun', 'I will have a chai latte with milk')),
 (array([[0.]]), ('One hot encoding', 'I will have a chai latte with milk')),
 (array([[0.]]),
  ('I will have a chai latte with milk', 'There is a hot sale today'))]