In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, manhattan_distances, euclidean_distances
from scipy.spatial.distance import cityblock
import pandas as pd
import numpy as np
import math

sentence1 = "data science is one of the most important courses in computer science"
sentence2 = "this is one of the best data science courses"
sentence3 = "the data scientists perform data analysis"

data_given = [sentence1, sentence2, sentence3]

In [3]:
vectorizer = CountVectorizer ()
matrix = vectorizer.fit_transform (data_given)
terms = vectorizer.get_feature_names_out ()
bow_array = matrix.toarray ()
dataframe = pd.DataFrame (data = matrix.toarray (), columns = terms)
dataframe

Unnamed: 0,analysis,best,computer,courses,data,important,in,is,most,of,one,perform,science,scientists,the,this
0,0,0,1,1,1,1,1,1,1,1,1,0,2,0,1,0
1,0,1,0,1,1,0,0,1,0,1,1,0,1,0,1,1
2,1,0,0,0,2,0,0,0,0,0,0,1,0,1,1,0


In [4]:
terms_in_a_row = dataframe.sum (axis = 1)
tf_values = dataframe.divide (terms_in_a_row, axis = 0)
tf_values

Unnamed: 0,analysis,best,computer,courses,data,important,in,is,most,of,one,perform,science,scientists,the,this
0,0.0,0.0,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.0,0.166667,0.0,0.083333,0.0
1,0.0,0.111111,0.0,0.111111,0.111111,0.0,0.0,0.111111,0.0,0.111111,0.111111,0.0,0.111111,0.0,0.111111,0.111111
2,0.166667,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.166667,0.0,0.166667,0.166667,0.0


In [5]:
no_of_documents = len (data_given)
document_frequencies = (dataframe > 0).sum (axis = 0)
idf_values = np.log10 (no_of_documents / document_frequencies)
idf_dataframe = pd.DataFrame ({'idf' : idf_values})
idf_dataframe

Unnamed: 0,idf
analysis,0.477121
best,0.477121
computer,0.477121
courses,0.176091
data,0.0
important,0.477121
in,0.477121
is,0.176091
most,0.477121
of,0.176091


In [6]:
tfidf_values = tf_values.mul (idf_values)
tfidf_dataframe = pd.DataFrame (tfidf_values)
tfidf_dataframe

Unnamed: 0,analysis,best,computer,courses,data,important,in,is,most,of,one,perform,science,scientists,the,this
0,0.0,0.0,0.03976,0.014674,0.0,0.03976,0.03976,0.014674,0.03976,0.014674,0.014674,0.0,0.029349,0.0,0.0,0.0
1,0.0,0.053013,0.0,0.019566,0.0,0.0,0.0,0.019566,0.0,0.019566,0.019566,0.0,0.019566,0.0,0.0,0.053013
2,0.07952,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07952,0.0,0.07952,0.0,0.0


In [7]:
cosine_similarity_matrix = cosine_similarity (matrix)
print(cosine_similarity_matrix)
cosine_similarity_dataframe = pd.DataFrame (data = cosine_similarity_matrix)

[[1.         0.71269665 0.28347335]
 [0.71269665 1.         0.35355339]
 [0.28347335 0.35355339 1.        ]]


In [8]:
print("Manhattan distance\nBetween S1 & S2: ", cityblock (tfidf_dataframe.iloc [0], tfidf_dataframe.iloc [1]))

Manhattan distance
Between S1 & S2:  0.29441590690909264


In [9]:
print("Manhattan distance\nBetween S2 & S3: ", cityblock (tfidf_dataframe.iloc [1], tfidf_dataframe.iloc [2]))

Manhattan distance
Between S2 & S3:  0.44241605010624574


In [10]:
print("Manhattan distance\nBetween S1 & S3: ", cityblock (tfidf_dataframe.iloc [0], tfidf_dataframe.iloc [2]))

Manhattan distance
Between S1 & S3:  0.4856466751275593


In [11]:
print("Euclidean distance\nBetween S1 & S2: ", math.dist (tfidf_dataframe.iloc [0], tfidf_dataframe.iloc [1]))

Euclidean distance
Between S1 & S2:  0.1101622823975075


In [12]:
print("Euclidean distance\nBetween S2 & S3: ", math.dist (tfidf_dataframe.iloc [1], tfidf_dataframe.iloc [2]))

Euclidean distance
Between S2 & S3:  0.16280457534445458


In [13]:
print("Euclidean distance\nBetween S1 & S2: ", math.dist (tfidf_dataframe.iloc [0], tfidf_dataframe.iloc [2]))

Euclidean distance
Between S1 & S2:  0.16436705450612407
