In [None]:
# 10 December 2023
# CSC461 – Assignment4 – NLP
# Komal Khizar
# FA20-BSE-096
# Q2. [CLO-2]
# This task involves calculating the similarity between three sentences (S1, S2, S3)
# using three different distance metrics: Cosine, Manhattan, and Euclidean distances.
# These metrics will be computed on the vectorized representations of the sentences,
#  providing a quantitative measure of how similar or different the sentences
#  are from each other in terms of their word usage and structure.
# Each metric offers a different perspective on similarity, with cosine focusing on
#  the angle between vectors, and Manhattan and Euclidean measuring direct distances in the vector space.

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, manhattan_distances, euclidean_distances


In [None]:
# Sentences
S1 = "data science is one of the most important courses in computer science"
S2 = "this is one of the best data science courses"
S3 = "the data scientists perform data analysis"

In [None]:
# Vectorize the sentences
vectorizer = CountVectorizer()
X = vectorizer.fit_transform([S1, S2, S3])

In [None]:
# Convert to dense array for distance calculations
X_dense = X.toarray()

In [None]:
# Compute cosine similarity
cosine_sim = cosine_similarity(X_dense)

In [None]:
# Compute Manhattan distance
manhattan_dist = manhattan_distances(X_dense)

In [None]:
# Compute Euclidean distance
euclidean_dist = euclidean_distances(X_dense)

In [None]:
# Display the results
print("Cosine Similarity:\n", cosine_sim)
print("\nManhattan Distance:\n", manhattan_dist)
print("\nEuclidean Distance:\n", euclidean_dist)

Cosine Similarity:
 [[1.         0.71269665 0.28347335]
 [0.71269665 1.         0.35355339]
 [0.28347335 0.35355339 1.        ]]

Manhattan Distance:
 [[ 0.  7. 14.]
 [ 7.  0. 11.]
 [14. 11.  0.]]

Euclidean Distance:
 [[0.         2.64575131 4.        ]
 [2.64575131 0.         3.31662479]
 [4.         3.31662479 0.        ]]


In [1]:
import numpy as np

# TF values for each sentence
tf_values = np.array([
    [0.000000, 0.000000, 0.083333, 0.083333, 0.083333, 0.083333, 0.083333, 0.083333, 0.083333, 0.083333, 0.083333, 0.000000, 0.166667, 0.000000, 0.083333, 0.000000],
    [0.000000, 0.111111, 0.000000, 0.111111, 0.111111, 0.000000, 0.000000, 0.111111, 0.000000, 0.111111, 0.111111, 0.000000, 0.111111, 0.000000, 0.111111, 0.111111],
    [0.166667, 0.000000, 0.000000, 0.000000, 0.333333, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.166667, 0.000000, 0.166667, 0.166667, 0.000000]
])

# Redefine the cosine similarity function
def cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    magnitude1 = np.linalg.norm(vector1)
    magnitude2 = np.linalg.norm(vector2)
    return dot_product / (magnitude1 * magnitude2)

# Compute cosine similarities
cosine_similarity_s1_s2 = cosine_similarity(tf_values[0], tf_values[1])
cosine_similarity_s1_s3 = cosine_similarity(tf_values[0], tf_values[2])
cosine_similarity_s2_s3 = cosine_similarity(tf_values[1], tf_values[2])

cosine_similarity_s1_s2, cosine_similarity_s1_s3, cosine_similarity_s2_s3



(0.7126964923761216, 0.2834727270637635, 0.35355321381626914)

In [2]:
# Function to calculate Manhattan distance
def manhattan_distance(vector1, vector2):
    return np.sum(np.abs(vector1 - vector2))

# Compute Manhattan distances
manhattan_distance_s1_s2 = manhattan_distance(tf_values[0], tf_values[1])
manhattan_distance_s1_s3 = manhattan_distance(tf_values[0], tf_values[2])
manhattan_distance_s2_s3 = manhattan_distance(tf_values[1], tf_values[2])

manhattan_distance_s1_s2, manhattan_distance_s1_s3, manhattan_distance_s2_s3



(0.777778, 1.666666, 1.5555560000000002)

In [3]:
# Function to calculate Euclidean distance
def euclidean_distance(vector1, vector2):
    return np.linalg.norm(vector1 - vector2)

# Compute Euclidean distances
euclidean_distance_s1_s2 = euclidean_distance(tf_values[0], tf_values[1])
euclidean_distance_s1_s3 = euclidean_distance(tf_values[0], tf_values[2])
euclidean_distance_s2_s3 = euclidean_distance(tf_values[1], tf_values[2])

euclidean_distance_s1_s2, euclidean_distance_s1_s3, euclidean_distance_s2_s3



(0.24532638879256347, 0.485912772237981, 0.47140463864285426)

In [8]:
 # TF values for each sentence
tfidf_values = np.array([
    [0, 0, 0.03976, 0.014674, 0, 0.03976, 0.03976, 0.014674, 0.03976, 0.014674, 0.014674, 0, 0.029349, 0, 0, 0],
[0, 0.053013, 0, 0.019566, 0, 0, 0, 0.019566, 0, 0.019566, 0.019566, 0, 0.019566, 0, 0, 0.053013],
 [0.07952, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.07952, 0, 0.07952, 0, 0]
])
# Redefine the cosine similarity function
def cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    magnitude1 = np.linalg.norm(vector1)
    magnitude2 = np.linalg.norm(vector2)
    return dot_product / (magnitude1 * magnitude2)

# Compute cosine similarities
cosine_similarity_s1_s2 = cosine_similarity(tfidf_values[0], tfidf_values[1])
cosine_similarity_s1_s3 = cosine_similarity(tfidf_values[0], tfidf_values[2])
cosine_similarity_s2_s3 = cosine_similarity(tfidf_values[1], tfidf_values[2])

cosine_similarity_s1_s2, cosine_similarity_s1_s3, cosine_similarity_s2_s3

(0.22124596584079695, 0.0, 0.0)

In [9]:
# Function to calculate Manhattan distance
def manhattan_distance(vector1, vector2):
    return np.sum(np.abs(vector1 - vector2))

# Compute Manhattan distances
manhattan_distance_s1_s2 = manhattan_distance(tfidf_values[0], tfidf_values[1])
manhattan_distance_s1_s3 = manhattan_distance(tfidf_values[0], tfidf_values[2])
manhattan_distance_s2_s3 = manhattan_distance(tfidf_values[1], tfidf_values[2])

manhattan_distance_s1_s2, manhattan_distance_s1_s3, manhattan_distance_s2_s3



(0.29441700000000004, 0.485645, 0.442416)

In [10]:
# Function to calculate Euclidean distance
def euclidean_distance(vector1, vector2):
    return np.linalg.norm(vector1 - vector2)

# Compute Euclidean distances
euclidean_distance_s1_s2 = euclidean_distance(tfidf_values[0], tfidf_values[1])
euclidean_distance_s1_s3 = euclidean_distance(tfidf_values[0], tfidf_values[2])
euclidean_distance_s2_s3 = euclidean_distance(tfidf_values[1], tfidf_values[2])

euclidean_distance_s1_s2, euclidean_distance_s1_s3, euclidean_distance_s2_s3



(0.11016179230114222, 0.16436663440309288, 0.16280414404430865)