In [66]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from scipy.spatial import distance
from scipy.special import rel_entr
from scipy.spatial.distance import pdist, squareform

In [4]:
doc1 = "I love machine learning and data science"
doc2 = "Machine learning is my passion"

In [5]:
#creating the bag of words
v = CountVectorizer()
bow = v.fit_transform([doc1, doc2])

In [8]:
bow.toarray()

array([[1, 1, 0, 1, 1, 1, 0, 0, 1],
       [0, 0, 1, 1, 0, 1, 1, 1, 0]], dtype=int64)

In [10]:
v.get_feature_names_out()

array(['and', 'data', 'is', 'learning', 'love', 'machine', 'my',
       'passion', 'science'], dtype=object)

In [27]:
#Set theory based Similarity Measure (STB-SM)
def stb_sm(bow):
    doc1 = bow[0]
    doc2 = bow[1]
    x1, x2 = 0,0
    y1, y2 = 0,0
    z1, z2 = 0,0
    #finding out the subvalues
    for i in range(len(doc1)):
        if doc1[i] != 0 and doc2[i] != 0:
            x1 += doc1[i]
            x2 += doc2[i]
        if doc1[i] != 0 and doc2[i] == 0:
            y1 += doc1[i]
        if doc2[i] != 0 and doc1[i] == 0:
            y2 += doc2[i]
        z1 += doc1[i]
        z2 += doc2[i]
        #print(x1,x2,y1,y2,z1,z2)
    #calculating the final parameters
    X = x1 * x2
    Y = y1 * y2
    Z = z1 * z2
    stb_val = (X / Z) * (1 - Y / Z)
    return stb_val

In [29]:
test_1 = [[2,5,7,8,0,9],[9,0,0,6,5,1]]
stb_sm(test_1)

0.4239348184643264

In [33]:
cosine_similarity([test_1[0]], [test_1[1]])

array([[0.4199918]])

In [67]:
def get_similarities(bow):
    #Cosine similarity is calculated as the dot product of the two document vectors divided by the product of their magnitudes.
    print("Cosine :", cosine_similarity([bow[0]], [bow[1]])[0])

    #Jaccard similarity
    # Convert to binary by treating non-zero elements as 1 (presence) and zero as 0 (absence)
    test_1_binary = [[1 if x > 0 else 0 for x in bow[0]], [1 if x > 0 else 0 for x in bow[1]]]
    print("Jaccard :", jaccard_score(test_1_binary[0], test_1_binary[1]))

    #Euclidean Distance
    print("Euclidean :", distance.euclidean(bow[0], bow[1]))

    #Manhattan Distance
    print("Manhattan :", distance.cityblock(bow[0], bow[1]))

    #The Bhattacharyya distance measures the similarity of two probability distributions.
    epsilon = 1e-10
    vec_1_prob = normalize(np.array(bow[0]) + epsilon)
    vec_2_prob = normalize(np.array(bow[1]) + epsilon)
    bhattacharyya_distance = -np.log(np.sum(np.sqrt(vec_1_prob * vec_2_prob)))
    print("Bhattacharyya :", bhattacharyya_distance)

    #KL divergence is not symmetric and assumes that A and B are non-negative and normalized.
    kl_div = np.sum(rel_entr(vec_1_prob, vec_2_prob))
    print("Kullback-Leibler Divergence:", kl_div)

    # Create an array of vectors (add more vectors if needed)
    vectors = np.array(bow)
    
    # Compute pairwise distance similarity matrix using Euclidean distance
    pds_matrix = squareform(pdist(vectors, metric='euclidean'))
    print("Pairwise Distance Similarity Matrix (Euclidean):\n", pds_matrix)

In [68]:
get_similarities(test_1)

Cosine : [0.4199918]
Jaccard : 0.5
Euclidean : 14.696938456699069
Manhattan : 34
Bhattacharyya : 0.5880631896474189
Kullback-Leibler Divergence: 9.837860870113113
Pairwise Distance Similarity Matrix (Euclidean):
 [[ 0.         14.69693846]
 [14.69693846  0.        ]]


In [52]:
def normalize(vec):
    return vec/ np.sum(vec)