In [16]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from scipy.spatial.distance import jaccard
import textdistance

# Define the phrases
phrase1 = "M/E T/C RPM"
phrase2 = "ME1 Turbo Charger 1 RPM"

# Initialize the vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the phrases
tfidf_matrix = vectorizer.fit_transform([phrase1, phrase2])

# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

# Compute Euclidean distance
euclidean_dist = euclidean_distances(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

# Compute Manhattan distance
manhattan_dist = manhattan_distances(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]

# Compute Jaccard similarity
# Note: Jaccard needs boolean values, here we use a simple approach by using non-zero as True
jaccard_sim = 1 - jaccard(tfidf_matrix[0].toarray().astype(bool)[0], tfidf_matrix[1].toarray().astype(bool)[0])

# Compute Levenshtein distance
levenshtein_dist = textdistance.levenshtein.normalized_similarity(phrase1, phrase2)

print(f"Cosine similarity: {cosine_sim:.4f}")
print(f"Euclidean distance: {euclidean_dist:.4f}")
print(f"Manhattan distance: {manhattan_dist:.4f}")
print(f"Jaccard similarity: {jaccard_sim:.4f}")
print(f"Levenshtein similarity: {levenshtein_dist:.4f}")


Cosine similarity: 0.6694
Euclidean distance: 0.8131
Manhattan distance: 1.1499
Jaccard similarity: 0.6667
Levenshtein similarity: 0.7222
