In [5]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load the dataset
data = pd.read_csv('../select_db/data_mapping_filtered.csv', encoding='ISO-8859-1')

# Handle potential NaN values
data['tag_description'] = data['tag_description'].fillna('')

# Define the specific phrase
phrase1 = "ME1 Turbo Charger 1 RPM"

# Initialize the vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the text data
# The phrase is included in the fit to ensure vocabulary is built with it
tfidf_matrix = vectorizer.fit_transform([phrase1] + data['tag_description'].tolist())

# Compute cosine similarity between the phrase and all tag_descriptions
# The first vector (0th index) is the phrase1, the rest are tag_descriptions
similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])

# Finding the index of the maximum similarity score
max_index = similarity_scores.argmax()

# Access the tag_description with the highest cosine similarity
most_similar_description = data.iloc[max_index]['tag_description']
max_similarity_score = similarity_scores[0, max_index]

print(f"The tag_description with the highest cosine similarity to '{phrase1}' is: '{most_similar_description}' with a score of {max_similarity_score:.4f}")


The tag_description with the highest cosine similarity to 'ME T/C 1 RPM' is: 'ME T/C RPM' with a score of 1.0000
