In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [11]:
df = pd.read_csv("Amazon Best Sellers Data.csv")

In [3]:
# Clearning the price column --- removing non-numeric characters and converting nums to float
df['product_price_clean'] = df['product_price'].str.replace(r'[^\d\.]', '', regex=True)
df['product_price_clean'] = pd.to_numeric(df['product_price_clean'], errors='coerce')

In [5]:
df_clean = df.dropna(subset=['product_price_clean', 'product_star_rating', 'product_num_ratings'])

features = df_clean[['product_price_clean', 'product_star_rating', 'product_num_ratings']]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

In [6]:
similarity_matrix = cosine_similarity(scaled_features)

In [7]:
query_titles = [
    'TurboTax Deluxe 2024 Tax Software, Federal & State Tax Return [PC/MAC Download]',
    'Microsoft Office Home 2024 | Classic Apps: Word, Excel, PowerPoint | One-Time Purchase for 1 PC/MAC | Instant Download | Form',
    'Norton 360 Premium 2025, Antivirus software for 10 Devices with Auto-Renewal – Includes Advanced AI Scam Protection, VPN, Dar'
]

In [8]:
query_indices = [df_clean[df_clean['product_title'] == title].index[0] for title in query_titles]

In [10]:
top_k = 10
results = {}

for idx, title in zip(query_indices, query_titles):
    similarities = similarity_matrix[idx]
    similar_indices = np.argsort(similarities)[::-1][1:top_k+1]  # Exclude the query itself
    results[title] = df_clean.iloc[similar_indices][[
        'product_title', 'product_price_clean', 'product_star_rating', 'product_num_ratings'
    ]].rename(columns={'product_price_clean': 'product_price'}).reset_index(drop=True)

for query, similar_df in results.items():
    print("\n" + "="*100)
    print(f"Top {top_k} Most Similar Products to:\n👉 {query}")
    print("="*100)
    print(similar_df.to_string(index=False))


Top 10 Most Similar Products to:
👉 TurboTax Deluxe 2024 Tax Software, Federal & State Tax Return [PC/MAC Download]
                                                                                                                product_title  product_price  product_star_rating  product_num_ratings
                                              TurboTax Deluxe 2024 Tax Software, Federal & State Tax Return [PC/MAC Download]          55.99                  4.2               6511.0
Norton 360 Premium 2025, Antivirus software for 10 Devices with Auto Renewal - Includes VPN, PC Cloud Backup & Dark Web Monit          29.99                  4.2               6625.0
Norton 360 Premium 2025, Antivirus software for 10 Devices with Auto Renewal - Includes VPN, PC Cloud Backup & Dark Web Monit          29.99                  4.2               6625.0
Norton 360 Premium 2021 – Antivirus software for 10 Devices with Auto Renewal - Includes VPN, PC Cloud Backup & Dark Web Moni          44.13            