In [1]:
!pip install gensim numpy scikit-learn

Defaulting to user installation because normal site-packages is not writeable
Collecting gensim
  Downloading gensim-4.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.5/26.5 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Installing collected packages: gensim
Successfully installed gensim-4.3.2


In [5]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import remove_stopwords, strip_punctuation

In [16]:
# Load pre-trained word embeddings (Google News vectors in this case)
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# Example list of product titles
product_titles = ["Hair Serum - Anti-Dandruff", "Shake & Spray Serum", "Antibacterial Hand Sanitizer - 72% Alcohol Based Sanitizer", "Non-Sticky, Gentle On Hands", "Olive Oil - Classic"]

# Reference product title
reference_title = "Garlic Oil - Vegetarian Capsule 500 mg,"

def preprocess(text):
    """ Preprocess the text by removing stopwords and punctuation. """
    return remove_stopwords(strip_punctuation(text.lower())).split()

def get_average_vector(words):
    """ Compute the average word vector for a list of words. """
    valid_words = [word for word in words if word in model.key_to_index]
    if valid_words:
        return np.mean(model[valid_words], axis=0)
    else:
        return np.zeros(model.vector_size)

# Calculate word vectors for each title
title_vectors = np.array([get_average_vector(preprocess(title)) for title in product_titles])
reference_vector = get_average_vector(preprocess(reference_title))

# Compute cosine similarities
similarities = cosine_similarity([reference_vector], title_vectors)

# Print out similarities
for title, similarity in zip(product_titles, similarities[0]):
    print(f"Similarity with '{reference_title}': {title} = {similarity:.3f}")

Similarity with 'Garlic Oil - Vegetarian Capsule 500 mg,': Hair Serum - Anti-Dandruff = 0.431
Similarity with 'Garlic Oil - Vegetarian Capsule 500 mg,': Shake & Spray Serum = 0.388
Similarity with 'Garlic Oil - Vegetarian Capsule 500 mg,': Antibacterial Hand Sanitizer - 72% Alcohol Based Sanitizer = 0.396
Similarity with 'Garlic Oil - Vegetarian Capsule 500 mg,': Non-Sticky, Gentle On Hands = 0.201
Similarity with 'Garlic Oil - Vegetarian Capsule 500 mg,': Olive Oil - Classic = 0.463


In [22]:
PATH="./data.csv"
data = pd.read_csv(PATH)
data = data.dropna()

In [43]:
reference_product = data.loc[3, :]
reference_product_type = reference_product['product_type']
rating_ball = 0.05
pricing_ball = 30
print(reference_product)

product_id                                                                4
product_title             Cereal Flip Lid Container/Storage Jar - Assort...
category                                               Cleaning & Household
product_subcategory                                    Bins & Bathroom Ware
brand                                                                Nakoda
selling_price                                                         149.0
original_price                                                        176.0
product_type                                       Laundry, Storage Baskets
product_rating                                                          3.7
product_description       Multipurpose container with an attractive desi...
availability_status                                                In Stock
customer_reviews_count                                                   49
seasonal_indicator                                                   Spring
promotion_in

In [46]:
data[(data['product_id'] != reference_product['product_id']) & (data['product_type'] == reference_product_type) & (data['product_rating'] >= reference_product['product_rating'] - rating_ball) & (data['product_rating'] <= reference_product['product_rating'] + rating_ball) & (data['original_price'] >= reference_product['original_price'] - pricing_ball) & (data['original_price'] <= reference_product['original_price'] + pricing_ball)]

Unnamed: 0,product_id,product_title,category,product_subcategory,brand,selling_price,original_price,product_type,product_rating,product_description,availability_status,customer_reviews_count,seasonal_indicator,promotion_indicator,shipping_weight,bundle_indicator,customer_demographics
5832,5833,"Multi Utility Plastic Basket - Small, Black, R...",Cleaning & Household,Bins & Bathroom Ware,Ratan,129.0,169.0,"Laundry, Storage Baskets",3.7,These are multipurpose baskets that are sturdy...,In Stock,64,Winter,No,3.978802,Individual,Female
8493,8494,Kitchen Multiutility Plastic Tray No. 2 - Blue...,Cleaning & Household,Bins & Bathroom Ware,Princeware,119.0,178.0,"Laundry, Storage Baskets",3.7,This is a multipurpose basket which keeps your...,In Stock,84,Spring,No,4.469756,Bundle,Male
19578,19579,Phoenix Fruit & Vegetable Plastic Basket - Ass...,Cleaning & Household,Bins & Bathroom Ware,Aristo,154.0,154.0,"Laundry, Storage Baskets",3.7,Premium Quality BPA-free basket having a smoot...,In Stock,9,Summer,No,3.530815,Individual,Male


In [34]:
reference_product

product_id                                                                2
product_title                                         Water Bottle - Orange
category                                             Kitchen, Garden & Pets
product_subcategory                                   Storage & Accessories
brand                                                            Mastercook
selling_price                                                         180.0
original_price                                                        180.0
product_type                                         Water & Fridge Bottles
product_rating                                                          2.3
product_description       Each product is microwave safe (without lid), ...
availability_status                                                In Stock
customer_reviews_count                                                   54
seasonal_indicator                                                   Winter
promotion_in