In [1]:
import pandas as pd

import numpy as np


In [2]:
df = pd.read_csv("Data/Major.csv")

In [3]:
df.head()

Unnamed: 0,Brand,Product Name,Description,Category,Rating,Discount_price,Original_price,Image_url,Url
0,ZEBRONICS,"ZEBRONICS Zeb- Thunder, With 60H Backup, BT v5...","Black, On the Ear",headphones and earphones,4.0,699,1699,https://rukminim2.flixcart.com/image/612/612/x...,https://www.flipkart.com/zebronics-zeb-thunder...
1,OnePlus,OnePlus Bullets Wireless Z2 Bluetooth,"Acoustic Red, In the Ear",headphones and earphones,4.3,1299,2299,https://rukminim2.flixcart.com/image/612/612/l...,https://www.flipkart.com/oneplus-bullets-wirel...
2,boAt,boAt Rockerz 430 w/ Beast Mode(40ms Low Latenc...,"Bold Blue, On the Ear",headphones and earphones,4.0,1199,2490,https://rukminim2.flixcart.com/image/612/612/x...,https://www.flipkart.com/boat-rockerz-430-w-be...
3,OnePlus,OnePlus Bullets Wireless Z2 Bluetooth,"Beam Blue, In the Ear",headphones and earphones,4.3,1299,2299,https://rukminim2.flixcart.com/image/612/612/l...,https://www.flipkart.com/oneplus-bullets-wirel...
4,Sennheiser,Sennheiser Accentum Wireless Over Ear Headphon...,"Black, On the Ear",headphones and earphones,4.1,12990,14990,https://rukminim2.flixcart.com/image/612/612/x...,https://www.flipkart.com/sennheiser-accentum-w...


In [4]:
## Checking the Missing values present in the dataset 

df.isnull().sum()

Brand             0
Product Name      0
Description       0
Category          0
Rating            0
Discount_price    0
Original_price    0
Image_url         0
Url               0
dtype: int64

In [5]:
df.columns

Index(['Brand', 'Product Name', 'Description', 'Category', 'Rating',
       'Discount_price', 'Original_price', 'Image_url', 'Url'],
      dtype='object')

In [6]:
df.duplicated().sum()

0

## 1. Text Features (Product Name, Description)

    Lowercasing
    
    Remove punctuation, numbers, special characters
    
    Remove stopwords
    
    Lemmatization/Stemming
    
    Convert to numeric representation (TF-IDF or embeddings)

In [7]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# nltk.download("stopwords")
# nltk.download("wordnet")

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-z0-9\s]", "", text)   # keep letters + numbers
    tokens = text.split()
    tokens = [t for t in tokens if t not in stopwords.words("english")]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return " ".join(tokens)

df["clean_name"] = df["Product Name"].apply(clean_text)
# df["clean_desc"] = df["Description"].apply(clean_text)


In [8]:
df['clean_name']

0           zebronics zeb thunder 60h backup bt v53 gamin
1                    oneplus bullet wireless z2 bluetooth
2       boat rockerz 430 w beast mode40ms low latency4...
3                    oneplus bullet wireless z2 bluetooth
4       sennheiser accentum wireless ear headphone design
                              ...                        
6061          marq flipkart 80 cm 32 inch hd ready led tv
6062           xelectron 80 cm 32 inch hd ready 3d led tv
6063                   iair 81 cm 32 inch hd ready led tv
6064                   iair 60 cm 24 inch hd ready led tv
6065        xelectron 60 cm 24 inch hd ready led linux tv
Name: clean_name, Length: 6066, dtype: object

## Model Training

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words="english", max_features=6000)
tfidf_matrix = tfidf.fit_transform(df["clean_name"])


In [10]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix)


In [11]:
df['clean_name'].values

array(['zebronics zeb thunder 60h backup bt v53 gamin',
       'oneplus bullet wireless z2 bluetooth',
       'boat rockerz 430 w beast mode40ms low latency40hrs', ...,
       'iair 81 cm 32 inch hd ready led tv',
       'iair 60 cm 24 inch hd ready led tv',
       'xelectron 60 cm 24 inch hd ready led linux tv'], dtype=object)

In [12]:
def recommendor_input(top_n=5):
    # Take input from user
    brand = input("Enter brand (or press Enter to skip): ").strip()
    category = input("Enter category (or press Enter to skip): ").strip()
    product = input("Enter product name: ").strip()

    if not product:
        print("❌ Product name is required.")
        return None

    # Start with full dataset
    filtered_df = df.copy()

    # Filter by brand if provided
    if brand:
        filtered_df = filtered_df[filtered_df['Brand'].str.lower() == brand.lower()]
        if filtered_df.empty:
            print(f"❌ No products found under Brand='{brand}'.")
            return None

    # Filter by category if provided
    if category:
        filtered_df = filtered_df[filtered_df['Category'].str.lower() == category.lower()]
        if filtered_df.empty:
            print(f"❌ No products found under Category='{category}'.")
            return None

    # Find matching products (partial or exact match)
    matched_products = [p for p in filtered_df['clean_name'].values if product.lower() in p.lower()]

    if not matched_products:
        print(f"❌ Product '{product}' not found in the selected filters.")
        return None

    # Take the first matched product
    matched_product = matched_products[0]
    product_index = df[df['clean_name'] == matched_product].index[0]
    print(f"✅ Product found: {matched_product}")

    # Get similarity scores
    similarity_list = list(cosine_sim[product_index])

    # Get top N similar products (excluding itself)
    top_similar_products = sorted(
        enumerate(similarity_list),
        key=lambda x: x[1],
        reverse=True
    )[1:top_n+1]

    # Collect recommendations
    recommendations = []
    for idx, sim_score in top_similar_products:
        rec = {
            "Product Name": df.iloc[idx]["Product Name"],
            "Brand": df.iloc[idx]["Brand"],
            "Category": df.iloc[idx]["Category"],
            "Discount Price": df.iloc[idx]["Discount_price"],
            "Original Price": df.iloc[idx]["Original_price"],
            # "Similarity Score": round(sim_score, 3)
        }
        recommendations.append(rec)

    # Print recommendations
    print(f"\n🔎 Top {top_n} recommendations for: {matched_product}\n")
    for rec in recommendations:
        print(f"- {rec['Product Name']} | {rec['Brand']} | {rec['Category']} | "
              f"Discount: {rec['Discount Price']} | Original: {rec['Original Price']}")

    # return recommendations


In [13]:
recommendor_input()

Enter brand (or press Enter to skip):  
Enter category (or press Enter to skip):  
Enter product name:  


❌ Product name is required.


In [15]:
df.Category.unique()

array(['headphones and earphones', 'Laptop Bag', 'Laptop Charges',
       'Laptop', 'Laptop Cooling Pad', 'Mobile', 'Mobile Charge',
       'Mobiles', 'Mobile Covers', 'Smart Tv'], dtype=object)

In [20]:
cat='laptop'

df[df['Category'] == cat]['Brand'].unique()

array([], dtype=object)

In [25]:
lap = "Dell 256Gb laptop windows 11 core i3"

p = 'gb'

if p in lap.lower():
    print("yes")

yes
