In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity
import os


In [2]:

print("--- Setting up Recommendation App ---")

# 1. Load the Processed Item Metadata
# This contains titles, categories, and IDs for all items
if os.path.exists('../data/100k/all_items_processed_100k.parquet'):
    all_items_df = pd.read_parquet('../data/100k/all_items_processed_100k.parquet')
    print(f"Loaded Item Metadata: {len(all_items_df)} items")
else:
    print("ERROR: 'all_items_processed.parquet' not found. Please run the previous notebook step to save it.")

# 2. Load the Pre-Computed Item Embeddings
# These are the 32-dim vectors representing each item
if os.path.exists('../data/100k/item_embeddings_100k.npy'):
    item_embeddings = np.load('../data/100k/item_embeddings_100k.npy')
    print(f"Loaded Item Embeddings: {item_embeddings.shape}")
else:
    print("ERROR: 'item_embeddings.npy' not found.")

# 3. Validation Check
# The number of rows in the dataframe MUST match the number of vectors
if len(all_items_df) == len(item_embeddings):
    print("SUCCESS: Data alignment verified.")
else:
    print(f"WARNING: Mismatch! Items: {len(all_items_df)}, Embeddings: {len(item_embeddings)}")
    # If mismatched, we might need to reset index, but usually loading saved files is safe.
    # all_items_df = all_items_df.reset_index(drop=True)

--- Setting up Recommendation App ---
Loaded Item Metadata: 48629 items
Loaded Item Embeddings: (48629, 32)
SUCCESS: Data alignment verified.


In [3]:
def recommend_similar_items(query, top_k=5):
    """
    Finds items semantically similar to the query string.
    
    Args:
        query (str): The product name to search for (e.g., "Dell Laptop").
        top_k (int): Number of recommendations to return.
    """
    # 1. Find the Item ID corresponding to the query text
    # We use string matching to find the product in our database
    mask = all_items_df['title'].str.contains(query, case=False, na=False)
    matches = all_items_df[mask]
    
    if len(matches) == 0:
        print(f"Item '{query}' not found in catalog.")
        return

    # 2. Get the First Match
    # (In a real app, you might let the user pick from a list)
    target_row = matches.iloc[0]
    target_idx = target_row.name # Since we loaded from parquet, index should be aligned 0..N
    
    # Validation: Ensure index is within bounds
    if target_idx >= len(item_embeddings):
        # Fallback if index isn't aligned (rare if saved correctly)
        target_idx = matches.index[0] 
        
    print(f"Selected Query Item:")
    print(f"   Title:    {target_row['title']}")
    print(f"   Category: {target_row['main_category']}")
    print(f"   Brand:    {target_row['brand']}")
    print("-" * 60)

    # 3. Retrieve the Vector for this Item
    # Shape: (1, 32)
    target_vector = item_embeddings[target_idx].reshape(1, -1)

    # 4. Calculate Similarity Scores
    # We compute Dot Product of this vector against ALL 48,000+ item vectors
    # (Since vectors are normalized, Dot Product == Cosine Similarity)
    # Shape Result: (N_items, )
    sim_scores = np.dot(item_embeddings, target_vector.T).flatten()

    # 5. Get Top K Results
    # We sort the scores in descending order
    # argsort gives us the INDICES of the best items
    sorted_indices = np.argsort(sim_scores)[::-1]
    
    # 6. Print Recommendations
    print(f"✅ Top {top_k} Recommendations:")
    count = 0
    for idx in sorted_indices:
        # Skip the item itself
        if idx == target_idx:
            continue
            
        if count >= top_k:
            break
            
        rec_item = all_items_df.iloc[idx]
        score = sim_scores[idx]
        
        print(f"{count+1}. [{score:.4f}] {rec_item['title']}")
        print(f"    Category: {rec_item['main_category']} | Brand: {rec_item['brand']}")
        count += 1
    print("\n")

In [4]:
# Test 1: Search for a specific laptop
recommend_similar_items("Dell Inspiron", top_k=5)

# Test 2: Search for headphones
recommend_similar_items("Sony Noise Cancelling", top_k=5)

# Test 3: Search for a camera
recommend_similar_items("Canon DSLR", top_k=5)

# Test 4: Search for a generic term to see how it disambiguates
recommend_similar_items("Cable", top_k=5)

Selected Query Item:
   Title:    Dell Inspiron 3452 HD High Performance Laptop NoteBook PC (Intel Celeron N3060, 2GB Ram, 32GB Solid State SSD, HDMI, Camera, WIFI, SC Card Reader) Windows 10 (Renewed)
   Category: Traditional Laptops
   Brand:    Amazon Renewed
------------------------------------------------------------
✅ Top 5 Recommendations:
1. [0.9944] 2018 Acer 14' FHD IPS Display Premium Flagship Business Chromebook-Intel Celeron Quad-Core Processor Up to 2.24Ghz, 4GB RAM, 32GB SSD, HDMI, WiFi, Bluetooth Chrome OS-(Renewed)
    Category: Traditional Laptops | Brand: Amazon Renewed
2. [0.9895] Angetube 1080P Webcam for Streaming, 920 PC Web Camera Calling Video Recording Cam for Windows Mac Conferencing Gaming Xbox Skype OBS Twitch Xsplit GoReact with Microphone & 100-Degree View Angle
    Category: Webcams | Brand: Angetube
3. [0.9893] Dell Latitude E6430 Laptop WEBCAM - HDMI - Intel Core i5 2.6ghz - 8GB DDR3 - 128GB SSD - DVD - Windows 10 Pro 64bit - (Renewed)
    Category: Tr