In [10]:
# üü¢ CELL 1: SETUP
!pip install -q sentence-transformers gradio
import os
import shutil
import zipfile
import pandas as pd
from sentence_transformers import CrossEncoder
import gradio as gr

print("‚úÖ Libraries Installed")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


‚úÖ Libraries Installed


In [16]:
# üü¢ CELL 2: MASTER LOAD (Model + Clean Data)
import os
import pandas as pd
from sentence_transformers import CrossEncoder

# --- 1. AUTO-DETECT PATHS ---
# Kaggle input paths can vary, so we scan for the files we need.
model_path = None
products_file_path = None

print("üîç Scanning Input Directory...")
for root, dirs, files in os.walk('/kaggle/input'):
    # Find Model: Look for config.json inside a folder
    if "config.json" in files and ("pytorch_model.bin" in files or "model.safetensors" in files):
        model_path = root
        print(f"   ‚úÖ FOUND MODEL at: {model_path}")
    
    # Find Dataset: Look for the specific parquet file
    if "shopping_queries_dataset_products.parquet" in files:
        products_file_path = os.path.join(root, "shopping_queries_dataset_products.parquet")
        print(f"   ‚úÖ FOUND DATASET at: {products_file_path}")

# --- 2. LOAD MODEL ---
try:
    if model_path:
        print(f"üß† Loading Model...")
        model = CrossEncoder(model_path)
        print("‚úÖ Model Loaded successfully.")
    else:
        print("‚ùå ERROR: Model not found. Did you add your previous notebook as output?")
except Exception as e:
    print(f"‚ùå Error loading model: {e}")

# --- 3. LOAD & CLEAN DATA ---
print("üìö Loading Product Catalog...")
try:
    if products_file_path:
        df_products = pd.read_parquet(products_file_path)
        
        # Filter for US English
        df_products = df_products[df_products['product_locale'] == 'us']
        
        # --- CRITICAL FIX: CLEAN GARBAGE ROWS ---
        # 1. Remove rows with empty titles
        df_products = df_products.dropna(subset=['product_title'])
        # 2. Remove junk titles like "1.0", "2.0" (Must be longer than 5 chars)
        df_products = df_products[df_products['product_title'].str.len() > 5]
        
        # Sample 20,000 products for fast demo search
        df_products = df_products.sample(n=20000, random_state=42).fillna('')
        
        # Create search text field
        df_products['text'] = df_products['product_title'] + " " + df_products['product_description']
        
        print(f"‚úÖ Catalog Ready: {len(df_products)} clean products loaded.")
    else:
        print("‚ùå ERROR: Dataset not found. Did you add 'amazon-esci' to Inputs?")

except Exception as e:
    print(f"‚ùå Error loading dataset: {e}")

üîç Scanning Input Directory...
   ‚úÖ FOUND DATASET at: /kaggle/input/amazon-esci/shopping_queries_dataset/shopping_queries_dataset_products.parquet
   ‚úÖ FOUND MODEL at: /kaggle/input/dataset
üß† Loading Model...
‚úÖ Model Loaded successfully.
üìö Loading Product Catalog...
‚úÖ Catalog Ready: 20000 clean products loaded.


In [None]:
# üü¢ CELL 4: FINAL DEMO UI (Robust)
def search_products(user_query):
    # 1. RETRIEVAL PHASE (Simulation)
    # We grab 100 candidates.
    # Priority 1: Title contains query words (High likelihood matches)
    # Priority 2: Random fill (Distractors to test the AI's ability to filter junk)
    
    # Split query into words to be more flexible
    query_words = user_query.lower().split()
    primary_keyword = query_words[0] if query_words else ""
    
    # Find items containing at least the first word
    candidates = df_products[df_products['text'].str.contains(primary_keyword, case=False, regex=False)].head(50)
    
    # If we have fewer than 50, fill with random items
    if len(candidates) < 50:
        remaining = 50 - len(candidates)
        fillers = df_products.sample(n=remaining)
        candidates = pd.concat([candidates, fillers])
        
    candidate_texts = candidates['text'].tolist()
    candidate_titles = candidates['product_title'].tolist()
    
    # 2. AI RE-RANKING PHASE (The "Intelligence")
    # The Cross-Encoder looks at the full pair (Query, Product)
    pairs = [[user_query, prod] for prod in candidate_texts]
    scores = model.predict(pairs)
    
    # 3. SORTING
    results = list(zip(candidate_titles, scores))
    results.sort(key=lambda x: x[1], reverse=True)
    
    # 4. FORMATTING
    out = f"üîç Query: '{user_query}'\n"
    out += f"‚ö° AI Re-ranked {len(candidates)} candidates in real-time.\n"
    out += "="*50 + "\n\n"
    
    for i, (title, score) in enumerate(results[:10]): # Show Top 10
        # Visual Indicator for Relevance
        if score > 0.8:
            icon = "üü¢ Excellent Match"
        elif score > 0.4:
            icon = "üü° Potential Match"
        else:
            icon = "üî¥ Low Relevance"
            
        out += f"{i+1}. {title}\n"
        out += f"   [{score:.4f}] {icon}\n"
        out += "-"*30 + "\n"
        
    return out

# Launch
iface = gr.Interface(
    fn=search_products,
    inputs=gr.Textbox(label="Search Amazon", placeholder="Try: 'running shoes', 'wireless charger', 'ps5 games'"),
    outputs=gr.Textbox(label="AI Ranked Results", lines=20),
    title="üõçÔ∏è Intelligent Product Search (Task 2 Demo)",
    description="This search engine uses a BERT Cross-Encoder to semantically understand your query and re-rank products.",
    examples=[
        ["wireless gaming mouse"],
        ["running shoes for men"],
        ["iphone 12 pro max case"],
        ["yoga mat non slip"],
        ["coffee maker with timer"]
    ]
)

iface.launch(share=True, debug=True)

* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://2ed699c261642866a1.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
