BASELINE RECOMMENDATION SYSTEM
============================================================================
Purpose: Establish baseline using pre-trained SBERT + rule-based matching
This serves as a comparison point for the trained neural network

This notebook implements a baseline using:

  • Pre-trained Sentence-BERT for semantic similarity

  • Rule-based skin type matching
  
  • Simple weighted combination

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

from google.colab import drive
drive.mount('/content/drive')

**Load Data**

In [None]:
# Load products
products_df = pd.read_csv('/content/drive/MyDrive/cosmetic_p.csv')
print(f"\n Loaded {len(products_df)} products")

# Load pre-trained SBERT model
print("\n Loading pre-trained Sentence-BERT model...")
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
print(" Model loaded: all-MiniLM-L6-v2 (384-dim embeddings)")

# Check for existing embeddings or generate new ones
import os
if os.path.exists('product_embeddings.npy'):
    print("\n Loading existing product embeddings...")
    product_embeddings = np.load('product_embeddings.npy')
else:
    print("\n Generating product embeddings (2-3 minutes)...")
    product_texts = products_df['ingredients'].fillna('').tolist()
    product_embeddings = sbert_model.encode(
        product_texts,
        show_progress_bar=True,
        batch_size=32,
        normalize_embeddings=True
    )
    np.save('product_embeddings.npy', product_embeddings)
    print(" Saved embeddings")

print(f" Product embeddings ready: {product_embeddings.shape}")

**Baseline System Components**

In [None]:
def extract_query_skin_types(query):
    """
    Rule-based extraction of skin types from query text.
    Returns list of detected skin types.
    """
    query_lower = query.lower()
    detected = []

    skin_type_keywords = {
        'Dry': ['dry', 'dehydrated', 'flaky', 'tight'],
        'Oily': ['oily', 'greasy', 'shiny', 'sebum'],
        'Combination': ['combination', 'combo', 't-zone'],
        'Normal': ['normal', 'balanced'],
        'Sensitive': ['sensitive', 'reactive', 'redness', 'irritated']
    }

    for skin_type, keywords in skin_type_keywords.items():
        if any(kw in query_lower for kw in keywords):
            detected.append(skin_type)

    return detected if detected else []


def calculate_skin_type_match(product_row, query_skin_types):
    """
    Calculate how well product matches query skin types using explicit rules.
    Returns score 0-1.

    This is a RULE-BASED component (not learned).
    """
    if not query_skin_types:
        return 0.5

    product_skin_types = [
        st for st in ['Dry', 'Oily', 'Combination', 'Normal', 'Sensitive']
        if product_row[st] == 1
    ]

    if not product_skin_types:
        return 0.0

    # Count matches
    matches = sum([1 for st in query_skin_types if st in product_skin_types])

    if matches == 0:
        return 0.0

    # Score based on match quality
    match_ratio = matches / len(query_skin_types)

    # Penalty for generic products (too many skin types)
    num_product_types = len(product_skin_types)
    genericity_penalty = (num_product_types - 1) / 4.0

    # Final score: match quality minus genericity penalty
    score = match_ratio * (1.0 - genericity_penalty * 0.3)

    return max(0.0, min(1.0, score))


def get_baseline_scores(query, products_df, product_embeddings, sbert_model,
                       skin_type_weight=0.7, semantic_weight=0.3):
    """
    Calculate compatibility using baseline approach.

    Components (NO TRAINING):
    1. Skin type matching (70% weight) - hand-coded rules
    2. Semantic similarity (30% weight) - pre-trained SBERT
    """
    # Get query embedding using PRE-TRAINED model
    query_embedding = sbert_model.encode([query], normalize_embeddings=True)[0]

    # Extract query skin types using RULES
    query_skin_types = extract_query_skin_types(query)

    # Calculate scores for all products
    scores = []

    for i in range(len(products_df)):
        product_row = products_df.iloc[i]
        product_emb = product_embeddings[i]

        # Component 1: Rule-based skin type matching
        skin_type_score = calculate_skin_type_match(product_row, query_skin_types)

        # Component 2: Semantic similarity (cosine similarity of embeddings)
        semantic_score = np.dot(query_embedding, product_emb)
        semantic_score = (semantic_score + 1) / 2

        # Simple weighted combination (no learning)
        final_score = (
            skin_type_score * skin_type_weight +
            semantic_score * semantic_weight
        )

        scores.append({
            'idx': i,
            'final_score': final_score,
            'skin_type_score': skin_type_score,
            'semantic_score': semantic_score
        })

    return scores, query_skin_types


def recommend_top_k_baseline(query, k=5):
    """Get top-K recommendations using baseline approach."""
    scores, query_skin_types = get_baseline_scores(
        query, products_df, product_embeddings, sbert_model
    )

    # Sort by final score
    scores.sort(key=lambda x: x['final_score'], reverse=True)

    # Get top K with simple diversity constraint
    selected = []
    selected_brands = set()

    for item in scores:
        if len(selected) >= k:
            break

        idx = item['idx']
        product = products_df.iloc[idx]
        brand = product['brand']

        # Diversity: limit 2 products per brand
        brand_count = sum([1 for s in selected if products_df.iloc[s['idx']]['brand'] == brand])

        if brand_count < 2:
            selected.append(item)
            selected_brands.add(brand)

    # Build results
    results = []
    for rank, item in enumerate(selected, 1):
        idx = item['idx']
        product = products_df.iloc[idx]

        product_skin_types = [
            st for st in ['Dry', 'Oily', 'Combination', 'Normal', 'Sensitive']
            if product[st] == 1
        ]

        # Check if product matches query
        matches_query = any([st in product_skin_types for st in query_skin_types]) if query_skin_types else True

        results.append({
            'rank': rank,
            'name': product['name'],
            'brand': product['brand'],
            'category': product['Label'],
            'price': product['price'],
            'rating': product['rank'],
            'final_score': item['final_score'],
            'skin_type_score': item['skin_type_score'],
            'semantic_score': item['semantic_score'],
            'skin_types': ', '.join(product_skin_types) if product_skin_types else 'Not specified',
            'matches_query': matches_query,
            'query_skin_types': ', '.join(query_skin_types) if query_skin_types else 'None detected'
        })

    return results


def display_baseline_results(results, query):
    """Display baseline recommendations with analysis."""
    print(f"\n{'='*80}")
    print(f" QUERY: '{query}'")
    print(f"Detected skin types: {results[0]['query_skin_types']}")
    print(f"\n TOP {len(results)} BASELINE RECOMMENDATIONS:\n")

    matches = 0
    for r in results:
        match_indicator = "✓" if r['matches_query'] else "X"
        if r['matches_query']:
            matches += 1

        print(f"#{r['rank']}. {r['name']} {match_indicator}")
        print(f"    Brand:        {r['brand']}")
        print(f"    Category:     {r['category']}")
        print(f"    Price:        ${r['price']:.2f}")
        print(f"    Rating:       {r['rating']:.1f}/5.0")
        print(f"    For:          {r['skin_types']}")
        print(f"    ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━")
        print(f"    Skin Match:   {r['skin_type_score']:.1%} (rule-based)")
        print(f"    Semantic:     {r['semantic_score']:.1%} (SBERT)")
        print(f"    FINAL:        {r['final_score']:.1%} ")
        print()

    match_rate = matches / len(results) * 100
    print(f" MATCH RATE: {matches}/{len(results)} ({match_rate:.0f}%) match query skin types")

    return match_rate

Evaluate baseline on Test Queries

In [None]:
test_queries = [
    "dry sensitive skin with redness",
    "oily acne prone skin",
    "combination skin with dark spots",
    "mature skin with wrinkles"
]

print("\n Testing baseline on sample queries...\n")

all_match_rates = []

for query in test_queries:
    results = recommend_top_k_baseline(query, k=5)
    match_rate = display_baseline_results(results, query)
    all_match_rates.append(match_rate)
    print("\n" + "-"*80 + "\n")

**Baseline Performance Summary**

In [None]:
avg_match_rate = np.mean(all_match_rates)

print(f"\n Baseline Statistics:")
print(f"   Average match rate: {avg_match_rate:.1f}%")
print(f"   Best query: {max(all_match_rates):.0f}%")
print(f"   Worst query: {min(all_match_rates):.0f}%")
print(f"   Standard deviation: {np.std(all_match_rates):.1f}%")

print(f"\n Baseline Performance:")
if avg_match_rate >= 80:
    print(f"    STRONG: {avg_match_rate:.0f}% match rate")
    print(f"   The baseline is working well - will be hard to beat!")
elif avg_match_rate >= 60:
    print(f"    GOOD: {avg_match_rate:.0f}% match rate")
    print(f"   Solid baseline - neural network needs to beat this")
elif avg_match_rate >= 40:
    print(f"     MODERATE: {avg_match_rate:.0f}% match rate")
    print(f"   Baseline is okay - room for improvement")
else:
    print(f"     WEAK: {avg_match_rate:.0f}% match rate")
    print(f"   Low baseline - easier for neural network to beat")



#Evaluation

In [None]:
import torch
import torch.nn as nn
import os
import zipfile
from google.colab import files


File Upload

In [None]:
# Create directories
os.makedirs('./data', exist_ok=True)
os.makedirs('./models', exist_ok=True)

print(' UPLOAD NEURAL NETWORK FILES')
print('\nPlease upload the following files:')
print('1. skincarefull.pth (trained model)')
print('2. product_embeddings.npy')

uploaded = files.upload()

# Handle each file
for filename in uploaded.keys():
    if filename.endswith('.zip'):
        # Extract zip to models directory
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall('./models/')
        print(f' Extracted {filename} to ./models/')

        # Check what was extracted
        extracted_files = os.listdir('./models/')
        print(f'   Extracted files: {extracted_files}')
    elif filename.endswith('.pth'):
        # Move model file to models directory
        target_path = f'./models/{filename}'
        with open(target_path, 'wb') as f:
            f.write(uploaded[filename])
        print(f' Moved {filename} to ./models/')
    elif filename.endswith('.npy'):
        # Move embeddings to data directory
        target_path = f'./data/{filename}'
        with open(target_path, 'wb') as f:
            f.write(uploaded[filename])
        print(f' Moved {filename} to ./data/')

print('\n Upload complete!\n')

Load Model

In [None]:
class CompatibilityNetV1(nn.Module):
    """Original architecture (512->256->128)"""
    def __init__(self, input_dim=770):
        super(CompatibilityNetV1, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.network(x)

class CompatibilityNetV2(nn.Module):
    """Architecture with BatchNorm (256->128->64)"""
    def __init__(self, input_dim=770):
        super(CompatibilityNetV2, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 256),
            nn.ReLU(),
            nn.BatchNorm1d(256),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return self.network(x)

# Find the model file
model_files = [f for f in os.listdir('./models/') if f.endswith('.pth')]
if not model_files:
    raise FileNotFoundError("No .pth model file found in ./models/")

model_path = f'./models/{model_files[0]}'
print(f" Loading model from: {model_path}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f" Device: {device}")

# Load checkpoint and detect architecture
print(" Loading checkpoint and detecting architecture...")

try:
    checkpoint = torch.load(model_path, map_location=device, weights_only=False)

    if 'model_state_dict' in checkpoint:
        state_dict = checkpoint['model_state_dict']
    else:
        state_dict = checkpoint

    # Detect architecture by checking layer shapes
    first_layer_shape = state_dict['network.0.weight'].shape
    has_batchnorm = any('running_mean' in key for key in state_dict.keys())

    print(f"   First layer shape: {first_layer_shape}")
    print(f"   Has BatchNorm: {has_batchnorm}")

    # Choose the right architecture
    if first_layer_shape[0] == 256 and has_batchnorm:
        print("   -> Detected architecture: V2 (256->128->64 with BatchNorm)")
        model = CompatibilityNetV2()
    elif first_layer_shape[0] == 512:
        print("   -> Detected architecture: V1 (512->256->128)")
        model = CompatibilityNetV1()
    else:
        raise ValueError(f"Unknown architecture with first layer shape {first_layer_shape}")

    # Load the weights
    model.load_state_dict(state_dict)
    print(" Loaded model weights from checkpoint")

    # Show optional training info
    if isinstance(checkpoint, dict) and 'hyperparameters' in checkpoint:
        print(f"   Hyperparameters: {checkpoint['hyperparameters']}")
    if isinstance(checkpoint, dict) and 'history' in checkpoint:
        history = checkpoint['history']
        if 'val_loss' in history and len(history['val_loss']) > 0:
            best_val_loss = min(history['val_loss'])
            print(f"   Best validation loss: {best_val_loss:.4f}")

except Exception as e:
    print(f" Error loading model: {e}")
    print("\n If this persists, please share:")
    print("   1. How you saved your model (torch.save code)")
    print("   2. The model architecture from your training notebook")
    raise

model.to(device)
model.eval()
print(" Model ready for evaluation")

# Check if we need to use uploaded embeddings or existing ones
if os.path.exists('./data/product_embeddings.npy'):
    print("\n  Found uploaded product_embeddings.npy")
    use_new = input("Use this instead of the existing embeddings? (yes/no): ").lower()
    if use_new == 'yes':
        product_embeddings = np.load('./data/product_embeddings.npy')
        print(" Using uploaded embeddings")
    else:
        print(" Using existing embeddings from earlier")
else:
    print(" Using existing product embeddings from earlier")

# Prepare price_rank_features (needed for model input)
print("\n Preparing features for neural network...")
price_rank_features = np.column_stack([
    products_df['price'].values,
    products_df['rank'].values
])
print(f" Features ready: {price_rank_features.shape}")

# Define enhanced functions (specificity-aware)
def calculate_specificity_score(product_row):
    """Calculate how specific a product is (fewer skin types = more specific)."""
    skin_type_cols = ['Dry', 'Oily', 'Combination', 'Normal', 'Sensitive']
    num_skin_types = sum([product_row[col] for col in skin_type_cols])
    specificity = 1.0 - (num_skin_types - 1) / 4.0 if num_skin_types > 0 else 0.0
    return specificity

def extract_query_skin_types_nn(query):
    """Extract mentioned skin types from query text."""
    query_lower = query.lower()
    detected = []

    skin_type_keywords = {
        'Dry': ['dry', 'dehydrated', 'flaky', 'tight'],
        'Oily': ['oily', 'greasy', 'shiny', 'sebum'],
        'Combination': ['combination', 'combo', 't-zone'],
        'Normal': ['normal', 'balanced'],
        'Sensitive': ['sensitive', 'reactive', 'redness', 'irritated']
    }

    for skin_type, keywords in skin_type_keywords.items():
        if any(kw in query_lower for kw in keywords):
            detected.append(skin_type)

    return detected if detected else []

def calculate_match_precision(product_row, query_skin_types):
    """Calculate how precisely a product matches the query's skin types."""
    if not query_skin_types:
        return 0.5

    product_skin_types = [
        st for st in ['Dry', 'Oily', 'Combination', 'Normal', 'Sensitive']
        if product_row[st] == 1
    ]

    matches = sum([1 for st in query_skin_types if st in product_skin_types])
    precision = matches / len(query_skin_types) if query_skin_types else 0.5

    # Bonus for exact match
    if set(query_skin_types) == set(product_skin_types):
        precision += 0.2

    return min(precision, 1.0)

def get_nn_base_scores(query):
    """Get base neural network predictions (no enhancements)."""
    query_emb = sbert_model.encode([query], normalize_embeddings=True)[0]

    batch_features = []
    for i in range(len(products_df)):
        product_emb = product_embeddings[i]
        numerical = price_rank_features[i]
        features = np.concatenate([query_emb, product_emb, numerical])
        batch_features.append(features)

    batch_tensor = torch.FloatTensor(np.array(batch_features)).to(device)

    with torch.no_grad():
        scores = model(batch_tensor).cpu().numpy().flatten()

    return scores

def get_nn_enhanced_scores(query, specificity_weight=0.3, precision_weight=0.2):
    """Get enhanced neural network scores with specificity and precision."""
    base_scores = get_nn_base_scores(query)
    query_skin_types = extract_query_skin_types_nn(query)

    enhanced_scores = base_scores.copy()

    for i in range(len(products_df)):
        product_row = products_df.iloc[i]

        # Calculate enhancements
        specificity = calculate_specificity_score(product_row)
        precision = calculate_match_precision(product_row, query_skin_types)

        # Apply multiplicative boosts
        enhanced_scores[i] = (
            base_scores[i] *
            (1.0 + specificity * specificity_weight) *
            (1.0 + precision * precision_weight)
        )

    # Normalize
    if enhanced_scores.max() > 0:
        enhanced_scores = enhanced_scores / enhanced_scores.max()

    return enhanced_scores, base_scores, query_skin_types

def recommend_nn_enhanced(query, k=5, filter_no_skin_type=True):
    """Get enhanced neural network recommendations with optional filtering."""
    enhanced_scores, base_scores, query_skin_types = get_nn_enhanced_scores(query)

    # Optional: Filter out products with no skin type data
    valid_indices = []
    for i in range(len(products_df)):
        if filter_no_skin_type:
            skin_type_count = sum([
                products_df.iloc[i][st] for st in ['Dry', 'Oily', 'Combination', 'Normal', 'Sensitive']
            ])
            if skin_type_count > 0:
                valid_indices.append(i)
        else:
            valid_indices.append(i)

    # Get top candidates
    valid_scores = enhanced_scores[valid_indices]
    num_candidates = min(k * 3, len(valid_indices))
    top_candidate_local = valid_scores.argsort()[-num_candidates:][::-1]
    top_candidates = [valid_indices[i] for i in top_candidate_local]

    # Diversity selection
    selected = []
    selected_brands = set()

    for idx in top_candidates:
        if len(selected) >= k:
            break

        brand = products_df.iloc[idx]['brand']
        brand_count = sum([1 for s in selected if products_df.iloc[s]['brand'] == brand])

        if brand_count < 2:
            selected.append(idx)
            selected_brands.add(brand)

    # Build results
    results = []
    for rank, idx in enumerate(selected, 1):
        product_row = products_df.iloc[idx]

        product_skin_types = [
            st for st in ['Dry', 'Oily', 'Combination', 'Normal', 'Sensitive']
            if product_row[st] == 1
        ]

        matches_query = any([st in product_skin_types for st in query_skin_types]) if query_skin_types else True

        results.append({
            'rank': rank,
            'idx': idx,
            'name': product_row['name'],
            'brand': product_row['brand'],
            'category': product_row['Label'],
            'price': product_row['price'],
            'rating': product_row['rank'],
            'base_score': base_scores[idx],
            'enhanced_score': enhanced_scores[idx],
            'specificity': calculate_specificity_score(product_row),
            'precision': calculate_match_precision(product_row, query_skin_types),
            'skin_types': ', '.join(product_skin_types) if product_skin_types else 'Not specified',
            'matches_query': matches_query,
            'query_skin_types': ', '.join(query_skin_types) if query_skin_types else 'None detected'
        })

    return results

print(" Neural network functions defined")
print("  * Base NN: Raw model predictions")
print("  * Enhanced NN: Specificity + precision adjustments")
print("  * Filtering: Optional removal of products without skin type data")

print("STEP 5: COMPARATIVE EVALUATION")

def evaluate_all_systems(query, k=5):
    """Compare all three systems on a single query."""

    # 1. Baseline
    baseline_results = recommend_top_k_baseline(query, k=k)

    # 2. Neural Network (Base)
    nn_base_scores = get_nn_base_scores(query)
    query_skin_types = extract_query_skin_types_nn(query)

    # Get top k for base NN with diversity
    top_candidates = nn_base_scores.argsort()[-k*3:][::-1]
    nn_base_selected = []
    selected_brands = set()

    for idx in top_candidates:
        if len(nn_base_selected) >= k:
            break
        brand = products_df.iloc[idx]['brand']
        brand_count = sum([1 for i in nn_base_selected if products_df.iloc[i]['brand'] == brand])
        if brand_count < 2:
            nn_base_selected.append(idx)
            selected_brands.add(brand)

    nn_base_results = []
    for rank, idx in enumerate(nn_base_selected, 1):
        product_row = products_df.iloc[idx]
        product_skin_types = [
            st for st in ['Dry', 'Oily', 'Combination', 'Normal', 'Sensitive']
            if product_row[st] == 1
        ]
        matches_query = any([st in product_skin_types for st in query_skin_types]) if query_skin_types else True

        nn_base_results.append({
            'rank': rank,
            'idx': idx,
            'name': product_row['name'],
            'brand': product_row['brand'],
            'final_score': nn_base_scores[idx],
            'skin_types': ', '.join(product_skin_types) if product_skin_types else 'Not specified',
            'matches_query': matches_query
        })

    # 3. Neural Network (Enhanced)
    nn_enhanced_results = recommend_nn_enhanced(query, k=k, filter_no_skin_type=True)

    return baseline_results, nn_base_results, nn_enhanced_results, query_skin_types


def display_comparison(query, baseline_results, nn_base_results, nn_enhanced_results, query_skin_types):
    """Display side-by-side comparison of all three systems."""

    print(f" QUERY: '{query}'")
    print(f"Detected skin types: {', '.join(query_skin_types) if query_skin_types else 'None detected'}")
    print(f"\n{'='*80}")

    # Calculate match rates
    baseline_matches = sum([1 for r in baseline_results if r['matches_query']])
    nn_base_matches = sum([1 for r in nn_base_results if r['matches_query']])
    nn_enhanced_matches = sum([1 for r in nn_enhanced_results if r['matches_query']])

    k = len(baseline_results)

    print(f"\n MATCH RATE COMPARISON:")
    print(f"   Baseline:        {baseline_matches}/{k} ({baseline_matches/k*100:.0f}%)")
    print(f"   NN Base:         {nn_base_matches}/{k} ({nn_base_matches/k*100:.0f}%)")
    print(f"   NN Enhanced:     {nn_enhanced_matches}/{k} ({nn_enhanced_matches/k*100:.0f}%)")

    # Show top 3 from each
    print("TOP 3 PRODUCTS FROM EACH SYSTEM:")

    for i in range(min(3, k)):
        print(f"Rank #{i+1}:")
        print(f"{'─'*80}")

        # Baseline
        b = baseline_results[i]
        match_symbol = "" if b['matches_query'] else "X"
        print(f"  BASELINE {match_symbol}:")
        print(f"    {b['name'][:60]}")
        print(f"    {b['brand']} | {b['category']} | ${b['price']:.2f}")
        print(f"    For: {b['skin_types']}")
        print(f"    Score: {b['final_score']:.1%}")

        # NN Base
        nb = nn_base_results[i]
        match_symbol = "✓" if nb['matches_query'] else "X"
        print(f"\n  NN BASE {match_symbol}:")
        print(f"    {nb['name'][:60]}")
        print(f"    {nb['brand']} | For: {nb['skin_types']}")
        print(f"    Score: {nb['final_score']:.1%}")

        # NN Enhanced
        ne = nn_enhanced_results[i]
        match_symbol = "✓" if ne['matches_query'] else "X"
        print(f"\n  NN ENHANCED {match_symbol}:")
        print(f"    {ne['name'][:60]}")
        print(f"    {ne['brand']} | For: {ne['skin_types']}")
        print(f"    Base: {ne['base_score']:.1%} → Enhanced: {ne['enhanced_score']:.1%}")
        print(f"    Specificity: {ne['specificity']:.1%} | Precision: {ne['precision']:.1%}")

        print()

    return baseline_matches/k*100, nn_base_matches/k*100, nn_enhanced_matches/k*100


# Run comparative evaluation
print("\n Running comparative evaluation on test queries...\n")

test_queries = [
    "dry sensitive skin with redness",
    "oily acne prone skin",
    "combination skin with dark spots",
    "mature skin with wrinkles"
]

all_baseline_rates = []
all_nn_base_rates = []
all_nn_enhanced_rates = []

for query in test_queries:
    baseline_res, nn_base_res, nn_enhanced_res, query_st = evaluate_all_systems(query, k=5)
    b_rate, nb_rate, ne_rate = display_comparison(query, baseline_res, nn_base_res, nn_enhanced_res, query_st)

    all_baseline_rates.append(b_rate)
    all_nn_base_rates.append(nb_rate)
    all_nn_enhanced_rates.append(ne_rate)

    print("\n" + "─"*80 + "\n")

print("FINAL EVALUATION SUMMARY")

avg_baseline = np.mean(all_baseline_rates)
avg_nn_base = np.mean(all_nn_base_rates)
avg_nn_enhanced = np.mean(all_nn_enhanced_rates)

print(f"\n AVERAGE MATCH RATES:")
print(f"   Baseline (Rule-based):         {avg_baseline:.1f}%")
print(f"   Neural Network (Base):         {avg_nn_base:.1f}%")
print(f"   Neural Network (Enhanced):     {avg_nn_enhanced:.1f}%")

print(f"\n IMPROVEMENTS:")
improvement_base = avg_nn_base - avg_baseline
improvement_enhanced = avg_nn_enhanced - avg_baseline
print(f"   NN Base vs Baseline:           {improvement_base:+.1f} percentage points")
print(f"   NN Enhanced vs Baseline:       {improvement_enhanced:+.1f} percentage points")
print(f"   NN Enhanced vs NN Base:        {avg_nn_enhanced - avg_nn_base:+.1f} percentage points")

print("Verdict:")

if avg_nn_enhanced > avg_baseline + 5:
    print(" WINNER: Neural Network (Enhanced)")
    print(f"   The enhanced neural network significantly outperforms the baseline")
    print(f"   by {avg_nn_enhanced - avg_baseline:.1f} percentage points.")
    print(f"   The training complexity is JUSTIFIED.")
elif avg_nn_enhanced > avg_baseline:
    print(" WINNER: Neural Network (Enhanced) - Marginal")
    print(f"   The enhanced neural network slightly outperforms the baseline")
    print(f"   by {avg_nn_enhanced - avg_baseline:.1f} percentage points.")
    print(f"   Consider if the improvement justifies the added complexity.")
elif abs(avg_nn_enhanced - avg_baseline) <= 2:
    print("  TIE: Both systems perform similarly")
    print(f"   Difference is only {abs(avg_nn_enhanced - avg_baseline):.1f} percentage points.")
    print(f"   The simpler baseline may be preferred for production.")
else:
    print("  WINNER: Baseline")
    print(f"   The baseline outperforms the neural network by {avg_baseline - avg_nn_enhanced:.1f} points.")
    print(f"   The neural network training was NOT justified.")

print(f"\n KEY INSIGHTS:")
print(f"   * Enhancement impact: {avg_nn_enhanced - avg_nn_base:+.1f} points")
if avg_nn_enhanced - avg_nn_base > 5:
    print(f"   * The specificity/precision enhancements provide significant value")
elif avg_nn_enhanced - avg_nn_base > 0:
    print(f"   * The specificity/precision enhancements provide modest value")
else:
    print(f"   * The specificity/precision enhancements have minimal impact")

print(f"\n   * Best system: ", end="")
if avg_nn_enhanced == max(avg_baseline, avg_nn_base, avg_nn_enhanced):
    print("Neural Network (Enhanced)")
elif avg_nn_base == max(avg_baseline, avg_nn_base, avg_nn_enhanced):
    print("Neural Network (Base)")
else:
    print("Baseline")

Testing why the 0% scores

In [None]:
print("\n DIAGNOSTIC: Checking model outputs...")
test_query = "dry sensitive skin"
query_emb = sbert_model.encode([test_query], normalize_embeddings=True)[0]

# Check a few products
for i in range(5):
    product_emb = product_embeddings[i]
    numerical = price_rank_features[i]
    features = np.concatenate([query_emb, product_emb, numerical])

    with torch.no_grad():
        score = model(torch.FloatTensor(features).unsqueeze(0).to(device))
        print(f"Product {i}: Score = {score.item():.6f}")

# Check feature statistics
print(f"\nQuery embedding stats: mean={query_emb.mean():.3f}, std={query_emb.std():.3f}")
print(f"Product embedding stats: mean={product_embeddings.mean():.3f}, std={product_embeddings.std():.3f}")
print(f"Price stats: mean={price_rank_features[:, 0].mean():.3f}, std={price_rank_features[:, 0].std():.3f}")
print(f"Rank stats: mean={price_rank_features[:, 1].mean():.3f}, std={price_rank_features[:, 1].std():.3f}")