In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set style for visualizations
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

<b> Product Matching System with validation (Testing, Metrics) and visualizations </b>

In [7]:
# creating test set
matches_df = pd.read_csv('matching_results.csv')

test_set = [
    # Format: (Description_ID, Correct_SKU, Description, Notes)
    ('DESC0001', 'SKU1000010', 'Style Pants for Fall 2025', 'Style brand, Parka (closest to pants)'),
    ('DESC0009', 'SKU1000036', 'White Polo from Alpine, size M', 'Alpine, White, M'),
    ('DESC0015', 'SKU1000023', 'Nordic Vest in Green size M', 'Exact match'),
    ('DESC0017', 'SKU1000035', 'Comfort Coat in Beige size XXL', 'Beige Vest XXL (closest)'),
    ('DESC0023', 'SKU1000004', 'Tan Parka from Premium, size XXL', 'Premium brand issue'),
    ('DESC0031', 'SKU1000006', 'Alpine Parka Gray size L', 'Close match')]

In [8]:
# Create test DataFrame
test_df = pd.DataFrame(test_set, columns=['Description_ID', 'Ground_Truth_SKU', 'Description', 'Notes'])

print(f"Test set created with {len(test_df)} samples")
print("\nSample test entries:")
print(test_df.head())

Test set created with 6 samples

Sample test entries:
  Description_ID Ground_Truth_SKU                       Description  \
0       DESC0001       SKU1000010         Style Pants for Fall 2025   
1       DESC0009       SKU1000036    White Polo from Alpine, size M   
2       DESC0015       SKU1000023       Nordic Vest in Green size M   
3       DESC0017       SKU1000035    Comfort Coat in Beige size XXL   
4       DESC0023       SKU1000004  Tan Parka from Premium, size XXL   

                                   Notes  
0  Style brand, Parka (closest to pants)  
1                       Alpine, White, M  
2                            Exact match  
3               Beige Vest XXL (closest)  
4                    Premium brand issue  


In [9]:
# Save test set
test_df.to_csv('test_set_ground_truth.csv', index=False)
print("\nTest set saved to 'test_set_ground_truth.csv'")


Test set saved to 'test_set_ground_truth.csv'


In [10]:
# Calculate comprehensive performance metrics
def calculate_metrics(test_df, matches_df):    
    metrics = {
        'top1_correct': 0,
        'top3_correct': 0,
        'total': len(test_df),
        'confidence_scores': [],
        'correct_confidences': [],
        'incorrect_confidences': []
    }    
    detailed_results = []
    
    for idx, row in test_df.iterrows():
        desc_id = row['Description_ID']
        ground_truth = row['Ground_Truth_SKU']        
        # Get top 3 matches for this description
        desc_matches = matches_df[matches_df['Description_ID'] == desc_id].sort_values('Match_Rank')
        if len(desc_matches) == 0:
            continue
        
        # Top-1 prediction
        top1_sku = desc_matches.iloc[0]['SKU']
        top1_conf = desc_matches.iloc[0]['Confidence']
        top1_correct = (top1_sku == ground_truth)
        
        if top1_correct:
            metrics['top1_correct'] += 1
            metrics['correct_confidences'].append(top1_conf)
        else:
            metrics['incorrect_confidences'].append(top1_conf)
        
        # Top-3 prediction
        top3_skus = desc_matches['SKU'].tolist()[:3]
        top3_correct = ground_truth in top3_skus
        
        if top3_correct:
            metrics['top3_correct'] += 1
        
        metrics['confidence_scores'].append(top1_conf)
        
        detailed_results.append({
            'Description_ID': desc_id,
            'Ground_Truth': ground_truth,
            'Predicted_SKU': top1_sku,
            'Top1_Correct': top1_correct,
            'Top3_Correct': top3_correct,
            'Confidence': top1_conf,
            'Top3_SKUs': ', '.join(top3_skus)
        })    
    # Calculate percentages
    metrics['top1_accuracy'] = (metrics['top1_correct'] / metrics['total']) * 100
    metrics['top3_accuracy'] = (metrics['top3_correct'] / metrics['total']) * 100
    metrics['avg_confidence'] = np.mean(metrics['confidence_scores'])
    metrics['avg_correct_confidence'] = np.mean(metrics['correct_confidences']) if metrics['correct_confidences'] else 0
    metrics['avg_incorrect_confidence'] = np.mean(metrics['incorrect_confidences']) if metrics['incorrect_confidences'] else 0
    
    return metrics, pd.DataFrame(detailed_results)

In [11]:
# Calculate metrics
performance_metrics, detailed_results = calculate_metrics(test_df, matches_df)

print("PERFORMANCE METRICS")
print(f"Total Test Cases: {performance_metrics['total']}")
print(f"\nTop-1 Accuracy: {performance_metrics['top1_accuracy']:.2f}%")
print(f"Top-3 Accuracy: {performance_metrics['top3_accuracy']:.2f}%")
print(f"\nAverage Confidence: {performance_metrics['avg_confidence']:.2f}%")
print(f"Avg Confidence (Correct): {performance_metrics['avg_correct_confidence']:.2f}%")
print(f"Avg Confidence (Incorrect): {performance_metrics['avg_incorrect_confidence']:.2f}%")

# Save detailed results
detailed_results.to_csv('validation_results.csv', index=False)
print("\nDetailed validation results saved to 'validation_results.csv'")

PERFORMANCE METRICS
Total Test Cases: 6

Top-1 Accuracy: 16.67%
Top-3 Accuracy: 16.67%

Average Confidence: 82.42%
Avg Confidence (Correct): 89.37%
Avg Confidence (Incorrect): 81.03%

Detailed validation results saved to 'validation_results.csv'


<b> Let's investigate the error analysis </b>

In [12]:
# Analyze errors
errors_df = detailed_results[detailed_results['Top1_Correct'] == False]

print(f"\nTotal Errors: {len(errors_df)}")
print("\nError Cases:")
print(errors_df[['Description_ID', 'Ground_Truth', 'Predicted_SKU', 'Confidence']])

# Error categorization
error_analysis = {
    'wrong_brand': 0,
    'wrong_color': 0,
    'wrong_size': 0,
    'wrong_category': 0,
    'ambiguous_description': 0
}


Total Errors: 5

Error Cases:
  Description_ID Ground_Truth Predicted_SKU  Confidence
0       DESC0001   SKU1000010    SKU1000425       82.56
1       DESC0009   SKU1000036    SKU1000182       73.21
3       DESC0017   SKU1000035    SKU1000109       88.55
4       DESC0023   SKU1000004    SKU1000142       71.05
5       DESC0031   SKU1000006    SKU1000046       89.79


In [13]:
# Load catalog for comparison
catalog_df = pd.read_csv('b_product_catalog.csv')

for idx, error in errors_df.iterrows():
    desc_id = error['Description_ID']
    # Analyze why it failed - you'd need to implement detailed logic here
    # For now, we'll categorize based on confidence
    if error['Confidence'] < 40:
        error_analysis['ambiguous_description'] += 1
    elif error['Confidence'] < 60:
        error_analysis['wrong_category'] += 1
    else:
        error_analysis['wrong_brand'] += 1

In [14]:
print("\nError Breakdown:")
for error_type, count in error_analysis.items():
    print(f"  {error_type}: {count}")


Error Breakdown:
  wrong_brand: 5
  wrong_color: 0
  wrong_size: 0
  wrong_category: 0
  ambiguous_description: 0


<b> Let's analyze results by category </b>

In [15]:
# Merge with catalog to get categories
results_with_cat = detailed_results.merge(
    catalog_df[['SKU', 'Subcategory']], 
    left_on='Ground_Truth', 
    right_on='SKU',
    how='left'
)

In [16]:
# Calculate accuracy by subcategory
category_performance = results_with_cat.groupby('Subcategory').agg({
    'Top1_Correct': 'mean',
    'Top3_Correct': 'mean',
    'Confidence': 'mean'
}).round(2) * 100

print("\nPerformance by Subcategory:")
print(category_performance)


Performance by Subcategory:
             Top1_Correct  Top3_Correct  Confidence
Subcategory                                        
Jacket                0.0           0.0      8150.0
Parka                 0.0           0.0      7681.0
Vest                 50.0          50.0      8896.0


<b> cretating visualizations </b>

In [17]:
# First visulization : Confidence Distribution
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# All matches confidence distribution
all_top1 = matches_df[matches_df['Match_Rank'] == 1]
axes[0].hist(all_top1['Confidence'], bins=20, color='skyblue', edgecolor='black', alpha=0.7)
axes[0].axvline(performance_metrics['avg_confidence'], color='red', linestyle='--', 
                label=f'Mean: {performance_metrics["avg_confidence"]:.1f}%')
axes[0].set_xlabel('Confidence Score (%)', fontsize=12)
axes[0].set_ylabel('Frequency', fontsize=12)
axes[0].set_title('Distribution of Confidence Scores (All Matches)', fontsize=14, fontweight='bold')
axes[0].legend()
axes[0].grid(axis='y', alpha=0.3)

# Correct vs Incorrect confidence
axes[1].hist([performance_metrics['correct_confidences'], 
              performance_metrics['incorrect_confidences']], 
             bins=15, label=['Correct Matches', 'Incorrect Matches'],
             color=['green', 'red'], alpha=0.6, edgecolor='black')
axes[1].set_xlabel('Confidence Score (%)', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].set_title('Confidence: Correct vs Incorrect Matches', fontsize=14, fontweight='bold')
axes[1].legend()
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('viz1_confidence_distribution.png', dpi=300, bbox_inches='tight')
print("✓ Visualization 1 saved: viz1_confidence_distribution.png")
plt.close()

✓ Visualization 1 saved: viz1_confidence_distribution.png


In [18]:
# Second visualization: Accuracy Metrics
fig, ax = plt.subplots(figsize=(10, 6))

metrics_data = {
    'Top-1 Accuracy': performance_metrics['top1_accuracy'],
    'Top-3 Accuracy': performance_metrics['top3_accuracy'],
    'Avg Confidence': performance_metrics['avg_confidence']
}

bars = ax.bar(metrics_data.keys(), metrics_data.values(), 
              color=['#2ecc71', '#3498db', '#f39c12'], 
              edgecolor='black', linewidth=1.5)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width()/2., height,
            f'{height:.1f}%',
            ha='center', va='bottom', fontsize=12, fontweight='bold')

ax.set_ylabel('Percentage (%)', fontsize=12)
ax.set_title('Model Performance Metrics', fontsize=14, fontweight='bold')
ax.set_ylim(0, 100)
ax.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('viz2_accuracy_metrics.png', dpi=300, bbox_inches='tight')
print("✓ Visualization 2 saved: viz2_accuracy_metrics.png")
plt.close()


✓ Visualization 2 saved: viz2_accuracy_metrics.png


In [19]:
# Third visualization: Error Analysis
fig, ax = plt.subplots(figsize=(10, 6))

error_types = list(error_analysis.keys())
error_counts = list(error_analysis.values())

colors = ['#e74c3c', '#e67e22', '#f39c12', '#3498db', '#9b59b6']
wedges, texts, autotexts = ax.pie(error_counts, labels=error_types, autopct='%1.1f%%',
                                    colors=colors, startangle=90,
                                    textprops={'fontsize': 11})

# Make percentage text bold
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontweight('bold')

ax.set_title('Error Analysis by Type', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('viz3_error_analysis.png', dpi=300, bbox_inches='tight')
print("✓ Visualization 3 saved: viz3_error_analysis.png")
plt.close()

✓ Visualization 3 saved: viz3_error_analysis.png
