In [1]:
# Setup and imports
import os
import sys
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from PIL import Image
import warnings
warnings.filterwarnings('ignore')

import torch
import clip
from sklearn.metrics import (
    accuracy_score, precision_recall_fscore_support,
    confusion_matrix, classification_report
)

# Import custom modules
sys.path.append(str(Path.cwd().parent))
from src.ensemble_classifier import EnsembleClassifier
from src.smart_validator import SmartValidator

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['font.size'] = 11

# Device configuration
device = "mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
print(f"PyTorch version: {torch.__version__}")

Using device: mps
PyTorch version: 2.9.0


## 1. Load Ensemble Classifier and Data

In [3]:
# Initialize ensemble classifier
DATA_ROOT = "../data/deepfashion_subset"
RESULTS_DIR = "../results"

print("Loading CLIP model and ensemble components...")
ensemble = EnsembleClassifier(device=device)
validator = SmartValidator()

# Load class names
CLASSMAP = os.path.join(RESULTS_DIR, "class_to_idx.json")
with open(CLASSMAP, 'r') as f:
    idx_to_class = json.load(f)
    class_to_idx = {v: int(k) for k, v in idx_to_class.items()}

num_classes = len(idx_to_class)
class_names = [idx_to_class[str(i)] for i in range(num_classes)]

print(f"\nClasses ({num_classes}): {class_names}")
print("\nEnsemble Classifier loaded successfully!")
print(f"  - CLIP Model: ViT-B/32")
print(f"  - Components: CLIP + Keyword + Path Analysis")
print(f"  - Smart Validator: Confidence-based (thresholds: 0.90/0.70/0.50)")

Loading CLIP model and ensemble components...


TypeError: EnsembleClassifier.__init__() missing 3 required positional arguments: 'clip_model', 'clip_preprocess', and 'categories'

## 2. Load Test Data and Generate Predictions

In [None]:
# Load test images
test_dir = os.path.join(DATA_ROOT, "test")
test_images = []
test_labels = []
test_paths = []

for class_name in class_names:
    class_dir = os.path.join(test_dir, class_name)
    if not os.path.exists(class_dir):
        continue
    
    for img_file in os.listdir(class_dir):
        if img_file.endswith(('.jpg', '.jpeg', '.png')):
            img_path = os.path.join(class_dir, img_file)
            test_paths.append(img_path)
            test_labels.append(class_to_idx[class_name])

print(f"Found {len(test_paths)} test images")
print(f"Class distribution:")
for i, class_name in enumerate(class_names):
    count = test_labels.count(i)
    print(f"  {class_name}: {count}")

In [None]:
# Generate predictions with ensemble classifier
print("\nGenerating predictions with Ensemble Classifier...")

y_true = []
y_pred = []
y_probs = []
predictions_detailed = []

for i, (img_path, true_label) in enumerate(zip(test_paths, test_labels)):
    if (i + 1) % 50 == 0:
        print(f"  Processing {i+1}/{len(test_paths)}...")
    
    try:
        # Load image
        image = Image.open(img_path).convert('RGB')
        
        # Get ensemble prediction
        result = ensemble.classify_with_ensemble(
            image=image,
            image_path=img_path,
            class_names=class_names
        )
        
        # Validate
        validated = validator.validate_classification(
            result['predicted_class'],
            result['confidence'],
            result
        )
        
        pred_label = class_to_idx[validated['final_class']]
        
        y_true.append(true_label)
        y_pred.append(pred_label)
        y_probs.append(validated['final_confidence'])
        
        predictions_detailed.append({
            'image_path': img_path,
            'true_class': class_names[true_label],
            'pred_class': validated['final_class'],
            'confidence': validated['final_confidence'],
            'clip_score': result['clip_score'],
            'keyword_score': result['keyword_score'],
            'path_score': result['path_score'],
            'correct': (pred_label == true_label)
        })
        
    except Exception as e:
        print(f"  Error processing {img_path}: {e}")
        continue

y_true = np.array(y_true)
y_pred = np.array(y_pred)
y_probs = np.array(y_probs)

print(f"\nPredictions complete: {len(y_true)} samples processed")

## 3. Overall Performance Metrics

In [None]:
# Calculate metrics
accuracy = accuracy_score(y_true, y_pred)
precision, recall, f1, support = precision_recall_fscore_support(
    y_true, y_pred, average=None, zero_division=0
)

# Create metrics dataframe
metrics_df = pd.DataFrame({
    'Class': class_names,
    'Precision': precision,
    'Recall': recall,
    'F1-Score': f1,
    'Support': support
})

print(f"\n{'='*70}")
print(f"DELIVERABLE 3 - ENSEMBLE CLASSIFIER PERFORMANCE")
print(f"{'='*70}\n")
print(f"Model: CLIP ViT-B/32 + Keyword + Path Analysis")
print(f"Overall Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)\n")
print(metrics_df.to_string(index=False))
print(f"\n{'='*70}")

# Macro averages
print(f"\nMacro Averages:")
print(f"  Precision: {precision.mean():.4f}")
print(f"  Recall:    {recall.mean():.4f}")
print(f"  F1-Score:  {f1.mean():.4f}")

# Weighted averages
weighted_p = np.average(precision, weights=support)
weighted_r = np.average(recall, weights=support)
weighted_f1 = np.average(f1, weights=support)
print(f"\nWeighted Averages:")
print(f"  Precision: {weighted_p:.4f}")
print(f"  Recall:    {weighted_r:.4f}")
print(f"  F1-Score:  {weighted_f1:.4f}")

## 4. Confusion Matrix Analysis

In [None]:
# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
cm_normalized = cm.astype('float') / (cm.sum(axis=1)[:, np.newaxis] + 1e-10)

fig, axes = plt.subplots(1, 2, figsize=(18, 7))

# Raw counts
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=class_names, yticklabels=class_names,
            ax=axes[0], cbar_kws={'label': 'Count'}, annot_kws={'size': 10})
axes[0].set_xlabel('Predicted Label', fontweight='bold', fontsize=12)
axes[0].set_ylabel('True Label', fontweight='bold', fontsize=12)
axes[0].set_title('Confusion Matrix (Raw Counts)', fontweight='bold', fontsize=14)
axes[0].tick_params(axis='x', rotation=45)

# Normalized
sns.heatmap(cm_normalized, annot=True, fmt='.2f', cmap='RdYlGn',
            xticklabels=class_names, yticklabels=class_names,
            ax=axes[1], cbar_kws={'label': 'Proportion'}, annot_kws={'size': 10})
axes[1].set_xlabel('Predicted Label', fontweight='bold', fontsize=12)
axes[1].set_ylabel('True Label', fontweight='bold', fontsize=12)
axes[1].set_title('Confusion Matrix (Normalized)', fontweight='bold', fontsize=14)
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, 'deliverable3_v4_confusion_matrix.png'), 
            dpi=300, bbox_inches='tight')
plt.show()

# Identify most confused pairs
print("\nMost Confused Class Pairs:")
confused_pairs = []
for i in range(len(class_names)):
    for j in range(len(class_names)):
        if i != j and cm[i, j] > 0:
            confused_pairs.append((class_names[i], class_names[j], cm[i, j], cm_normalized[i, j]))

confused_pairs.sort(key=lambda x: x[2], reverse=True)
for true_class, pred_class, count, prop in confused_pairs[:10]:
    print(f"  {true_class} → {pred_class}: {count} cases ({prop*100:.1f}%)")

## 5. Per-Class Performance Analysis

In [None]:
# Visualize per-class metrics
fig, axes = plt.subplots(2, 2, figsize=(16, 11))

# Precision
axes[0, 0].bar(class_names, precision, color='steelblue', alpha=0.8)
axes[0, 0].set_ylabel('Precision', fontweight='bold', fontsize=12)
axes[0, 0].set_title('Precision by Class', fontweight='bold', fontsize=14)
axes[0, 0].set_ylim([0, 1.05])
axes[0, 0].axhline(y=precision.mean(), color='red', linestyle='--', linewidth=2,
                   label=f'Mean: {precision.mean():.3f}')
axes[0, 0].legend(fontsize=10)
axes[0, 0].grid(axis='y', alpha=0.3)
axes[0, 0].tick_params(axis='x', rotation=45)

# Recall
axes[0, 1].bar(class_names, recall, color='coral', alpha=0.8)
axes[0, 1].set_ylabel('Recall', fontweight='bold', fontsize=12)
axes[0, 1].set_title('Recall by Class', fontweight='bold', fontsize=14)
axes[0, 1].set_ylim([0, 1.05])
axes[0, 1].axhline(y=recall.mean(), color='red', linestyle='--', linewidth=2,
                   label=f'Mean: {recall.mean():.3f}')
axes[0, 1].legend(fontsize=10)
axes[0, 1].grid(axis='y', alpha=0.3)
axes[0, 1].tick_params(axis='x', rotation=45)

# F1-Score
axes[1, 0].bar(class_names, f1, color='seagreen', alpha=0.8)
axes[1, 0].set_ylabel('F1-Score', fontweight='bold', fontsize=12)
axes[1, 0].set_title('F1-Score by Class', fontweight='bold', fontsize=14)
axes[1, 0].set_ylim([0, 1.05])
axes[1, 0].axhline(y=f1.mean(), color='red', linestyle='--', linewidth=2,
                   label=f'Mean: {f1.mean():.3f}')
axes[1, 0].legend(fontsize=10)
axes[1, 0].grid(axis='y', alpha=0.3)
axes[1, 0].tick_params(axis='x', rotation=45)

# Support
axes[1, 1].bar(class_names, support, color='mediumpurple', alpha=0.8)
axes[1, 1].set_ylabel('Support (# samples)', fontweight='bold', fontsize=12)
axes[1, 1].set_title('Test Set Distribution', fontweight='bold', fontsize=14)
axes[1, 1].grid(axis='y', alpha=0.3)
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, 'deliverable3_v4_per_class_metrics.png'), 
            dpi=300, bbox_inches='tight')
plt.show()

## 6. Ensemble Component Analysis

In [None]:
# Analyze contribution of each ensemble component
predictions_df = pd.DataFrame(predictions_detailed)

fig, axes = plt.subplots(2, 2, figsize=(16, 11))

# CLIP scores distribution
axes[0, 0].hist(predictions_df[predictions_df['correct']]['clip_score'], 
                bins=30, alpha=0.7, label='Correct', color='green', edgecolor='black')
axes[0, 0].hist(predictions_df[~predictions_df['correct']]['clip_score'], 
                bins=30, alpha=0.7, label='Incorrect', color='red', edgecolor='black')
axes[0, 0].set_xlabel('CLIP Score', fontweight='bold', fontsize=12)
axes[0, 0].set_ylabel('Frequency', fontweight='bold', fontsize=12)
axes[0, 0].set_title('CLIP Score Distribution', fontweight='bold', fontsize=14)
axes[0, 0].legend(fontsize=11)
axes[0, 0].grid(alpha=0.3)

# Keyword scores distribution
axes[0, 1].hist(predictions_df[predictions_df['correct']]['keyword_score'], 
                bins=30, alpha=0.7, label='Correct', color='green', edgecolor='black')
axes[0, 1].hist(predictions_df[~predictions_df['correct']]['keyword_score'], 
                bins=30, alpha=0.7, label='Incorrect', color='red', edgecolor='black')
axes[0, 1].set_xlabel('Keyword Score', fontweight='bold', fontsize=12)
axes[0, 1].set_ylabel('Frequency', fontweight='bold', fontsize=12)
axes[0, 1].set_title('Keyword Score Distribution', fontweight='bold', fontsize=14)
axes[0, 1].legend(fontsize=11)
axes[0, 1].grid(alpha=0.3)

# Path scores distribution
axes[1, 0].hist(predictions_df[predictions_df['correct']]['path_score'], 
                bins=30, alpha=0.7, label='Correct', color='green', edgecolor='black')
axes[1, 0].hist(predictions_df[~predictions_df['correct']]['path_score'], 
                bins=30, alpha=0.7, label='Incorrect', color='red', edgecolor='black')
axes[1, 0].set_xlabel('Path Score', fontweight='bold', fontsize=12)
axes[1, 0].set_ylabel('Frequency', fontweight='bold', fontsize=12)
axes[1, 0].set_title('Path Score Distribution', fontweight='bold', fontsize=14)
axes[1, 0].legend(fontsize=11)
axes[1, 0].grid(alpha=0.3)

# Final confidence distribution
axes[1, 1].hist(predictions_df[predictions_df['correct']]['confidence'], 
                bins=30, alpha=0.7, label='Correct', color='green', edgecolor='black')
axes[1, 1].hist(predictions_df[~predictions_df['correct']]['confidence'], 
                bins=30, alpha=0.7, label='Incorrect', color='red', edgecolor='black')
axes[1, 1].set_xlabel('Final Confidence', fontweight='bold', fontsize=12)
axes[1, 1].set_ylabel('Frequency', fontweight='bold', fontsize=12)
axes[1, 1].set_title('Final Confidence Distribution', fontweight='bold', fontsize=14)
axes[1, 1].legend(fontsize=11)
axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, 'deliverable3_v4_ensemble_components.png'), 
            dpi=300, bbox_inches='tight')
plt.show()

# Print average scores
print("\nAverage Component Scores:")
print(f"Correct Predictions:")
print(f"  CLIP:    {predictions_df[predictions_df['correct']]['clip_score'].mean():.4f}")
print(f"  Keyword: {predictions_df[predictions_df['correct']]['keyword_score'].mean():.4f}")
print(f"  Path:    {predictions_df[predictions_df['correct']]['path_score'].mean():.4f}")
print(f"\nIncorrect Predictions:")
print(f"  CLIP:    {predictions_df[~predictions_df['correct']]['clip_score'].mean():.4f}")
print(f"  Keyword: {predictions_df[~predictions_df['correct']]['keyword_score'].mean():.4f}")
print(f"  Path:    {predictions_df[~predictions_df['correct']]['path_score'].mean():.4f}")

## 7. System Evolution Comparison

In [None]:
# Historical performance data
evolution_data = {
    'Version': ['Deliverable 2\n(ResNet50)', 'Deliverable 3 v1\n(CLIP Only)', 
                'Deliverable 3 v2\n(CLIP + Keyword)', 'Deliverable 3 v4\n(Full Ensemble)'],
    'Accuracy': [0.566, 0.620, 0.680, accuracy],
    'Precision': [0.572, 0.635, 0.695, precision.mean()],
    'Recall': [0.566, 0.620, 0.680, recall.mean()],
    'F1-Score': [0.568, 0.625, 0.685, f1.mean()]
}

evolution_df = pd.DataFrame(evolution_data)

# Plot evolution
fig, axes = plt.subplots(1, 2, figsize=(18, 7))

# Line plot
x = np.arange(len(evolution_df))
axes[0].plot(x, evolution_df['Accuracy'], marker='o', linewidth=3, 
             markersize=10, label='Accuracy', color='steelblue')
axes[0].plot(x, evolution_df['Precision'], marker='s', linewidth=3, 
             markersize=10, label='Precision', color='coral')
axes[0].plot(x, evolution_df['Recall'], marker='^', linewidth=3, 
             markersize=10, label='Recall', color='seagreen')
axes[0].plot(x, evolution_df['F1-Score'], marker='D', linewidth=3, 
             markersize=10, label='F1-Score', color='mediumpurple')

axes[0].set_xticks(x)
axes[0].set_xticklabels(evolution_df['Version'], fontsize=10)
axes[0].set_ylabel('Score', fontweight='bold', fontsize=13)
axes[0].set_title('System Performance Evolution', fontweight='bold', fontsize=15)
axes[0].set_ylim([0.5, 0.85])
axes[0].legend(fontsize=12, loc='lower right')
axes[0].grid(alpha=0.3)

# Add value labels
for metric in ['Accuracy', 'Precision', 'Recall', 'F1-Score']:
    for i, val in enumerate(evolution_df[metric]):
        axes[0].text(i, val + 0.01, f'{val:.3f}', ha='center', va='bottom', 
                     fontsize=9, fontweight='bold')

# Bar chart comparison
bar_width = 0.2
x_pos = np.arange(len(evolution_df))

axes[1].bar(x_pos - 1.5*bar_width, evolution_df['Accuracy'], bar_width, 
            label='Accuracy', color='steelblue', alpha=0.8)
axes[1].bar(x_pos - 0.5*bar_width, evolution_df['Precision'], bar_width, 
            label='Precision', color='coral', alpha=0.8)
axes[1].bar(x_pos + 0.5*bar_width, evolution_df['Recall'], bar_width, 
            label='Recall', color='seagreen', alpha=0.8)
axes[1].bar(x_pos + 1.5*bar_width, evolution_df['F1-Score'], bar_width, 
            label='F1-Score', color='mediumpurple', alpha=0.8)

axes[1].set_xticks(x_pos)
axes[1].set_xticklabels(evolution_df['Version'], fontsize=10)
axes[1].set_ylabel('Score', fontweight='bold', fontsize=13)
axes[1].set_title('Performance Comparison Across Versions', fontweight='bold', fontsize=15)
axes[1].set_ylim([0, 0.9])
axes[1].legend(fontsize=11)
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, 'deliverable3_v4_evolution.png'), 
            dpi=300, bbox_inches='tight')
plt.show()

# Print improvements
print("\nPerformance Improvements:")
baseline_acc = evolution_df['Accuracy'].iloc[0]
current_acc = evolution_df['Accuracy'].iloc[-1]
improvement = (current_acc - baseline_acc) * 100
print(f"  Accuracy: {baseline_acc:.1%} → {current_acc:.1%} (+{improvement:.1f}%)")

baseline_f1 = evolution_df['F1-Score'].iloc[0]
current_f1 = evolution_df['F1-Score'].iloc[-1]
improvement_f1 = (current_f1 - baseline_f1) * 100
print(f"  F1-Score: {baseline_f1:.1%} → {current_f1:.1%} (+{improvement_f1:.1f}%)")

## 8. Error Analysis

In [None]:
# Analyze misclassified samples
errors_df = predictions_df[~predictions_df['correct']].copy()
correct_df = predictions_df[predictions_df['correct']].copy()

print(f"\n{'='*70}")
print("ERROR ANALYSIS")
print(f"{'='*70}\n")
print(f"Total test samples: {len(predictions_df)}")
print(f"Correct predictions: {len(correct_df)} ({len(correct_df)/len(predictions_df)*100:.1f}%)")
print(f"Misclassified: {len(errors_df)} ({len(errors_df)/len(predictions_df)*100:.1f}%)")

# Confidence analysis
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Histogram
axes[0].hist(correct_df['confidence'], bins=25, alpha=0.7, 
             label=f'Correct (n={len(correct_df)})', color='green', edgecolor='black')
axes[0].hist(errors_df['confidence'], bins=25, alpha=0.7, 
             label=f'Errors (n={len(errors_df)})', color='red', edgecolor='black')
axes[0].set_xlabel('Confidence Score', fontweight='bold', fontsize=12)
axes[0].set_ylabel('Frequency', fontweight='bold', fontsize=12)
axes[0].set_title('Confidence Distribution: Correct vs Errors', fontweight='bold', fontsize=14)
axes[0].legend(fontsize=11)
axes[0].grid(alpha=0.3)

# Box plot
data_to_plot = [correct_df['confidence'], errors_df['confidence']]
bp = axes[1].boxplot(data_to_plot, labels=['Correct', 'Errors'],
                      patch_artist=True, widths=0.5)
for patch, color in zip(bp['boxes'], ['lightgreen', 'lightcoral']):
    patch.set_facecolor(color)
axes[1].set_ylabel('Confidence Score', fontweight='bold', fontsize=12)
axes[1].set_title('Confidence Comparison', fontweight='bold', fontsize=14)
axes[1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, 'deliverable3_v4_error_analysis.png'), 
            dpi=300, bbox_inches='tight')
plt.show()

print(f"\nConfidence Statistics:")
print(f"  Correct predictions: mean={correct_df['confidence'].mean():.3f}, std={correct_df['confidence'].std():.3f}")
print(f"  Errors: mean={errors_df['confidence'].mean():.3f}, std={errors_df['confidence'].std():.3f}")

# Most common error patterns
print(f"\nMost Common Error Patterns:")
error_patterns = errors_df.groupby(['true_class', 'pred_class']).size().sort_values(ascending=False)
for (true_c, pred_c), count in error_patterns.head(10).items():
    print(f"  {true_c} → {pred_c}: {count} cases")

## 9. Save Comprehensive Report

In [None]:
# Create comprehensive metrics report
report = {
    "deliverable": "3.0 v4",
    "model": "Ensemble Classifier (CLIP ViT-B/32 + Keyword + Path + Smart Validator)",
    "date": "January 2025",
    "test_accuracy": float(accuracy),
    "per_class_metrics": {
        class_names[i]: {
            "precision": float(precision[i]),
            "recall": float(recall[i]),
            "f1_score": float(f1[i]),
            "support": int(support[i])
        } for i in range(num_classes)
    },
    "macro_averages": {
        "precision": float(precision.mean()),
        "recall": float(recall.mean()),
        "f1_score": float(f1.mean())
    },
    "weighted_averages": {
        "precision": float(weighted_p),
        "recall": float(weighted_r),
        "f1_score": float(weighted_f1)
    },
    "ensemble_analysis": {
        "clip_contribution": "95% (primary visual understanding)",
        "keyword_contribution": "3% (category disambiguation)",
        "path_contribution": "2% (file naming patterns)",
        "avg_clip_score_correct": float(predictions_df[predictions_df['correct']]['clip_score'].mean()),
        "avg_clip_score_error": float(predictions_df[~predictions_df['correct']]['clip_score'].mean())
    },
    "error_analysis": {
        "total_errors": int(len(errors_df)),
        "error_rate": float(len(errors_df) / len(predictions_df)),
        "avg_error_confidence": float(errors_df['confidence'].mean()),
        "avg_correct_confidence": float(correct_df['confidence'].mean())
    },
    "improvements_from_baseline": {
        "baseline_model": "ResNet50 (Deliverable 2)",
        "baseline_accuracy": 0.566,
        "current_accuracy": float(accuracy),
        "absolute_gain": float(accuracy - 0.566),
        "relative_gain": f"+{(accuracy - 0.566)*100:.2f}%"
    }
}

# Save JSON report
output_path = os.path.join(RESULTS_DIR, "deliverable3_v4_metrics.json")
with open(output_path, "w") as f:
    json.dump(report, f, indent=2)

print(f"\n{'='*70}")
print("EVALUATION COMPLETE")
print(f"{'='*70}\n")
print(f"Generated artifacts:")
print(f"  - deliverable3_v4_confusion_matrix.png")
print(f"  - deliverable3_v4_per_class_metrics.png")
print(f"  - deliverable3_v4_ensemble_components.png")
print(f"  - deliverable3_v4_evolution.png")
print(f"  - deliverable3_v4_error_analysis.png")
print(f"  - deliverable3_v4_metrics.json")
print(f"\nAll visualizations saved to: {RESULTS_DIR}")
print(f"\nReady for IEEE report writing!")

## Summary for IEEE Report

### Key Findings:

1. **Overall Performance**: The ensemble classifier achieved {accuracy:.1%} accuracy, representing a significant improvement over the baseline ResNet50 model (56.6%)

2. **Ensemble Components**:
   - CLIP ViT-B/32: 95% contribution (primary visual understanding)
   - Keyword Classifier: 3% contribution (category disambiguation)
   - Path Analysis: 2% contribution (file naming patterns)

3. **Per-Class Performance**: 
   - Best performing classes: [To be filled after execution]
   - Challenging classes: [To be filled after execution]

4. **Error Analysis**:
   - Average confidence for correct predictions: Higher than errors
   - Common confusion patterns: [To be analyzed]

5. **System Evolution**: Progressive improvement from ResNet50 → CLIP → CLIP+Keyword → Full Ensemble

### Recommendations for Report:
- Include all generated visualizations
- Emphasize the multi-signal ensemble approach
- Discuss the trade-offs between model complexity and performance
- Highlight the explainability features (style/color/material analysis)
- Compare with state-of-the-art fashion recommendation systems