# H&M Recommendation Model Experiment Analysis

This notebook analyzes the results from model comparison experiments.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import yaml
from datetime import datetime

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
%matplotlib inline

## Load Experiment Results

In [None]:
# Find latest experiment directory
experiments_dir = Path('../experiments')
experiment_dirs = sorted([d for d in experiments_dir.iterdir() if d.is_dir()], 
                        key=lambda x: x.stat().st_mtime, reverse=True)

if experiment_dirs:
    latest_exp_dir = experiment_dirs[0]
    print(f"Latest experiment: {latest_exp_dir.name}")
else:
    print("No experiments found. Run experiments first!")
    latest_exp_dir = None

In [None]:
# Load results from all experiments
def load_experiment_results(exp_dir):
    """Load all experiment results from a directory."""
    results = []
    
    for model_dir in exp_dir.iterdir():
        if model_dir.is_dir():
            # Try to load results.yaml
            results_files = list(model_dir.glob('*_results.yaml'))
            if results_files:
                with open(results_files[0], 'r') as f:
                    data = yaml.safe_load(f)
                    data['model'] = model_dir.name
                    results.append(data)
            
            # Try to load results.json as backup
            elif (model_dir / 'results.json').exists():
                with open(model_dir / 'results.json', 'r') as f:
                    data = json.load(f)
                    results.append(data)
    
    return pd.DataFrame(results)

if latest_exp_dir:
    results_df = load_experiment_results(latest_exp_dir)
    print(f"Loaded {len(results_df)} experiment results")
    display(results_df)

## Performance Comparison

In [None]:
# Main metrics comparison
if len(results_df) > 0:
    metrics = ['test_map', 'test_recall', 'test_precision', 'test_ndcg']
    available_metrics = [m for m in metrics if m in results_df.columns]
    
    if available_metrics:
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        axes = axes.flatten()
        
        for i, metric in enumerate(available_metrics):
            ax = axes[i]
            data = results_df[['model', metric]].dropna()
            
            bars = ax.bar(data['model'], data[metric])
            ax.set_title(f'{metric.upper()} by Model', fontsize=14)
            ax.set_xlabel('Model', fontsize=12)
            ax.set_ylabel(metric.upper(), fontsize=12)
            ax.tick_params(axis='x', rotation=45)
            
            # Color best performer
            best_idx = data[metric].idxmax()
            bars[best_idx].set_color('red')
            
            # Add value labels
            for j, (idx, row) in enumerate(data.iterrows()):
                ax.text(j, row[metric] + 0.001, f'{row[metric]:.4f}', 
                       ha='center', va='bottom', fontsize=10)
        
        # Hide unused subplots
        for i in range(len(available_metrics), len(axes)):
            axes[i].set_visible(False)
        
        plt.tight_layout()
        plt.show()
    else:
        print("No test metrics found in results")

## Detailed Performance Analysis

In [None]:
# Create performance summary table
if len(results_df) > 0 and 'test_map' in results_df.columns:
    summary_cols = ['model', 'test_map', 'test_recall', 'test_precision', 'test_ndcg']
    summary_cols = [col for col in summary_cols if col in results_df.columns]
    
    summary_df = results_df[summary_cols].copy()
    summary_df = summary_df.round(4)
    summary_df = summary_df.sort_values('test_map', ascending=False)
    
    # Calculate rank for each metric
    for metric in ['test_map', 'test_recall', 'test_precision', 'test_ndcg']:
        if metric in summary_df.columns:
            summary_df[f'{metric}_rank'] = summary_df[metric].rank(ascending=False)
    
    print("Performance Summary (sorted by MAP@12):")
    display(summary_df)
    
    # Best model analysis
    best_model = summary_df.iloc[0]
    print(f"\nBest Model: {best_model['model']}")
    print(f"MAP@12: {best_model['test_map']:.4f}")
    print(f"Recall@12: {best_model.get('test_recall', 'N/A')}")
    print(f"Precision@12: {best_model.get('test_precision', 'N/A')}")
    print(f"NDCG@12: {best_model.get('test_ndcg', 'N/A')}")

## Training Efficiency Analysis

In [None]:
# Training time analysis
if 'duration' in results_df.columns:
    results_df['duration_min'] = results_df['duration'] / 60
    
    plt.figure(figsize=(12, 6))
    ax = sns.barplot(data=results_df, x='model', y='duration_min')
    plt.title('Training Time by Model', fontsize=14)
    plt.xlabel('Model', fontsize=12)
    plt.ylabel('Training Time (minutes)', fontsize=12)
    plt.xticks(rotation=45)
    
    # Add value labels
    for p in ax.patches:
        ax.annotate(f'{p.get_height():.1f}',
                   (p.get_x() + p.get_width()/2., p.get_height()),
                   ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()
    
    # Efficiency score (MAP per minute)
    if 'test_map' in results_df.columns:
        results_df['efficiency'] = results_df['test_map'] / results_df['duration_min']
        
        plt.figure(figsize=(12, 6))
        ax = sns.barplot(data=results_df, x='model', y='efficiency')
        plt.title('Model Efficiency (MAP per Training Minute)', fontsize=14)
        plt.xlabel('Model', fontsize=12)
        plt.ylabel('MAP / Training Minute', fontsize=12)
        plt.xticks(rotation=45)
        
        plt.tight_layout()
        plt.show()

## Model Comparison Heatmap

In [None]:
# Create heatmap of all metrics
if len(results_df) > 0:
    metrics = ['test_map', 'test_recall', 'test_precision', 'test_ndcg']
    available_metrics = [m for m in metrics if m in results_df.columns]
    
    if available_metrics:
        # Prepare data for heatmap
        heatmap_data = results_df.set_index('model')[available_metrics]
        
        # Normalize to 0-1 scale for better comparison
        heatmap_normalized = (heatmap_data - heatmap_data.min()) / (heatmap_data.max() - heatmap_data.min())
        
        plt.figure(figsize=(10, 8))
        sns.heatmap(heatmap_normalized.T, annot=heatmap_data.T, fmt='.4f', 
                   cmap='YlOrRd', cbar_kws={'label': 'Normalized Score'})
        plt.title('Model Performance Heatmap', fontsize=14)
        plt.xlabel('Model', fontsize=12)
        plt.ylabel('Metric', fontsize=12)
        plt.tight_layout()
        plt.show()

## Statistical Analysis

In [None]:
# Statistical summary
if len(results_df) > 0 and 'test_map' in results_df.columns:
    print("Statistical Summary of MAP@12 across models:")
    print(f"Mean: {results_df['test_map'].mean():.4f}")
    print(f"Std: {results_df['test_map'].std():.4f}")
    print(f"Min: {results_df['test_map'].min():.4f}")
    print(f"Max: {results_df['test_map'].max():.4f}")
    print(f"Range: {results_df['test_map'].max() - results_df['test_map'].min():.4f}")
    
    # Performance improvement over baseline
    if 'popularity_baseline' in results_df['model'].values:
        baseline_map = results_df[results_df['model'] == 'popularity_baseline']['test_map'].values[0]
        
        print(f"\nImprovement over Popularity Baseline (MAP={baseline_map:.4f}):")
        for _, row in results_df.iterrows():
            if row['model'] != 'popularity_baseline':
                improvement = ((row['test_map'] - baseline_map) / baseline_map) * 100
                print(f"{row['model']}: {improvement:+.1f}%")

## Recommendations and Next Steps

In [None]:
# Generate recommendations
if len(results_df) > 0 and 'test_map' in results_df.columns:
    top_models = results_df.nlargest(3, 'test_map')
    
    print("Top 3 Models:")
    for i, (_, model) in enumerate(top_models.iterrows(), 1):
        print(f"{i}. {model['model']}: MAP@12 = {model['test_map']:.4f}")
    
    print("\nRecommendations:")
    print("1. Focus on the top performing models for production deployment")
    print("2. Consider ensemble methods combining top models")
    print("3. Run hyperparameter tuning on the best models")
    print("4. Test with larger data samples for more reliable results")
    print("5. Analyze failure modes and edge cases for top models")
    
    # Model-specific recommendations
    best_model_name = top_models.iloc[0]['model']
    print(f"\nFor the best model ({best_model_name}):")
    
    if 'neural' in best_model_name.lower():
        print("- Try different architectures and layer sizes")
        print("- Experiment with different activation functions")
        print("- Consider adding attention mechanisms")
    elif 'lightgcn' in best_model_name.lower():
        print("- Experiment with different number of layers")
        print("- Try different graph normalization techniques")
        print("- Consider adding edge weights based on interaction strength")
    elif 'wide_deep' in best_model_name.lower():
        print("- Add more feature engineering for the wide part")
        print("- Try different deep network architectures")
        print("- Experiment with different feature combinations"

## Export Results

In [None]:
# Export results to various formats
if len(results_df) > 0:
    # CSV
    results_df.to_csv(latest_exp_dir / 'analysis_results.csv', index=False)
    print(f"Results exported to: {latest_exp_dir / 'analysis_results.csv'}")
    
    # Markdown summary
    with open(latest_exp_dir / 'analysis_summary.md', 'w') as f:
        f.write("# Experiment Analysis Summary\n\n")
        f.write(f"**Date**: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
        
        if 'test_map' in results_df.columns:
            summary_df = results_df[['model', 'test_map', 'test_recall', 'test_precision', 'test_ndcg']]
            summary_df = summary_df.round(4).sort_values('test_map', ascending=False)
            f.write("## Performance Summary\n\n")
            f.write(summary_df.to_markdown(index=False))
        
        f.write("\n\n## Best Model\n\n")
        best = results_df.loc[results_df['test_map'].idxmax()]
        f.write(f"**{best['model']}** with MAP@12 = {best['test_map']:.4f}\n")
    
    print(f"Summary exported to: {latest_exp_dir / 'analysis_summary.md'}")