In [None]:
import sys
import os
sys.path.insert(0, os.path.abspath('..'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Project imports
from src.evaluation.analysis import ResultsAnalyzer

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print('Imports successful!')

## 1. Load Results and Data

In [None]:
analyzer = ResultsAnalyzer(results_dir='../results')

# Load results
try:
    baseline = analyzer.load_results('baseline_results.json')
    print('✓ Loaded baseline results')
except FileNotFoundError:
    baseline = {}
    print('✗ Baseline results not found. Run main.py first.')

## 2. Model Performance Distribution

In [None]:
if baseline:
    # Extract accuracy distribution
    accuracy_data = []
    
    for dataset, model_results in baseline.items():
        for model, results in model_results.items():
            accuracy_data.append({
                'Model': model,
                'Dataset': dataset,
                'Accuracy': results.get('accuracy', 0)
            })
    
    df_accuracy = pd.DataFrame(accuracy_data)
    
    # Visualize distributions
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Box plot by model
    sns.boxplot(data=df_accuracy, x='Model', y='Accuracy', ax=axes[0])
    axes[0].set_title('Accuracy Distribution by Model')
    axes[0].set_ylim(0, 1)
    
    # Box plot by dataset
    sns.boxplot(data=df_accuracy, x='Dataset', y='Accuracy', ax=axes[1])
    axes[1].set_title('Accuracy Distribution by Dataset')
    axes[1].set_ylim(0, 1)
    
    plt.tight_layout()
    plt.savefig('../results/accuracy_distribution.png', dpi=150)
    plt.show()
    
    print('\nAccuracy Summary:')
    print(df_accuracy.describe())

## 3. Model Specialization Analysis

In [None]:
if baseline:
    print('\nModel Strengths by Dataset:')
    print('=' * 60)
    
    for dataset in baseline.keys():
        print(f'\n{dataset}:')
        
        accuracies = [(model, results.get('accuracy', 0)) 
                     for model, results in baseline[dataset].items()]
        accuracies.sort(key=lambda x: x[1], reverse=True)
        
        for i, (model, acc) in enumerate(accuracies, 1):
            print(f"  {i}. {model:20s}: {acc:.4f}")

## 4. Error Type Analysis

In [None]:
# Analyze error patterns
if baseline:
    print('\nError Analysis by Model:')
    print('=' * 60)
    
    error_data = []
    
    for dataset, model_results in baseline.items():
        for model, results in model_results.items():
            accuracy = results.get('accuracy', 0)
            n_samples = results.get('n_samples', 0)
            n_correct = int(accuracy * n_samples)
            n_errors = n_samples - n_correct
            error_rate = 1 - accuracy
            
            error_data.append({
                'Model': model,
                'Dataset': dataset,
                'Total': n_samples,
                'Correct': n_correct,
                'Errors': n_errors,
                'Error Rate': error_rate
            })
    
    df_errors = pd.DataFrame(error_data)
    
    # Visualize error rates
    plt.figure(figsize=(12, 6))
    sns.heatmap(df_errors.pivot(index='Model', columns='Dataset', values='Error Rate'),
                cmap='RdYlGn_r', annot=True, fmt='.3f', cbar_kws={'label': 'Error Rate'})
    plt.title('Error Rate Heatmap')
    plt.tight_layout()
    plt.savefig('../results/error_rate_heatmap.png', dpi=150)
    plt.show()
    
    print(df_errors.to_string())

## 5. Complementarity Analysis

In [None]:
# Analyze which models complement each other
if baseline:
    print('\nComplementarity Insights:')
    print('=' * 60)
    
    for dataset in baseline.keys():
        accuracies = {model: results.get('accuracy', 0) 
                     for model, results in baseline[dataset].items()}
        
        # Find models with different strengths
        models_sorted = sorted(accuracies.items(), key=lambda x: x[1], reverse=True)
        
        print(f'\n{dataset}:')
        print(f"  Ensemble potential: Combining {models_sorted[0][0]} (best)"
              f" with {models_sorted[-1][0]} (weakest)")
        
        max_acc = models_sorted[0][1]
        min_acc = models_sorted[-1][1]
        gap = max_acc - min_acc
        
        print(f"  Accuracy gap: {gap:.4f}")
        print(f"  Potential for ensemble improvement: {'HIGH' if gap > 0.1 else 'MEDIUM' if gap > 0.05 else 'LOW'}")