In [None]:
import sys
import os
sys.path.insert(0, os.path.abspath('..'))

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Project imports
from src.utils.config import ConfigLoader
from src.utils.dataset_loader import DatasetLoader
from src.evaluation.metrics import MetricsCalculator
from src.evaluation.analysis import ResultsAnalyzer

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print('Imports successful!')

## 1. Load Configuration and Datasets

In [None]:
# Load configuration
config_loader = ConfigLoader(config_dir='../config')
exp_config = config_loader.get_experiment_config()

print('Experiment Configuration:')
print(f"Datasets enabled: {[k for k, v in exp_config['datasets'].items() if v.get('enabled')]}")

# Load datasets
loader = DatasetLoader(cache_dir='../data')
datasets = loader.load_all_datasets(exp_config)

print(f'\nLoaded {len(datasets)} datasets:')
for name, examples in datasets.items():
    print(f"  {name}: {len(examples)} examples")

## 2. Examine Dataset Structure

In [None]:
# Show sample from each dataset
for dataset_name, examples in datasets.items():
    print(f'\n{dataset_name.upper()}:')
    print('=' * 80)
    
    sample = examples[0]
    print(f"Question: {sample.get('question', '')[:200]}...")
    print(f"Options: {sample.get('options', [])[:3]}")
    print(f"Gold Label: {sample.get('gold_label', '')}")
    print(f"Dataset: {sample.get('dataset', 'N/A')}")

## 3. Load Results and Analyze

In [None]:
# Load results if available
analyzer = ResultsAnalyzer(results_dir='../results')

try:
    baseline_results = analyzer.load_results('baseline_results.json')
    print('Loaded baseline results')
except FileNotFoundError:
    print('No baseline results found. Run main.py --baseline-only to generate results.')
    baseline_results = {}

## 4. Visualize Results

In [None]:
if baseline_results:
    # Prepare data for visualization
    results_list = []
    
    for dataset, model_results in baseline_results.items():
        for model, results in model_results.items():
            results_list.append({
                'Dataset': dataset,
                'Model': model,
                'Accuracy': results.get('accuracy', 0)
            })
    
    df_results = pd.DataFrame(results_list)
    
    # Create visualization
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    # Bar plot by dataset
    sns.barplot(data=df_results, x='Dataset', y='Accuracy', hue='Model', ax=axes[0])
    axes[0].set_title('Model Accuracy by Dataset')
    axes[0].set_ylim(0, 1)
    
    # Bar plot by model
    sns.barplot(data=df_results, x='Model', y='Accuracy', hue='Dataset', ax=axes[1])
    axes[1].set_title('Model Accuracy Comparison')
    axes[1].set_ylim(0, 1)
    
    plt.tight_layout()
    plt.savefig('../results/baseline_comparison.png', dpi=150)
    plt.show()
    
    print(df_results.to_string())

## 5. Summary Statistics

In [None]:
if baseline_results:
    print('\nBaseline Summary Statistics:')
    print('=' * 60)
    
    for dataset in baseline_results.keys():
        accuracies = [r['accuracy'] for r in baseline_results[dataset].values()]
        print(f'\n{dataset}:')
        print(f"  Mean Accuracy: {np.mean(accuracies):.4f}")
        print(f"  Std Dev: {np.std(accuracies):.4f}")
        print(f"  Min: {np.min(accuracies):.4f}")
        print(f"  Max: {np.max(accuracies):.4f}")