# Active Learning Experiment Analysis

This notebook visualizes and analyzes AL experiment outputs (learning curves, baselines, strategy comparisons and improvement over random).

Order: imports → paths → load → organize → baselines → curves → comparisons → heatmaps → improvement → export.

In [None]:
# Imports
import json
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
sns.set_theme(style='whitegrid')
plt.rcParams['figure.figsize'] = (12, 5)
plt.rcParams['font.size'] = 10

In [None]:
# Paths (adjust if you placed outputs elsewhere)
experiments_dir = Path('../artifacts/embedding_cnn/al_experiments')
baselines_path = experiments_dir / 'baseline_results.csv'
curves_path = experiments_dir / 'all_al_curves.csv'
metadata_path = experiments_dir / 'experiment_metadata.json'
print('Experiments dir:', experiments_dir.resolve())
print('Exists:', experiments_dir.exists())

In [None]:
# Load files if present
baselines_df = pd.read_csv(baselines_path) if baselines_path.exists() else None
curves_df = pd.read_csv(curves_path) if curves_path.exists() else None
metadata = json.loads(metadata_path.read_text()) if metadata_path.exists() else None
print('baselines:', baselines_path.exists(), 'curves:', curves_path.exists(), 'metadata:', metadata_path.exists())

## Parse and quick sanity checks

In [None]:
# Quick inspection
if curves_df is not None:
    print('AL curves columns:', list(curves_df.columns))
    display(curves_df.head())
else:
    print('No AL curves found; run experiments first.')

if baselines_df is not None:
    display(baselines_df)
else:
    print('No baseline results found.')

## Baseline metrics (full training set)

In [None]:
if baselines_df is not None:
    plt.figure(figsize=(8,4))
    sns.barplot(data=baselines_df.sort_values('accuracy', ascending=False), x='classifier', y='accuracy')
    plt.title('Baseline accuracy (trained on all labeled data)')
    plt.ylim(0,1)
    plt.tight_layout()
    plt.show()
else:
    print('Baselines missing.')

## Learning curves by classifier

In [None]:
if curves_df is not None:
    for clf, g in curves_df.groupby('classifier'):
        pivot = g.pivot_table(index='round', columns='strategy', values='test_accuracy')
        plt.figure(figsize=(8,4))
        pivot.plot(title=f'Learning curves — {clf}', xlabel='round', ylabel='test_accuracy')
        plt.legend(title='strategy')
        plt.tight_layout()
        plt.show()
else:
    print('No curves to plot.')

## Compare sampling strategies (aggregate)

In [None]:
if curves_df is not None:
    agg = curves_df.groupby(['strategy','round'])['test_accuracy'].mean().reset_index()
    plt.figure(figsize=(8,4))
    sns.lineplot(data=agg, x='round', y='test_accuracy', hue='strategy')
    plt.title('Mean learning curves across classifiers')
    plt.show()
else:
    print('No curves for aggregation.')

## Final-round heatmap: accuracy by classifier × strategy

In [None]:
if curves_df is not None:
    last_round = curves_df['round'].max()
    final = curves_df[curves_df['round'] == last_round].copy()
    pivot_final = final.pivot_table(index='classifier', columns='strategy', values='test_accuracy')
    plt.figure(figsize=(10,4))
    sns.heatmap(pivot_final, annot=True, fmt='.3f', cmap='viridis')
    plt.title('Final-round accuracy by classifier and strategy')
    plt.tight_layout()
    plt.show()
else:
    print('No final results available.')

## Improvement over random (final round)

In [None]:
if curves_df is not None:
    last_round = curves_df['round'].max()
    final = curves_df[curves_df['round'] == last_round].copy()
    rnd = final[final['strategy']=='random'][['classifier','test_accuracy']].rename(columns={'test_accuracy':'random_acc'})
    merged_final = final.merge(rnd, on='classifier', how='left')
    merged_final['improvement_over_random'] = merged_final['test_accuracy'] - merged_final['random_acc']
    pivot_imp = merged_final.pivot_table(index='classifier', columns='strategy', values='improvement_over_random')
    plt.figure(figsize=(10,4))
    sns.heatmap(pivot_imp, annot=True, fmt='.3f', center=0, cmap='RdBu_r')
    plt.title('Improvement over random (final round)')
    plt.tight_layout()
    plt.show()
else:
    print('No curves to compare to random.')

## Summary and export

In [None]:
if 'merged_final' in globals():
    summary = merged_final.groupby('strategy').agg(mean_final_acc=('test_accuracy','mean'), mean_imp_over_random=('improvement_over_random','mean')).reset_index()
    display(summary.sort_values('mean_final_acc', ascending=False))
    out_dir = experiments_dir / 'analysis_outputs'
    out_dir.mkdir(parents=True, exist_ok=True)
    summary.to_csv(out_dir / 'strategy_summary.csv', index=False)
    pivot_final.to_csv(out_dir / 'final_accuracy_matrix.csv')
    pivot_imp.to_csv(out_dir / 'improvement_over_random_matrix.csv')
    print('Wrote summary files to', out_dir)
else:
    print('Nothing to export; run previous cells first.')