# Field ON vs Field OFF: Statistical Comparison

30 seeds per condition, 10M steps each, PROVEN 64-agent config.

## Setup
1. Runtime > Change runtime type > **TPU v6e** + **High-RAM**
2. Run all cells (Ctrl+F9)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
REPO_DIR = '/content/emergence-lab'
GITHUB_USERNAME = "imashishkh"

if not os.path.exists(REPO_DIR):
    !git clone https://github.com/{GITHUB_USERNAME}/emergence-lab.git {REPO_DIR}
else:
    !cd {REPO_DIR} && git pull

os.chdir(REPO_DIR)
!pip install -e ".[dev]" -q
print(f"Working directory: {os.getcwd()}")

In [None]:
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import json
import pickle
from pathlib import Path

from src.analysis.statistics import (
    compute_iqm, compare_methods, welch_t_test, mann_whitney_test,
    probability_of_improvement,
)
from src.analysis.paper_figures import (
    setup_publication_style, plot_performance_profiles,
    save_figure,
)

FIELD_OFF_DIR = '/content/drive/MyDrive/emergence-lab/field_off/'
OUTPUT_DIR = '/content/drive/MyDrive/emergence-lab/analysis_results/'
os.makedirs(OUTPUT_DIR, exist_ok=True)
print("Imports loaded. Output dir:", OUTPUT_DIR)

## Phase 1: Training-Time Data

All 60 reward values hardcoded from training logs (no checkpoint loading needed).

In [None]:
# ========== TRAINING-TIME DATA ==========
# Source: EXPERIMENT_LOG.md (Field ON) and training output (Field OFF)

field_on_rewards = np.array([
    5.19, 5.38, 4.70, 3.09, 4.84, 5.50, 2.54, 0.00, 5.18, 4.52,
    5.09, 5.33, 5.38, 3.70, 5.24, 4.61, 3.46, 4.56, 5.48, 4.42,
    5.43, 4.56, 5.30, 4.99, 4.22, 4.33, 5.51, 4.19, 4.67, 5.20,
])

field_on_populations = np.array([
    64, 64, 6, 22, 64, 64, 11, 0, 64, 20,
    64, 64, 40, 30, 48, 62, 39, 58, 64, 64,
    64, 58, 64, 30, 28, 50, 62, 52, 60, 58,
])

field_off_rewards = np.array([
    5.527, 5.614, 5.378, 5.728, 5.547, 5.624, 5.618, 5.386, 5.685, 5.600,
    5.430, 4.785, 5.542, 5.556, 5.610, 5.709, 5.494, 5.487, 5.695, 5.661,
    5.457, 5.605, 5.180, 5.588, 5.428, 5.399, 5.476, 5.297, 5.712, 5.670,
])

# Field OFF populations not available from training output - will get from eval
field_off_populations = None  # Will be populated in Phase 3

print(f"Field ON:  {len(field_on_rewards)} seeds, mean={field_on_rewards.mean():.3f} +/- {field_on_rewards.std():.3f}")
print(f"Field OFF: {len(field_off_rewards)} seeds, mean={field_off_rewards.mean():.3f} +/- {field_off_rewards.std():.3f}")
print(f"\nField ON population: mean={field_on_populations.mean():.1f}, at max(64): {np.sum(field_on_populations==64)}/30")
print(f"Field ON failed seeds (reward < 1.0): {np.sum(field_on_rewards < 1.0)}")

In [None]:
# ========== DESCRIPTIVE STATISTICS ==========
print("="*60)
print("DESCRIPTIVE STATISTICS")
print("="*60)

for name, rewards in [("Field ON", field_on_rewards), ("Field OFF", field_off_rewards)]:
    iqm = compute_iqm(rewards, n_bootstrap=10000, seed=42)
    print(f"\n{name} (n={len(rewards)}):")
    print(f"  Mean:   {rewards.mean():.4f} +/- {rewards.std(ddof=1):.4f}")
    print(f"  Median: {np.median(rewards):.4f}")
    print(f"  IQM:    {iqm.iqm:.4f} [{iqm.ci_lower:.4f}, {iqm.ci_upper:.4f}]")
    print(f"  Min:    {rewards.min():.4f}")
    print(f"  Max:    {rewards.max():.4f}")
    print(f"  CoV:    {rewards.std(ddof=1)/rewards.mean():.4f}")

# Compute and store IQMs for later
iqm_on = compute_iqm(field_on_rewards, n_bootstrap=10000, seed=42)
iqm_off = compute_iqm(field_off_rewards, n_bootstrap=10000, seed=42)

In [None]:
# ========== HYPOTHESIS TESTS ==========
print("="*60)
print("HYPOTHESIS TESTS")
print("="*60)

# Welch's t-test
welch = welch_t_test(field_off_rewards, field_on_rewards)
print(f"\n1. Welch's t-test:")
print(f"   t = {welch.statistic:.4f}, p = {welch.p_value:.6f}")
print(f"   Cohen's d = {welch.effect_size:.4f}", end="")
d = abs(welch.effect_size)
if d < 0.2: print(" (negligible)")
elif d < 0.5: print(" (small)")
elif d < 0.8: print(" (MEDIUM)")
else: print(" (LARGE)")
print(f"   Significant at alpha=0.05: {welch.significant}")

# Mann-Whitney U
mw = mann_whitney_test(field_off_rewards, field_on_rewards)
print(f"\n2. Mann-Whitney U test:")
print(f"   U = {mw.statistic:.1f}, p = {mw.p_value:.6f}")
print(f"   Rank-biserial r = {mw.effect_size:.4f}")
print(f"   Significant at alpha=0.05: {mw.significant}")

# Probability of Improvement
poi = probability_of_improvement(field_off_rewards, field_on_rewards, n_bootstrap=5000, seed=42)
print(f"\n3. Probability of Improvement:")
print(f"   P(Field OFF > Field ON) = {poi['prob_x_better']:.4f}")
print(f"   P(Field ON > Field OFF) = {poi['prob_y_better']:.4f}")
print(f"   95% CI: [{poi['ci_lower']:.4f}, {poi['ci_upper']:.4f}]")

# Direction
print(f"\n4. Direction:")
print(f"   Field OFF mean: {field_off_rewards.mean():.4f}")
print(f"   Field ON mean:  {field_on_rewards.mean():.4f}")
print(f"   Gap: {field_off_rewards.mean() - field_on_rewards.mean():+.4f} (Field OFF {'higher' if field_off_rewards.mean() > field_on_rewards.mean() else 'lower'})")

# Sensitivity: exclude failed seeds
mask_on = field_on_rewards > 0.5
on_filtered = field_on_rewards[mask_on]
welch_f = welch_t_test(field_off_rewards, on_filtered)
print(f"\n5. Sensitivity (excluding {np.sum(~mask_on)} failed Field ON seeds):")
print(f"   Field ON filtered: n={len(on_filtered)}, mean={on_filtered.mean():.4f}")
print(f"   Welch p = {welch_f.p_value:.6f}, Cohen's d = {welch_f.effect_size:.4f}")

In [None]:
# ========== FULL METHOD COMPARISON ==========
print("="*60)
print("FULL METHOD COMPARISON (rliable-style)")
print("="*60)

comparison = compare_methods(
    {"Field ON": field_on_rewards, "Field OFF": field_off_rewards},
    n_bootstrap=10000, seed=42,
)
print(comparison.summary)

## Phase 2: Comparison Plots

In [None]:
# ========== COMPARISON PLOTS ==========
setup_publication_style()

fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# 1. Bar chart: IQM with CI
ax = axes[0]
methods = ['Field ON\n(Stigmergy)', 'Field OFF\n(No Field)']
iqm_vals = [iqm_on.iqm, iqm_off.iqm]
iqm_lo = [iqm_on.iqm - iqm_on.ci_lower, iqm_off.iqm - iqm_off.ci_lower]
iqm_hi = [iqm_on.ci_upper - iqm_on.iqm, iqm_off.ci_upper - iqm_off.iqm]
colors = ['#009988', '#BBBBBB']

bars = ax.bar(methods, iqm_vals, yerr=[iqm_lo, iqm_hi], color=colors,
              edgecolor='black', linewidth=0.5, capsize=8)
for bar, val in zip(bars, iqm_vals):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + max(iqm_hi) + 0.05,
            f'{val:.3f}', ha='center', fontsize=10)
ax.set_ylabel('IQM Reward')
ax.set_title('(a) IQM + 95% CI')
sig = f"p = {welch.p_value:.4f}"
if welch.p_value < 0.001: sig += " ***"
elif welch.p_value < 0.01: sig += " **"
elif welch.p_value < 0.05: sig += " *"
ax.text(0.5, max(iqm_vals) + max(iqm_hi) + 0.3, sig, ha='center', fontsize=11,
        transform=ax.get_xaxis_transform())

# 2. Violin + swarm
ax = axes[1]
parts = ax.violinplot([field_on_rewards, field_off_rewards], positions=[1, 2],
                       showmeans=True, showmedians=True, showextrema=False)
for i, body in enumerate(parts['bodies']):
    body.set_facecolor(colors[i])
    body.set_alpha(0.4)

rng = np.random.default_rng(42)
for i, (data, pos) in enumerate(zip([field_on_rewards, field_off_rewards], [1, 2])):
    jitter = rng.normal(0, 0.05, size=len(data))
    ax.scatter(np.full_like(data, pos) + jitter, data, alpha=0.6, s=25,
               color=colors[i], edgecolor='black', linewidth=0.3, zorder=3)

ax.scatter([1], [iqm_on.iqm], marker='D', s=80, color='red', zorder=5, label='IQM')
ax.scatter([2], [iqm_off.iqm], marker='D', s=80, color='red', zorder=5)
ax.set_xticks([1, 2])
ax.set_xticklabels(['Field ON', 'Field OFF'])
ax.set_ylabel('Mean Reward')
ax.set_title('(b) Distribution (all 60 seeds)')
ax.legend(fontsize=9)

# 3. Population
ax = axes[2]
ax.hist(field_on_populations, bins=15, alpha=0.7, color=colors[0], label='Field ON', edgecolor='black')
ax.set_xlabel('Final Population')
ax.set_ylabel('Count')
ax.set_title('(c) Field ON Population Distribution')
ax.axvline(x=64, color='red', linestyle='--', alpha=0.5, label='Max capacity')
ax.legend(fontsize=9)

plt.suptitle('Field ON vs Field OFF: 30-Seed Comparison', fontsize=14, y=1.02)
plt.tight_layout()
save_figure(fig, os.path.join(OUTPUT_DIR, 'main_comparison'))
plt.show()

In [None]:
# ========== PERFORMANCE PROFILES ==========
setup_publication_style()
fig = plot_performance_profiles(
    {"Field ON (Stigmergy)": field_on_rewards, "Field OFF (No Field)": field_off_rewards},
    output_path=os.path.join(OUTPUT_DIR, 'performance_profiles'),
    tau_range=(0, 1.05),
)
plt.show()

In [None]:
# ========== REWARD vs POPULATION ==========
setup_publication_style()
fig, ax = plt.subplots(figsize=(8, 6))

ax.scatter(field_on_populations, field_on_rewards, color='#009988', label='Field ON',
            s=50, alpha=0.7, edgecolor='black', linewidth=0.5)

ax.set_xlabel('Final Population')
ax.set_ylabel('Mean Reward')
ax.set_title('Field ON: Reward vs Population (per seed)')
ax.legend()

# Annotate failed seed
failed_idx = np.where(field_on_rewards < 1.0)[0]
for idx in failed_idx:
    ax.annotate(f'Seed {idx}\n(died)', xy=(field_on_populations[idx], field_on_rewards[idx]),
                fontsize=8, color='red', arrowprops=dict(arrowstyle='->', color='red'),
                xytext=(field_on_populations[idx]+5, field_on_rewards[idx]+1))

plt.tight_layout()
save_figure(fig, os.path.join(OUTPUT_DIR, 'reward_vs_population'))
plt.show()

# Correlation
from scipy import stats as scipy_stats
mask = field_on_rewards > 0.5  # exclude dead seed
r, p = scipy_stats.pearsonr(field_on_populations[mask], field_on_rewards[mask])
print(f"Correlation (pop vs reward, excluding failed): r={r:.3f}, p={p:.6f}")

## Phase 3: Checkpoint Analysis (Field OFF)

Load Field OFF checkpoints from Drive, run eval episodes, compute weight divergence.

In [None]:
# ========== LOAD FIELD OFF CHECKPOINTS ==========
import glob as glob_mod
import jax
import jax.numpy as jnp
from src.training.checkpointing import load_checkpoint
from src.agents.network import ActorCritic
from src.environment.obs import obs_dim
from src.analysis.ablation import _run_episode_full

# Discover checkpoints
checkpoint_paths = []
for batch_idx in range(10):
    batch_dir = os.path.join(FIELD_OFF_DIR, f'batch_{batch_idx}')
    if not os.path.exists(batch_dir):
        continue
    for seed_dir in sorted(os.listdir(batch_dir)):
        seed_path = os.path.join(batch_dir, seed_dir)
        if not os.path.isdir(seed_path):
            continue
        pkl_files = glob_mod.glob(os.path.join(seed_path, 'step_*.pkl'))
        if pkl_files:
            checkpoint_paths.append(sorted(pkl_files)[-1])

print(f"Found {len(checkpoint_paths)} Field OFF checkpoints")

# Load helper
def load_seed_data(ckpt_path):
    ckpt = load_checkpoint(ckpt_path)
    config = ckpt['config']
    agent_params = jax.tree_util.tree_map(lambda x: x[0], ckpt['agent_params'])
    network = ActorCritic(hidden_dims=tuple(config.agent.hidden_dims), num_actions=6)
    return {
        'params': ckpt['params'],
        'agent_params': agent_params,
        'config': config,
        'network': network,
        'seed_id': ckpt.get('seed_id', -1),
    }

# Test load
test_data = load_seed_data(checkpoint_paths[0])
print(f"Test load OK: seed {test_data['seed_id']}, grid={test_data['config'].env.grid_size}")
print(f"Field OFF config: decay={test_data['config'].field.decay_rate}, diffusion={test_data['config'].field.diffusion_rate}, write={test_data['config'].field.write_strength}")

In [None]:
# ========== EVAL EPISODES (Field OFF) ==========
# Run 5 eval episodes per seed to get population dynamics + survival stats
# Uses shared params (not per-agent); "normal" condition since field already off in config

NUM_EVAL_EPISODES = 5
eval_results = []

for i, ckpt_path in enumerate(checkpoint_paths):
    seed_data = load_seed_data(ckpt_path)
    config = seed_data['config']

    seed_pops = []
    seed_rewards = []
    seed_births = []
    seed_deaths = []

    for ep in range(NUM_EVAL_EPISODES):
        key = jax.random.PRNGKey(ep * 1000 + i)
        stats = _run_episode_full(
            network=seed_data['network'],
            params=seed_data['params'],
            config=config,
            key=key,
            condition="normal",  # field already disabled in config
            evolution=True,
        )
        seed_pops.append(stats.final_population)
        seed_rewards.append(stats.total_reward)
        seed_births.append(stats.total_births)
        seed_deaths.append(stats.total_deaths)

    eval_results.append({
        'seed_id': seed_data['seed_id'],
        'ckpt_path': ckpt_path,
        'mean_total_reward': np.mean(seed_rewards),
        'std_total_reward': np.std(seed_rewards),
        'mean_population': np.mean(seed_pops),
        'mean_births': np.mean(seed_births),
        'mean_deaths': np.mean(seed_deaths),
        'survival_rate': np.mean(seed_pops) / config.evolution.max_agents,
        'all_rewards': seed_rewards,
        'all_populations': seed_pops,
    })

    if (i + 1) % 5 == 0 or i == 0:
        print(f"  [{i+1}/{len(checkpoint_paths)}] seed {seed_data['seed_id']}: "
              f"total_reward={np.mean(seed_rewards):.1f}, pop={np.mean(seed_pops):.1f}, "
              f"births={np.mean(seed_births):.0f}, deaths={np.mean(seed_deaths):.0f}")

# Extract Field OFF populations from eval
field_off_populations = np.array([r['mean_population'] for r in eval_results])
field_off_eval_total_rewards = np.array([r['mean_total_reward'] for r in eval_results])

print(f"\n{'='*60}")
print(f"FIELD OFF EVAL SUMMARY ({len(eval_results)} seeds x {NUM_EVAL_EPISODES} episodes)")
print(f"  Mean total reward: {field_off_eval_total_rewards.mean():.1f} +/- {field_off_eval_total_rewards.std():.1f}")
print(f"  Mean population:   {field_off_populations.mean():.1f} +/- {field_off_populations.std():.1f}")
print(f"  At max capacity:   {np.sum(field_off_populations >= 60)}/{len(field_off_populations)}")
print(f"  Mean births:       {np.mean([r['mean_births'] for r in eval_results]):.1f}")
print(f"  Mean deaths:       {np.mean([r['mean_deaths'] for r in eval_results]):.1f}")

In [None]:
# ========== WEIGHT DIVERGENCE (Field OFF) ==========
from src.analysis.specialization import compute_weight_divergence

divergence_results = []

for i, ckpt_path in enumerate(checkpoint_paths):
    seed_data = load_seed_data(ckpt_path)
    agent_params = seed_data['agent_params']

    # Compute weight divergence across all agents in this seed
    div = compute_weight_divergence(agent_params)
    divergence_results.append({
        'seed_id': seed_data['seed_id'],
        'mean_divergence': float(div['mean_divergence']),
        'max_divergence': float(div['max_divergence']),
        'n_agents': len(div['agent_indices']),
    })

    if (i + 1) % 10 == 0 or i == 0:
        print(f"  [{i+1}/{len(checkpoint_paths)}] seed {seed_data['seed_id']}: "
              f"mean_div={div['mean_divergence']:.4f}, max_div={div['max_divergence']:.4f}, "
              f"n_agents={len(div['agent_indices'])}")

field_off_divergences = np.array([r['mean_divergence'] for r in divergence_results])
field_off_max_divs = np.array([r['max_divergence'] for r in divergence_results])

print(f"\n{'='*60}")
print(f"FIELD OFF WEIGHT DIVERGENCE SUMMARY ({len(divergence_results)} seeds)")
print(f"  Mean divergence:  {field_off_divergences.mean():.4f} +/- {field_off_divergences.std():.4f}")
print(f"  Max divergence:   {field_off_max_divs.mean():.4f} +/- {field_off_max_divs.std():.4f}")
print(f"  Range: [{field_off_divergences.min():.4f}, {field_off_divergences.max():.4f}]")

# Plot weight divergence distribution
setup_publication_style()
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

ax = axes[0]
ax.hist(field_off_divergences, bins=15, color='#BBBBBB', edgecolor='black', alpha=0.7)
ax.axvline(field_off_divergences.mean(), color='red', linestyle='--', label=f'Mean={field_off_divergences.mean():.4f}')
ax.set_xlabel('Mean Pairwise Weight Divergence (cosine)')
ax.set_ylabel('Count')
ax.set_title('(a) Field OFF: Weight Divergence Distribution')
ax.legend()

# Population vs reward scatter for Field OFF
ax = axes[1]
if field_off_populations is not None and len(field_off_populations) == len(field_off_rewards):
    ax.scatter(field_off_populations, field_off_rewards, color='#BBBBBB', s=50,
               alpha=0.7, edgecolor='black', linewidth=0.5)
    ax.set_xlabel('Eval Population')
    ax.set_ylabel('Training Reward')
    ax.set_title('(b) Field OFF: Reward vs Eval Population')
else:
    ax.text(0.5, 0.5, 'Eval populations not yet available',
            ha='center', va='center', transform=ax.transAxes)
    ax.set_title('(b) Field OFF: Reward vs Population')

plt.tight_layout()
save_figure(fig, os.path.join(OUTPUT_DIR, 'field_off_analysis'))
plt.show()

## Phase 4: Report & Save

Generate formatted comparison report and save all results to Drive.

In [None]:
# ========== COMPARISON REPORT ==========
from datetime import datetime

# Determine significance level
if welch.p_value < 0.001:
    sig_str = "p < 0.001 (***)"
elif welch.p_value < 0.01:
    sig_str = f"p = {welch.p_value:.4f} (**)"
elif welch.p_value < 0.05:
    sig_str = f"p = {welch.p_value:.4f} (*)"
else:
    sig_str = f"p = {welch.p_value:.4f} (not significant)"

d_val = abs(welch.effect_size)
if d_val < 0.2: d_str = "negligible"
elif d_val < 0.5: d_str = "small"
elif d_val < 0.8: d_str = "medium"
else: d_str = "large"

winner = "Field OFF" if field_off_rewards.mean() > field_on_rewards.mean() else "Field ON"

# Use sample std (ddof=1) for CoV
cov_on = field_on_rewards.std(ddof=1) / field_on_rewards.mean()
cov_off = field_off_rewards.std(ddof=1) / field_off_rewards.mean()

report = f"""# Field ON vs Field OFF: 30-Seed Comparison Report
Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}

## Experiment Setup
- **Conditions**: Field ON (stigmergy) vs Field OFF (no shared field)
- **Seeds per condition**: 30
- **Training steps**: 10M per seed
- **Config**: 64-agent, grid=32, num_food=40, starting_energy=200
- **Field ON**: diffusion=0.1, decay=0.05, write_strength=1.0
- **Field OFF**: diffusion=0.0, decay=1.0, write_strength=0.0

## Key Results

### Reward Comparison
| Metric | Field ON | Field OFF |
|--------|----------|-----------|
| Mean | {field_on_rewards.mean():.4f} +/- {field_on_rewards.std(ddof=1):.4f} | {field_off_rewards.mean():.4f} +/- {field_off_rewards.std(ddof=1):.4f} |
| Median | {np.median(field_on_rewards):.4f} | {np.median(field_off_rewards):.4f} |
| IQM | {iqm_on.iqm:.4f} [{iqm_on.ci_lower:.4f}, {iqm_on.ci_upper:.4f}] | {iqm_off.iqm:.4f} [{iqm_off.ci_lower:.4f}, {iqm_off.ci_upper:.4f}] |
| Min | {field_on_rewards.min():.4f} | {field_off_rewards.min():.4f} |
| Max | {field_on_rewards.max():.4f} | {field_off_rewards.max():.4f} |
| CoV | {cov_on:.4f} | {cov_off:.4f} |

### Statistical Tests
- **Welch's t-test**: {sig_str}, Cohen's d = {welch.effect_size:.4f} ({d_str})
- **Mann-Whitney U**: U = {mw.statistic:.1f}, p = {mw.p_value:.6f}, rank-biserial r = {mw.effect_size:.4f}
- **P(Field OFF > Field ON)**: {poi['prob_x_better']:.4f}

### Winner: **{winner}** (by mean reward)

### Population Dynamics (Field ON)
- Mean final population: {field_on_populations.mean():.1f}
- At max capacity (64): {np.sum(field_on_populations == 64)}/30
- Failed seeds (reward < 1.0): {np.sum(field_on_rewards < 1.0)}

### Sensitivity Analysis
- Excluding {np.sum(field_on_rewards < 0.5)} failed Field ON seeds:
  Field ON filtered mean = {field_on_rewards[field_on_rewards > 0.5].mean():.4f} (n={np.sum(field_on_rewards > 0.5)})
  Welch p = {welch_f.p_value:.6f}, Cohen's d = {welch_f.effect_size:.4f}
"""

# Add weight divergence if available
if divergence_results:
    report += f"""
### Weight Divergence (Field OFF only)
- Mean divergence: {field_off_divergences.mean():.4f} +/- {field_off_divergences.std():.4f}
- Max divergence: {field_off_max_divs.mean():.4f} +/- {field_off_max_divs.std():.4f}
"""

# Add eval results if available
if eval_results:
    report += f"""
### Eval Episodes (Field OFF, {NUM_EVAL_EPISODES} episodes/seed)
- Mean total reward: {field_off_eval_total_rewards.mean():.1f} +/- {field_off_eval_total_rewards.std():.1f}
- Mean population: {field_off_populations.mean():.1f} +/- {field_off_populations.std():.1f}
- At max capacity: {np.sum(field_off_populations >= 60)}/{len(field_off_populations)}
"""

report += f"""
## Interpretation

**Surprising finding**: Field OFF agents achieve {'higher' if field_off_rewards.mean() > field_on_rewards.mean() else 'lower'} mean reward than Field ON.

Key observations:
1. Field ON has MUCH higher variance (CoV {cov_on:.3f} vs {cov_off:.3f})
2. Field ON has {np.sum(field_on_rewards < 1.0)} failed seed(s) with near-zero reward
3. Field OFF is remarkably consistent across all 30 seeds
4. When excluding failed seeds, the gap {'narrows' if abs(welch_f.effect_size) < abs(welch.effect_size) else 'remains'}

Possible explanations:
- The shared field may introduce a coordination overhead that hurts some seeds
- Field ON populations are more variable (some collapse, some max out)
- The field may be a harder optimization landscape requiring more training
- Field OFF is simpler: agents just learn individual foraging without field-reading costs

## Next Steps
- Run Field ON checkpoints through eval episodes (need to upload to Drive)
- Compare weight divergence between Field ON and Field OFF
- Investigate why some Field ON seeds fail (population collapse)
- Try longer training (20M+ steps) to see if Field ON catches up
- Test with diversity_bonus and niche_pressure enabled
"""

from IPython.display import Markdown, display
display(Markdown(report))
print("\nReport generated successfully.")

In [None]:
# ========== SAVE RESULTS ==========
import json

results = {
    'metadata': {
        'generated': datetime.now().isoformat(),
        'field_on_seeds': 30,
        'field_off_seeds': 30,
        'steps_per_seed': 10_000_000,
    },
    'field_on': {
        'rewards': field_on_rewards.tolist(),
        'populations': field_on_populations.tolist(),
        'mean_reward': float(field_on_rewards.mean()),
        'std_reward': float(field_on_rewards.std()),
        'iqm': float(iqm_on.iqm),
        'iqm_ci': [float(iqm_on.ci_lower), float(iqm_on.ci_upper)],
    },
    'field_off': {
        'rewards': field_off_rewards.tolist(),
        'mean_reward': float(field_off_rewards.mean()),
        'std_reward': float(field_off_rewards.std()),
        'iqm': float(iqm_off.iqm),
        'iqm_ci': [float(iqm_off.ci_lower), float(iqm_off.ci_upper)],
    },
    'tests': {
        'welch_t': float(welch.statistic),
        'welch_p': float(welch.p_value),
        'cohens_d': float(welch.effect_size),
        'mann_whitney_u': float(mw.statistic),
        'mann_whitney_p': float(mw.p_value),
        'prob_off_better': float(poi['prob_x_better']),
    },
}

# Add eval results if available
if eval_results:
    results['field_off']['eval'] = {
        'populations': field_off_populations.tolist(),
        'total_rewards': field_off_eval_total_rewards.tolist(),
        'per_seed': [{k: v for k, v in r.items() if k != 'ckpt_path'} for r in eval_results],
    }

# Add divergence results if available
if divergence_results:
    results['field_off']['weight_divergence'] = {
        'mean_divergences': field_off_divergences.tolist(),
        'max_divergences': field_off_max_divs.tolist(),
        'per_seed': divergence_results,
    }

# Save JSON
json_path = os.path.join(OUTPUT_DIR, 'field_on_vs_off_results.json')
with open(json_path, 'w') as f:
    json.dump(results, f, indent=2)
print(f"JSON saved: {json_path}")

# Save pickle (preserves numpy arrays)
pkl_path = os.path.join(OUTPUT_DIR, 'field_on_vs_off_results.pkl')
with open(pkl_path, 'wb') as f:
    pickle.dump(results, f)
print(f"Pickle saved: {pkl_path}")

# Save report markdown
md_path = os.path.join(OUTPUT_DIR, 'comparison_report.md')
with open(md_path, 'w') as f:
    f.write(report)
print(f"Report saved: {md_path}")

# List all output files
print(f"\n{'='*60}")
print(f"ALL OUTPUT FILES:")
for f in sorted(os.listdir(OUTPUT_DIR)):
    fpath = os.path.join(OUTPUT_DIR, f)
    size_mb = os.path.getsize(fpath) / 1024 / 1024
    print(f"  {f} ({size_mb:.2f} MB)")
print(f"\nDone! All results saved to {OUTPUT_DIR}")