# RSL-RL Policy Performance Comparison

This notebook performs comprehensive performance comparisons between different trained policies:
- Flat vs Rough terrain
- Different robot platforms (Anymal-C, Anymal-D, Unitree Go2)
- Direct locomotion vs Manager-based navigation

All comparisons focus on the latest checkpoints from training.

In [None]:
# Import libraries and utilities
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add scripts directory to path
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == 'notebooks' else Path.cwd()
sys.path.insert(0, str(PROJECT_ROOT / "scripts"))

# Import analysis utilities
from rsl_rl_analysis_utils import (
    COMPARISONS, KEY_METRICS, TASK_METRICS,
    load_all_metrics, extract_key_metrics,
    plot_comparison_bar, plot_training_curves, plot_metric_heatmap,
    TENSORBOARD_AVAILABLE
)

# Define directories
LOGS_DIR = PROJECT_ROOT / "logs" / "rsl_rl"
OUTPUT_DIR = Path.cwd() / "comparison_results"
OUTPUT_DIR.mkdir(exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Logs directory: {LOGS_DIR}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"TensorBoard available: {TENSORBOARD_AVAILABLE}")


## 1. Load All Metrics

Load training metrics from all TensorBoard log files for the runs defined in the comparison groups.


In [None]:
# Load all metrics
print("Loading metrics from TensorBoard logs...")
all_metrics = load_all_metrics(LOGS_DIR)

print(f"\nTotal runs loaded: {len(all_metrics)}")
print("\nLoaded runs:")
for run_key, run_data in all_metrics.items():
    num_metrics = len(run_data['metrics'])
    print(f"  - {run_data['display_name']}: {num_metrics} metrics")


In [None]:
# Extract key metrics into DataFrame
metrics_df = extract_key_metrics(all_metrics)

print(f"Metrics DataFrame shape: {metrics_df.shape}")
print(f"\nAvailable columns:")
print(list(metrics_df.columns))

# Display summary
display(metrics_df[['display_name', 'experiment', 'category']].head())

# Save to CSV
metrics_csv = OUTPUT_DIR / "metrics_summary.csv"
metrics_df.to_csv(metrics_csv, index=False)
print(f"\nMetrics summary saved to: {metrics_csv}")


## 2. Comparison: Flat vs Rough Terrain

Compare performance on flat vs rough terrain across different robots and tasks.


In [None]:
# Total reward comparison
metric_col = 'Reward___Total_reward_mean'
if metric_col in metrics_df.columns:
    fig, ax = plt.subplots(figsize=(12, 6))
    plot_comparison_bar(
        metrics_df,
        'flat_vs_rough',
        metric_col,
        title='Total Reward: Flat vs Rough Terrain',
        ylabel='Average Episode Reward',
        ax=ax
    )
    plt.savefig(OUTPUT_DIR / "flat_vs_rough_reward.png", dpi=150, bbox_inches='tight')
    plt.show()
else:
    print(f"Metric {metric_col} not available")


In [None]:
# Training curves for flat vs rough
fig, ax = plt.subplots(figsize=(12, 6))
plot_training_curves(
    all_metrics,
    'flat_vs_rough',
    'Reward / Total reward (mean)',
    title='Training Progress: Total Reward (Flat vs Rough)',
    smoothing=10,
    ax=ax
)
plt.savefig(OUTPUT_DIR / "flat_vs_rough_training_curves.png", dpi=150, bbox_inches='tight')
plt.show()


## 3. Comparison: Robot Platforms (Rough Terrain)

Compare different robot platforms on rough terrain.


In [None]:
# Robot comparison on rough terrain
metric_col = 'Reward___Total_reward_mean'
if metric_col in metrics_df.columns:
    fig, ax = plt.subplots(figsize=(10, 6))
    plot_comparison_bar(
        metrics_df,
        'robot_comparison_rough',
        metric_col,
        title='Total Reward: Robot Platform Comparison (Rough Terrain)',
        ylabel='Average Episode Reward',
        ax=ax
    )
    plt.savefig(OUTPUT_DIR / "robot_comparison_rough_reward.png", dpi=150, bbox_inches='tight')
    plt.show()


In [None]:
# Training curves for robot comparison
fig, ax = plt.subplots(figsize=(12, 6))
plot_training_curves(
    all_metrics,
    'robot_comparison_rough',
    'Reward / Total reward (mean)',
    title='Training Progress: Robot Platform Comparison (Rough)',
    smoothing=10,
    ax=ax
)
plt.savefig(OUTPUT_DIR / "robot_comparison_rough_training_curves.png", dpi=150, bbox_inches='tight')
plt.show()


## 4. Comparison: Direct vs Manager-Based Control

Compare direct locomotion control vs manager-based navigation.


In [None]:
# Direct vs Manager-based comparison
metric_col = 'Reward___Total_reward_mean'
if metric_col in metrics_df.columns:
    fig, ax = plt.subplots(figsize=(12, 6))
    plot_comparison_bar(
        metrics_df,
        'direct_vs_manager',
        metric_col,
        title='Total Reward: Direct vs Manager-Based Control',
        ylabel='Average Episode Reward',
        ax=ax
    )
    plt.savefig(OUTPUT_DIR / "direct_vs_manager_reward.png", dpi=150, bbox_inches='tight')
    plt.show()


## 5. Comparison: Navigation - Flat vs Rough

Compare navigation task performance on flat vs rough terrain.


In [None]:
# Navigation terrain comparison
metric_col = 'Reward___Total_reward_mean'
if metric_col in metrics_df.columns:
    fig, ax = plt.subplots(figsize=(10, 6))
    plot_comparison_bar(
        metrics_df,
        'navigation_terrain',
        metric_col,
        title='Navigation: Total Reward (Flat vs Rough)',
        ylabel='Average Episode Reward',
        ax=ax
    )
    plt.savefig(OUTPUT_DIR / "navigation_terrain_reward.png", dpi=150, bbox_inches='tight')
    plt.show()

# Position tracking (navigation-specific)
position_col = 'Info___Episode_Reward_position_tracking'
if position_col in metrics_df.columns:
    fig, ax = plt.subplots(figsize=(10, 6))
    plot_comparison_bar(
        metrics_df,
        'navigation_terrain',
        position_col,
        title='Navigation: Position Tracking Accuracy (Flat vs Rough)',
        ylabel='Position Tracking Score',
        ax=ax
    )
    plt.savefig(OUTPUT_DIR / "navigation_terrain_position_tracking.png", dpi=150, bbox_inches='tight')
    plt.show()


## 6. Comprehensive Metric Heatmaps

Heatmaps showing multiple metrics across different runs for easy comparison.


In [None]:
# Create comprehensive heatmaps
key_metric_cols = [
    'Reward___Total_reward_mean',
    'Reward___Instantaneous_reward_mean',
    'Episode___Total_timesteps_mean',
    'Loss___Policy_loss',
    'Loss___Value_loss',
]

# Flat vs Rough heatmap
plot_metric_heatmap(
    metrics_df,
    'flat_vs_rough',
    key_metric_cols,
    title="Comprehensive Metrics: Flat vs Rough Terrain"
)
plt.savefig(OUTPUT_DIR / "heatmap_flat_vs_rough.png", dpi=150, bbox_inches='tight')
plt.show()


In [None]:
# Robot comparison heatmap
plot_metric_heatmap(
    metrics_df,
    'robot_comparison_rough',
    key_metric_cols,
    title="Comprehensive Metrics: Robot Platform Comparison (Rough)"
)
plt.savefig(OUTPUT_DIR / "heatmap_robot_comparison.png", dpi=150, bbox_inches='tight')
plt.show()


## 7. Statistical Summary and Insights

Generate statistical summaries and key insights from the comparisons.


In [None]:
# Statistical summary by category
print("="*80)
print("STATISTICAL SUMMARY")
print("="*80)

summary_cols = ['Reward___Total_reward_mean', 'Episode___Total_timesteps_mean']
available_summary_cols = [c for c in summary_cols if c in metrics_df.columns]

if available_summary_cols:
    print("\nSummary by Category:")
    category_summary = metrics_df.groupby('category')[available_summary_cols].agg(['mean', 'std', 'min', 'max'])
    display(category_summary)
    
    # Full metrics table
    print("\nFull Metrics Table (Latest Checkpoint Values):")
    display_cols = ['display_name', 'experiment', 'category'] + available_summary_cols
    full_table = metrics_df[display_cols].sort_values(by='Reward___Total_reward_mean', ascending=False)
    display(full_table)
    
    # Save summary
    full_table.to_csv(OUTPUT_DIR / "full_metrics_summary.csv", index=False)
    print(f"\nFull summary saved to: {OUTPUT_DIR / 'full_metrics_summary.csv'}")


In [None]:
# Generate key insights
print("="*80)
print("KEY INSIGHTS AND FINDINGS")
print("="*80)

insights = []

# 1. Flat vs Rough performance
flat_runs = metrics_df[metrics_df['category'] == 'flat']
rough_runs = metrics_df[metrics_df['category'] == 'rough']

if 'Reward___Total_reward_mean' in metrics_df.columns:
    flat_mean = flat_runs['Reward___Total_reward_mean'].mean()
    rough_mean = rough_runs['Reward___Total_reward_mean'].mean()
    
    if pd.notna(flat_mean) and pd.notna(rough_mean):
        diff_pct = ((rough_mean - flat_mean) / abs(flat_mean)) * 100 if flat_mean != 0 else 0
        insights.append({
            'Finding': 'Terrain Difficulty Impact',
            'Detail': f'Average reward on rough terrain is {diff_pct:.1f}% {'higher' if diff_pct > 0 else 'lower'} than flat terrain',
            'Flat Mean': f"{flat_mean:.3f}",
            'Rough Mean': f"{rough_mean:.3f}"
        })

# 2. Best performing policy
if 'Reward___Total_reward_mean' in metrics_df.columns:
    best_run = metrics_df.loc[metrics_df['Reward___Total_reward_mean'].idxmax()]
    insights.append({
        'Finding': 'Best Performing Policy',
        'Detail': f"{best_run['display_name']} achieves the highest total reward: {best_run['Reward___Total_reward_mean']:.3f}",
        'Experiment': best_run['experiment']
    })

# 3. Training stability
if 'Episode___Total_timesteps_mean' in metrics_df.columns:
    stable_runs = metrics_df[metrics_df['Episode___Total_timesteps_mean'] > metrics_df['Episode___Total_timesteps_mean'].quantile(0.75)]
    insights.append({
        'Finding': 'Training Stability',
        'Detail': f"{len(stable_runs)} policies show high episode stability (top quartile)",
        'Top Stable Policies': ', '.join(stable_runs['display_name'].tolist()[:5])
    })

# Print insights
for i, insight in enumerate(insights, 1):
    print(f"\n{i}. {insight['Finding']}")
    print(f"   {insight['Detail']}")
    for key, value in insight.items():
        if key not in ['Finding', 'Detail']:
            print(f"   {key}: {value}")

# Save insights
insights_df = pd.DataFrame(insights)
insights_df.to_csv(OUTPUT_DIR / "insights.csv", index=False)
print(f"\n\nInsights saved to: {OUTPUT_DIR / 'insights.csv'}")


## Summary

This notebook has generated comprehensive performance comparisons including:
- Bar charts comparing key metrics across different configurations
- Training curves showing learning progress
- Heatmaps for multi-metric comparisons
- Statistical summaries and key insights

All visualizations have been saved to the `comparison_results/` directory.
