# Sampling Ablation Study Analysis

This notebook analyzes the sampling ablation study results to identify the best configurations for:
1. Exploration Temperature
2. Sampling Strategies
3. On-Policy vs Off-Policy
4. Preference Diversity
5. Batch Size Effects

**Inputs:**
- `results/ablations/sampling/all_results.csv`
- `configs/ablations/sampling_ablation.yaml`

**Outputs:**
- Summary reports for each experiment type
- Comparative visualizations
- Best configuration recommendations

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import yaml
from pathlib import Path
import warnings

warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

print("✓ Libraries imported successfully")

## 1. Load Data and Configuration

In [None]:
# Load results
results_path = 'results/ablations/sampling/all_results.csv'
config_path = 'configs/ablations/sampling_ablation.yaml'

print(f"Loading results from: {results_path}")
df = pd.read_csv(results_path)
print(f"✓ Loaded {len(df)} experiments")
print(f"\nColumns: {list(df.columns)}")

# Load configuration
print(f"\nLoading config from: {config_path}")
with open(config_path, 'r') as f:
    config = yaml.safe_load(f)
print(f"✓ Configuration loaded")

# Display first few rows
print("\nFirst 5 rows:")
df.head()

In [None]:
# Create output directory
output_dir = Path('results/ablations/sampling/report')
output_dir.mkdir(parents=True, exist_ok=True)
print(f"✓ Output directory: {output_dir}")

# Define key metrics to analyze
key_metrics = [
    'hypervolume', 'tds', 'mpd', 'mce', 'pmd', 
    'pas', 'fci', 'qds', 'der', 'rbd'
]

# Filter to available metrics
available_metrics = [m for m in key_metrics if m in df.columns]
print(f"\nAvailable metrics for analysis: {available_metrics}")

## 2. Identify Experiment Types

Categorize experiments based on their configuration to identify which experiments test which hypotheses.

In [None]:
# Identify experiment types based on varying parameters
def categorize_experiments(df):
    """
    Categorize experiments into different ablation types based on 
    what parameter is being varied.
    """
    categories = {}
    
    # Check which columns exist and vary
    param_columns = ['temperature', 'sampling_strategy', 'on_policy', 
                     'preference_sampling', 'alpha', 'batch_size']
    
    available_params = [col for col in param_columns if col in df.columns]
    
    print("Available parameter columns:")
    for param in available_params:
        unique_vals = df[param].unique()
        print(f"  - {param}: {unique_vals} ({len(unique_vals)} values)")
    
    # Categorize based on exp_name patterns if parameters not directly available
    if 'exp_name' in df.columns:
        df['experiment_type'] = df['exp_name'].apply(lambda x: x.split('_')[0] if isinstance(x, str) else 'unknown')
        
        # More specific categorization
        def get_category(name):
            if pd.isna(name):
                return 'unknown'
            name_lower = str(name).lower()
            
            if 'temp' in name_lower or 'temperature' in name_lower:
                return 'temperature'
            elif 'strategy' in name_lower or 'greedy' in name_lower or 'stochastic' in name_lower:
                return 'sampling_strategy'
            elif 'policy' in name_lower or 'onpolicy' in name_lower or 'offpolicy' in name_lower:
                return 'policy_type'
            elif 'pref' in name_lower or 'dirichlet' in name_lower or 'uniform' in name_lower:
                return 'preference_diversity'
            elif 'batch' in name_lower:
                return 'batch_size'
            else:
                return 'other'
        
        df['experiment_category'] = df['exp_name'].apply(get_category)
        
        print("\nExperiment categories identified:")
        print(df['experiment_category'].value_counts())
    
    return df

df = categorize_experiments(df)
df.head()

## 3. Experiment 1: Exploration Temperature

Analyze how temperature affects exploration-exploitation trade-off.

In [None]:
# Filter temperature experiments
temp_df = df[df['experiment_category'] == 'temperature'].copy()

if len(temp_df) > 0:
    print(f"Temperature experiments: {len(temp_df)}")
    print(f"Unique experiment names: {temp_df['exp_name'].unique()}")
    
    # Summary statistics
    temp_summary = temp_df.groupby('exp_name')[available_metrics].agg(['mean', 'std', 'count'])
    temp_summary.to_csv(output_dir / 'temperature_summary.csv')
    print(f"\n✓ Saved: {output_dir / 'temperature_summary.csv'}")
    
    # Display summary
    display(temp_summary.round(4))
    
    # Find best configuration for each metric
    print("\n📊 Best Temperature Configuration per Metric:")
    for metric in available_metrics[:5]:  # Top 5 metrics
        best_config = temp_summary[(metric, 'mean')].idxmax()
        best_value = temp_summary.loc[best_config, (metric, 'mean')]
        print(f"  {metric.upper()}: {best_config} ({best_value:.4f})")
else:
    print("⚠ No temperature experiments found in data")

In [None]:
# Visualize temperature effects
if len(temp_df) > 0:
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    fig.suptitle('Temperature Effects on Diversity Metrics', fontsize=16, fontweight='bold')
    
    for idx, metric in enumerate(available_metrics[:6]):
        ax = axes[idx // 3, idx % 3]
        
        # Box plot
        temp_df_plot = temp_df[['exp_name', metric]].dropna()
        if len(temp_df_plot) > 0:
            sns.boxplot(data=temp_df_plot, x='exp_name', y=metric, ax=ax)
            ax.set_title(f'{metric.upper()}', fontweight='bold')
            ax.set_xlabel('Configuration')
            ax.set_ylabel(metric.upper())
            ax.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig(output_dir / 'temperature_comparison.png', dpi=300, bbox_inches='tight')
    print(f"✓ Saved: {output_dir / 'temperature_comparison.png'}")
    plt.show()

## 4. Experiment 2: Sampling Strategies

Compare different sampling strategies (greedy, stochastic, etc.).

In [None]:
# Filter sampling strategy experiments
strategy_df = df[df['experiment_category'] == 'sampling_strategy'].copy()

if len(strategy_df) > 0:
    print(f"Sampling strategy experiments: {len(strategy_df)}")
    print(f"Unique experiment names: {strategy_df['exp_name'].unique()}")
    
    # Summary statistics
    strategy_summary = strategy_df.groupby('exp_name')[available_metrics].agg(['mean', 'std', 'count'])
    strategy_summary.to_csv(output_dir / 'sampling_strategy_summary.csv')
    print(f"\n✓ Saved: {output_dir / 'sampling_strategy_summary.csv'}")
    
    display(strategy_summary.round(4))
    
    # Best configuration
    print("\n📊 Best Sampling Strategy per Metric:")
    for metric in available_metrics[:5]:
        best_config = strategy_summary[(metric, 'mean')].idxmax()
        best_value = strategy_summary.loc[best_config, (metric, 'mean')]
        print(f"  {metric.upper()}: {best_config} ({best_value:.4f})")
else:
    print("⚠ No sampling strategy experiments found in data")

In [None]:
# Visualize sampling strategy effects
if len(strategy_df) > 0:
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    fig.suptitle('Sampling Strategy Effects on Diversity Metrics', fontsize=16, fontweight='bold')
    
    for idx, metric in enumerate(available_metrics[:6]):
        ax = axes[idx // 3, idx % 3]
        
        strategy_df_plot = strategy_df[['exp_name', metric]].dropna()
        if len(strategy_df_plot) > 0:
            sns.violinplot(data=strategy_df_plot, x='exp_name', y=metric, ax=ax)
            ax.set_title(f'{metric.upper()}', fontweight='bold')
            ax.set_xlabel('Strategy')
            ax.set_ylabel(metric.upper())
            ax.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig(output_dir / 'sampling_strategy_comparison.png', dpi=300, bbox_inches='tight')
    print(f"✓ Saved: {output_dir / 'sampling_strategy_comparison.png'}")
    plt.show()

## 5. Experiment 3: On-Policy vs Off-Policy

Compare on-policy and off-policy learning approaches.

In [None]:
# Filter policy type experiments
policy_df = df[df['experiment_category'] == 'policy_type'].copy()

if len(policy_df) > 0:
    print(f"Policy type experiments: {len(policy_df)}")
    print(f"Unique experiment names: {policy_df['exp_name'].unique()}")
    
    # Summary statistics
    policy_summary = policy_df.groupby('exp_name')[available_metrics].agg(['mean', 'std', 'count'])
    policy_summary.to_csv(output_dir / 'policy_type_summary.csv')
    print(f"\n✓ Saved: {output_dir / 'policy_type_summary.csv'}")
    
    display(policy_summary.round(4))
    
    # Best configuration
    print("\n📊 Best Policy Type per Metric:")
    for metric in available_metrics[:5]:
        best_config = policy_summary[(metric, 'mean')].idxmax()
        best_value = policy_summary.loc[best_config, (metric, 'mean')]
        print(f"  {metric.upper()}: {best_config} ({best_value:.4f})")
    
    # Statistical comparison
    print("\n📈 Performance Comparison (On-Policy vs Off-Policy):")
    on_policy = policy_df[policy_df['exp_name'].str.contains('on', case=False, na=False)]
    off_policy = policy_df[policy_df['exp_name'].str.contains('off', case=False, na=False)]
    
    if len(on_policy) > 0 and len(off_policy) > 0:
        for metric in available_metrics[:5]:
            on_mean = on_policy[metric].mean()
            off_mean = off_policy[metric].mean()
            diff = ((on_mean - off_mean) / off_mean * 100) if off_mean != 0 else 0
            winner = "On-Policy" if on_mean > off_mean else "Off-Policy"
            print(f"  {metric.upper()}: {winner} wins by {abs(diff):.2f}%")
else:
    print("⚠ No policy type experiments found in data")

In [None]:
# Visualize policy type effects
if len(policy_df) > 0:
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    fig.suptitle('On-Policy vs Off-Policy Comparison', fontsize=16, fontweight='bold')
    
    for idx, metric in enumerate(available_metrics[:6]):
        ax = axes[idx // 3, idx % 3]
        
        policy_df_plot = policy_df[['exp_name', metric]].dropna()
        if len(policy_df_plot) > 0:
            sns.barplot(data=policy_df_plot, x='exp_name', y=metric, ax=ax, ci='sd')
            ax.set_title(f'{metric.upper()}', fontweight='bold')
            ax.set_xlabel('Policy Type')
            ax.set_ylabel(metric.upper())
            ax.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig(output_dir / 'policy_type_comparison.png', dpi=300, bbox_inches='tight')
    print(f"✓ Saved: {output_dir / 'policy_type_comparison.png'}")
    plt.show()

## 6. Experiment 4: Preference Diversity

Analyze the impact of different preference sampling distributions (Dirichlet vs Uniform).

In [None]:
# Filter preference diversity experiments
pref_df = df[df['experiment_category'] == 'preference_diversity'].copy()

if len(pref_df) > 0:
    print(f"Preference diversity experiments: {len(pref_df)}")
    print(f"Unique experiment names: {pref_df['exp_name'].unique()}")
    
    # Summary statistics
    pref_summary = pref_df.groupby('exp_name')[available_metrics].agg(['mean', 'std', 'count'])
    pref_summary.to_csv(output_dir / 'preference_diversity_summary.csv')
    print(f"\n✓ Saved: {output_dir / 'preference_diversity_summary.csv'}")
    
    display(pref_summary.round(4))
    
    # Best configuration
    print("\n📊 Best Preference Sampling per Metric:")
    for metric in available_metrics[:5]:
        best_config = pref_summary[(metric, 'mean')].idxmax()
        best_value = pref_summary.loc[best_config, (metric, 'mean')]
        print(f"  {metric.upper()}: {best_config} ({best_value:.4f})")
    
    # Analyze alpha parameter effect (if available)
    if 'alpha' in pref_df.columns:
        print("\n📈 Alpha Parameter Analysis:")
        for alpha_val in sorted(pref_df['alpha'].unique()):
            alpha_data = pref_df[pref_df['alpha'] == alpha_val]
            print(f"\n  Alpha = {alpha_val}:")
            for metric in available_metrics[:3]:
                mean_val = alpha_data[metric].mean()
                print(f"    {metric.upper()}: {mean_val:.4f}")
else:
    print("⚠ No preference diversity experiments found in data")

In [None]:
# Visualize preference diversity effects
if len(pref_df) > 0:
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    fig.suptitle('Preference Diversity Effects on Metrics', fontsize=16, fontweight='bold')
    
    for idx, metric in enumerate(available_metrics[:6]):
        ax = axes[idx // 3, idx % 3]
        
        pref_df_plot = pref_df[['exp_name', metric]].dropna()
        if len(pref_df_plot) > 0:
            sns.boxplot(data=pref_df_plot, x='exp_name', y=metric, ax=ax)
            ax.set_title(f'{metric.upper()}', fontweight='bold')
            ax.set_xlabel('Preference Sampling')
            ax.set_ylabel(metric.upper())
            ax.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig(output_dir / 'preference_diversity_comparison.png', dpi=300, bbox_inches='tight')
    print(f"✓ Saved: {output_dir / 'preference_diversity_comparison.png'}")
    plt.show()

## 7. Experiment 5: Batch Size Effects

Analyze how batch size affects training dynamics and diversity metrics.

In [None]:
# Filter batch size experiments
batch_df = df[df['experiment_category'] == 'batch_size'].copy()

if len(batch_df) > 0:
    print(f"Batch size experiments: {len(batch_df)}")
    print(f"Unique experiment names: {batch_df['exp_name'].unique()}")
    
    # Summary statistics
    batch_summary = batch_df.groupby('exp_name')[available_metrics].agg(['mean', 'std', 'count'])
    batch_summary.to_csv(output_dir / 'batch_size_summary.csv')
    print(f"\n✓ Saved: {output_dir / 'batch_size_summary.csv'}")
    
    display(batch_summary.round(4))
    
    # Best configuration
    print("\n📊 Best Batch Size per Metric:")
    for metric in available_metrics[:5]:
        best_config = batch_summary[(metric, 'mean')].idxmax()
        best_value = batch_summary.loc[best_config, (metric, 'mean')]
        print(f"  {metric.upper()}: {best_config} ({best_value:.4f})")
    
    # Training efficiency analysis
    if 'training_time' in batch_df.columns:
        print("\n⏱️ Training Efficiency:")
        efficiency_df = batch_df.groupby('exp_name')[['training_time'] + available_metrics[:3]].mean()
        display(efficiency_df.round(4))
else:
    print("⚠ No batch size experiments found in data")

In [None]:
# Visualize batch size effects
if len(batch_df) > 0:
    fig, axes = plt.subplots(2, 3, figsize=(18, 10))
    fig.suptitle('Batch Size Effects on Metrics', fontsize=16, fontweight='bold')
    
    for idx, metric in enumerate(available_metrics[:6]):
        ax = axes[idx // 3, idx % 3]
        
        batch_df_plot = batch_df[['exp_name', metric]].dropna()
        if len(batch_df_plot) > 0:
            sns.boxplot(data=batch_df_plot, x='exp_name', y=metric, ax=ax)
            ax.set_title(f'{metric.upper()}', fontweight='bold')
            ax.set_xlabel('Batch Size')
            ax.set_ylabel(metric.upper())
            ax.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.savefig(output_dir / 'batch_size_comparison.png', dpi=300, bbox_inches='tight')
    print(f"✓ Saved: {output_dir / 'batch_size_comparison.png'}")
    plt.show()

## 8. Overall Best Configuration Analysis

Rank all configurations across all experiment types to identify the overall best settings.

In [None]:
# Compute overall rankings
print("🏆 Overall Best Configurations Across All Experiments")
print("="*70)

overall_summary = df.groupby('exp_name')[available_metrics].mean()

# Rank by each metric
rankings = {}
for metric in available_metrics:
    rankings[metric] = overall_summary[metric].sort_values(ascending=False)
    print(f"\n{metric.upper()} - Top 5 Configurations:")
    for i, (config, value) in enumerate(rankings[metric].head(5).items(), 1):
        print(f"  {i}. {config}: {value:.4f}")

# Save overall rankings
overall_summary.to_csv(output_dir / 'overall_configuration_rankings.csv')
print(f"\n✓ Saved: {output_dir / 'overall_configuration_rankings.csv'}")

In [None]:
# Create radar chart for top configurations
from math import pi

# Select top 5 configurations by average rank
rank_scores = {}
for config in overall_summary.index:
    ranks = []
    for metric in available_metrics:
        rank = rankings[metric].index.get_loc(config) + 1
        ranks.append(rank)
    rank_scores[config] = np.mean(ranks)

top_configs = sorted(rank_scores.items(), key=lambda x: x[1])[:5]
print("\n📊 Top 5 Configurations by Average Rank:")
for i, (config, avg_rank) in enumerate(top_configs, 1):
    print(f"  {i}. {config} (Avg Rank: {avg_rank:.2f})")

# Create radar chart
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='polar')

# Normalize metrics to 0-1 for comparison
normalized_data = {}
for metric in available_metrics[:6]:  # Top 6 metrics for clarity
    min_val = overall_summary[metric].min()
    max_val = overall_summary[metric].max()
    normalized_data[metric] = (overall_summary[metric] - min_val) / (max_val - min_val + 1e-10)

# Plot top 3 configurations
angles = [n / float(len(available_metrics[:6])) * 2 * pi for n in range(len(available_metrics[:6]))]
angles += angles[:1]

for config, _ in top_configs[:3]:
    values = [normalized_data[metric][config] for metric in available_metrics[:6]]
    values += values[:1]
    ax.plot(angles, values, 'o-', linewidth=2, label=config)
    ax.fill(angles, values, alpha=0.15)

ax.set_xticks(angles[:-1])
ax.set_xticklabels([m.upper() for m in available_metrics[:6]])
ax.set_ylim(0, 1)
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
ax.set_title('Top 3 Configurations - Normalized Performance', size=14, fontweight='bold', pad=20)
ax.grid(True)

plt.tight_layout()
plt.savefig(output_dir / 'top_configurations_radar.png', dpi=300, bbox_inches='tight')
print(f"\n✓ Saved: {output_dir / 'top_configurations_radar.png'}")
plt.show()

## 9. Comprehensive Summary Report

Generate final recommendations based on all analyses.

In [None]:
# Generate comprehensive report
report = []
report.append("="*70)
report.append("SAMPLING ABLATION STUDY - COMPREHENSIVE REPORT")
report.append("="*70)
report.append("")

# Summary by experiment type
experiment_types = [
    ('temperature', 'Exploration Temperature', temp_df),
    ('sampling_strategy', 'Sampling Strategies', strategy_df),
    ('policy_type', 'On-Policy vs Off-Policy', policy_df),
    ('preference_diversity', 'Preference Diversity', pref_df),
    ('batch_size', 'Batch Size Effects', batch_df)
]

for exp_type, title, exp_df in experiment_types:
    if len(exp_df) > 0:
        report.append(f"\n## {title}")
        report.append("-" * 70)
        
        # Best configuration for each key metric
        summary = exp_df.groupby('exp_name')[available_metrics[:5]].mean()
        
        for metric in available_metrics[:5]:
            best_config = summary[metric].idxmax()
            best_value = summary[metric].max()
            report.append(f"  Best {metric.upper()}: {best_config} ({best_value:.4f})")

report.append("\n")
report.append("="*70)
report.append("OVERALL RECOMMENDATIONS")
report.append("="*70)
report.append("")
report.append("Top 5 Configurations by Average Rank:")
for i, (config, avg_rank) in enumerate(top_configs, 1):
    report.append(f"  {i}. {config} (Average Rank: {avg_rank:.2f})")

report.append("\n")
report.append("Key Insights:")
report.append("  - Review the generated CSVs for detailed statistics")
report.append("  - Compare PNG visualizations for metric-specific trends")
report.append("  - Consider trade-offs between different metrics based on your goals")
report.append("")
report.append("="*70)

# Print report
report_text = "\n".join(report)
print(report_text)

# Save report
with open(output_dir / 'comprehensive_report.txt', 'w') as f:
    f.write(report_text)

print(f"\n✓ Saved: {output_dir / 'comprehensive_report.txt'}")

In [None]:
# List all generated files
print("\n📁 Generated Files:")
print("="*70)
generated_files = sorted(output_dir.glob('*'))
for file in generated_files:
    print(f"  ✓ {file.name}")

print(f"\nTotal: {len(generated_files)} files generated in {output_dir}")

## 10. Next Steps

Based on the analysis:

1. **Review the summary CSVs** for detailed statistics on each experiment type
2. **Examine the visualizations** to understand metric-specific trends
3. **Compare top configurations** using the radar chart
4. **Select the best configuration** based on your specific optimization goals
5. **Run validation experiments** with the recommended settings

**Key Files:**
- `temperature_summary.csv` - Temperature ablation results
- `sampling_strategy_summary.csv` - Strategy comparison
- `policy_type_summary.csv` - On/off-policy comparison
- `preference_diversity_summary.csv` - Preference sampling analysis
- `batch_size_summary.csv` - Batch size effects
- `overall_configuration_rankings.csv` - Complete rankings
- `comprehensive_report.txt` - Text summary with recommendations