# Learned Analytical Formula - Summary Across All Edge Types

This notebook summarizes the learned analytical formula results across all edge types.

## Contents

1. Load results from all edge types
2. Compare minimum permutations needed (N_min)
3. Analyze learned parameters across graphs
4. Performance comparison: Learned vs Current Analytical
5. Relationship between graph properties and N_min
6. Recommendations for future graphs

## Setup

In [None]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import warnings
warnings.filterwarnings('ignore')

# Setup paths
repo_dir = Path.cwd().parent
results_dir = repo_dir / 'results' / 'learned_analytical'

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.dpi'] = 100

print("Setup complete!")
print(f"Results directory: {results_dir}")

## Load Results from All Edge Types

In [None]:
# Find all edge type results
edge_type_dirs = [d for d in results_dir.glob('*_results') if d.is_dir()]
edge_types = [d.name.replace('_results', '') for d in edge_type_dirs]

print(f"Found {len(edge_types)} edge types with results:")
print(f"{edge_types}")

# Load all results
all_results = []

for edge_type in sorted(edge_types):
    edge_dir = results_dir / f'{edge_type}_results'
    
    # Load parameters
    params_file = edge_dir / f'{edge_type}_learned_parameters.json'
    metrics_file = edge_dir / f'{edge_type}_metrics.json'
    
    if not params_file.exists() or not metrics_file.exists():
        print(f"⚠ Warning: Missing files for {edge_type}")
        continue
    
    with open(params_file, 'r') as f:
        params = json.load(f)
    
    with open(metrics_file, 'r') as f:
        metrics = json.load(f)
    
    # Combine into summary
    result = {
        'edge_type': edge_type,
        'N_min': params['N_min'],
        'edges': params['graph_stats']['m'],
        'density': params['graph_stats']['density'],
        'n_sources': params['graph_stats']['n_sources'],
        'n_targets': params['graph_stats']['n_targets'],
        # Learned parameters
        'α': params['α'],
        'β': params['β'],
        'γ': params['γ'],
        'δ': params['δ'],
        'ε': params['ε'],
        'ζ': params['ζ'],
        'η': params['η'],
        'θ': params['θ'],
        'κ': params['κ'],
        # Performance metrics
        'learned_mae': metrics['final_metrics']['mae'],
        'learned_correlation': metrics['final_metrics']['correlation'],
        'baseline_mae': metrics['baseline_metrics']['mae'],
        'baseline_correlation': metrics['baseline_metrics']['correlation']
    }
    
    # Calculate improvements
    result['mae_improvement'] = (result['baseline_mae'] - result['learned_mae']) / result['baseline_mae'] * 100
    result['corr_improvement'] = (result['learned_correlation'] - result['baseline_correlation']) / result['baseline_correlation'] * 100
    
    all_results.append(result)

# Create summary DataFrame
summary_df = pd.DataFrame(all_results)

print(f"\nLoaded {len(summary_df)} complete results")
print(f"\nFirst few rows:")
print(summary_df[['edge_type', 'N_min', 'edges', 'density', 'learned_correlation', 'baseline_correlation']].head(10))

## Summary Statistics

In [None]:
print("="*80)
print("LEARNED ANALYTICAL FORMULA - SUMMARY STATISTICS")
print("="*80)

print(f"\nEdge types analyzed: {len(summary_df)}")

print(f"\nMinimum Permutations (N_min):")
print(f"  Range: {summary_df['N_min'].min()} - {summary_df['N_min'].max()}")
print(f"  Mean: {summary_df['N_min'].mean():.1f}")
print(f"  Median: {summary_df['N_min'].median():.0f}")

print(f"\nGraph Characteristics:")
print(f"  Edges: {summary_df['edges'].min():,} - {summary_df['edges'].max():,}")
print(f"  Density: {summary_df['density'].min():.4f} - {summary_df['density'].max():.4f}")

print(f"\nPerformance (Correlation vs 200-perm empirical):")
print(f"  Learned:    {summary_df['learned_correlation'].min():.4f} - {summary_df['learned_correlation'].max():.4f} (mean: {summary_df['learned_correlation'].mean():.4f})")
print(f"  Analytical: {summary_df['baseline_correlation'].min():.4f} - {summary_df['baseline_correlation'].max():.4f} (mean: {summary_df['baseline_correlation'].mean():.4f})")

print(f"\nImprovement:")
print(f"  MAE: {summary_df['mae_improvement'].mean():+.1f}% (range: {summary_df['mae_improvement'].min():+.1f}% to {summary_df['mae_improvement'].max():+.1f}%)")
print(f"  Correlation: {summary_df['corr_improvement'].mean():+.1f}% (range: {summary_df['corr_improvement'].min():+.1f}% to {summary_df['corr_improvement'].max():+.1f}%)")

# Count improvements
n_improved = (summary_df['corr_improvement'] > 0).sum()
print(f"\nEdge types with improved correlation: {n_improved}/{len(summary_df)} ({n_improved/len(summary_df)*100:.1f}%)")

print("="*80)

## Relationship: Graph Properties vs N_min

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# N_min vs Density
axes[0, 0].scatter(summary_df['density'], summary_df['N_min'], s=100, alpha=0.6)
axes[0, 0].set_xlabel('Edge Density', fontsize=12)
axes[0, 0].set_ylabel('N_min (Minimum Permutations)', fontsize=12)
axes[0, 0].set_title('Minimum Permutations vs Density', fontsize=14, fontweight='bold')
axes[0, 0].grid(True, alpha=0.3)
axes[0, 0].set_xscale('log')

# Add trendline
z = np.polyfit(np.log(summary_df['density']), summary_df['N_min'], 1)
p = np.poly1d(z)
x_trend = np.sort(summary_df['density'])
axes[0, 0].plot(x_trend, p(np.log(x_trend)), "r--", alpha=0.5, linewidth=2, label='Trend')
axes[0, 0].legend()

# N_min vs Number of Edges
axes[0, 1].scatter(summary_df['edges'], summary_df['N_min'], s=100, alpha=0.6, color='green')
axes[0, 1].set_xlabel('Number of Edges', fontsize=12)
axes[0, 1].set_ylabel('N_min (Minimum Permutations)', fontsize=12)
axes[0, 1].set_title('Minimum Permutations vs Graph Size', fontsize=14, fontweight='bold')
axes[0, 1].grid(True, alpha=0.3)
axes[0, 1].set_xscale('log')

# Density distribution
axes[1, 0].hist(summary_df['density'], bins=20, edgecolor='black', alpha=0.7)
axes[1, 0].set_xlabel('Edge Density', fontsize=12)
axes[1, 0].set_ylabel('Frequency', fontsize=12)
axes[1, 0].set_title('Distribution of Graph Densities', fontsize=14, fontweight='bold')
axes[1, 0].grid(True, alpha=0.3)
axes[1, 0].axvline(0.03, color='red', linestyle='--', linewidth=2, label='Sparse threshold')
axes[1, 0].axvline(0.05, color='orange', linestyle='--', linewidth=2, label='Dense threshold')
axes[1, 0].legend()

# N_min distribution
axes[1, 1].hist(summary_df['N_min'], bins=15, edgecolor='black', alpha=0.7, color='purple')
axes[1, 1].set_xlabel('N_min (Minimum Permutations)', fontsize=12)
axes[1, 1].set_ylabel('Frequency', fontsize=12)
axes[1, 1].set_title('Distribution of Minimum Permutations', fontsize=14, fontweight='bold')
axes[1, 1].grid(True, alpha=0.3)
axes[1, 1].axvline(summary_df['N_min'].median(), color='red', linestyle='--', linewidth=2, 
                   label=f'Median = {summary_df["N_min"].median():.0f}')
axes[1, 1].legend()

plt.tight_layout()
plt.savefig(results_dir / 'summary_graph_properties_vs_N_min.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Graph properties plot saved to: {results_dir / 'summary_graph_properties_vs_N_min.png'}")

## Performance Comparison: Learned vs Analytical

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Correlation comparison
x_pos = np.arange(len(summary_df))
width = 0.35

axes[0].bar(x_pos - width/2, summary_df['baseline_correlation'], width, 
            label='Current Analytical', alpha=0.7, color='orange')
axes[0].bar(x_pos + width/2, summary_df['learned_correlation'], width, 
            label='Learned Formula', alpha=0.7, color='blue')
axes[0].set_xlabel('Edge Type', fontsize=12)
axes[0].set_ylabel('Correlation', fontsize=12)
axes[0].set_title('Correlation with 200-Perm Empirical', fontsize=14, fontweight='bold')
axes[0].set_xticks(x_pos)
axes[0].set_xticklabels(summary_df['edge_type'], rotation=90, ha='right')
axes[0].legend()
axes[0].grid(True, alpha=0.3, axis='y')

# Improvement scatter
axes[1].scatter(summary_df['baseline_correlation'], summary_df['corr_improvement'], 
                s=100, alpha=0.6, c=summary_df['N_min'], cmap='viridis')
axes[1].axhline(0, color='red', linestyle='--', linewidth=2, alpha=0.5)
axes[1].set_xlabel('Baseline Correlation', fontsize=12)
axes[1].set_ylabel('Improvement (%)', fontsize=12)
axes[1].set_title('Improvement vs Baseline Performance', fontsize=14, fontweight='bold')
axes[1].grid(True, alpha=0.3)
cbar = plt.colorbar(axes[1].collections[0], ax=axes[1])
cbar.set_label('N_min', fontsize=10)

# Add labels for edge types with large improvements
for idx, row in summary_df.iterrows():
    if abs(row['corr_improvement']) > 5:  # Label if >5% improvement
        axes[1].annotate(row['edge_type'], 
                        (row['baseline_correlation'], row['corr_improvement']),
                        fontsize=8, alpha=0.7)

plt.tight_layout()
plt.savefig(results_dir / 'summary_performance_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Performance comparison plot saved to: {results_dir / 'summary_performance_comparison.png'}")

## Learned Parameters Analysis

In [None]:
# Analyze learned parameters
param_cols = ['α', 'β', 'γ', 'δ', 'ε', 'ζ', 'η', 'θ', 'κ']

fig, axes = plt.subplots(3, 3, figsize=(15, 12))
axes = axes.flatten()

for i, param in enumerate(param_cols):
    axes[i].hist(summary_df[param], bins=15, edgecolor='black', alpha=0.7)
    axes[i].set_xlabel(param, fontsize=12, fontweight='bold')
    axes[i].set_ylabel('Frequency', fontsize=10)
    axes[i].set_title(f'Distribution of {param}', fontsize=12)
    axes[i].grid(True, alpha=0.3)
    axes[i].axvline(summary_df[param].mean(), color='red', linestyle='--', 
                   linewidth=2, label=f'Mean={summary_df[param].mean():.3f}')
    axes[i].legend(fontsize=8)

plt.tight_layout()
plt.savefig(results_dir / 'summary_learned_parameters.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Parameter distributions saved to: {results_dir / 'summary_learned_parameters.png'}")

# Print parameter statistics
print("\nLearned Parameter Statistics:")
print("="*80)
for param in param_cols:
    print(f"{param}: mean={summary_df[param].mean():.4f}, std={summary_df[param].std():.4f}, range=[{summary_df[param].min():.4f}, {summary_df[param].max():.4f}]")

## Categorize by Density

In [None]:
# Categorize graphs
summary_df['category'] = pd.cut(summary_df['density'], 
                                 bins=[0, 0.01, 0.03, 0.05, 1.0],
                                 labels=['Very Sparse (<1%)', 'Sparse (1-3%)', 
                                        'Medium (3-5%)', 'Dense (>5%)'])

# Summary by category
category_summary = summary_df.groupby('category').agg({
    'edge_type': 'count',
    'N_min': ['mean', 'median', 'std'],
    'learned_correlation': ['mean', 'min', 'max'],
    'corr_improvement': ['mean', 'min', 'max']
}).round(3)

print("\n" + "="*80)
print("SUMMARY BY GRAPH DENSITY CATEGORY")
print("="*80)
print(category_summary)
print("\n")

# Visualize by category
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# N_min by category
category_summary['N_min']['mean'].plot(kind='bar', ax=axes[0], color='steelblue', alpha=0.7)
axes[0].set_xlabel('Graph Category', fontsize=12)
axes[0].set_ylabel('Mean N_min', fontsize=12)
axes[0].set_title('Average Minimum Permutations by Graph Density', fontsize=14, fontweight='bold')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45, ha='right')
axes[0].grid(True, alpha=0.3, axis='y')

# Improvement by category
category_summary['corr_improvement']['mean'].plot(kind='bar', ax=axes[1], color='green', alpha=0.7)
axes[1].axhline(0, color='red', linestyle='--', linewidth=2)
axes[1].set_xlabel('Graph Category', fontsize=12)
axes[1].set_ylabel('Mean Correlation Improvement (%)', fontsize=12)
axes[1].set_title('Average Improvement by Graph Density', fontsize=14, fontweight='bold')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45, ha='right')
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig(results_dir / 'summary_by_density_category.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Category summary plot saved to: {results_dir / 'summary_by_density_category.png'}")

## Save Summary Tables

In [None]:
# Save full summary
summary_file = results_dir / 'learned_analytical_summary.csv'
summary_df.to_csv(summary_file, index=False)
print(f"Full summary saved to: {summary_file}")

# Save category summary
category_file = results_dir / 'summary_by_category.csv'
category_summary.to_csv(category_file)
print(f"Category summary saved to: {category_file}")

# Create top performers table
top_improved = summary_df.nlargest(10, 'corr_improvement')[[
    'edge_type', 'density', 'N_min', 'baseline_correlation', 
    'learned_correlation', 'corr_improvement'
]].round(4)

print("\n" + "="*80)
print("TOP 10 EDGE TYPES WITH LARGEST IMPROVEMENT")
print("="*80)
print(top_improved.to_string(index=False))

top_improved_file = results_dir / 'top_improved_edge_types.csv'
top_improved.to_csv(top_improved_file, index=False)
print(f"\nTop improved edge types saved to: {top_improved_file}")

## Recommendations

In [None]:
print("\n" + "="*80)
print("RECOMMENDATIONS FOR FUTURE GRAPHS")
print("="*80)

# Recommendations based on density
very_sparse = summary_df[summary_df['density'] < 0.01]
sparse = summary_df[(summary_df['density'] >= 0.01) & (summary_df['density'] < 0.03)]
medium = summary_df[(summary_df['density'] >= 0.03) & (summary_df['density'] < 0.05)]
dense = summary_df[summary_df['density'] >= 0.05]

print("\nMinimum Permutations by Graph Type:")
print("-" * 80)
if len(very_sparse) > 0:
    print(f"Very Sparse (<1% density): N_min ≈ {very_sparse['N_min'].median():.0f} (range: {very_sparse['N_min'].min()}-{very_sparse['N_min'].max()})")
if len(sparse) > 0:
    print(f"Sparse (1-3% density):     N_min ≈ {sparse['N_min'].median():.0f} (range: {sparse['N_min'].min()}-{sparse['N_min'].max()})")
if len(medium) > 0:
    print(f"Medium (3-5% density):     N_min ≈ {medium['N_min'].median():.0f} (range: {medium['N_min'].min()}-{medium['N_min'].max()})")
if len(dense) > 0:
    print(f"Dense (>5% density):       N_min ≈ {dense['N_min'].median():.0f} (range: {dense['N_min'].min()}-{dense['N_min'].max()})")

print("\nExpected Performance:")
print("-" * 80)
print(f"Correlation with 200-perm empirical: {summary_df['learned_correlation'].mean():.4f} ± {summary_df['learned_correlation'].std():.4f}")
print(f"Improvement over current analytical: {summary_df['corr_improvement'].mean():+.1f}% ± {summary_df['corr_improvement'].std():.1f}%")

print("\nRecommended Strategy:")
print("-" * 80)
print("1. For NEW graph with unknown density:")
print(f"   - Start with N = {int(summary_df['N_min'].median())} permutations (median across all graphs)")
print(f"   - Check convergence: if improvement < 2%, N is sufficient")
print(f"   - If needed, increase to N = {int(summary_df['N_min'].quantile(0.75))} (75th percentile)")
print("\n2. For graphs similar to existing:")
print("   - Use N_min from most similar graph by density")
print("   - Expected accuracy: r > 0.95 with 200-perm empirical")
print("\n3. For production use:")
print(f"   - Conservative estimate: N = {summary_df['N_min'].max()} (maximum observed)")
print("   - Ensures high confidence across all graph types")

print("="*80)