# Enhanced Learned Analytical Formula with Degree-Based Error Analysis

This notebook extends the original learned analytical formula analysis (Notebook 8) with
comprehensive degree-based error analysis for better understanding of formula performance.

## Enhanced Features

- **Degree-stratified residual analysis**: Understand where the formula fails
- **Enhanced parameter sensitivity**: Analyze parameter importance by degree range
- **Bias-variance decomposition**: Break down errors by degree combinations
- **Improved convergence analysis**: Degree-aware minimum permutations

## Formula Types Supported

1. **Original**: `P = α × (u^β × v^γ) / (δ + ε×m + ζ×(u×v)^η + θ×density^κ)`
2. **Extended**: Adds logarithmic terms for high-degree nodes
3. **Polynomial**: Linear combination approach for sparse graphs

In [None]:
# Papermill parameters
edge_type = "PCiC"  # Start with smallest edge type for testing
N_candidates = [2, 3, 5, 7, 10]  # Reduced for faster testing
convergence_threshold = 0.01  # 1% for faster convergence
target_metric = "correlation"
min_metric_value = 0.95
formula_type = "original"  # 'original', 'extended', 'polynomial'
small_graph_mode = True  # Use small graph optimizations

# Handle string-to-list conversion for N_candidates (from papermill)
import json
if isinstance(N_candidates, str):
    N_candidates = json.loads(N_candidates)

## Setup

In [None]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Setup paths
repo_dir = Path.cwd().parent
src_dir = repo_dir / 'src'
data_dir = repo_dir / 'data'
results_dir = repo_dir / 'results'
output_dir = results_dir / 'learned_analytical_enhanced'
output_dir.mkdir(parents=True, exist_ok=True)

sys.path.append(str(src_dir))

# Import modules
from learned_analytical import LearnedAnalyticalFormula
from degree_analysis import DegreeAnalyzer, identify_small_graphs

print("All modules imported successfully!")
print(f"Repository directory: {repo_dir}")
print(f"Analyzing edge type: {edge_type}")
print(f"Formula type: {formula_type}")
print(f"Small graph mode: {small_graph_mode}")
print(f"Testing N values: {N_candidates}")

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.dpi'] = 100

## Validate Edge Type Selection

In [None]:
# If in small graph mode, validate edge type is actually small
if small_graph_mode:
    small_graphs = identify_small_graphs(data_dir, max_edges=10000)
    small_edge_types = [g['edge_type'] for g in small_graphs]
    
    print(f"Available small edge types: {small_edge_types[:10]}...")  # Show first 10
    
    if edge_type not in small_edge_types:
        print(f"Warning: {edge_type} not in small graphs. Using {small_edge_types[0]} instead.")
        edge_type = small_edge_types[0]
    
    # Get graph info
    edge_info = next((g for g in small_graphs if g['edge_type'] == edge_type), None)
    if edge_info:
        print(f"\nSelected edge type: {edge_type}")
        print(f"  Edges: {edge_info['n_edges']:,}")
        print(f"  Shape: {edge_info['shape']}")
        print(f"  Density: {edge_info['density']:.6f}")
else:
    print(f"Full-scale mode: analyzing {edge_type}")

## Initialize Enhanced Learner

In [None]:
# Initialize learner with enhanced configuration
learner = LearnedAnalyticalFormula(
    n_random_starts=5 if small_graph_mode else 10,  # Reduced for faster testing
    regularization_lambda=0.001,
    formula_type=formula_type,
    bootstrap_samples=1,  # No bootstrap for small graphs
    ensemble_size=1
)

print(f"Initialized {formula_type} learner with:")
print(f"  Random starts: {learner.n_random_starts}")
print(f"  Regularization: {learner.regularization_lambda}")
print(f"  Bootstrap samples: {learner.bootstrap_samples}")
print(f"  Ensemble size: {learner.ensemble_size}")

## Enhanced Minimum Permutations Analysis

In [None]:
# Run enhanced minimum permutations analysis
print(f"{'='*80}")
print(f"ENHANCED MINIMUM PERMUTATIONS ANALYSIS - {edge_type}")
print(f"{'='*80}")

results = learner.find_minimum_permutations(
    graph_name=edge_type,
    data_dir=data_dir,
    results_dir=results_dir,
    N_candidates=N_candidates,
    convergence_threshold=convergence_threshold,
    target_metric=target_metric,
    min_metric_value=min_metric_value
)

print(f"\n{'='*80}")
print(f"RESULTS SUMMARY")
print(f"{'='*80}")
print(f"Minimum permutations: N = {results['N_min']}")
print(f"Final validation correlation: {results['final_metrics']['correlation']:.6f}")
print(f"Improvement over baseline: {((results['final_metrics']['correlation'] - results['baseline_metrics']['correlation']) / results['baseline_metrics']['correlation'] * 100):+.1f}%")

## Enhanced Residual Analysis with Degree Decomposition

In [None]:
# Load 200-permutation empirical frequencies for enhanced analysis
empirical_file = results_dir / 'empirical_edge_frequencies' / f'edge_frequency_by_degree_{edge_type}.csv'

if empirical_file.exists():
    empirical_df = pd.read_csv(empirical_file)
    empirical_200 = {}
    for _, row in empirical_df.iterrows():
        u = int(row['source_degree'])
        v = int(row['target_degree'])
        freq = float(row['frequency'])
        empirical_200[(u, v)] = freq
    
    print(f"Loaded empirical frequencies: {len(empirical_200)} degree combinations")
    
    # Run enhanced residual analysis
    print(f"\nRunning enhanced residual analysis with degree decomposition...")
    residuals_df, degree_error_metrics = learner.analyze_residuals(
        empirical_200=empirical_200,
        m=results['graph_stats']['m'],
        density=results['graph_stats']['density'],
        results_dir=output_dir,
        graph_name=edge_type,
        small_graph_mode=small_graph_mode
    )
    
    print(f"\nGenerated enhanced residual analysis with {len(degree_error_metrics)} degree combinations")
    
    # Display degree-based error summary
    print(f"\nDegree-based error metrics:")
    display_cols = ['n_samples', 'bias_learned', 'rmse_learned', 'rel_error_mean_learned']
    print(degree_error_metrics[display_cols].round(4))
    
else:
    print(f"Empirical frequency file not found: {empirical_file}")
    print("Skipping enhanced residual analysis")
    residuals_df = None
    degree_error_metrics = None

## Enhanced Parameter Sensitivity Analysis

In [None]:
# Run enhanced parameter sensitivity analysis
if empirical_200 is not None:
    print(f"\nRunning enhanced parameter sensitivity analysis...")
    
    sensitivity_df = learner.analyze_parameter_importance(
        empirical_200=empirical_200,
        m=results['graph_stats']['m'],
        density=results['graph_stats']['density'],
        graph_name=edge_type,
        results_dir=output_dir
    )
    
    print(f"\nParameter sensitivity analysis complete!")
    print(f"Most sensitive parameters:")
    top_params = sensitivity_df.head(3)
    for _, row in top_params.iterrows():
        print(f"  {row['parameter']}: {row['sensitivity']:.6f} (value: {row['value']:.4f})")
else:
    print("Skipping parameter sensitivity analysis (no empirical data)")

## Degree-Stratified Convergence Analysis

In [None]:
# Analyze convergence patterns by degree combination
if degree_error_metrics is not None:
    print(f"\n{'='*60}")
    print(f"DEGREE-STRATIFIED CONVERGENCE ANALYSIS")
    print(f"{'='*60}")
    
    # Create enhanced convergence visualization
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Plot 1: Sample size vs error
    ax = axes[0, 0]
    ax.scatter(degree_error_metrics['n_samples'], degree_error_metrics['rmse_learned'],
               alpha=0.7, s=50, c=degree_error_metrics['bias_learned'], 
               cmap='RdBu_r', edgecolor='black')
    ax.set_xlabel('Sample Size', fontsize=12)
    ax.set_ylabel('RMSE', fontsize=12)
    ax.set_title(f'{edge_type} - Error vs Sample Size\n(Color = Bias)', fontsize=14, fontweight='bold')
    ax.grid(alpha=0.3)
    
    # Add colorbar
    cbar = plt.colorbar(ax.collections[0], ax=ax)
    cbar.set_label('Bias', fontsize=10)
    
    # Plot 2: Error by degree combination
    ax = axes[0, 1]
    degree_error_metrics['rmse_learned'].plot(kind='bar', ax=ax, color='skyblue', edgecolor='black')
    ax.set_xlabel('Degree Combination', fontsize=12)
    ax.set_ylabel('RMSE', fontsize=12)
    ax.set_title(f'{edge_type} - Error by Degree Combination', fontsize=14, fontweight='bold')
    ax.grid(axis='y', alpha=0.3)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    
    # Plot 3: Bias analysis
    ax = axes[1, 0]
    bias_data = degree_error_metrics[['bias_learned', 'bias_analytical']]
    bias_data.plot(kind='bar', ax=ax, color=['green', 'orange'], alpha=0.7)
    ax.axhline(0, color='black', linestyle='--', linewidth=1)
    ax.set_xlabel('Degree Combination', fontsize=12)
    ax.set_ylabel('Bias', fontsize=12)
    ax.set_title(f'{edge_type} - Bias Comparison', fontsize=14, fontweight='bold')
    ax.legend(['Learned', 'Analytical'])
    ax.grid(axis='y', alpha=0.3)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    
    # Plot 4: Relative error distribution
    ax = axes[1, 1]
    rel_error_data = degree_error_metrics[['rel_error_mean_learned', 'rel_error_mean_analytical']]
    rel_error_data.plot(kind='bar', ax=ax, color=['green', 'orange'], alpha=0.7)
    ax.set_xlabel('Degree Combination', fontsize=12)
    ax.set_ylabel('Mean Relative Error', fontsize=12)
    ax.set_title(f'{edge_type} - Relative Error Comparison', fontsize=14, fontweight='bold')
    ax.legend(['Learned', 'Analytical'])
    ax.grid(axis='y', alpha=0.3)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
    
    plt.tight_layout()
    convergence_plot = output_dir / f'{edge_type}_degree_convergence_analysis.png'
    plt.savefig(convergence_plot, dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"Degree-stratified convergence analysis saved to: {convergence_plot}")
    
    # Summary statistics
    print(f"\nConvergence insights:")
    print(f"  Most stable combination: {degree_error_metrics['rmse_learned'].idxmin()} (RMSE: {degree_error_metrics['rmse_learned'].min():.6f})")
    print(f"  Least stable combination: {degree_error_metrics['rmse_learned'].idxmax()} (RMSE: {degree_error_metrics['rmse_learned'].max():.6f})")
    
    high_bias = degree_error_metrics[np.abs(degree_error_metrics['bias_learned']) > 0.01]
    if len(high_bias) > 0:
        print(f"  High bias combinations ({len(high_bias)}): {list(high_bias.index)}")
    else:
        print(f"  No high bias combinations detected")

else:
    print("Skipping degree-stratified convergence analysis (no degree metrics)")

## Generate Enhanced Predictions

In [None]:
# Generate predictions with degree-based analysis
print(f"\nGenerating enhanced predictions for all source-target combinations...")

predictions_df = learner.predict_all_edges(edge_type, data_dir)

# Add degree-based analysis to predictions
if small_graph_mode:
    # For small graphs, we can afford to add degree analysis to all predictions
    analyzer = DegreeAnalyzer(small_graph_mode=True)
    
    # Load graph degrees
    source_degrees, target_degrees = analyzer.load_graph_degrees(edge_type, data_dir)
    
    # Add degree categories to predictions
    predictions_df['source_degree_category'] = analyzer.categorize_degrees(
        source_degrees[predictions_df['source_index']]
    ).astype(str)
    predictions_df['target_degree_category'] = analyzer.categorize_degrees(
        target_degrees[predictions_df['target_index']]
    ).astype(str)
    predictions_df['degree_combination'] = analyzer.create_degree_combination_labels(
        predictions_df['source_degree_category'].values,
        predictions_df['target_degree_category'].values
    )
    
    print(f"Enhanced predictions generated:")
    print(f"  Total combinations: {len(predictions_df):,}")
    print(f"  Degree combinations: {predictions_df['degree_combination'].nunique()}")
    print(f"  Probability range: {predictions_df['learned_probability'].min():.6f} - {predictions_df['learned_probability'].max():.6f}")
    
    # Sample by degree combination
    print(f"\nSample predictions by degree combination:")
    for combo in predictions_df['degree_combination'].unique()[:5]:  # Show first 5
        combo_data = predictions_df[predictions_df['degree_combination'] == combo]
        print(f"  {combo}: {len(combo_data)} pairs, avg prob: {combo_data['learned_probability'].mean():.6f}")

else:
    print(f"Predictions generated:")
    print(f"  Total combinations: {len(predictions_df):,}")
    print(f"  Probability range: {predictions_df['learned_probability'].min():.6f} - {predictions_df['learned_probability'].max():.6f}")

## Save Enhanced Results

In [None]:
# Save enhanced results
print(f"\nSaving enhanced results to: {output_dir}")

# Save learned parameters with enhanced metadata
enhanced_results = results.copy()
enhanced_results['formula_type'] = formula_type
enhanced_results['small_graph_mode'] = small_graph_mode
enhanced_results['convergence_threshold'] = convergence_threshold

learner.save_results(enhanced_results, output_dir)

# Save enhanced predictions
predictions_file = output_dir / f'{edge_type}_enhanced_predictions.csv'
predictions_df.to_csv(predictions_file, index=False)
print(f"Enhanced predictions saved to: {predictions_file}")

# Save degree-based error metrics if available
if degree_error_metrics is not None:
    degree_metrics_file = output_dir / f'{edge_type}_degree_error_metrics.csv'
    degree_error_metrics.to_csv(degree_metrics_file)
    print(f"Degree-based error metrics saved to: {degree_metrics_file}")

# Save residual data if available
if residuals_df is not None:
    residuals_file = output_dir / f'{edge_type}_enhanced_residuals.csv'
    residuals_df.to_csv(residuals_file, index=False)
    print(f"Enhanced residual data saved to: {residuals_file}")

# Create summary report
summary_report = {
    'edge_type': edge_type,
    'formula_type': formula_type,
    'small_graph_mode': small_graph_mode,
    'N_min': results['N_min'],
    'final_correlation': results['final_metrics']['correlation'],
    'final_mae': results['final_metrics']['mae'],
    'baseline_correlation': results['baseline_metrics']['correlation'],
    'improvement_pct': ((results['final_metrics']['correlation'] - results['baseline_metrics']['correlation']) / results['baseline_metrics']['correlation'] * 100),
    'graph_stats': results['graph_stats'],
    'degree_combinations_analyzed': len(degree_error_metrics) if degree_error_metrics is not None else 0,
    'total_predictions': len(predictions_df)
}

summary_file = output_dir / f'{edge_type}_enhanced_summary.json'
with open(summary_file, 'w') as f:
    json.dump(summary_report, f, indent=2)
print(f"Summary report saved to: {summary_file}")

print(f"\n{'='*80}")
print(f"ENHANCED ANALYSIS COMPLETE")
print(f"{'='*80}")
print(f"\nGenerated files:")
for file in sorted(output_dir.glob(f'{edge_type}_*')):
    print(f"  - {file.name}")

print(f"\n✓ Enhanced learned analytical formula analysis complete!")
print(f"✓ Degree-based error decomposition: {len(degree_error_metrics) if degree_error_metrics is not None else 0} combinations")
print(f"✓ Formula type: {formula_type}")
print(f"✓ Minimum permutations: N = {results['N_min']}")
print(f"✓ Performance improvement: {summary_report['improvement_pct']:+.1f}%")

if small_graph_mode:
    print(f"\n✓ Small graph validation successful!")
    print(f"✓ Framework ready for HPC deployment on all edge types")
else:
    print(f"\n✓ Full-scale analysis complete!")
    print(f"✓ Results ready for publication")