# Model Comparison for Edge Probability Prediction

This notebook compares 4 different models for predicting edge probabilities based on source and target node degrees:

1. **Simple Neural Network**: 2-layer NN with continuous probability output
2. **Random Forest**: Ensemble method with 100 trees
3. **Logistic Regression**: Standard logistic regression with L2 regularization
4. **Polynomial Logistic Regression**: Quadratic features + logistic regression

## Data

We use edge data from a single permutation: `data/permutations/000.hetmat/edges/AeG.sparse.npz`

## Evaluation

Models are evaluated using:
- **Classification metrics**: AUC-ROC, Precision, Recall, F1-Score
- **Regression metrics**: RMSE, MAE, R², Correlation
- **Empirical comparison**: Compare predictions with empirical frequencies from `results/edge_frequency_by_degree.csv`
- **Visualizations**: ROC curves, Precision-Recall curves, probability heatmaps

## 1. Setup and Data Loading

In [None]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Setup paths
repo_dir = Path.cwd().parent
src_dir = repo_dir / 'src'
data_dir = repo_dir / 'data'
results_dir = repo_dir / 'results' / 'model_comparison'
results_dir.mkdir(parents=True, exist_ok=True)

sys.path.append(str(src_dir))

# Import our custom modules
from model_comparison import ModelCollection, prepare_edge_features_and_labels, create_degree_grid
from model_training import ModelTrainer
from model_evaluation import ModelEvaluator, get_best_models
from model_visualization import ModelVisualizer, create_comparison_table_plot

print("All modules imported successfully!")
print(f"Repository directory: {repo_dir}")
print(f"Results will be saved to: {results_dir}")

## 2. Data Preparation

In [None]:
# Parameters
EDGE_FILE_PATH = data_dir / 'permutations' / '000.hetmat' / 'edges' / 'AeG.sparse.npz'
SAMPLE_RATIO = 0.1  # Ratio for negative sampling to balance dataset
RANDOM_STATE = 42
TEST_SIZE = 0.2
VAL_SIZE = 0.1

print(f"Loading edge data from: {EDGE_FILE_PATH}")
print(f"File exists: {EDGE_FILE_PATH.exists()}")

if not EDGE_FILE_PATH.exists():
    raise FileNotFoundError(f"Edge data file not found: {EDGE_FILE_PATH}")

In [None]:
# Prepare features and labels
print("Preparing edge features and labels...")
features, labels = prepare_edge_features_and_labels(
    str(EDGE_FILE_PATH), 
    sample_ratio=SAMPLE_RATIO
)

print(f"\nDataset Statistics:")
print(f"  Total samples: {len(features)}")
print(f"  Features shape: {features.shape}")
print(f"  Positive examples: {np.sum(labels)} ({np.mean(labels):.1%})")
print(f"  Negative examples: {len(labels) - np.sum(labels)} ({1-np.mean(labels):.1%})")

print(f"\nFeature Statistics:")
print(f"  Source degrees: {features[:, 0].min():.0f} - {features[:, 0].max():.0f} (mean: {features[:, 0].mean():.1f})")
print(f"  Target degrees: {features[:, 1].min():.0f} - {features[:, 1].max():.0f} (mean: {features[:, 1].mean():.1f})")

In [None]:
# Visualize data distribution
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Source degree distribution
axes[0, 0].hist(features[:, 0], bins=50, alpha=0.7, edgecolor='black')
axes[0, 0].set_xlabel('Source Degree')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].set_title('Distribution of Source Degrees')
axes[0, 0].grid(True, alpha=0.3)

# Target degree distribution
axes[0, 1].hist(features[:, 1], bins=50, alpha=0.7, edgecolor='black')
axes[0, 1].set_xlabel('Target Degree')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Distribution of Target Degrees')
axes[0, 1].grid(True, alpha=0.3)

# Scatter plot of degrees (colored by edge existence)
positive_mask = labels == 1
negative_mask = labels == 0

# Sample for visualization (too many points otherwise)
n_sample = min(10000, len(features))
sample_idx = np.random.choice(len(features), n_sample, replace=False)

axes[1, 0].scatter(features[sample_idx[negative_mask[sample_idx]], 0], 
                   features[sample_idx[negative_mask[sample_idx]], 1], 
                   alpha=0.3, s=1, label='No Edge', color='red')
axes[1, 0].scatter(features[sample_idx[positive_mask[sample_idx]], 0], 
                   features[sample_idx[positive_mask[sample_idx]], 1], 
                   alpha=0.3, s=1, label='Edge Exists', color='blue')
axes[1, 0].set_xlabel('Source Degree')
axes[1, 0].set_ylabel('Target Degree')
axes[1, 0].set_title('Degree Relationships (Sample)')
axes[1, 0].legend()
axes[1, 0].grid(True, alpha=0.3)

# Label distribution
label_counts = pd.Series(labels).value_counts().sort_index()
axes[1, 1].bar(['No Edge', 'Edge Exists'], label_counts.values, 
               color=['red', 'blue'], alpha=0.7)
axes[1, 1].set_ylabel('Count')
axes[1, 1].set_title('Edge Distribution')
axes[1, 1].grid(True, alpha=0.3)

# Add count labels on bars
for i, count in enumerate(label_counts.values):
    axes[1, 1].text(i, count + len(features) * 0.01, str(count), 
                    ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig(results_dir / 'data_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("Data distribution visualized and saved.")

## 3. Model Creation and Training

In [None]:
# Create all models
model_collection = ModelCollection(random_state=RANDOM_STATE)
models = model_collection.create_models()
model_info = model_collection.get_model_info()

print("Models created:")
print("=" * 50)
for name, description in model_info.items():
    print(f"{name}:")
    print(f"  {description}")
    print()

In [None]:
# Train all models
trainer = ModelTrainer(random_state=RANDOM_STATE)
training_results = trainer.train_all_models(
    models, features, labels, 
    test_size=TEST_SIZE, 
    val_size=VAL_SIZE
)

print("\nAll models trained successfully!")

## 4. Model Evaluation

In [None]:
# Evaluate all models
evaluator = ModelEvaluator()
X_test = training_results['data_splits']['X_test']
y_test = training_results['data_splits']['y_test']

evaluation_results = evaluator.evaluate_all_models(training_results, X_test, y_test)

print("Model evaluation completed!")

In [None]:
# Print detailed evaluation results
evaluator.print_detailed_results(evaluation_results)

In [None]:
# Create and display comparison table
comparison_df = evaluator.create_comparison_dataframe(evaluation_results)
print("Model Performance Comparison:")
print("=" * 80)
print(comparison_df.to_string(index=False, float_format='%.4f'))

# Save comparison table
comparison_df.to_csv(results_dir / 'model_comparison.csv', index=False)
print(f"\nComparison table saved to: {results_dir / 'model_comparison.csv'}")

In [None]:
# Visual comparison table
create_comparison_table_plot(comparison_df, save_path=results_dir / 'comparison_table.png')

In [None]:
# Identify best models for each metric
best_models = get_best_models(evaluation_results)

print("Best Models by Metric:")
print("=" * 30)
for metric, model_name in best_models.items():
    print(f"{metric.upper():20}: {model_name}")

## 4.5. Empirical Frequency Comparison

In this section, we compare model predictions with empirical edge frequencies from the full dataset. This provides insight into how well each model captures the true underlying edge probability distribution based on degree combinations.

In [None]:
# Reimport updated modules to get the new empirical comparison methods
import importlib
import model_evaluation
importlib.reload(model_evaluation)
from model_evaluation import ModelEvaluator, get_best_models

# Create a new evaluator instance with the updated methods
evaluator = ModelEvaluator()

# Re-run evaluation with the new evaluator instance to get fresh results
evaluation_results = evaluator.evaluate_all_models(training_results, X_test, y_test)

print("Reloaded model_evaluation module with updated empirical comparison methods")
print("Created new evaluator instance and re-ran evaluation")

In [None]:
# Test Predictions vs Empirical Frequencies Comparison
EMPIRICAL_FREQ_FILE = repo_dir / 'results' / 'edge_frequency_by_degree.csv'

print(f"Empirical frequency file: {EMPIRICAL_FREQ_FILE}")
print(f"File exists: {EMPIRICAL_FREQ_FILE.exists()}")

if EMPIRICAL_FREQ_FILE.exists():
    # Compare test predictions with empirical frequencies for the same degree combinations
    test_empirical_comparison = evaluator.compare_test_predictions_with_empirical(
        evaluation_results, training_results, X_test, str(EMPIRICAL_FREQ_FILE)
    )
    
    # Print summary
    evaluator.print_test_empirical_comparison_summary(test_empirical_comparison)
    
    # Create comparison dataframe
    test_empirical_df = evaluator.create_test_empirical_comparison_dataframe(test_empirical_comparison)
    print("\nTest Predictions vs Empirical Frequencies Comparison:")
    print("=" * 80)
    print(test_empirical_df.to_string(index=False, float_format='%.6f'))
    
    # Save comparison results
    test_empirical_df.to_csv(results_dir / 'test_vs_empirical_comparison.csv', index=False)
    print(f"\nTest vs empirical comparison saved to: {results_dir / 'test_vs_empirical_comparison.csv'}")
    
    # Create scatter plot comparing predictions vs empirical frequencies
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    axes = axes.flatten()
    
    for i, (model_name, results) in enumerate(test_empirical_comparison.items()):
        if i >= 4:  # Only plot first 4 models
            break
            
        predictions = results['matched_predictions']
        empirical = results['matched_empirical']
        correlation = results['correlation_vs_empirical']
        
        axes[i].scatter(empirical, predictions, alpha=0.6, s=20)
        axes[i].plot([0, 1], [0, 1], 'r--', alpha=0.8)  # Perfect correlation line
        axes[i].set_xlabel('Empirical Frequency')
        axes[i].set_ylabel('Predicted Probability')
        axes[i].set_title(f'{model_name}\nr = {correlation:.4f}')
        axes[i].grid(True, alpha=0.3)
        axes[i].set_xlim(0, 1)
        axes[i].set_ylim(0, 1)
    
    plt.tight_layout()
    plt.savefig(results_dir / 'test_vs_empirical_scatter.png', dpi=300, bbox_inches='tight')
    plt.show()
    
else:
    print("Empirical frequency file not found. Skipping empirical comparison.")
    print("Run the edge frequency analysis first to generate the empirical frequencies.")
    print("This should create the file: results/edge_frequency_by_degree.csv")
    test_empirical_comparison = None
    test_empirical_df = None

## 5. Visualizations

In [None]:
# Create visualizer
visualizer = ModelVisualizer()

# Plot ROC curves
print("Creating ROC curves...")
visualizer.plot_roc_curves(evaluation_results, save_path=results_dir / 'roc_curves.png')

In [None]:
# Plot Precision-Recall curves
print("Creating Precision-Recall curves...")
visualizer.plot_precision_recall_curves(evaluation_results, save_path=results_dir / 'precision_recall_curves.png')

In [None]:
# Plot performance comparison
print("Creating performance comparison...")
visualizer.plot_performance_comparison(evaluation_results, save_path=results_dir / 'performance_comparison.png')

In [None]:
# Plot training history (for neural networks)
print("Creating training history plot...")
visualizer.plot_training_history(training_results, save_path=results_dir / 'training_history.png')

## 6. Edge Probability Heatmaps

In [None]:
# Create degree grids for heatmap visualization
source_degrees = features[:, 0]
target_degrees = features[:, 1]

source_bins, target_bins, grid_features = create_degree_grid(
    source_degrees, target_degrees, n_bins=50
)

print(f"Created degree grid for visualization:")
print(f"  Source degree range: {source_bins.min():.0f} - {source_bins.max():.0f}")
print(f"  Target degree range: {target_bins.min():.0f} - {target_bins.max():.0f}")
print(f"  Grid size: {len(source_bins)} x {len(target_bins)} = {len(grid_features)} points")

In [None]:
# Create individual heatmaps for each model
print("Creating individual prediction heatmaps...")
visualizer.create_all_prediction_heatmaps(
    training_results, source_bins, target_bins, 
    save_dir=str(results_dir)
)

In [None]:
# Create combined heatmap grid
print("Creating combined heatmap grid...")
visualizer.create_combined_heatmap_grid(
    training_results, source_bins, target_bins,
    figsize=(20, 5), save_path=results_dir / 'combined_heatmaps.png'
)

## 7. Summary and Conclusions

In [None]:
# Summary analysis
print("=" * 80)
print("MODEL COMPARISON SUMMARY")
print("=" * 80)

print(f"\nDataset:")
print(f"  Edge file: {EDGE_FILE_PATH.name}")
print(f"  Total samples: {len(features):,}")
print(f"  Test samples: {len(X_test):,}")
print(f"  Positive ratio: {np.mean(labels):.1%}")

print(f"\nBest Performing Models:")
best_auc_model = comparison_df.iloc[0]['Model']
best_auc_score = comparison_df.iloc[0]['AUC']
print(f"  Overall Best (AUC): {best_auc_model} (AUC = {best_auc_score:.4f})")

for metric in ['Accuracy', 'F1 Score', 'RMSE', 'Correlation']:
    if metric in ['RMSE']:
        best_idx = comparison_df[metric].idxmin()
        best_value = comparison_df[metric].min()
        direction = "(lower is better)"
    else:
        best_idx = comparison_df[metric].idxmax()
        best_value = comparison_df[metric].max()
        direction = "(higher is better)"
    
    best_model = comparison_df.loc[best_idx, 'Model']
    print(f"  Best {metric}: {best_model} ({metric} = {best_value:.4f}) {direction}")

# Add test vs empirical comparison results if available
if 'test_empirical_comparison' in locals() and test_empirical_comparison is not None:
    print(f"\nTest Predictions vs Empirical Frequencies:")
    best_empirical_corr_model = test_empirical_df.iloc[0]['Model']
    best_empirical_corr = test_empirical_df.iloc[0]['Correlation vs Empirical']
    print(f"  Best Correlation with Empirical: {best_empirical_corr_model} (r = {best_empirical_corr:.6f})")
    
    best_empirical_mae_idx = test_empirical_df['MAE vs Empirical'].idxmin()
    best_empirical_mae_model = test_empirical_df.loc[best_empirical_mae_idx, 'Model']
    best_empirical_mae = test_empirical_df.loc[best_empirical_mae_idx, 'MAE vs Empirical']
    print(f"  Best MAE vs Empirical: {best_empirical_mae_model} (MAE = {best_empirical_mae:.6f})")
    
    # Report matching statistics
    avg_match_ratio = test_empirical_df['Match Ratio'].mean()
    print(f"  Average match ratio: {avg_match_ratio:.1%} of test samples matched with empirical data")

print(f"\nKey Insights:")

# Performance analysis
auc_scores = comparison_df['AUC'].values
if np.max(auc_scores) - np.min(auc_scores) < 0.05:
    print(f"  - All models show similar AUC performance (range: {np.min(auc_scores):.3f} - {np.max(auc_scores):.3f})")
else:
    print(f"  - Significant performance differences observed (AUC range: {np.min(auc_scores):.3f} - {np.max(auc_scores):.3f})")

# Model complexity vs performance
nn_auc = comparison_df[comparison_df['Model'] == 'Simple NN']['AUC'].values[0]
logistic_auc = comparison_df[comparison_df['Model'] == 'Logistic Regression']['AUC'].values[0]

if nn_auc - logistic_auc > 0.05:
    print(f"  - Neural Network shows substantial improvement over Logistic Regression")
elif abs(nn_auc - logistic_auc) < 0.02:
    print(f"  - Neural Network and Logistic Regression show similar performance")
else:
    print(f"  - Neural Network shows modest improvement over Logistic Regression")

# Correlation analysis
correlations = comparison_df['Correlation'].values
best_corr = np.max(correlations)
if best_corr > 0.8:
    print(f"  - Strong correlation between predictions and true probabilities (max: {best_corr:.3f})")
elif best_corr > 0.5:
    print(f"  - Moderate correlation between predictions and true probabilities (max: {best_corr:.3f})")
else:
    print(f"  - Weak correlation between predictions and true probabilities (max: {best_corr:.3f})")

# Test vs empirical comparison insights
if 'test_empirical_comparison' in locals() and test_empirical_comparison is not None:
    empirical_correlations = test_empirical_df['Correlation vs Empirical'].values
    best_empirical_corr_val = np.max(empirical_correlations)
    if best_empirical_corr_val > 0.8:
        print(f"  - Strong correlation between test predictions and empirical frequencies (max: {best_empirical_corr_val:.3f})")
    elif best_empirical_corr_val > 0.5:
        print(f"  - Moderate correlation between test predictions and empirical frequencies (max: {best_empirical_corr_val:.3f})")
    else:
        print(f"  - Weak correlation between test predictions and empirical frequencies (max: {best_empirical_corr_val:.3f})")
        
    print(f"  - Gap between test performance and empirical accuracy reveals generalization challenges")

print(f"\nFiles Generated:")
generated_files = list(results_dir.glob('*'))
for file_path in sorted(generated_files):
    print(f"  - {file_path.name}")

print(f"\nAll results saved to: {results_dir}")
print("=" * 80)

In [None]:
# Final recommendation
print("\nRECOMMENDATION:")
print("-" * 20)

best_overall = comparison_df.iloc[0]
print(f"For edge probability prediction on this dataset, the {best_overall['Model']} "
      f"performs best overall with:")
print(f"  - AUC: {best_overall['AUC']:.4f}")
print(f"  - Accuracy: {best_overall['Accuracy']:.4f}")
print(f"  - F1 Score: {best_overall['F1 Score']:.4f}")
print(f"  - RMSE: {best_overall['RMSE']:.4f}")

# Training time consideration
print(f"\nTraining time considerations:")
for model_name, result in training_results.items():
    if model_name != 'data_splits':
        training_time = result['training_result']['training_time']
        print(f"  - {model_name}: {training_time:.2f} seconds")

print(f"\nFor production use, consider the trade-off between model performance and training time.")