# Minimum Permutations Summary Across Edge Types

This notebook aggregates and analyzes the results from **Notebook 6** (minimum permutations analysis for ML models) across all 24 edge types.

## Purpose

- Summarize N_min (minimum permutations) for each model across all edge types
- Identify which models are most data-efficient
- Analyze relationship between graph characteristics and N_min
- Provide recommendations for choosing N based on graph properties

## Workflow

1. Load results from all edge types
2. Aggregate N_min by model and edge type
3. Analyze convergence patterns
4. Correlate N_min with graph characteristics (density, size, degree)
5. Generate recommendations

In [None]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import scipy.sparse as sp
from typing import Dict, List
import warnings
warnings.filterwarnings('ignore')

# Setup paths
repo_dir = Path.cwd().parent
data_dir = repo_dir / 'data'
results_dir = repo_dir / 'results' / 'minimum_permutations_ml'
summary_dir = repo_dir / 'results' / 'minimum_permutations_ml_summary'
summary_dir.mkdir(parents=True, exist_ok=True)

print(f"Repository directory: {repo_dir}")
print(f"Results directory: {results_dir}")
print(f"Summary output directory: {summary_dir}")

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.dpi'] = 100

## 1. Discover and Load Results

In [None]:
# Find all edge type result directories
result_dirs = [d for d in results_dir.iterdir() if d.is_dir() and d.name.endswith('_results')]
edge_types = [d.name.replace('_results', '') for d in result_dirs]

print(f"Found results for {len(edge_types)} edge types")

# Load all summaries
all_summaries = {}
all_convergence_data = {}
successful_loads = 0
failed_loads = []

for edge_type in edge_types:
    summary_file = results_dir / f'{edge_type}_results' / f'{edge_type}_summary.json'
    convergence_file = results_dir / f'{edge_type}_results' / f'{edge_type}_convergence_data.csv'
    
    if summary_file.exists():
        with open(summary_file, 'r') as f:
            all_summaries[edge_type] = json.load(f)
        successful_loads += 1
    else:
        failed_loads.append(edge_type)
    
    if convergence_file.exists():
        all_convergence_data[edge_type] = pd.read_csv(convergence_file)

print(f"Successfully loaded {successful_loads} summaries")
if failed_loads:
    print(f"Failed to load: {failed_loads}")

# Display sample
if all_summaries:
    sample_edge = list(all_summaries.keys())[0]
    print(f"\nSample summary ({sample_edge}):")
    print(f"  Target metric: {all_summaries[sample_edge]['target_metric']}")
    print(f"  Models: {list(all_summaries[sample_edge]['models'].keys())}")

## 2. Load Graph Characteristics

In [None]:
def get_graph_characteristics(edge_type: str) -> Dict:
    """Extract graph characteristics from edge matrix."""
    edge_file = data_dir / 'permutations' / '000.hetmat' / 'edges' / f"{edge_type}.sparse.npz"
    
    if not edge_file.exists():
        return None
    
    # Load edge matrix
    edge_matrix = sp.load_npz(edge_file)
    n_sources, n_targets = edge_matrix.shape
    n_edges = edge_matrix.nnz
    
    # Calculate characteristics
    density = n_edges / (n_sources * n_targets)
    source_degrees = np.array(edge_matrix.sum(axis=1)).flatten()
    target_degrees = np.array(edge_matrix.sum(axis=0)).flatten()
    
    # Filter zero degrees
    source_degrees_nz = source_degrees[source_degrees > 0]
    target_degrees_nz = target_degrees[target_degrees > 0]
    
    return {
        'edge_type': edge_type,
        'n_sources': n_sources,
        'n_targets': n_targets,
        'n_edges': n_edges,
        'density': density,
        'mean_source_degree': source_degrees_nz.mean() if len(source_degrees_nz) > 0 else 0,
        'mean_target_degree': target_degrees_nz.mean() if len(target_degrees_nz) > 0 else 0,
        'max_source_degree': source_degrees.max(),
        'max_target_degree': target_degrees.max(),
        'n_sources_nz': len(source_degrees_nz),
        'n_targets_nz': len(target_degrees_nz)
    }

# Load graph characteristics
graph_chars = []
for edge_type in all_summaries.keys():
    chars = get_graph_characteristics(edge_type)
    if chars is not None:
        graph_chars.append(chars)

graph_chars_df = pd.DataFrame(graph_chars)
print(f"\nLoaded graph characteristics for {len(graph_chars_df)} edge types")
print(f"Density range: {graph_chars_df['density'].min():.6f} - {graph_chars_df['density'].max():.6f}")
print(f"Edge count range: {graph_chars_df['n_edges'].min()} - {graph_chars_df['n_edges'].max()}")

## 3. Aggregate N_min by Model and Edge Type

In [None]:
# Create N_min matrix (edge_types × models)
n_min_data = []

for edge_type, summary in all_summaries.items():
    entry = {'edge_type': edge_type}
    
    for model_name, model_data in summary['models'].items():
        entry[f'{model_name}_N_min'] = model_data['N_min']
        entry[f'{model_name}_achieved'] = model_data['achieved_value']
        entry[f'{model_name}_target_met'] = model_data['target_met']
    
    n_min_data.append(entry)

n_min_df = pd.DataFrame(n_min_data)

# Merge with graph characteristics
n_min_df = n_min_df.merge(graph_chars_df, on='edge_type', how='left')

# Add density category
n_min_df['density_category'] = pd.cut(
    n_min_df['density'],
    bins=[0, 0.01, 0.03, 0.05, 1.0],
    labels=['Very Sparse (<1%)', 'Sparse (1-3%)', 'Medium (3-5%)', 'Dense (>5%)']
)

print(f"\nN_min data shape: {n_min_df.shape}")
print(f"\nSample:")
print(n_min_df.head())

## 4. N_min Statistics by Model

In [None]:
# Extract model names
model_names = [col.replace('_N_min', '') for col in n_min_df.columns if col.endswith('_N_min')]

print("N_min Statistics by Model")
print("="*80)

for model_name in model_names:
    n_min_col = f'{model_name}_N_min'
    n_mins = n_min_df[n_min_col]
    
    print(f"\n{model_name}:")
    print(f"  Mean N_min: {n_mins.mean():.1f}")
    print(f"  Median N_min: {n_mins.median():.0f}")
    print(f"  Min N_min: {n_mins.min()}")
    print(f"  Max N_min: {n_mins.max()}")
    print(f"  Std N_min: {n_mins.std():.1f}")

# Overall most data-efficient model
model_means = {m: n_min_df[f'{m}_N_min'].mean() for m in model_names}
best_model = min(model_means, key=model_means.get)
print(f"\n{'='*80}")
print(f"Most data-efficient model overall: {best_model} (mean N_min = {model_means[best_model]:.1f})")
print(f"{'='*80}")

## 5. Visualizations

In [None]:
# 1. N_min heatmap (edge types × models)
fig, ax = plt.subplots(figsize=(12, 16))

# Prepare data for heatmap
heatmap_data = n_min_df.set_index('edge_type')[[f'{m}_N_min' for m in model_names]]
heatmap_data.columns = model_names

# Sort by mean N_min
heatmap_data = heatmap_data.loc[heatmap_data.mean(axis=1).sort_values().index]

# Create heatmap
sns.heatmap(
    heatmap_data,
    annot=True,
    fmt='.0f',
    cmap='RdYlGn_r',
    ax=ax,
    cbar_kws={'label': 'N_min (Minimum Permutations)'}
)

ax.set_title('Minimum Permutations by Edge Type and Model', fontsize=14, fontweight='bold')
ax.set_xlabel('Model', fontsize=12)
ax.set_ylabel('Edge Type', fontsize=12)

plt.tight_layout()
plt.savefig(summary_dir / 'N_min_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()
print("Saved N_min heatmap")

In [None]:
# 2. N_min distribution by model
fig, ax = plt.subplots(figsize=(12, 6))

# Prepare data for box plot
box_data = [n_min_df[f'{m}_N_min'] for m in model_names]

bp = ax.boxplot(box_data, labels=model_names, patch_artist=True)

# Color boxes
colors = sns.color_palette('Set2', len(model_names))
for patch, color in zip(bp['boxes'], colors):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)

ax.set_ylabel('N_min (Minimum Permutations)', fontsize=12)
ax.set_title('N_min Distribution by Model', fontsize=14, fontweight='bold')
ax.grid(axis='y', alpha=0.3)
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.savefig(summary_dir / 'N_min_distribution.png', dpi=300, bbox_inches='tight')
plt.show()
print("Saved N_min distribution")

In [None]:
# 3. N_min vs Graph Density
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.flatten()

for idx, model_name in enumerate(model_names):
    ax = axes[idx]
    
    n_min_col = f'{model_name}_N_min'
    
    ax.scatter(n_min_df['density'], n_min_df[n_min_col], alpha=0.6, s=100)
    
    # Add edge type labels for outliers
    for _, row in n_min_df.iterrows():
        if row[n_min_col] > n_min_df[n_min_col].quantile(0.75):
            ax.annotate(row['edge_type'], (row['density'], row[n_min_col]), 
                       fontsize=8, alpha=0.7)
    
    ax.set_xlabel('Graph Density', fontsize=12)
    ax.set_ylabel('N_min', fontsize=12)
    ax.set_title(f'{model_name}', fontsize=13, fontweight='bold')
    ax.set_xscale('log')
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(summary_dir / 'N_min_vs_density.png', dpi=300, bbox_inches='tight')
plt.show()
print("Saved N_min vs density plots")

In [None]:
# 4. N_min by density category
fig, ax = plt.subplots(figsize=(12, 6))

# Prepare data for grouped bar chart
density_groups = []
for cat in n_min_df['density_category'].dropna().unique():
    cat_data = n_min_df[n_min_df['density_category'] == cat]
    for model_name in model_names:
        density_groups.append({
            'Density Category': cat,
            'Model': model_name,
            'Mean N_min': cat_data[f'{model_name}_N_min'].mean()
        })

density_df = pd.DataFrame(density_groups)
pivot = density_df.pivot(index='Density Category', columns='Model', values='Mean N_min')

pivot.plot(kind='bar', ax=ax, width=0.8)
ax.set_ylabel('Mean N_min', fontsize=12)
ax.set_title('Average N_min by Graph Density Category', fontsize=14, fontweight='bold')
ax.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
ax.grid(axis='y', alpha=0.3)
plt.xticks(rotation=45, ha='right')

plt.tight_layout()
plt.savefig(summary_dir / 'N_min_by_density_category.png', dpi=300, bbox_inches='tight')
plt.show()
print("Saved N_min by density category")

In [None]:
# 5. Model comparison: Which is most data-efficient?
fig, ax = plt.subplots(figsize=(10, 6))

model_means = [n_min_df[f'{m}_N_min'].mean() for m in model_names]
model_stds = [n_min_df[f'{m}_N_min'].std() for m in model_names]

bars = ax.bar(range(len(model_names)), model_means, yerr=model_stds, 
              color=sns.color_palette('Set2', len(model_names)), 
              alpha=0.7, edgecolor='black', capsize=5)

ax.set_xticks(range(len(model_names)))
ax.set_xticklabels(model_names, rotation=45, ha='right')
ax.set_ylabel('Mean N_min Across All Edge Types', fontsize=12)
ax.set_title('Data Efficiency Comparison (Lower is Better)', fontsize=14, fontweight='bold')
ax.grid(axis='y', alpha=0.3)

# Add value labels on bars
for i, (bar, mean, std) in enumerate(zip(bars, model_means, model_stds)):
    ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + std + 0.5,
            f'{mean:.1f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig(summary_dir / 'model_efficiency_comparison.png', dpi=300, bbox_inches='tight')
plt.show()
print("Saved model efficiency comparison")

## 6. Save Summary Tables

In [None]:
# Save full N_min data
n_min_df.to_csv(summary_dir / 'N_min_by_edge_type.csv', index=False)
print(f"Saved N_min data: {summary_dir / 'N_min_by_edge_type.csv'}")

# Save model statistics
model_stats = []
for model_name in model_names:
    n_min_col = f'{model_name}_N_min'
    model_stats.append({
        'Model': model_name,
        'Mean_N_min': n_min_df[n_min_col].mean(),
        'Median_N_min': n_min_df[n_min_col].median(),
        'Std_N_min': n_min_df[n_min_col].std(),
        'Min_N_min': n_min_df[n_min_col].min(),
        'Max_N_min': n_min_df[n_min_col].max()
    })

model_stats_df = pd.DataFrame(model_stats)
model_stats_df.to_csv(summary_dir / 'model_statistics.csv', index=False)
print(f"Saved model statistics: {summary_dir / 'model_statistics.csv'}")

## 7. Recommendations

In [None]:
print("\n" + "="*80)
print("MINIMUM PERMUTATIONS RECOMMENDATIONS")
print("="*80)

# Overall champion
best_model_overall = model_stats_df.loc[model_stats_df['Mean_N_min'].idxmin(), 'Model']
best_mean = model_stats_df.loc[model_stats_df['Mean_N_min'].idxmin(), 'Mean_N_min']
print(f"\n1. MOST DATA-EFFICIENT MODEL OVERALL: {best_model_overall}")
print(f"   Average N_min across all edge types: {best_mean:.1f}")

# Best by density category
print("\n2. RECOMMENDATIONS BY GRAPH DENSITY:")
for cat in ['Very Sparse (<1%)', 'Sparse (1-3%)', 'Medium (3-5%)', 'Dense (>5%)']:
    cat_data = n_min_df[n_min_df['density_category'] == cat]
    if len(cat_data) == 0:
        continue
    
    model_means_cat = {m: cat_data[f'{m}_N_min'].mean() for m in model_names}
    best_model_cat = min(model_means_cat, key=model_means_cat.get)
    best_mean_cat = model_means_cat[best_model_cat]
    
    print(f"   {cat}: {best_model_cat} (avg N_min = {best_mean_cat:.1f})")

# Rule of thumb
print("\n3. GENERAL GUIDELINES:")
overall_median = n_min_df[[f'{m}_N_min' for m in model_names]].median().median()
print(f"   Median N_min across all models and edge types: {overall_median:.0f}")
print(f"   Conservative recommendation: Use N ≥ {int(np.ceil(overall_median * 1.5))} for new graphs")

# Correlation with density
print("\n4. RELATIONSHIP WITH GRAPH DENSITY:")
for model_name in model_names:
    corr = n_min_df['density'].corr(n_min_df[f'{model_name}_N_min'])
    print(f"   {model_name}: correlation = {corr:.3f}")

print("\n" + "="*80)
print("SUMMARY COMPLETE")
print("="*80)
print(f"\nAll results saved to: {summary_dir}")
print("\nGenerated files:")
for file in sorted(summary_dir.glob('*')):
    print(f"  - {file.name}")