# Minimum Permutations Analysis for Edge Probability Learning

This notebook implements a systematic approach to determine the minimum number of permutations needed for effective edge probability distribution learning. It progressively trains models using increasing numbers of permutations and measures convergence against empirical frequencies.

## Methodology

1. **Progressive Training**: Train models using 1, 2, 3, ... up to max_permutations
2. **Performance Evaluation**: Compare predictions against empirical frequencies for each permutation count  
3. **Convergence Detection**: Identify when additional permutations provide diminishing returns
4. **Optimal Selection**: Determine the minimum number of permutations that achieves target performance

## Usage with Papermill

```bash
papermill 5_minimum_permutations_analysis.ipynb output.ipynb \
  -p edge_type "AeG" \
  -p max_permutations 10 \
  -p convergence_threshold 0.05 \
  -p random_seed 42
```

## Parameters

- `edge_type` (str): Type of edge to analyze (default: "AeG")
- `max_permutations` (int): Maximum number of permutations to test (default: 10)
- `convergence_threshold` (float): Performance improvement threshold for convergence (default: 0.05)
- `random_seed` (int): Random seed for reproducibility (default: 42)

## Output

- Performance metrics for each permutation count
- Convergence analysis showing diminishing returns
- Recommendation for optimal permutation count
- Detailed visualizations of learning progression

In [22]:
import sys
from pathlib import Path
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from sklearn.model_selection import train_test_split
import torch.optim as optim
from typing import Dict, List, Optional, Tuple

repo_dir = Path.cwd().parent
src_dir = repo_dir / 'src'
sys.path.append(str(src_dir))

from data_processing import load_permutation_data, prepare_edge_prediction_data
from models import EdgePredictionNN
from training import train_edge_prediction_model

print("All dependencies imported successfully")
print(f"Repository directory: {repo_dir}")
print(f"Source directory: {src_dir}")

All dependencies imported successfully
Repository directory: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability
Source directory: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/src


In [42]:
# Parameters (can be overridden by papermill)
edge_type = "AeG"  # Type of edge to analyze
max_permutations = 10  # Maximum number of permutations to test
convergence_threshold = 0.05  # Performance improvement threshold for convergence detection
random_seed = 42  # Random seed for reproducibility

# Training parameters
epochs_per_model = 50  # Number of epochs for each model training
batch_size = 512  # Batch size for training
learning_rate = 0.001  # Learning rate
early_stopping_patience = 5  # Early stopping patience

# Analysis parameters
negative_sampling_ratio = 1.0  # Ratio for negative sampling
performance_metric = 'correlation'  # Primary metric for convergence ('mae', 'rmse', 'correlation')

print(f"Minimum Permutations Analysis Configuration:")
print(f"  Edge type: {edge_type}")
print(f"  Max permutations to test: {max_permutations}")
print(f"  Convergence threshold: {convergence_threshold}")
print(f"  Random seed: {random_seed}")
print(f"  Performance metric: {performance_metric}")
print(f"  Training epochs: {epochs_per_model}")

# Set random seeds for reproducibility
import numpy as np
import torch
np.random.seed(random_seed)
torch.manual_seed(random_seed)

Minimum Permutations Analysis Configuration:
  Edge type: AeG
  Max permutations to test: 10
  Convergence threshold: 0.05
  Random seed: 42
  Performance metric: correlation
  Training epochs: 50


<torch._C.Generator at 0x1738ba4d0>

In [24]:
# Setup directories
from pathlib import Path

repo_dir = Path.cwd().parent
data_dir = repo_dir / 'data'
permutations_dir = data_dir / 'permutations'
downloads_dir = data_dir / 'downloads'
models_dir = repo_dir / 'models'
output_dir = repo_dir / 'results' / 'minimum_permutations_basic_2d'
output_dir.mkdir(parents=True, exist_ok=True)

print(f"Directory setup:")
print(f"  Repository: {repo_dir}")
print(f"  Data: {data_dir}")
print(f"  Permutations: {permutations_dir}")
print(f"  Output: {output_dir}")

def load_empirical_frequencies(results_dir: Path = repo_dir / 'results') -> pd.DataFrame:
    """Load empirical edge frequencies from CSV file."""
    freq_file = results_dir / 'edge_frequency_by_degree.csv'
    try:
        if not freq_file.exists():
            raise FileNotFoundError(f"Empirical frequency file not found: {freq_file}")
        
        empirical_freq_df = pd.read_csv(freq_file)
        empirical_freq_df = empirical_freq_df.rename(columns={'frequency': 'empirical_frequency'})
        
        print(f"Loaded {len(empirical_freq_df)} empirical frequency records")
        print(f"Degree range - Source: {empirical_freq_df['source_degree'].min()}-{empirical_freq_df['source_degree'].max()}")
        print(f"Degree range - Target: {empirical_freq_df['target_degree'].min()}-{empirical_freq_df['target_degree'].max()}")
        print(f"Frequency range: {empirical_freq_df['empirical_frequency'].min():.3f}-{empirical_freq_df['empirical_frequency'].max():.3f}")
        
        return empirical_freq_df
        
    except Exception as e:
        print(f"Error loading empirical frequencies: {e}")
        raise

def validate_data_directories() -> Tuple[List[Path], Path]:
    """Validate and discover available data directories."""
    print("Discovering available data directories...")
    
    # Find available permutation directories
    available_permutations = []
    if permutations_dir.exists():
        for perm_dir in permutations_dir.iterdir():
            if perm_dir.is_dir() and perm_dir.name.endswith('.hetmat'):
                available_permutations.append(perm_dir)
    
    # Sort permutations by name for consistent ordering
    permutations_dirs = sorted(available_permutations)
    
    print(f"Original data directory: {data_dir}")
    print(f"Permutations directory: {permutations_dir}")
    print(f"Found {len(permutations_dirs)} permutation directories")
    
    if len(permutations_dirs) <= 5:
        for i, perm_dir in enumerate(permutations_dirs):
            print(f"  {i+1}. {perm_dir.name}")
    else:
        for i, perm_dir in enumerate(permutations_dirs[:3]):
            print(f"  {i+1}. {perm_dir.name}")
        print(f"  ... and {len(permutations_dirs) - 3} more")
    
    # Validate permutation availability
    if len(permutations_dirs) < max_permutations:
        print(f"Warning: Only {len(permutations_dirs)} permutations available, but max_permutations = {max_permutations}")
        print("   Will reuse permutations if needed.")
    else:
        print(f"Sufficient permutations available for experiment")
    
    # Check original edge data
    original_edge_file = data_dir / 'edges' / f"{edge_type}.sparse.npz"
    if original_edge_file.exists():
        print(f"Original edge data found: {original_edge_file}")
    else:
        print(f"Original edge data not found: {original_edge_file}")
        if (data_dir / 'edges').exists():
            edge_files = list((data_dir / 'edges').glob('*.npz'))
            print(f"Available edge files: {[f.name for f in edge_files[:5]]}")
    
    return permutations_dirs, original_edge_file

# Load data and validate directories
empirical_freq_df = load_empirical_frequencies()
permutations_dirs, original_edge_file = validate_data_directories()

print("\nData loading and validation completed successfully!")

Directory setup:
  Repository: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability
  Data: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/data
  Permutations: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/data/permutations
  Output: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/results/minimum_permutations_basic_2d
Loaded 13167 empirical frequency records
Degree range - Source: 1-15036
Degree range - Target: 1-98
Frequency range: 0.000-1.005
Discovering available data directories...
Original data directory: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/data
Permutations directory: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColora

## 2. Data Loading & Validation

This section loads empirical frequencies and sets up data directories for permutation analysis.

In [43]:
def prepare_training_data_from_permutations(permutation_dirs: List[Path], num_permutations: int) -> pd.DataFrame:
    """
    Prepare training data by combining data from multiple permutations.
    
    Parameters:
    -----------
    permutation_dirs : List[Path]
        List of available permutation directories
    num_permutations : int
        Number of permutations to use for training
        
    Returns:
    --------
    pd.DataFrame
        Combined training data from all specified permutations
    """
    print(f"Preparing training data using {num_permutations} permutations...")
    
    if not permutation_dirs:
        raise ValueError("No permutation directories available")
    
    if num_permutations > len(permutation_dirs):
        print(f"Warning: Requested {num_permutations} permutations but only {len(permutation_dirs)} available")
        num_permutations = len(permutation_dirs)
    
    all_training_data = []
    
    for i in range(num_permutations):
        perm_dir = permutation_dirs[i]
        print(f"  Loading permutation {i+1}/{num_permutations}: {perm_dir.name}")
        
        try:
            # Load permutation data using proper data processing function
            permutation_data = load_permutation_data(
                permutation_name=perm_dir.name,
                permutations_dir=permutations_dir,
                edge_type=edge_type,
                source_node_type="Anatomy",
                target_node_type="Gene"
            )
            
            # Prepare features and labels for this permutation
            features, labels = prepare_edge_prediction_data(
                permutation_data,
                sample_negative_ratio=negative_sampling_ratio
            )
            
            # Convert to DataFrame and add permutation identifier
            perm_df = pd.DataFrame({
                'source_degree': features[:, 0],
                'target_degree': features[:, 1],
                'edge_probability': labels.astype(float),
                'permutation_id': i
            })
            
            all_training_data.append(perm_df)
            print(f"    Added {len(perm_df)} samples from {perm_dir.name}")
            
        except Exception as e:
            print(f"    Error loading {perm_dir.name}: {e}")
            # Try alternative loading method
            try:
                from data_processing_helpers import load_permutation_data as load_perm_helper
                from data_processing_helpers import extract_improved_edge_features_and_labels
                
                edge_matrix, source_degrees, target_degrees = load_perm_helper(perm_dir, edge_type)
                features, targets = extract_improved_edge_features_and_labels(
                    edge_matrix, source_degrees, target_degrees, 
                    negative_ratio=negative_sampling_ratio,
                    use_normalized_features=False,
                    use_regression=True
                )
                
                perm_df = pd.DataFrame({
                    'source_degree': features[:, 0],
                    'target_degree': features[:, 1],
                    'edge_probability': targets,
                    'permutation_id': i
                })
                
                all_training_data.append(perm_df)
                print(f"    Alternative loading successful: {len(perm_df)} samples")
                
            except Exception as e2:
                print(f"    Alternative loading also failed: {e2}")
                continue
    
    if not all_training_data:
        raise ValueError("Failed to load any permutation data")
    
    # Combine all permutation data
    combined_data = pd.concat(all_training_data, ignore_index=True)
    
    print(f"Combined training data: {len(combined_data)} total samples from {len(all_training_data)} permutations")
    print(f"Feature ranges:")
    print(f"  Source degrees: {combined_data['source_degree'].min():.0f}-{combined_data['source_degree'].max():.0f}")
    print(f"  Target degrees: {combined_data['target_degree'].min():.0f}-{combined_data['target_degree'].max():.0f}")
    print(f"  Edge probabilities: {combined_data['edge_probability'].min():.3f}-{combined_data['edge_probability'].max():.3f}")
    
    return combined_data

def train_model_with_n_permutations(num_permutations: int, permutation_dirs: List[Path]) -> Tuple[EdgePredictionNN, Dict, Dict]:
    """
    Train a model using data from n permutations.
    
    Parameters:
    -----------
    num_permutations : int
        Number of permutations to use for training
    permutation_dirs : List[Path]
        List of available permutation directories
        
    Returns:
    --------
    Tuple[EdgePredictionNN, Dict, Dict]
        Trained model, training history, and test metrics
    """
    print(f"\nTraining model with {num_permutations} permutations...")
    
    # Prepare training data
    training_data = prepare_training_data_from_permutations(permutation_dirs, num_permutations)
    
    # Extract features and labels
    features = training_data[['source_degree', 'target_degree']].values.astype(np.float32)
    labels = training_data['edge_probability'].values.astype(np.float32)
    
    print(f"Training with {len(features)} samples")
    
    # Train model using existing utilities
    model, train_history, test_metrics = train_edge_prediction_model(
        features=features,
        labels=labels,
        test_size=0.2,
        epochs=epochs_per_model,
        batch_size=batch_size,
        learning_rate=learning_rate,
        patience=early_stopping_patience
    )
    
    print(f"Training completed. Final train loss: {train_history['train_losses'][-1]:.4f}")
    
    return model, train_history, test_metrics

# Test with a single permutation first to verify the pipeline
print("Testing data loading pipeline...")
if permutations_dirs:
    test_data = prepare_training_data_from_permutations(permutations_dirs, 1)
    print("Data loading pipeline verified successfully")
else:
    print("Warning: No permutation directories found for testing")

Testing data loading pipeline...
Preparing training data using 1 permutations...
  Loading permutation 1/1: 000.hetmat
Loading data from permutation: 000.hetmat
Permutation path: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/data/permutations/000.hetmat
Edge type: AeG (Anatomy -> Gene)
Loaded AeG edges: (402, 20945) matrix with 526407 non-zero entries
Preparing AeG edge prediction data (Anatomy -> Gene)
Anatomy degree range: 0 - 15036
Gene degree range: 0 - 98
Number of positive examples (existing edges): 526407
Number of positive examples (existing edges): 526407
Number of negative examples (non-existing edges): 526407
    Added 1052814 samples from 000.hetmat
Combined training data: 1052814 total samples from 1 permutations
Feature ranges:
  Source degrees: 0-15036
  Target degrees: 0-98
  Edge probabilities: 0.000-1.000
Data loading pipeline verified successfully
Number of negative examples (non-existing edges): 

In [44]:
def evaluate_model_against_empirical(model: EdgePredictionNN, empirical_freq_df: pd.DataFrame) -> Dict:
    """
    Evaluate model predictions against empirical frequencies.
    
    Parameters:
    -----------
    model : EdgePredictionNN
        Trained model to evaluate
    empirical_freq_df : pd.DataFrame
        DataFrame with empirical frequencies
        
    Returns:
    --------
    Dict
        Performance metrics including MAE, RMSE, correlation
    """
    # Prepare input features
    test_features = empirical_freq_df[['source_degree', 'target_degree']].values.astype(np.float32)
    test_tensor = torch.tensor(test_features)
    
    # Generate predictions
    model.eval()
    with torch.no_grad():
        predictions = model(test_tensor).squeeze().numpy()
    
    predictions = np.clip(predictions, 0.0, 1.0)
    empirical_values = empirical_freq_df['empirical_frequency'].values
    
    # Calculate metrics
    mae = np.mean(np.abs(predictions - empirical_values))
    rmse = np.sqrt(np.mean((predictions - empirical_values) ** 2))
    correlation = np.corrcoef(predictions, empirical_values)[0, 1]
    
    # Handle NaN correlation (can occur with constant predictions)
    if np.isnan(correlation):
        correlation = 0.0
    
    return {
        'mae': mae,
        'rmse': rmse,
        'correlation': correlation,
        'predictions': predictions,
        'empirical': empirical_values
    }

def run_progressive_permutation_analysis(permutation_dirs: List[Path], empirical_freq_df: pd.DataFrame) -> pd.DataFrame:
    """
    Run the complete progressive permutation analysis.
    
    Parameters:
    -----------
    permutation_dirs : List[Path]
        List of available permutation directories
    empirical_freq_df : pd.DataFrame
        DataFrame with empirical frequencies for evaluation
        
    Returns:
    --------
    pd.DataFrame
        Results dataframe with performance metrics for each permutation count
    """
    print("="*60)
    print("PROGRESSIVE PERMUTATION ANALYSIS")
    print("="*60)
    
    results = []
    max_perms_to_test = min(max_permutations, len(permutation_dirs))
    
    print(f"Testing 1 to {max_perms_to_test} permutations...")
    
    for num_perms in range(1, max_perms_to_test + 1):
        print(f"\n{'='*20} Testing {num_perms} Permutations {'='*20}")
        
        try:
            # Train model with n permutations
            model, train_history, test_metrics = train_model_with_n_permutations(num_perms, permutation_dirs)
            
            # Evaluate against empirical frequencies
            evaluation_results = evaluate_model_against_empirical(model, empirical_freq_df)
            
            # Store results
            result = {
                'num_permutations': num_perms,
                'mae': evaluation_results['mae'],
                'rmse': evaluation_results['rmse'],
                'correlation': evaluation_results['correlation'],
                'train_loss': train_history['train_losses'][-1],
                'test_auc': test_metrics.get('auc', 0.0),
                'test_ap': test_metrics.get('average_precision', 0.0)
            }
            
            results.append(result)
            
            print(f"Results for {num_perms} permutations:")
            print(f"  MAE: {result['mae']:.4f}")
            print(f"  RMSE: {result['rmse']:.4f}")
            print(f"  Correlation: {result['correlation']:.4f}")
            print(f"  Train Loss: {result['train_loss']:.4f}")
            
            # Check for convergence if we have enough data points
            if len(results) >= 2:
                current_metric = result[performance_metric] if performance_metric != 'mae' and performance_metric != 'rmse' else -result[performance_metric]
                previous_metric = results[-2][performance_metric] if performance_metric != 'mae' and performance_metric != 'rmse' else -results[-2][performance_metric]
                improvement = current_metric - previous_metric
                
                print(f"  Improvement in {performance_metric}: {improvement:.4f}")
                
                if abs(improvement) < convergence_threshold:
                    print(f"  Convergence detected! Improvement ({abs(improvement):.4f}) below threshold ({convergence_threshold})")
                    break
            
        except Exception as e:
            print(f"Error training with {num_perms} permutations: {e}")
            continue
    
    return pd.DataFrame(results)

def analyze_convergence(results_df: pd.DataFrame) -> Dict:
    """
    Analyze convergence patterns in the results.
    
    Parameters:
    -----------
    results_df : pd.DataFrame
        Results from progressive permutation analysis
        
    Returns:
    --------
    Dict
        Convergence analysis results
    """
    if len(results_df) < 2:
        return {"message": "Insufficient data for convergence analysis"}
    
    # Find optimal permutation count based on primary metric
    if performance_metric == 'correlation':
        best_idx = results_df['correlation'].idxmax()
        best_value = results_df['correlation'].max()
    elif performance_metric == 'mae':
        best_idx = results_df['mae'].idxmin()
        best_value = results_df['mae'].min()
    elif performance_metric == 'rmse':
        best_idx = results_df['rmse'].idxmin()
        best_value = results_df['rmse'].min()
    else:
        best_idx = results_df['correlation'].idxmax()
        best_value = results_df['correlation'].max()
    
    optimal_permutations = results_df.loc[best_idx, 'num_permutations']
    
    # Calculate improvement curves
    results_df['mae_improvement'] = results_df['mae'].diff().fillna(0)
    results_df['rmse_improvement'] = results_df['rmse'].diff().fillna(0)
    results_df['correlation_improvement'] = results_df['correlation'].diff().fillna(0)
    
    # Find first point where improvement falls below threshold
    if performance_metric == 'correlation':
        convergence_points = results_df[abs(results_df['correlation_improvement']) < convergence_threshold]
    elif performance_metric == 'mae':
        convergence_points = results_df[abs(results_df['mae_improvement']) < convergence_threshold]
    else:
        convergence_points = results_df[abs(results_df['correlation_improvement']) < convergence_threshold]
    
    if len(convergence_points) > 0:
        convergence_permutations = convergence_points.iloc[0]['num_permutations']
    else:
        convergence_permutations = results_df.iloc[-1]['num_permutations']
    
    return {
        'optimal_permutations': int(optimal_permutations),
        'optimal_value': best_value,
        'convergence_permutations': int(convergence_permutations),
        'total_tested': len(results_df),
        'final_mae': results_df.iloc[-1]['mae'],
        'final_rmse': results_df.iloc[-1]['rmse'],
        'final_correlation': results_df.iloc[-1]['correlation']
    }

# Run the progressive analysis
if permutations_dirs:
    print(f"Starting progressive analysis with {len(permutations_dirs)} available permutations")
    results_df = run_progressive_permutation_analysis(permutations_dirs, empirical_freq_df)
    convergence_analysis = analyze_convergence(results_df)
    
    print("\n" + "="*60)
    print("ANALYSIS COMPLETE")
    print("="*60)
    print(f"Optimal number of permutations: {convergence_analysis['optimal_permutations']}")
    print(f"Convergence detected at: {convergence_analysis['convergence_permutations']} permutations")
    print(f"Final performance - MAE: {convergence_analysis['final_mae']:.4f}, Correlation: {convergence_analysis['final_correlation']:.4f}")
else:
    print("No permutation directories available for analysis")
    results_df = pd.DataFrame()
    convergence_analysis = {"message": "No data available"}

Starting progressive analysis with 2 available permutations
PROGRESSIVE PERMUTATION ANALYSIS
Testing 1 to 2 permutations...


Training model with 1 permutations...
Preparing training data using 1 permutations...
  Loading permutation 1/1: 000.hetmat
Loading data from permutation: 000.hetmat
Permutation path: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/data/permutations/000.hetmat
Edge type: AeG (Anatomy -> Gene)
Loaded AeG edges: (402, 20945) matrix with 526407 non-zero entries
Preparing AeG edge prediction data (Anatomy -> Gene)
Anatomy degree range: 0 - 15036
Gene degree range: 0 - 98
Number of positive examples (existing edges): 526407
Number of positive examples (existing edges): 526407
Number of negative examples (non-existing edges): 526407
    Added 1052814 samples from 000.hetmat
Combined training data: 1052814 total samples from 1 permutations
Feature ranges:
  Source degrees: 0-15036
  Target degrees: 0-



Training on 842251 samples, testing on 210563 samples
Feature shapes: (842251, 2), Labels shape: (842251,)
Early stopping patience: 5 epochs


Training:  16%|█▌        | 8/50 [01:05<05:46,  8.25s/it]

Early stopping triggered after 9 epochs
Best validation loss: 0.1284 at epoch 4
Restored best model weights
Final Test AUC: 0.9877
Final Test AP: 0.9850
Training completed. Final train loss: 0.1318
Results for 1 permutations:
  MAE: 0.6622
  RMSE: 0.7702
  Correlation: 0.1508
  Train Loss: 0.1318


Training model with 2 permutations...
Preparing training data using 2 permutations...
  Loading permutation 1/2: 000.hetmat
Loading data from permutation: 000.hetmat
Permutation path: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/data/permutations/000.hetmat
Edge type: AeG (Anatomy -> Gene)
Loaded AeG edges: (402, 20945) matrix with 526407 non-zero entries
Preparing AeG edge prediction data (Anatomy -> Gene)
Anatomy degree range: 0 - 15036
Gene degree range: 0 - 98





Number of positive examples (existing edges): 526407
Number of negative examples (non-existing edges): 526407
    Added 1052814 samples from 000.hetmat
  Loading permutation 2/2: 001.hetmat
Loading data from permutation: 001.hetmat
Permutation path: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/data/permutations/001.hetmat
Edge type: AeG (Anatomy -> Gene)
Loaded AeG edges: (402, 20945) matrix with 526407 non-zero entries
Preparing AeG edge prediction data (Anatomy -> Gene)
Anatomy degree range: 0 - 15036
Gene degree range: 0 - 98
Number of negative examples (non-existing edges): 526407
    Added 1052814 samples from 000.hetmat
  Loading permutation 2/2: 001.hetmat
Loading data from permutation: 001.hetmat
Permutation path: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/data/permutations/001.hetmat
Edge type: AeG (Anatomy -> Gene)
Loaded AeG edges:



Training on 1684502 samples, testing on 421126 samples
Feature shapes: (1684502, 2), Labels shape: (1684502,)
Early stopping patience: 5 epochs


Training:  20%|██        | 10/50 [02:34<10:21, 15.53s/it]

Epoch 10/50: Train Loss: 0.1309, Val Loss: 0.1282, Val AUC: 0.9877, LR: 1.00e-03, Best Val Loss: 0.1272


Training:  40%|████      | 20/50 [05:10<07:40, 15.35s/it]

Epoch 20/50: Train Loss: 0.1307, Val Loss: 0.1272, Val AUC: 0.9878, LR: 1.00e-03, Best Val Loss: 0.1270


Training:  50%|█████     | 25/50 [06:44<06:44, 16.18s/it]

Early stopping triggered after 26 epochs
Best validation loss: 0.1269 at epoch 21
Restored best model weights
Final Test AUC: 0.9878
Final Test AP: 0.9851
Training completed. Final train loss: 0.1306
Results for 2 permutations:
  MAE: 0.3046
  RMSE: 0.4855
  Correlation: -0.0310
  Train Loss: 0.1306
  Improvement in correlation: -0.1819

ANALYSIS COMPLETE
Optimal number of permutations: 1
Convergence detected at: 1 permutations
Final performance - MAE: 0.3046, Correlation: -0.0310





## 4. Model Evaluation & Prediction Analysis

This section evaluates the trained model and compares predictions with empirical frequencies.

In [45]:
def evaluate_model_predictions(model: EdgePredictionNN, empirical_freq_df: pd.DataFrame) -> pd.DataFrame:
    """Evaluate model predictions against empirical frequencies."""
    
    print("Evaluating model predictions...")
    
    # Prepare input features from empirical data
    test_features = empirical_freq_df[['source_degree', 'target_degree']].values.astype(np.float32)
    test_tensor = torch.tensor(test_features)
    
    # Generate predictions
    model.eval()
    with torch.no_grad():
        raw_predictions = model(test_tensor).squeeze().numpy()
    
    # Ensure predictions are valid probabilities
    predictions = raw_predictions
    # predictions = np.clip(raw_predictions, 0.0, 1.0)
    
    # Create results dataframe
    results_df = empirical_freq_df.copy()
    results_df['predicted_probability'] = predictions
    results_df['prediction_error'] = np.abs(results_df['predicted_probability'] - results_df['empirical_frequency'])
    results_df['relative_error'] = results_df['prediction_error'] / (results_df['empirical_frequency'] + 1e-8)
    
    # Calculate summary statistics
    mae = results_df['prediction_error'].mean()
    rmse = np.sqrt((results_df['prediction_error'] ** 2).mean())
    median_error = results_df['prediction_error'].median()
    correlation = results_df['predicted_probability'].corr(results_df['empirical_frequency'])
    
    print(f"Prediction Quality Metrics:")
    print(f"  Mean Absolute Error: {mae:.4f}")
    print(f"  Root Mean Square Error: {rmse:.4f}")
    print(f"  Median Absolute Error: {median_error:.4f}")
    print(f"  Pearson Correlation: {correlation:.4f}")
    
    return results_df

def demonstrate_edge_predictions(model: EdgePredictionNN, n_examples: int = 10):
    """Demonstrate edge probability predictions for sample degree pairs."""
    
    print(f"Demonstrating predictions for {n_examples} example degree pairs...")
    
    # Create sample degree pairs with diverse ranges
    example_pairs = [
        (10, 5), (20, 15), (50, 25), (100, 50), (200, 100),
        (5, 100), (15, 200), (25, 300), (0, 10), (500, 1000)
    ][:n_examples]
    
    test_features = np.array(example_pairs, dtype=np.float32)
    test_tensor = torch.tensor(test_features)
    
    model.eval()
    with torch.no_grad():
        predictions = model(test_tensor).squeeze().numpy()
    
    predictions = np.clip(predictions, 0.0, 1.0)
    
    print("Sample Predictions:")
    print("Source Deg | Target Deg | Predicted Probability")
    print("-" * 45)
    for i, (src_deg, tgt_deg) in enumerate(example_pairs):
        print(f"{src_deg:>9} | {tgt_deg:>9} | {predictions[i]:>17.4f}")
    
    return example_pairs, predictions

# Evaluate the model
results_df = evaluate_model_predictions(model, empirical_freq_df)

# Demonstrate predictions
example_pairs, example_predictions = demonstrate_edge_predictions(model)

print("Model evaluation completed!")

Evaluating model predictions...
Prediction Quality Metrics:
  Mean Absolute Error: 0.6115
  Root Mean Square Error: 0.7365
  Median Absolute Error: 0.8333
  Pearson Correlation: 0.2401
Demonstrating predictions for 10 example degree pairs...
Sample Predictions:
Source Deg | Target Deg | Predicted Probability
---------------------------------------------
       10 |         5 |            1.0000
       20 |        15 |            1.0000
       50 |        25 |            1.0000
      100 |        50 |            1.0000
      200 |       100 |            1.0000
        5 |       100 |            0.0000
       15 |       200 |            0.0000
       25 |       300 |            0.0000
        0 |        10 |            0.0000
      500 |      1000 |            1.0000
Model evaluation completed!


## 5. Results Visualization & Analysis

This section provides visualization of training progress and prediction quality.

In [41]:
results_df

Unnamed: 0,source_degree,target_degree,empirical_frequency,predicted_probability,prediction_error,relative_error
0,7939,20,0.272037,1.0,0.727963,2.675964
1,7939,75,0.979326,1.0,0.020674,0.021110
2,7939,38,0.627020,1.0,0.372980,0.594846
3,7939,56,0.874961,1.0,0.125039,0.142908
4,7939,67,0.954649,1.0,0.045351,0.047505
...,...,...,...,...,...,...
13162,56,3,0.000008,1.0,0.999992,121732.629836
13163,101,1,0.000011,1.0,0.999989,87642.120332
13164,25,3,0.000016,1.0,0.999984,60754.066388
13165,46,5,0.000011,1.0,0.999989,87031.188872


In [None]:
def plot_progressive_analysis_results(results_df: pd.DataFrame, convergence_analysis: Dict, figsize: Tuple[int, int] = (15, 12), x = max_permutations):
    """
    Create comprehensive visualizations of the progressive permutation analysis.
    
    Parameters:
    -----------
    results_df : pd.DataFrame
        Results from progressive analysis
    convergence_analysis : Dict
        Convergence analysis results
    figsize : Tuple[int, int]
        Figure size for plots
    x : int
        Number of permutations
    """
    if len(results_df) == 0:
        print("No results to plot")
        return
    
    fig, axes = plt.subplots(2, 3, figsize=figsize)
    
    # 1. Performance metrics vs number of permutations
 
    axes[0,0].plot(x, results_df['mae'], 'b-o', label='MAE', linewidth=2, markersize=6)
    axes[0,0].plot(x, results_df['rmse'], 'r-s', label='RMSE', linewidth=2, markersize=6)
    axes[0,0].axvline(convergence_analysis.get('optimal_permutations', 0), color='green', linestyle='--', alpha=0.7, label='Optimal')
    axes[0,0].axvline(convergence_analysis.get('convergence_permutations', 0), color='orange', linestyle='--', alpha=0.7, label='Convergence')
    axes[0,0].set_xlabel('Number of Permutations')
    axes[0,0].set_ylabel('Error')
    axes[0,0].set_title('Error Metrics vs Permutations')
    axes[0,0].legend()
    axes[0,0].grid(True, alpha=0.3)
    
    # 2. Correlation vs number of permutations
    axes[0,1].plot(x, results_df['correlation'], 'g-^', label='Correlation', linewidth=2, markersize=6)
    axes[0,1].axvline(convergence_analysis.get('optimal_permutations', 0), color='green', linestyle='--', alpha=0.7, label='Optimal')
    axes[0,1].axvline(convergence_analysis.get('convergence_permutations', 0), color='orange', linestyle='--', alpha=0.7, label='Convergence')
    axes[0,1].set_xlabel('Number of Permutations')
    axes[0,1].set_ylabel('Correlation with Empirical')
    axes[0,1].set_title('Correlation vs Permutations')
    axes[0,1].legend()
    axes[0,1].grid(True, alpha=0.3)
    
    # 3. Training loss vs number of permutations
    axes[0,2].plot(x, results_df['train_loss'], 'm-d', label='Train Loss', linewidth=2, markersize=6)
    axes[0,2].axvline(convergence_analysis.get('optimal_permutations', 0), color='green', linestyle='--', alpha=0.7, label='Optimal')
    axes[0,2].set_xlabel('Number of Permutations')
    axes[0,2].set_ylabel('Final Training Loss')
    axes[0,2].set_title('Training Loss vs Permutations')
    axes[0,2].legend()
    axes[0,2].grid(True, alpha=0.3)
    
    # 4. Improvement curves (if enough data points)
    if len(results_df) > 1:
        mae_improvement = results_df['mae'].diff().fillna(0)
        correlation_improvement = results_df['correlation'].diff().fillna(0)
        
        axes[1,0].plot(x[1:], mae_improvement[1:], 'b-o', label='MAE Improvement', linewidth=2)
        axes[1,0].axhline(convergence_threshold, color='red', linestyle='--', alpha=0.7, label=f'Threshold ({convergence_threshold})')
        axes[1,0].axhline(-convergence_threshold, color='red', linestyle='--', alpha=0.7)
        axes[1,0].set_xlabel('Number of Permutations')
        axes[1,0].set_ylabel('Change in MAE')
        axes[1,0].set_title('MAE Improvement per Additional Permutation')
        axes[1,0].legend()
        axes[1,0].grid(True, alpha=0.3)
        
        axes[1,1].plot(x[1:], correlation_improvement[1:], 'g-^', label='Correlation Improvement', linewidth=2)
        axes[1,1].axhline(convergence_threshold, color='red', linestyle='--', alpha=0.7, label=f'Threshold ({convergence_threshold})')
        axes[1,1].axhline(-convergence_threshold, color='red', linestyle='--', alpha=0.7)
        axes[1,1].set_xlabel('Number of Permutations')
        axes[1,1].set_ylabel('Change in Correlation')
        axes[1,1].set_title('Correlation Improvement per Additional Permutation')
        axes[1,1].legend()
        axes[1,1].grid(True, alpha=0.3)
    else:
        axes[1,0].text(0.5, 0.5, 'Insufficient data\nfor improvement analysis', ha='center', va='center', transform=axes[1,0].transAxes)
        axes[1,1].text(0.5, 0.5, 'Insufficient data\nfor improvement analysis', ha='center', va='center', transform=axes[1,1].transAxes)
    
    # 5. Performance summary bar chart
    metrics = ['MAE', 'RMSE', 'Correlation']
    if len(results_df) > 0:
        final_values = [results_df.iloc[-1]['mae'], results_df.iloc[-1]['rmse'], results_df.iloc[-1]['correlation']]
        optimal_idx = convergence_analysis.get('optimal_permutations', 1) - 1
        if optimal_idx < len(results_df):
            optimal_values = [results_df.iloc[optimal_idx]['mae'], results_df.iloc[optimal_idx]['rmse'], results_df.iloc[optimal_idx]['correlation']]
        else:
            optimal_values = final_values
        
        x_pos = np.arange(len(metrics))
        width = 0.35
        
        axes[1,2].bar(x_pos - width/2, [final_values[0], final_values[1], final_values[2]], width, 
                     label=f'Final ({len(results_df)} perms)', alpha=0.7)
        axes[1,2].bar(x_pos + width/2, [optimal_values[0], optimal_values[1], optimal_values[2]], width, 
                     label=f'Optimal ({convergence_analysis.get("optimal_permutations", "N/A")} perms)', alpha=0.7)
        
        axes[1,2].set_xlabel('Metrics')
        axes[1,2].set_ylabel('Values')
        axes[1,2].set_title('Final vs Optimal Performance')
        axes[1,2].set_xticks(x_pos)
        axes[1,2].set_xticklabels(metrics)
        axes[1,2].legend()
        axes[1,2].grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()

def save_analysis_results(results_df: pd.DataFrame, convergence_analysis: Dict):
    """Save the analysis results to files."""
    
    # Save detailed results
    results_file = output_dir / f'progressive_analysis_results_{edge_type}.csv'
    results_df.to_csv(results_file, index=False)
    
    # Save convergence analysis
    convergence_file = output_dir / f'convergence_analysis_{edge_type}.json'
    import json
    with open(convergence_file, 'w') as f:
        json.dump(convergence_analysis, f, indent=2, default=str)
    
    # Save summary report
    summary_file = output_dir / f'minimum_permutations_summary_{edge_type}.txt'
    with open(summary_file, 'w') as f:
        f.write("MINIMUM PERMUTATIONS ANALYSIS SUMMARY\n")
        f.write("="*50 + "\n\n")
        f.write(f"Edge Type: {edge_type}\n")
        f.write(f"Analysis Date: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write(f"Performance Metric: {performance_metric}\n")
        f.write(f"Convergence Threshold: {convergence_threshold}\n\n")
        
        if 'optimal_permutations' in convergence_analysis:
            f.write(f"RESULTS:\n")
            f.write(f"- Optimal number of permutations: {convergence_analysis['optimal_permutations']}\n")
            f.write(f"- Convergence detected at: {convergence_analysis['convergence_permutations']} permutations\n")
            f.write(f"- Total permutations tested: {convergence_analysis['total_tested']}\n\n")
            
            f.write(f"FINAL PERFORMANCE:\n")
            f.write(f"- MAE: {convergence_analysis['final_mae']:.4f}\n")
            f.write(f"- RMSE: {convergence_analysis['final_rmse']:.4f}\n")
            f.write(f"- Correlation: {convergence_analysis['final_correlation']:.4f}\n\n")
        else:
            f.write(f"RESULTS: {convergence_analysis.get('message', 'Analysis incomplete')}\n")
    
    print(f"Analysis results saved:")
    print(f"  Detailed results: {results_file}")
    print(f"  Convergence analysis: {convergence_file}")
    print(f"  Summary report: {summary_file}")

def print_final_analysis_report(results_df: pd.DataFrame, convergence_analysis: Dict):
    """Print a comprehensive final report."""
    
    print("\n" + "="*80)
    print("MINIMUM PERMUTATIONS ANALYSIS - FINAL REPORT")
    print("="*80)
    
    if 'optimal_permutations' in convergence_analysis:
        print(f"\nRECOMMENDATION:")
        print(f"  Minimum effective permutations: {convergence_analysis['optimal_permutations']}")
        print(f"  Convergence detected at: {convergence_analysis['convergence_permutations']} permutations")
        
        print(f"\nPERFORMANCE ACHIEVED:")
        print(f"  Mean Absolute Error: {convergence_analysis['final_mae']:.4f}")
        print(f"  Root Mean Square Error: {convergence_analysis['final_rmse']:.4f}")
        print(f"  Correlation with Empirical: {convergence_analysis['final_correlation']:.4f}")
        
        print(f"\nANALYSIS DETAILS:")
        print(f"  Total permutations tested: {convergence_analysis['total_tested']}")
        print(f"  Primary performance metric: {performance_metric}")
        print(f"  Convergence threshold: {convergence_threshold}")
        
        if len(results_df) > 1:
            improvement_1_to_2 = results_df.iloc[1]['correlation'] - results_df.iloc[0]['correlation'] if len(results_df) > 1 else 0
            print(f"  Improvement from 1 to 2 permutations: {improvement_1_to_2:.4f}")
        
        print(f"\nINTERPRETATION:")
        if convergence_analysis['optimal_permutations'] == 1:
            print("  - Single permutation provides optimal performance")
            print("  - Additional permutations may not improve learning significantly")
        elif convergence_analysis['optimal_permutations'] <= 3:
            print("  - Few permutations needed for effective learning")
            print("  - Model converges quickly with limited data diversity")
        else:
            print("  - Multiple permutations required for optimal performance")
            print("  - Model benefits from increased data diversity")
    else:
        print(f"\nANALYSIS INCOMPLETE: {convergence_analysis.get('message', 'Unknown error')}")
    
    print(f"\n" + "="*80)

# Create visualizations and save results
if len(results_df) > 0:
    print("Creating visualization of progressive analysis results...")
    plot_progressive_analysis_results(results_df, convergence_analysis, x = min(len(permutations_dirs), max_permutations))
    
    print("Saving analysis results...")
    save_analysis_results(results_df, convergence_analysis)
    
    print_final_analysis_report(results_df, convergence_analysis)
else:
    print("No results available for visualization and reporting")

## 6. Summary & Conclusions

This section provides a summary of the analysis and potential next steps.

In [None]:
# Summary of Analysis Configuration and Results

print("="*80)
print("MINIMUM PERMUTATIONS ANALYSIS CONFIGURATION")
print("="*80)

print(f"Analysis Parameters:")
print(f"  Edge type analyzed: {edge_type}")
print(f"  Maximum permutations tested: {max_permutations}")
print(f"  Convergence threshold: {convergence_threshold}")
print(f"  Performance metric: {performance_metric}")
print(f"  Random seed: {random_seed}")

print(f"\nTraining Parameters:")
print(f"  Epochs per model: {epochs_per_model}")
print(f"  Batch size: {batch_size}")
print(f"  Learning rate: {learning_rate}")
print(f"  Early stopping patience: {early_stopping_patience}")
print(f"  Negative sampling ratio: {negative_sampling_ratio}")

if 'optimal_permutations' in convergence_analysis:
    print(f"\nKey Findings:")
    print(f"  - Minimum effective permutations: {convergence_analysis['optimal_permutations']}")
    print(f"  - Model performance stabilizes at: {convergence_analysis['convergence_permutations']} permutations")
    print(f"  - Final correlation with empirical data: {convergence_analysis['final_correlation']:.4f}")
    print(f"  - Final mean absolute error: {convergence_analysis['final_mae']:.4f}")

print(f"\nFiles Generated:")
print(f"  - Results: {output_dir}/progressive_analysis_results_{edge_type}.csv")
print(f"  - Convergence Analysis: {output_dir}/convergence_analysis_{edge_type}.json")  
print(f"  - Summary Report: {output_dir}/minimum_permutations_summary_{edge_type}.txt")

print(f"\nReproducibility:")
print(f"  Run with papermill using:")
print(f"  papermill 5_minimum_permutations_analysis.ipynb output.ipynb \\")
print(f"    -p edge_type '{edge_type}' \\")
print(f"    -p max_permutations {max_permutations} \\")
print(f"    -p convergence_threshold {convergence_threshold} \\")
print(f"    -p random_seed {random_seed}")

print("="*80)
print("ANALYSIS COMPLETE")
print("="*80)

## 3. Neural Network Training

This section trains the edge prediction model using the existing EdgePredictionNN architecture and training utilities.