# Minimum Permutations Analysis with Unique Edge Sampling

This notebook implements a progressive training approach to determine the minimum number of permutations needed for effective edge probability distribution learning. It uses unique edge sampling to eliminate data leakage and maximize the benefit of multiple permutations.

In [1]:
import sys
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import os
import torch


# Add src to path
repo_dir = Path.cwd().parent
sys.path.append(str(repo_dir / 'src'))

# # Import helper modules moved from notebook
# from data_processing_helpers import load_permutation_data, get_available_permutations, extract_improved_edge_features_and_labels
# from optimized_model import DistributionAwareNN, OptimizedModelTrainer
# from unique_sampling import UniqueEdgeSampler
# from validation_utils import compute_adaptive_degree_based_probability_distribution, compute_enhanced_distribution_difference

print("All imports successful!")
print(f"Repository directory: {repo_dir}")

All imports successful!
Repository directory: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability


In [2]:
# Papermill parameters cell for reproducible runs
# Parameters set by papermill or default values for interactive use

# Parameters (these will be overridden by papermill if run as a pipeline)
edge_type = os.environ.get('EDGE_TYPE', 'AeG')  # Compound-treats-Disease
max_permutations = int(os.environ.get('MAX_PERMUTATIONS', 2))
validation_networks = int(os.environ.get('VALIDATION_NETWORKS', 3))
convergence_threshold = float(os.environ.get('CONVERGENCE_THRESHOLD', 0.2))
n_bins = int(os.environ.get('N_BINS', 8))
negative_sampling_ratio = float(os.environ.get('NEGATIVE_SAMPLING_RATIO', 0.5))
random_seed = int(os.environ.get('RANDOM_SEED', 42))
model_types = os.environ.get('MODEL_TYPES', 'NN,RF').split(',')
use_normalized_features = os.environ.get('USE_NORMALIZED_FEATURES', 'False') == 'True'
use_regression_approach = os.environ.get('USE_REGRESSION_APPROACH', 'True') == 'True'
use_distribution_loss = os.environ.get('USE_DISTRIBUTION_LOSS', 'True') == 'True'
use_adaptive_binning = os.environ.get('USE_ADAPTIVE_BINNING', 'True') == 'True'
use_ensemble_methods = os.environ.get('USE_ENSEMBLE_METHODS', 'True') == 'True'
early_stopping_patience = int(os.environ.get('EARLY_STOPPING_PATIENCE', 3))
relative_improvement_threshold = float(os.environ.get('RELATIVE_IMPROVEMENT_THRESHOLD', 0.02))
use_relative_convergence = os.environ.get('USE_RELATIVE_CONVERGENCE', 'True') == 'True'

# Set random seeds for reproducibility
np.random.seed(random_seed)
torch.manual_seed(random_seed)

# Directory setup
repo_dir = Path.cwd().parent

# These are the same as before, but now use the parameters above
data_dir = repo_dir / 'data'
permutations_dir = data_dir / 'permutations'
downloads_dir = data_dir / 'downloads'
models_dir = repo_dir / 'models'
output_dir = repo_dir / 'results' / 'minimum_permutations_basic_2d'
output_dir.mkdir(parents=True, exist_ok=True)

print("Papermill Parameters:")
print(f"  edge_type: {edge_type}")
print(f"  max_permutations: {max_permutations}")
print(f"  validation_networks: {validation_networks}")
print(f"  convergence_threshold: {convergence_threshold}")
print(f"  n_bins: {n_bins}")
print(f"  random_seed: {random_seed}")
print(f"  model_types: {model_types}")
print(f"  use_distribution_loss: {use_distribution_loss}")
print(f"  use_adaptive_binning: {use_adaptive_binning}")
print(f"  early_stopping_patience: {early_stopping_patience}")
print(f"  relative_improvement_threshold: {relative_improvement_threshold}")
print(f"  use_relative_convergence: {use_relative_convergence}")
print(f"\nDirectories:")
print(f"  Data: {data_dir}")
print(f"  Permutations: {permutations_dir}")
print(f"  Downloads: {downloads_dir}")
print(f"  Output: {output_dir}")

Papermill Parameters:
  edge_type: AeG
  max_permutations: 2
  validation_networks: 3
  convergence_threshold: 0.2
  n_bins: 8
  random_seed: 42
  model_types: ['NN', 'RF']
  use_distribution_loss: True
  use_adaptive_binning: True
  early_stopping_patience: 3
  relative_improvement_threshold: 0.02
  use_relative_convergence: True

Directories:
  Data: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/data
  Permutations: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/data/permutations
  Downloads: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/data/downloads
  Output: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/results/minimum_permutations_basic_2d


In [3]:
empirical_freq_df = pd.read_csv('../results/edge_frequency_by_degree.csv')
empirical_freq_df = empirical_freq_df.rename(columns={'frequency': 'empirical_frequency'})
print(empirical_freq_df.head())

   source_degree  target_degree  empirical_frequency
0           7939             20             0.272037
1           7939             75             0.979326
2           7939             38             0.627020
3           7939             56             0.874961
4           7939             67             0.954649


In [4]:
# Setup data directories and paths
print("Setting up data directories...")

# Original data directory (main hetionet data)
original_data_dir = data_dir  # Main data directory contains the original network

# Find available permutation directories
available_permutations = []
if permutations_dir.exists():
    for perm_dir in permutations_dir.iterdir():
        if perm_dir.is_dir() and perm_dir.name.endswith('.hetmat'):
            available_permutations.append(perm_dir)

# Sort permutations by name to ensure consistent ordering
permutations_dirs = sorted(available_permutations)

print(f"Original data directory: {original_data_dir}")
print(f"Permutations directory: {permutations_dir}")
print(f"Found {len(permutations_dirs)} permutation directories:")
for i, perm_dir in enumerate(permutations_dirs[:5]):  # Show first 5
    print(f"  {i+1}. {perm_dir.name}")
if len(permutations_dirs) > 5:
    print(f"  ... and {len(permutations_dirs) - 5} more")

# Validate we have enough permutations for the experiment
if len(permutations_dirs) < max_permutations:
    print(f"⚠️  Warning: Only {len(permutations_dirs)} permutations available, but max_permutations = {max_permutations}")
    print("   Will reuse permutations if needed.")
else:
    print(f"✅ Sufficient permutations available for experiment")

# Check if original data exists
original_edge_file = original_data_dir / 'edges' / f"{edge_type}.sparse.npz"
if original_edge_file.exists():
    print(f"✅ Original edge data found: {original_edge_file}")
else:
    print(f"❌ Original edge data not found: {original_edge_file}")
    print("Available edge files:")
    if (original_data_dir / 'edges').exists():
        for edge_file in (original_data_dir / 'edges').iterdir():
            if edge_file.suffix == '.npz':
                print(f"  - {edge_file.name}")

print("\nDirectory setup complete!")

Setting up data directories...
Original data directory: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/data
Permutations directory: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/data/permutations
Found 2 permutation directories:
  1. 000.hetmat
  2. 001.hetmat
✅ Sufficient permutations available for experiment
✅ Original edge data found: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/data/edges/AeG.sparse.npz

Directory setup complete!


In [27]:
from scipy.stats import ttest_rel

# Combine permutations iteratively to create larger datasets
combined_data = None
combined_labels = None

for idx, perm_dir in enumerate(permutations_dirs[:max_permutations]):
    print(f"\n=== Combining permutation {idx+1} | Using permutation: {perm_dir.name} ===")
    # Load permutation data
    perm_data = load_permutation_data(
        permutation_name=perm_dir.name,
        permutations_dir=permutations_dir,
        edge_type=edge_type,
        source_node_type='Anatomy',  # adjust as needed
        target_node_type='Gene'      # adjust as needed
    )
    
    # Compute source and target degrees
    source_degrees = np.array(perm_data['edges'].sum(axis=1)).flatten()
    target_degrees = np.array(perm_data['edges'].sum(axis=0)).flatten()
    perm_data['source_degrees'] = source_degrees
    perm_data['target_degrees'] = target_degrees


    # Prepare features and labels
    X, y = prepare_edge_prediction_data(perm_data, sample_negative_ratio=1)
    print(f"Shape of X: {X.shape}")

    # Combine with existing data
    if combined_data is None:
        combined_data = X
        combined_labels = y
        source_degrees_combined = source_degrees
        target_degrees_combined = target_degrees
    else:
        combined_data = np.vstack([combined_data, X])
        combined_labels = np.hstack([combined_labels, y])
        combined_source_degrees = np.hstack([source_degrees_combined, source_degrees])
        combined_target_degrees = np.hstack([target_degrees_combined, target_degrees])

    print(f"Combined dataset size: {combined_data.shape[0]} samples")

    # Predict edge probabilities using EdgePredictionNN
    model = EdgePredictionNN(
        input_dim=combined_data.shape[1],
        hidden_dims=[64, 32],  # adjust as needed
        dropout_rate=0.2      # adjust as needed
    )
    model.eval()
    with torch.no_grad():
        X_tensor = torch.tensor(combined_data, dtype=torch.float32)
        predictions = model(X_tensor).numpy()
    print(f"Predicted probabilities for {len(predictions)} edges.")

    # Create a DataFrame for predictions with source and target degrees
    prediction_df = pd.DataFrame({
        'source_degree': combined_data[:, 0],
        'target_degree': combined_data[:, 1],
        'predicted_probability': predictions.flatten()
    })

    # Group by source and target degree and compute mean predicted probability
    predicted_means = prediction_df.groupby(['source_degree', 'target_degree'])['predicted_probability'].mean().reset_index()

    # Align predicted probabilities with empirical frequencies
    comparison_df = empirical_freq_df.merge(predicted_means, on=['source_degree', 'target_degree'], how='left')

    # Print comparison
    print(comparison_df.shape)

    # Perform paired t-test between predicted probabilities and empirical frequencies
    valid_comparison_df = comparison_df.dropna(subset=['predicted_probability'])
    t_stat, p_value = ttest_rel(valid_comparison_df['predicted_probability'], valid_comparison_df['empirical_frequency'])

    print(f"T-statistic: {t_stat}")
    print(f"P-value: {p_value}")

    # Interpret the result
    if p_value < 0.05:
        print("The difference between predicted probabilities and empirical frequencies is statistically significant.")
    else:
        print("The difference between predicted probabilities and empirical frequencies is not statistically significant.")



=== Combining permutation 1 | Using permutation: 000.hetmat ===
Loading data from permutation: 000.hetmat
Permutation path: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/data/permutations/000.hetmat
Edge type: AeG (Anatomy -> Gene)
Loaded AeG edges: (402, 20945) matrix with 526407 non-zero entries
Loaded Anatomy nodes: 402 nodes
Anatomy columns: ['position', 'identifier', 'name']
Loaded Gene nodes: 20945 nodes
Gene columns: ['position', 'identifier', 'name']
Preparing AeG edge prediction data (Anatomy -> Gene)
Anatomy degree range: 0 - 15036
Gene degree range: 0 - 98
Number of positive examples (existing edges): 526407
Number of positive examples (existing edges): 526407
Number of negative examples (non-existing edges): 526407
Shape of X: (1052814, 2)
Combined dataset size: 1052814 samples
Predicted probabilities for 1052814 edges.
(13167, 4)
T-statistic: -78.3737010429412
P-value: 0.0
The difference between predic