# Minimum Permutations Analysis with Unique Edge Sampling

This notebook implements a progressive training approach to determine the minimum number of permutations needed for effective edge probability distribution learning. It uses unique edge sampling to eliminate data leakage and maximize the benefit of multiple permutations.

In [None]:
import sys
import numpy as np
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Add src to path
repo_dir = Path.cwd().parent
sys.path.append(str(repo_dir / 'src'))

# Import helper modules moved from notebook
from data_processing_helpers import load_permutation_data, get_available_permutations, extract_improved_edge_features_and_labels
from optimized_model import DistributionAwareNN, OptimizedModelTrainer
from unique_sampling import UniqueEdgeSampler
from validation_utils import compute_adaptive_degree_based_probability_distribution, compute_enhanced_distribution_difference

print("All imports successful!")
print(f"Repository directory: {repo_dir}")

All imports successful!
Repository directory: /projects/lgillenwater@xsede.org/repositories/Context-Aware-Path-Probability
PyTorch available: 2.6.0
CUDA available: False


In [34]:
# Configuration - OPTION 1: BASIC 2D MODEL (Source + Target Degrees Only)
CONFIG = {
    'edge_type': 'CtD',  # Compound-treats-Disease
    'max_permutations': 20,  # INCREASED - More training data for better convergence
    'validation_networks': 3,  # Number of held-out networks for validation
    'convergence_threshold': 0.2,  # REALISTIC - Based on achieved performance (~0.55 MAE)
    'n_bins': 8,  # INCREASED slightly - Better granularity while maintaining statistics
    'negative_sampling_ratio': 0.5,  # Standard ratio for balanced training
    'random_seed': 42,
    'model_types': ['NN', 'RF'],  # FOCUSED - Best performing models only
    'use_normalized_features': False,  # BASIC 2D MODEL - Source and target degrees only
    'use_regression_approach': True,
    'use_distribution_loss': True,  # NEW - Direct distribution-based training
    'use_adaptive_binning': True,   # NEW - Adaptive binning based on data density
    'use_ensemble_methods': True,   # NEW - Ensemble of specialized models
    'early_stopping_patience': 3,   # NEW - Stop early if no improvement
    'relative_improvement_threshold': 0.02,  # NEW - 2% improvement required
    'use_relative_convergence': True  # NEW - Use relative improvement criteria
}

# Set random seeds for reproducibility
np.random.seed(CONFIG['random_seed'])
torch.manual_seed(CONFIG['random_seed'])

# Directory setup
data_dir = repo_dir / 'data'
permutations_dir = data_dir / 'permutations'
downloads_dir = data_dir / 'downloads'
models_dir = repo_dir / 'models'
output_dir = repo_dir / 'results' / 'minimum_permutations_basic_2d'

# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)

print("Configuration - BASIC 2D MODEL (Source + Target Degrees Only):")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")
print(f"\n🎯 BASIC 2D MODEL CONFIGURATION:")
print(f"  - FEATURES: Only source_degree + target_degree (2 dimensions)")
print(f"  - NO log transforms, degree products, sums, or ratios")
print(f"  - SIMPLIFIED model to study pure degree effects")
print(f"  - convergence_threshold: {CONFIG['convergence_threshold']}")
print(f"  - negative_sampling_ratio: {CONFIG['negative_sampling_ratio']}")
print(f"  - model_types: {CONFIG['model_types']}")
print(f"\n🚀 EXPECTED OUTCOME:")
print(f"  - Focus: Pure effects of source and target node degrees")
print(f"  - Feature interpretability: Direct degree relationships")
print(f"  - Simplified neural network architecture for 2D input")
print(f"  - Clear understanding of degree-based edge prediction")
print(f"\nDirectories:")
print(f"  Data: {data_dir}")
print(f"  Permutations: {permutations_dir}")
print(f"  Downloads: {downloads_dir}")
print(f"  Output: {output_dir}")

Configuration - BASIC 2D MODEL (Source + Target Degrees Only):
  edge_type: CtD
  max_permutations: 20
  validation_networks: 3
  convergence_threshold: 0.2
  n_bins: 8
  negative_sampling_ratio: 0.5
  random_seed: 42
  model_types: ['NN', 'RF']
  use_normalized_features: False
  use_regression_approach: True
  use_distribution_loss: True
  use_adaptive_binning: True
  use_ensemble_methods: True
  early_stopping_patience: 3
  relative_improvement_threshold: 0.02
  use_relative_convergence: True

🎯 BASIC 2D MODEL CONFIGURATION:
  - FEATURES: Only source_degree + target_degree (2 dimensions)
  - NO log transforms, degree products, sums, or ratios
  - SIMPLIFIED model to study pure degree effects
  - convergence_threshold: 0.2
  - negative_sampling_ratio: 0.5
  - model_types: ['NN', 'RF']

🚀 EXPECTED OUTCOME:
  - Focus: Pure effects of source and target node degrees
  - Feature interpretability: Direct degree relationships
  - Simplified neural network architecture for 2D input
  - Clear 

In [35]:
# Setup data directories and paths
print("Setting up data directories...")

# Original data directory (main hetionet data)
original_data_dir = data_dir  # Main data directory contains the original network

# Find available permutation directories
available_permutations = []
if permutations_dir.exists():
    for perm_dir in permutations_dir.iterdir():
        if perm_dir.is_dir() and perm_dir.name.endswith('.hetmat'):
            available_permutations.append(perm_dir)

# Sort permutations by name to ensure consistent ordering
permutations_dirs = sorted(available_permutations)

print(f"Original data directory: {original_data_dir}")
print(f"Permutations directory: {permutations_dir}")
print(f"Found {len(permutations_dirs)} permutation directories:")
for i, perm_dir in enumerate(permutations_dirs[:5]):  # Show first 5
    print(f"  {i+1}. {perm_dir.name}")
if len(permutations_dirs) > 5:
    print(f"  ... and {len(permutations_dirs) - 5} more")

# Validate we have enough permutations for the experiment
if len(permutations_dirs) < CONFIG['max_permutations']:
    print(f"⚠️  Warning: Only {len(permutations_dirs)} permutations available, but max_permutations = {CONFIG['max_permutations']}")
    print("   Will reuse permutations if needed.")
else:
    print(f"✅ Sufficient permutations available for experiment")

# Check if original data exists
original_edge_file = original_data_dir / 'edges' / f"{CONFIG['edge_type']}.sparse.npz"
if original_edge_file.exists():
    print(f"✅ Original edge data found: {original_edge_file}")
else:
    print(f"❌ Original edge data not found: {original_edge_file}")
    print("Available edge files:")
    if (original_data_dir / 'edges').exists():
        for edge_file in (original_data_dir / 'edges').iterdir():
            if edge_file.suffix == '.npz':
                print(f"  - {edge_file.name}")

print("\nDirectory setup complete!")

Setting up data directories...
Original data directory: /projects/lgillenwater@xsede.org/repositories/Context-Aware-Path-Probability/data
Permutations directory: /projects/lgillenwater@xsede.org/repositories/Context-Aware-Path-Probability/data/permutations
Found 51 permutation directories:
  1. 000.hetmat
  2. 001.hetmat
  3. 002.hetmat
  4. 003.hetmat
  5. 004.hetmat
  ... and 46 more
✅ Sufficient permutations available for experiment
✅ Original edge data found: /projects/lgillenwater@xsede.org/repositories/Context-Aware-Path-Probability/data/edges/CtD.sparse.npz

Directory setup complete!


In [None]:
# Functions moved to src/data_processing_helpers.py
# See: load_permutation_data, get_available_permutations, extract_improved_edge_features_and_labels

Testing improved data loading...
Available permutations: ['000.hetmat', '001.hetmat', '002.hetmat', '003.hetmat', '004.hetmat', '005.hetmat', '006.hetmat', '007.hetmat', '008.hetmat', '009.hetmat', '010.hetmat', '011.hetmat', '012.hetmat', '013.hetmat', '014.hetmat', '015.hetmat', '016.hetmat', '017.hetmat', '018.hetmat', '019.hetmat', '020.hetmat', '021.hetmat', '022.hetmat', '023.hetmat', '024.hetmat', '025.hetmat', '026.hetmat', '027.hetmat', '028.hetmat', '029.hetmat', '030.hetmat', '031.hetmat', '032.hetmat', '033.hetmat', '034.hetmat', '035.hetmat', '036.hetmat', '037.hetmat', '038.hetmat', '039.hetmat', '040.hetmat', '041.hetmat', '042.hetmat', '043.hetmat', '044.hetmat', '045.hetmat', '046.hetmat', '047.hetmat', '048.hetmat', '049.hetmat', '050.hetmat']

Test permutation: 000.hetmat
Edge matrix shape: (1552, 137)
Number of edges: 755
Edge density: 0.003551
Source node degree range: 0 - 19
Target node degree range: 0 - 68

Improved Features:
  Features shape: (1132, 2)
  Targets

In [None]:
# Classes moved to src/optimized_model.py
# See: DistributionAwareNN, OptimizedModelTrainer

In [None]:
# Functions moved to src/validation_utils.py
# See: compute_adaptive_degree_based_probability_distribution, compute_enhanced_distribution_difference

Setting up optimized validation framework...
Loaded validation network: 151.hetmat
Loaded validation network: 015.hetmat
Loaded validation network: 098.hetmat
Loaded 3 validation networks


In [None]:
# Class moved to src/unique_sampling.py
# See: UniqueEdgeSampler

Testing improved Unique Edge Sampling approach...
\nTesting unique sampling with permutations: [PosixPath('/projects/lgillenwater@xsede.org/repositories/Context-Aware-Path-Probability/data/permutations/014.hetmat'), PosixPath('/projects/lgillenwater@xsede.org/repositories/Context-Aware-Path-Probability/data/permutations/044.hetmat')]
Sampling unique edges from 2 permutations...
  Loading /projects/lgillenwater@xsede.org/repositories/Context-Aware-Path-Probability/data/permutations/014.hetmat (1/2)...
    Found 755 positive edges, 755 are unique
  Loading /projects/lgillenwater@xsede.org/repositories/Context-Aware-Path-Probability/data/permutations/044.hetmat (2/2)...
    Found 755 positive edges, 654 are unique
\nUnique positive edges summary:
  Total positive edges attempted: 1,510
  Unique positive edges found: 1,409
  Positive deduplication rate: 6.7%
\nBuilding features for 1409 unique positive edges...
Generating 1409 negative edges...
  Generated 1409 negative edges
\nFinal datas