# Edge Prediction for Heterogeneous Networks

## Single Permutation Analysis with Neural Networks and Baseline Models

This notebook performs comprehensive edge prediction analysis on a single permutation of a heterogeneous network. It trains and evaluates multiple machine learning models to predict the existence of edges between nodes based on network topology features.

In [5]:
# Import all required libraries and functions
import sys
import pathlib
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import sparse
import warnings
warnings.filterwarnings('ignore')

# Additional imports for neural network training
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

# Sklearn imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import roc_auc_score, average_precision_score, roc_curve, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Other utility imports
import importlib
import time

# Add src directory to path
repo_dir = Path().absolute().parent
src_dir = repo_dir / 'src'
sys.path.insert(0, str(src_dir))

# Import sampling functions
from sampling import (
    stratified_positive_sampling,
    representative_negative_sampling, 
    create_representative_dataset
)

# Import enhanced experiment functions  
from enhanced_experiments import (
    run_enhanced_experiment,
    analyze_enhanced_experiment_results,
    calculate_prediction_stability
)

# Import data processing functions
from data_processing import load_permutation_data, prepare_edge_prediction_data
import data_processing
importlib.reload(data_processing)

# Import models
from models import EdgePredictionNN

In [6]:
# Parameters for papermill
# Default parameter values - can be overridden by papermill
permutations_subdirectory = "permutations"  # Default: use 'permutations' (local generated)
permutation_name = None  # Specific permutation to process (e.g., "000", "001", etc.)
output_dir = "models"  # Directory to save trained models

# Edge and node type parameters for flexible relationship modeling
edge_type = "AeG"  # Edge type to model (e.g., "AeG", "CbG", "DaG", "GiG", etc.)
source_node_type = "Anatomy"  # Source node type (e.g., "Anatomy", "Compound", "Disease", "Gene")
target_node_type = "Gene"  # Target node type (e.g., "Gene", "Anatomy", "Disease", "Compound")

# Validation
if not isinstance(permutations_subdirectory, str):
    raise ValueError(f"permutations_subdirectory must be a string, got: {permutations_subdirectory}")

if permutation_name is not None and not isinstance(permutation_name, str):
    raise ValueError(f"permutation_name must be a string or None, got: {permutation_name}")

if not isinstance(edge_type, str):
    raise ValueError(f"edge_type must be a string, got: {edge_type}")

if not isinstance(source_node_type, str):
    raise ValueError(f"source_node_type must be a string, got: {source_node_type}")

if not isinstance(target_node_type, str):
    raise ValueError(f"target_node_type must be a string, got: {target_node_type}")

print(f"Using permutations subdirectory: {permutations_subdirectory}")
print(f"Edge type: {edge_type} ({source_node_type} -> {target_node_type})")
if permutation_name:
    print(f"Processing single permutation: {permutation_name}")
else:
    print("No specific permutation specified - will use first available")

Using permutations subdirectory: permutations
Edge type: AeG (Anatomy -> Gene)
No specific permutation specified - will use first available


In [7]:
# Set up paths for data access using parameterized directory
repo_dir = pathlib.Path().cwd().parent
data_dir = repo_dir / "data"
output_models_dir = repo_dir / output_dir

# Use the parameterized permutations subdirectory
if "/" in permutations_subdirectory:
    permutations_dir = data_dir / permutations_subdirectory
elif permutations_subdirectory == "permutations":
    permutations_dir = data_dir / "permutations"
else:
    permutations_dir = data_dir / "permutations" / permutations_subdirectory

print(f"Repository directory: {repo_dir}")
print(f"Data directory: {data_dir}")
print(f"Permutations directory: {permutations_dir}")
print(f"Models output directory: {output_models_dir}")

# Create output directory if it doesn't exist
output_models_dir.mkdir(exist_ok=True)

# List available permutations
if permutations_dir.exists():
    available_permutations = [p.name for p in permutations_dir.iterdir() if p.is_dir()]
    print(f"Available permutations: {len(available_permutations)} total")
    
    # Select specific permutation or first available
    if permutation_name:
        if permutation_name in available_permutations:
            selected_permutation = permutation_name
            print(f"Selected permutation: {selected_permutation}")
        else:
            raise ValueError(f"Permutation '{permutation_name}' not found. Available: {available_permutations[:5]}...")
    else:
        if available_permutations:
            selected_permutation = available_permutations[0]
            print(f"Using first available permutation: {selected_permutation}")
        else:
            raise ValueError("No permutations found!")
else:
    raise ValueError(f"Permutations directory not found: {permutations_dir}")

Repository directory: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability
Data directory: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/data
Permutations directory: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/data/permutations
Models output directory: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/models
Available permutations: 2 total
Using first available permutation: 001.hetmat


In [8]:
# Load data from the selected permutation
print(f"Loading permutation: {selected_permutation}")
print(f"Edge type: {edge_type} ({source_node_type} -> {target_node_type})")

perm_data = load_permutation_data(
    selected_permutation, 
    permutations_dir,
    edge_type=edge_type,
    source_node_type=source_node_type,
    target_node_type=target_node_type
)

if not perm_data:
    raise ValueError(f"Failed to load permutation data for: {selected_permutation}")

print(f"Successfully loaded permutation: {selected_permutation}")

# Extract data components using new parameterized keys
edges = perm_data["edges"]
source_nodes = perm_data["source_nodes"]
target_nodes = perm_data["target_nodes"]

# Also extract with legacy names for backwards compatibility
aeg_edges = perm_data["aeg_edges"]  # Will be the same as edges
anatomy_nodes = perm_data["anatomy_nodes"]  # Will be the same as source_nodes
gene_nodes = perm_data["gene_nodes"]  # Will be the same as target_nodes

print(f"Permutation {selected_permutation} data summary:")
print(f"  {edge_type} edges matrix shape: {edges.shape}")
print(f"  Number of edges: {edges.nnz}")
print(f"  {source_node_type} nodes: {len(source_nodes)}")
print(f"  {target_node_type} nodes: {len(target_nodes)}")
print(f"  Matrix density: {edges.nnz / (edges.shape[0] * edges.shape[1]):.6f}")

Loading permutation: 001.hetmat
Edge type: AeG (Anatomy -> Gene)
Loading data from permutation: 001.hetmat
Permutation path: /Users/lucas/Library/CloudStorage/OneDrive-TheUniversityofColoradoDenver/Repositories/Context-Aware-Path-Probability/data/permutations/001.hetmat
Edge type: AeG (Anatomy -> Gene)
Loaded AeG edges: (402, 20945) matrix with 526407 non-zero entries
Loaded Anatomy nodes: 402 nodes
Anatomy columns: ['position', 'identifier', 'name']
Loaded Gene nodes: 20945 nodes
Gene columns: ['position', 'identifier', 'name']
Successfully loaded permutation: 001.hetmat
Permutation 001.hetmat data summary:
  AeG edges matrix shape: (402, 20945)
  Number of edges: 526407
  Anatomy nodes: 402
  Gene nodes: 20945
  Matrix density: 0.062519


In [9]:
# Prepare degrees dictionary for run_enhanced_experiment
edges_coo = edges.tocoo()
source_degrees = edges_coo.sum(axis=1).A1
target_degrees = edges_coo.sum(axis=0).A1
degrees_dict = {'source': source_degrees, 'target': target_degrees}

# Use all available positive edges (or as many as possible)
sample_size = edges_coo.nnz 
run_id = 0

# Run enhanced experiment
results = run_enhanced_experiment(
    sample_size=sample_size,
    run_id=run_id,
    edges=edges,
    degrees_dict=degrees_dict,
    verbose=True
)


  Enhanced Experiment Run 1: Sample size 526407

Creating representative dataset: 526407 positive + 526407 negative edges
Positive method: stratified, Negative method: degree_matched
Stratified sampling: Found 25 degree-based strata
Sampled 526407 positive edges using stratified sampling

Generating 526407 negative edges using degree_matched method...
Positive edge degree stats - Source: 8702.6±3478.9
Positive edge degree stats - Target: 41.8±17.1
Using fast batch sampling approach...
Generated 34133 negative edges (success rate: 0.341)

Dataset created successfully:
  Total samples: 560540
  Positive: 526407, Negative: 34133
  Feature correlation: -0.274


KeyboardInterrupt: 