In [None]:
# Standard library imports
import sys
import json
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Dict, List, Tuple, Any, Optional
import warnings
warnings.filterwarnings('ignore')

# Scientific computing
import scipy.sparse as sp
from scipy.stats import wasserstein_distance, ks_2samp
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss
from sklearn.neural_network import MLPClassifier

# PyTorch for neural networks
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
repo_dir = Path.cwd().parent
sys.path.append(str(repo_dir / 'src'))

# Import custom modules
from models import EdgePredictionNN
from data_processing import prepare_edge_prediction_data
from training import train_edge_prediction_model
from sampling import negative_sampling

print("All imports successful!")
print(f"Repository directory: {repo_dir}")
print(f"PyTorch available: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

In [None]:
# Configuration
CONFIG = {
    'edge_type': 'CtD',  # Compound-treats-Disease
    'max_permutations': 10,
    'validation_networks': 3,  # Number of held-out networks for validation
    'convergence_threshold': 0.05,  # Threshold for distribution difference
    'n_bins': 20,  # Number of bins for degree-based probability distribution
    'negative_sampling_ratio': 1.0,  # Ratio of negative to positive edges
    'random_seed': 42,
    'models': ['NN', 'LR', 'PLR', 'RF']  # Neural Net, Logistic Reg, Penalized LR, Random Forest
}

# Set random seeds for reproducibility
np.random.seed(CONFIG['random_seed'])
torch.manual_seed(CONFIG['random_seed'])

# Directory setup
data_dir = repo_dir / 'data'
permutations_dir = data_dir / 'permutations'
downloads_dir = data_dir / 'downloads'
models_dir = repo_dir / 'models'
output_dir = repo_dir / 'results' / 'minimum_permutations'

# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)

print("Configuration:")
for key, value in CONFIG.items():
    print(f"  {key}: {value}")
print(f"\nDirectories:")
print(f"  Data: {data_dir}")
print(f"  Permutations: {permutations_dir}")
print(f"  Downloads: {downloads_dir}")
print(f"  Output: {output_dir}")

In [None]:
def load_permutation_data(perm_dir: Path, edge_type: str) -> Tuple[sp.csr_matrix, np.ndarray, np.ndarray]:
    """
    Load edge matrix and node degrees from a permutation directory.
    
    Parameters:
    -----------
    perm_dir : Path
        Path to permutation directory (e.g., data/permutations/000.hetmat/)
    edge_type : str
        Edge type to load (e.g., 'CtD')
    
    Returns:
    --------
    edge_matrix : scipy.sparse.csr_matrix
        Sparse matrix of edges
    source_degrees : np.ndarray
        Degrees of source nodes
    target_degrees : np.ndarray
        Degrees of target nodes
    """
    # Load edge matrix
    edge_file = perm_dir / 'edges' / f'{edge_type}.sparse.npz'
    if not edge_file.exists():
        raise FileNotFoundError(f"Edge file not found: {edge_file}")
    
    edge_matrix = sp.load_npz(edge_file).astype(bool).tocsr()
    
    # Calculate degrees
    source_degrees = np.array(edge_matrix.sum(axis=1)).flatten()
    target_degrees = np.array(edge_matrix.sum(axis=0)).flatten()
    
    return edge_matrix, source_degrees, target_degrees


def get_available_permutations(permutations_dir: Path) -> List[str]:
    """Get list of available permutation directories."""
    perm_dirs = []
    for item in permutations_dir.iterdir():
        if item.is_dir() and item.name.endswith('.hetmat'):
            perm_dirs.append(item.name)
    return sorted(perm_dirs)


def extract_edge_features_and_labels(edge_matrix: sp.csr_matrix, 
                                   source_degrees: np.ndarray, 
                                   target_degrees: np.ndarray,
                                   negative_ratio: float = 1.0) -> Tuple[np.ndarray, np.ndarray]:
    """
    Extract features (source/target degrees) and labels for edge prediction.
    
    Parameters:
    -----------
    edge_matrix : scipy.sparse.csr_matrix
        Sparse matrix of edges
    source_degrees : np.ndarray
        Degrees of source nodes
    target_degrees : np.ndarray
        Degrees of target nodes
    negative_ratio : float
        Ratio of negative to positive edges to generate
    
    Returns:
    --------
    features : np.ndarray
        Feature matrix with shape (n_samples, 2) for source and target degrees
    labels : np.ndarray
        Binary labels (1 for edge exists, 0 for no edge)
    """
    # Get positive edges
    pos_edges = list(zip(*edge_matrix.nonzero()))
    n_pos = len(pos_edges)
    
    # Generate negative edges
    n_neg = int(n_pos * negative_ratio)
    neg_edges = []
    
    # Simple negative sampling - randomly sample non-existing edges
    n_source, n_target = edge_matrix.shape
    max_attempts = n_neg * 10  # Avoid infinite loop
    
    for _ in range(max_attempts):
        if len(neg_edges) >= n_neg:
            break
        
        source = np.random.randint(0, n_source)
        target = np.random.randint(0, n_target)
        
        if edge_matrix[source, target] == 0:  # Non-existing edge
            neg_edges.append((source, target))
    
    # Create features and labels
    all_edges = pos_edges + neg_edges
    n_total = len(all_edges)
    
    features = np.zeros((n_total, 2))
    labels = np.zeros(n_total)
    
    for i, (source, target) in enumerate(all_edges):
        features[i, 0] = source_degrees[source]
        features[i, 1] = target_degrees[target]
        labels[i] = 1 if i < n_pos else 0
    
    return features, labels


# Test data loading with first permutation
print("Testing data loading...")
available_perms = get_available_permutations(permutations_dir)
print(f"Available permutations: {available_perms}")

if available_perms:
    test_perm_dir = permutations_dir / available_perms[0]
    edge_matrix, source_degrees, target_degrees = load_permutation_data(test_perm_dir, CONFIG['edge_type'])
    
    print(f"\\nTest permutation: {available_perms[0]}")
    print(f"Edge matrix shape: {edge_matrix.shape}")
    print(f"Number of edges: {edge_matrix.nnz}")
    print(f"Source node degree range: {source_degrees.min():.0f} - {source_degrees.max():.0f}")
    print(f"Target node degree range: {target_degrees.min():.0f} - {target_degrees.max():.0f}")
    
    # Test feature extraction
    features, labels = extract_edge_features_and_labels(edge_matrix, source_degrees, target_degrees, 0.1)
    print(f"\\nFeatures shape: {features.shape}")
    print(f"Labels shape: {labels.shape}")
    print(f"Positive samples: {labels.sum():.0f}, Negative samples: {(1-labels).sum():.0f}")
else:
    print("No permutations found!")

In [None]:
class ModelTrainer:
    """Unified interface for training different model types."""
    
    def __init__(self, model_type: str, random_seed: int = 42):
        self.model_type = model_type
        self.random_seed = random_seed
        self.model = None
        self.scaler = None
        
    def train(self, features: np.ndarray, labels: np.ndarray, test_size: float = 0.2) -> Dict[str, Any]:
        """
        Train the specified model type.
        
        Returns:
        --------
        results : dict
            Dictionary containing model, scaler, and performance metrics
        """
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            features, labels, test_size=test_size, random_state=self.random_seed, stratify=labels
        )
        
        # Scale features
        self.scaler = StandardScaler()
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)
        
        # Train model based on type
        if self.model_type == 'NN':
            self.model, train_metrics = self._train_neural_network(X_train_scaled, y_train, X_test_scaled, y_test)
        elif self.model_type == 'LR':
            self.model, train_metrics = self._train_logistic_regression(X_train_scaled, y_train, X_test_scaled, y_test)
        elif self.model_type == 'PLR':
            self.model, train_metrics = self._train_penalized_logistic_regression(X_train_scaled, y_train, X_test_scaled, y_test)
        elif self.model_type == 'RF':
            self.model, train_metrics = self._train_random_forest(X_train_scaled, y_train, X_test_scaled, y_test)
        else:
            raise ValueError(f"Unknown model type: {self.model_type}")
        
        return {
            'model': self.model,
            'scaler': self.scaler,
            'metrics': train_metrics,
            'model_type': self.model_type
        }
    
    def _train_neural_network(self, X_train, y_train, X_test, y_test):
        """Train PyTorch neural network."""
        # Convert to tensors
        X_train_tensor = torch.FloatTensor(X_train)
        y_train_tensor = torch.FloatTensor(y_train)
        X_test_tensor = torch.FloatTensor(X_test)
        y_test_tensor = torch.FloatTensor(y_test)
        
        # Initialize model
        model = EdgePredictionNN(input_dim=2, hidden_dims=[64, 32], dropout_rate=0.2)
        criterion = nn.BCELoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        
        # Training parameters
        epochs = 100
        batch_size = min(1024, len(X_train) // 4)
        
        # Create data loader
        train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        
        # Training loop
        model.train()
        for epoch in range(epochs):
            total_loss = 0
            for batch_X, batch_y in train_loader:
                optimizer.zero_grad()
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
                total_loss += loss.item()
        
        # Evaluation
        model.eval()
        with torch.no_grad():
            train_pred = model(X_train_tensor).numpy()
            test_pred = model(X_test_tensor).numpy()
        
        metrics = {
            'train_auc': roc_auc_score(y_train, train_pred),
            'test_auc': roc_auc_score(y_test, test_pred),
            'train_ap': average_precision_score(y_train, train_pred),
            'test_ap': average_precision_score(y_test, test_pred)
        }
        
        return model, metrics
    
    def _train_logistic_regression(self, X_train, y_train, X_test, y_test):
        """Train logistic regression."""
        model = LogisticRegression(random_state=self.random_seed, max_iter=1000)
        model.fit(X_train, y_train)
        
        train_pred = model.predict_proba(X_train)[:, 1]
        test_pred = model.predict_proba(X_test)[:, 1]
        
        metrics = {
            'train_auc': roc_auc_score(y_train, train_pred),
            'test_auc': roc_auc_score(y_test, test_pred),
            'train_ap': average_precision_score(y_train, train_pred),
            'test_ap': average_precision_score(y_test, test_pred)
        }
        
        return model, metrics
    
    def _train_penalized_logistic_regression(self, X_train, y_train, X_test, y_test):
        """Train L1-penalized logistic regression."""
        model = LogisticRegression(penalty='l1', solver='liblinear', random_state=self.random_seed, max_iter=1000)
        model.fit(X_train, y_train)
        
        train_pred = model.predict_proba(X_train)[:, 1]
        test_pred = model.predict_proba(X_test)[:, 1]
        
        metrics = {
            'train_auc': roc_auc_score(y_train, train_pred),
            'test_auc': roc_auc_score(y_test, test_pred),
            'train_ap': average_precision_score(y_train, train_pred),
            'test_ap': average_precision_score(y_test, test_pred)
        }
        
        return model, metrics
    
    def _train_random_forest(self, X_train, y_train, X_test, y_test):
        """Train random forest."""
        model = RandomForestClassifier(n_estimators=100, random_state=self.random_seed, n_jobs=-1)
        model.fit(X_train, y_train)
        
        train_pred = model.predict_proba(X_train)[:, 1]
        test_pred = model.predict_proba(X_test)[:, 1]
        
        metrics = {
            'train_auc': roc_auc_score(y_train, train_pred),
            'test_auc': roc_auc_score(y_test, test_pred),
            'train_ap': average_precision_score(y_train, train_pred),
            'test_ap': average_precision_score(y_test, test_pred)
        }
        
        return model, metrics
    
    def predict_probabilities(self, features: np.ndarray) -> np.ndarray:
        """Predict edge probabilities for given features."""
        if self.scaler is None or self.model is None:
            raise ValueError("Model must be trained first")
        
        features_scaled = self.scaler.transform(features)
        
        if self.model_type == 'NN':
            self.model.eval()
            with torch.no_grad():
                features_tensor = torch.FloatTensor(features_scaled)
                predictions = self.model(features_tensor).numpy()
        else:
            predictions = self.model.predict_proba(features_scaled)[:, 1]
        
        return predictions


# Test model training
print("Testing model training...")
if available_perms:
    # Use small sample for testing
    test_features, test_labels = extract_edge_features_and_labels(
        edge_matrix, source_degrees, target_degrees, 0.1
    )
    
    for model_type in ['LR', 'RF']:  # Test faster models first
        print(f"\\nTesting {model_type}...")
        trainer = ModelTrainer(model_type, CONFIG['random_seed'])
        results = trainer.train(test_features, test_labels)
        print(f"  Test AUC: {results['metrics']['test_auc']:.3f}")
        print(f"  Test AP: {results['metrics']['test_ap']:.3f}")
    
    print("\\nModel training pipeline ready!")

In [None]:
def compute_degree_based_probability_distribution(edge_matrix: sp.csr_matrix, 
                                               source_degrees: np.ndarray, 
                                               target_degrees: np.ndarray,
                                               n_bins: int = 20) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """
    Compute observed edge probability distribution based on source and target degree bins.
    
    Returns:
    --------
    prob_matrix : np.ndarray
        Probability matrix (n_bins x n_bins) where prob_matrix[i,j] is the probability
        of an edge between source degree bin i and target degree bin j
    source_bin_edges : np.ndarray
        Bin edges for source degrees
    target_bin_edges : np.ndarray
        Bin edges for target degrees
    """
    # Create degree bins
    source_bin_edges = np.percentile(source_degrees[source_degrees > 0], 
                                   np.linspace(0, 100, n_bins + 1))
    target_bin_edges = np.percentile(target_degrees[target_degrees > 0], 
                                   np.linspace(0, 100, n_bins + 1))
    
    # Ensure minimum bin width
    source_bin_edges = np.unique(source_bin_edges)
    target_bin_edges = np.unique(target_bin_edges)
    
    if len(source_bin_edges) < n_bins + 1:
        source_bin_edges = np.linspace(source_degrees.min(), source_degrees.max(), n_bins + 1)
    if len(target_bin_edges) < n_bins + 1:
        target_bin_edges = np.linspace(target_degrees.min(), target_degrees.max(), n_bins + 1)
    
    # Initialize counts
    edge_counts = np.zeros((len(source_bin_edges) - 1, len(target_bin_edges) - 1))
    total_counts = np.zeros((len(source_bin_edges) - 1, len(target_bin_edges) - 1))
    
    # Bin all possible node pairs
    for i in range(len(source_degrees)):
        for j in range(len(target_degrees)):
            # Find bins
            source_bin = np.digitize(source_degrees[i], source_bin_edges) - 1
            target_bin = np.digitize(target_degrees[j], target_bin_edges) - 1
            
            # Ensure bins are within bounds
            source_bin = max(0, min(source_bin, len(source_bin_edges) - 2))
            target_bin = max(0, min(target_bin, len(target_bin_edges) - 2))
            
            total_counts[source_bin, target_bin] += 1
            if edge_matrix[i, j]:
                edge_counts[source_bin, target_bin] += 1
    
    # Compute probabilities
    prob_matrix = np.divide(edge_counts, total_counts, 
                           out=np.zeros_like(edge_counts), where=total_counts!=0)
    
    return prob_matrix, source_bin_edges, target_bin_edges


def predict_degree_based_probability_distribution(model_trainer: ModelTrainer,
                                                source_degrees: np.ndarray,
                                                target_degrees: np.ndarray,
                                                source_bin_edges: np.ndarray,
                                                target_bin_edges: np.ndarray) -> np.ndarray:
    """
    Predict edge probability distribution using trained model.
    
    Returns:
    --------
    predicted_prob_matrix : np.ndarray
        Predicted probability matrix with same shape as observed
    """
    n_source_bins = len(source_bin_edges) - 1
    n_target_bins = len(target_bin_edges) - 1
    predicted_prob_matrix = np.zeros((n_source_bins, n_target_bins))
    
    # For each bin combination, predict probability using bin centers
    for i in range(n_source_bins):
        for j in range(n_target_bins):
            # Use bin centers as representative degrees
            source_center = (source_bin_edges[i] + source_bin_edges[i+1]) / 2
            target_center = (target_bin_edges[j] + target_bin_edges[j+1]) / 2
            
            # Create feature vector
            features = np.array([[source_center, target_center]])
            
            # Predict probability
            predicted_prob_matrix[i, j] = model_trainer.predict_probabilities(features)[0]
    
    return predicted_prob_matrix


def compute_distribution_difference(observed_dist: np.ndarray, 
                                  predicted_dist: np.ndarray) -> Dict[str, float]:
    """
    Compute different metrics for distribution comparison.
    
    Returns:
    --------
    metrics : dict
        Dictionary with different distance metrics
    """
    # Flatten distributions for distance calculations
    obs_flat = observed_dist.flatten()
    pred_flat = predicted_dist.flatten()
    
    # Remove NaN values
    valid_mask = ~(np.isnan(obs_flat) | np.isnan(pred_flat))
    obs_clean = obs_flat[valid_mask]
    pred_clean = pred_flat[valid_mask]
    
    if len(obs_clean) == 0:
        return {'mse': np.inf, 'mae': np.inf, 'wasserstein': np.inf, 'ks_statistic': 1.0}
    
    metrics = {
        'mse': np.mean((obs_clean - pred_clean) ** 2),
        'mae': np.mean(np.abs(obs_clean - pred_clean)),
        'wasserstein': wasserstein_distance(obs_clean, pred_clean),
        'ks_statistic': ks_2samp(obs_clean, pred_clean).statistic
    }
    
    return metrics


class ValidationFramework:
    """Framework for validating model predictions against held-out networks."""
    
    def __init__(self, validation_dir: Path, edge_type: str, n_validation_networks: int = 3):
        self.validation_dir = validation_dir
        self.edge_type = edge_type
        self.n_validation_networks = n_validation_networks
        
        # Load validation networks (would be from downloads directory)
        # For now, we'll use the existing permutations as proxy validation networks
        self.validation_networks = self._load_validation_networks()
    
    def _load_validation_networks(self) -> List[Tuple[sp.csr_matrix, np.ndarray, np.ndarray]]:
        """Load validation networks from downloads or use existing permutations."""
        validation_networks = []
        
        # Check if downloads directory exists
        downloads_permutations_dir = self.validation_dir / 'downloads' / 'hetionet-permutations' / 'permutations'
        if downloads_permutations_dir.exists():
            # Use downloaded permutations
            available_dirs = [d for d in downloads_permutations_dir.iterdir() if d.is_dir()]
            selected_dirs = np.random.choice(available_dirs, 
                                           min(self.n_validation_networks, len(available_dirs)), 
                                           replace=False)
        else:
            # Use existing permutations as validation (exclude training permutations)
            permutations_dir = self.validation_dir / 'permutations'
            available_dirs = [d for d in permutations_dir.iterdir() if d.is_dir() and d.name.endswith('.hetmat')]
            # Use last few permutations as validation
            selected_dirs = available_dirs[-self.n_validation_networks:] if len(available_dirs) >= self.n_validation_networks else available_dirs
        
        for perm_dir in selected_dirs:
            try:
                edge_matrix, source_degrees, target_degrees = load_permutation_data(perm_dir, self.edge_type)
                validation_networks.append((edge_matrix, source_degrees, target_degrees))
                print(f"Loaded validation network: {perm_dir.name}")
            except Exception as e:
                print(f"Failed to load validation network {perm_dir}: {e}")
        
        return validation_networks
    
    def validate_model(self, model_trainer: ModelTrainer, 
                      reference_bin_edges: Tuple[np.ndarray, np.ndarray],
                      n_bins: int = 20) -> Dict[str, Any]:
        """
        Validate model against held-out networks.
        
        Returns:
        --------
        validation_results : dict
            Dictionary with validation metrics and distributions
        """
        source_bin_edges, target_bin_edges = reference_bin_edges
        
        observed_distributions = []
        predicted_distributions = []
        individual_metrics = []
        
        for i, (edge_matrix, source_degrees, target_degrees) in enumerate(self.validation_networks):
            # Compute observed distribution
            obs_dist, _, _ = compute_degree_based_probability_distribution(
                edge_matrix, source_degrees, target_degrees, n_bins
            )
            
            # Predict distribution
            pred_dist = predict_degree_based_probability_distribution(
                model_trainer, source_degrees, target_degrees, 
                source_bin_edges, target_bin_edges
            )
            
            # Compute metrics
            metrics = compute_distribution_difference(obs_dist, pred_dist)
            
            observed_distributions.append(obs_dist)
            predicted_distributions.append(pred_dist)
            individual_metrics.append(metrics)
            
            print(f"Validation network {i+1}: MAE = {metrics['mae']:.4f}, MSE = {metrics['mse']:.4f}")
        
        # Aggregate metrics
        aggregate_metrics = {}
        for metric_name in individual_metrics[0].keys():
            values = [m[metric_name] for m in individual_metrics]
            aggregate_metrics[f'{metric_name}_mean'] = np.mean(values)
            aggregate_metrics[f'{metric_name}_std'] = np.std(values)
        
        return {
            'observed_distributions': observed_distributions,
            'predicted_distributions': predicted_distributions,
            'individual_metrics': individual_metrics,
            'aggregate_metrics': aggregate_metrics,
            'validation_networks_count': len(self.validation_networks)
        }


# Initialize validation framework
print("Setting up validation framework...")
validator = ValidationFramework(data_dir, CONFIG['edge_type'], CONFIG['validation_networks'])
print(f"Loaded {len(validator.validation_networks)} validation networks")

In [None]:
def run_minimum_permutation_experiment(config: Dict[str, Any], 
                                      validator: ValidationFramework) -> Dict[str, Any]:
    """
    Run the main experiment to find minimum permutations needed for each model.
    
    Returns:
    --------
    results : dict
        Complete results for all models including convergence information
    """
    # Get available permutations for training
    available_perms = get_available_permutations(permutations_dir)
    training_perms = available_perms[:-config['validation_networks']]  # Reserve last few for validation
    
    if len(training_perms) > config['max_permutations']:
        training_perms = training_perms[:config['max_permutations']]
    
    print(f"Available training permutations: {len(training_perms)}")
    print(f"Will test up to {min(len(training_perms), config['max_permutations'])} permutations")
    
    # Store results for all models
    experiment_results = {}
    
    # Reference bin edges (computed from first permutation for consistency)
    reference_perm_dir = permutations_dir / training_perms[0]
    ref_edge_matrix, ref_source_degrees, ref_target_degrees = load_permutation_data(
        reference_perm_dir, config['edge_type']
    )
    _, ref_source_bin_edges, ref_target_bin_edges = compute_degree_based_probability_distribution(
        ref_edge_matrix, ref_source_degrees, ref_target_degrees, config['n_bins']
    )
    
    print(f"\\nReference bins: {len(ref_source_bin_edges)-1} source x {len(ref_target_bin_edges)-1} target")
    
    # Run experiment for each model type
    for model_type in config['models']:\n        print(f\"\\n{'='*60}\")
        print(f\"Running experiment for {model_type}\")\n        print(f\"{'='*60}\")\n        \n        model_results = {\n            'model_type': model_type,\n            'convergence_achieved': False,\n            'minimum_permutations': None,\n            'training_history': [],\n            'final_distribution': None,\n            'final_metrics': None\n        }\n        \n        # Progressive training: add one permutation at a time\n        for n_perms in range(1, min(len(training_perms), config['max_permutations']) + 1):\n            print(f\"\\nTesting with {n_perms} permutation(s)...\")\n            \n            # Collect features and labels from n_perms permutations\n            all_features = []\n            all_labels = []\n            \n            for i in range(n_perms):\n                perm_dir = permutations_dir / training_perms[i]\n                edge_matrix, source_degrees, target_degrees = load_permutation_data(\n                    perm_dir, config['edge_type']\n                )\n                \n                # Extract features and labels\n                features, labels = extract_edge_features_and_labels(\n                    edge_matrix, source_degrees, target_degrees, \n                    config['negative_sampling_ratio']\n                )\n                \n                all_features.append(features)\n                all_labels.append(labels)\n                \n                print(f\"  Permutation {i+1}: {len(features)} samples\")\n            \n            # Combine all data\n            combined_features = np.vstack(all_features)\n            combined_labels = np.hstack(all_labels)\n            \n            print(f\"  Total training samples: {len(combined_features)}\")\n            print(f\"  Positive rate: {combined_labels.mean():.3f}\")\n            \n            # Train model\n            trainer = ModelTrainer(model_type, config['random_seed'])\n            training_results = trainer.train(combined_features, combined_labels)\n            \n            print(f\"  Training AUC: {training_results['metrics']['train_auc']:.3f}\")\n            print(f\"  Test AUC: {training_results['metrics']['test_auc']:.3f}\")\n            \n            # Validate model\n            validation_results = validator.validate_model(\n                trainer, (ref_source_bin_edges, ref_target_bin_edges), config['n_bins']\n            )\n            \n            # Check convergence\n            mean_mae = validation_results['aggregate_metrics']['mae_mean']\n            mean_mse = validation_results['aggregate_metrics']['mse_mean']\n            \n            print(f\"  Validation MAE: {mean_mae:.4f}\")\n            print(f\"  Validation MSE: {mean_mse:.4f}\")\n            \n            # Store iteration results\n            iteration_results = {\n                'n_permutations': n_perms,\n                'training_metrics': training_results['metrics'],\n                'validation_metrics': validation_results['aggregate_metrics'],\n                'mean_mae': mean_mae,\n                'mean_mse': mean_mse\n            }\n            model_results['training_history'].append(iteration_results)\n            \n            # Check convergence\n            if mean_mae < config['convergence_threshold']:\n                print(f\"  *** CONVERGENCE ACHIEVED with {n_perms} permutations! ***\")\n                model_results['convergence_achieved'] = True\n                model_results['minimum_permutations'] = n_perms\n                model_results['final_distribution'] = validation_results['predicted_distributions']\n                model_results['final_metrics'] = validation_results['aggregate_metrics']\n                \n                # Save the converged model\n                model_save_path = output_dir / f'{model_type}_converged_model.pkl'\n                import pickle\n                with open(model_save_path, 'wb') as f:\n                    pickle.dump({\n                        'trainer': trainer,\n                        'bin_edges': (ref_source_bin_edges, ref_target_bin_edges),\n                        'config': config,\n                        'results': model_results\n                    }, f)\n                \n                print(f\"  Model saved to: {model_save_path}\")\n                break\n        \n        # Final status\n        if not model_results['convergence_achieved']:\n            print(f\"\\n  WARNING: {model_type} did not converge within {config['max_permutations']} permutations\")\n            print(f\"  Final MAE: {model_results['training_history'][-1]['mean_mae']:.4f}\")\n        \n        experiment_results[model_type] = model_results\n    \n    return experiment_results\n\n\n# Run the main experiment\nprint(\"Starting minimum permutation experiment...\")\nprint(f\"Models to test: {CONFIG['models']}\")\nprint(f\"Convergence threshold (MAE): {CONFIG['convergence_threshold']}\")\nprint(f\"Maximum permutations: {CONFIG['max_permutations']}\")\n\n# Start experiment\nexperiment_results = run_minimum_permutation_experiment(CONFIG, validator)"

In [None]:
def plot_convergence_analysis(experiment_results: Dict[str, Any], output_dir: Path):
    """Plot convergence analysis for all models."""
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Model Convergence Analysis', fontsize=16)
    
    metrics_to_plot = ['mean_mae', 'mean_mse']
    metric_titles = ['Mean Absolute Error', 'Mean Squared Error']
    
    for i, (metric, title) in enumerate(zip(metrics_to_plot, metric_titles)):\n        ax = axes[i // 2, i % 2]\n        \n        for model_type, results in experiment_results.items():\n            if results['training_history']:\n                n_perms = [h['n_permutations'] for h in results['training_history']]\n                values = [h[metric] for h in results['training_history']]\n                \n                # Plot line\n                ax.plot(n_perms, values, 'o-', label=model_type, linewidth=2, markersize=6)\n                \n                # Mark convergence point if achieved\n                if results['convergence_achieved']:\n                    conv_point = results['minimum_permutations']\n                    conv_value = next(h[metric] for h in results['training_history'] \n                                    if h['n_permutations'] == conv_point)\n                    ax.axvline(x=conv_point, color=ax.lines[-1].get_color(), \n                             linestyle='--', alpha=0.7)\n                    ax.text(conv_point, conv_value, f'{conv_point}', \n                           ha='center', va='bottom', fontweight='bold')\n        \n        # Add threshold line\n        if metric == 'mean_mae':\n            ax.axhline(y=CONFIG['convergence_threshold'], color='red', \n                      linestyle='--', alpha=0.5, label='Threshold')\n        \n        ax.set_xlabel('Number of Permutations')\n        ax.set_ylabel(title)\n        ax.set_title(f'{title} vs Number of Permutations')\n        ax.legend()\n        ax.grid(True, alpha=0.3)\n    \n    # Training performance comparison\n    ax = axes[1, 0]\n    model_types = list(experiment_results.keys())\n    final_train_aucs = []\n    final_test_aucs = []\n    \n    for model_type in model_types:\n        if experiment_results[model_type]['training_history']:\n            final_metrics = experiment_results[model_type]['training_history'][-1]['training_metrics']\n            final_train_aucs.append(final_metrics['train_auc'])\n            final_test_aucs.append(final_metrics['test_auc'])\n        else:\n            final_train_aucs.append(0)\n            final_test_aucs.append(0)\n    \n    x = np.arange(len(model_types))\n    width = 0.35\n    \n    ax.bar(x - width/2, final_train_aucs, width, label='Train AUC', alpha=0.8)\n    ax.bar(x + width/2, final_test_aucs, width, label='Test AUC', alpha=0.8)\n    \n    ax.set_xlabel('Model Type')\n    ax.set_ylabel('AUC Score')\n    ax.set_title('Final Training Performance')\n    ax.set_xticks(x)\n    ax.set_xticklabels(model_types)\n    ax.legend()\n    ax.grid(True, alpha=0.3)\n    \n    # Minimum permutations summary\n    ax = axes[1, 1]\n    converged_models = []\n    min_perms = []\n    \n    for model_type, results in experiment_results.items():\n        if results['convergence_achieved']:\n            converged_models.append(model_type)\n            min_perms.append(results['minimum_permutations'])\n    \n    if converged_models:\n        bars = ax.bar(converged_models, min_perms, alpha=0.8)\n        ax.set_xlabel('Model Type')\n        ax.set_ylabel('Minimum Permutations')\n        ax.set_title('Minimum Permutations for Convergence')\n        \n        # Add value labels on bars\n        for bar, value in zip(bars, min_perms):\n            ax.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.1,\n                   str(value), ha='center', va='bottom', fontweight='bold')\n        \n        ax.grid(True, alpha=0.3)\n    else:\n        ax.text(0.5, 0.5, 'No models converged', ha='center', va='center', \n               transform=ax.transAxes, fontsize=12)\n        ax.set_title('Minimum Permutations for Convergence')\n    \n    plt.tight_layout()\n    \n    # Save plot\n    plot_path = output_dir / 'convergence_analysis.png'\n    plt.savefig(plot_path, dpi=300, bbox_inches='tight')\n    print(f\"Convergence analysis plot saved to: {plot_path}\")\n    \n    plt.show()\n\n\ndef plot_distribution_heatmaps(experiment_results: Dict[str, Any], \n                              validator: ValidationFramework, \n                              output_dir: Path):\n    \"\"\"Plot heatmaps of predicted vs observed probability distributions.\"\"\"\n    converged_models = {k: v for k, v in experiment_results.items() \n                       if v['convergence_achieved']}\n    \n    if not converged_models:\n        print(\"No converged models to plot distributions for.\")\n        return\n    \n    n_models = len(converged_models)\n    fig, axes = plt.subplots(n_models, 3, figsize=(15, 5*n_models))\n    \n    if n_models == 1:\n        axes = axes.reshape(1, -1)\n    \n    fig.suptitle('Edge Probability Distributions: Observed vs Predicted', fontsize=16)\n    \n    for i, (model_type, results) in enumerate(converged_models.items()):\n        # Get a representative validation network for comparison\n        if validator.validation_networks:\n            edge_matrix, source_degrees, target_degrees = validator.validation_networks[0]\n            \n            # Compute observed distribution\n            obs_dist, source_bin_edges, target_bin_edges = compute_degree_based_probability_distribution(\n                edge_matrix, source_degrees, target_degrees, CONFIG['n_bins']\n            )\n            \n            # Get predicted distribution (should be saved in results)\n            if results['final_distribution']:\n                pred_dist = results['final_distribution'][0]  # First validation network\n            else:\n                # Recompute if not saved\n                print(f\"Recomputing distribution for {model_type}...\")\n                pred_dist = np.zeros_like(obs_dist)  # Placeholder\n            \n            # Plot observed\n            im1 = axes[i, 0].imshow(obs_dist, cmap='viridis', aspect='auto')\n            axes[i, 0].set_title(f'{model_type}: Observed Distribution')\n            axes[i, 0].set_xlabel('Target Degree Bins')\n            axes[i, 0].set_ylabel('Source Degree Bins')\n            plt.colorbar(im1, ax=axes[i, 0])\n            \n            # Plot predicted\n            im2 = axes[i, 1].imshow(pred_dist, cmap='viridis', aspect='auto')\n            axes[i, 1].set_title(f'{model_type}: Predicted Distribution')\n            axes[i, 1].set_xlabel('Target Degree Bins')\n            axes[i, 1].set_ylabel('Source Degree Bins')\n            plt.colorbar(im2, ax=axes[i, 1])\n            \n            # Plot difference\n            diff = np.abs(obs_dist - pred_dist)\n            im3 = axes[i, 2].imshow(diff, cmap='Reds', aspect='auto')\n            axes[i, 2].set_title(f'{model_type}: Absolute Difference')\n            axes[i, 2].set_xlabel('Target Degree Bins')\n            axes[i, 2].set_ylabel('Source Degree Bins')\n            plt.colorbar(im3, ax=axes[i, 2])\n    \n    plt.tight_layout()\n    \n    # Save plot\n    plot_path = output_dir / 'distribution_heatmaps.png'\n    plt.savefig(plot_path, dpi=300, bbox_inches='tight')\n    print(f\"Distribution heatmaps saved to: {plot_path}\")\n    \n    plt.show()\n\n\ndef save_results_summary(experiment_results: Dict[str, Any], output_dir: Path):\n    \"\"\"Save comprehensive results summary.\"\"\"\n    # Create summary dictionary\n    summary = {\n        'experiment_config': CONFIG,\n        'timestamp': pd.Timestamp.now().isoformat(),\n        'model_results': {}\n    }\n    \n    # Summary statistics\n    converged_count = sum(1 for r in experiment_results.values() if r['convergence_achieved'])\n    total_models = len(experiment_results)\n    \n    summary['overall_stats'] = {\n        'total_models_tested': total_models,\n        'models_converged': converged_count,\n        'convergence_rate': converged_count / total_models if total_models > 0 else 0\n    }\n    \n    # Individual model results\n    for model_type, results in experiment_results.items():\n        model_summary = {\n            'converged': results['convergence_achieved'],\n            'minimum_permutations': results['minimum_permutations'],\n            'final_mae': results['training_history'][-1]['mean_mae'] if results['training_history'] else None,\n            'final_mse': results['training_history'][-1]['mean_mse'] if results['training_history'] else None,\n            'training_progression': results['training_history']\n        }\n        summary['model_results'][model_type] = model_summary\n    \n    # Save as JSON\n    summary_path = output_dir / 'experiment_summary.json'\n    with open(summary_path, 'w') as f:\n        json.dump(summary, f, indent=2, default=str)\n    \n    print(f\"Results summary saved to: {summary_path}\")\n    \n    # Create and save DataFrame for easy analysis\n    df_data = []\n    for model_type, results in experiment_results.items():\n        for iteration in results['training_history']:\n            row = {\n                'model_type': model_type,\n                'n_permutations': iteration['n_permutations'],\n                'train_auc': iteration['training_metrics']['train_auc'],\n                'test_auc': iteration['training_metrics']['test_auc'],\n                'validation_mae': iteration['mean_mae'],\n                'validation_mse': iteration['mean_mse'],\n                'converged': iteration['mean_mae'] < CONFIG['convergence_threshold']\n            }\n            df_data.append(row)\n    \n    df = pd.DataFrame(df_data)\n    csv_path = output_dir / 'detailed_results.csv'\n    df.to_csv(csv_path, index=False)\n    print(f\"Detailed results saved to: {csv_path}\")\n    \n    return summary, df\n\n\n# Generate visualizations and save results\nprint(\"\\n\" + \"=\"*60)\nprint(\"GENERATING VISUALIZATIONS AND SAVING RESULTS\")\nprint(\"=\"*60)\n\n# Plot convergence analysis\nplot_convergence_analysis(experiment_results, output_dir)\n\n# Plot distribution heatmaps\nplot_distribution_heatmaps(experiment_results, validator, output_dir)\n\n# Save results summary\nsummary, results_df = save_results_summary(experiment_results, output_dir)\n\n# Print final summary\nprint(\"\\n\" + \"=\"*60)\nprint(\"EXPERIMENT SUMMARY\")\nprint(\"=\"*60)\n\nprint(f\"Total models tested: {summary['overall_stats']['total_models_tested']}\")\nprint(f\"Models converged: {summary['overall_stats']['models_converged']}\")\nprint(f\"Convergence rate: {summary['overall_stats']['convergence_rate']:.1%}\")\n\nprint(\"\\nIndividual Model Results:\")\nfor model_type, model_summary in summary['model_results'].items():\n    if model_summary['converged']:\n        print(f\"  {model_type}: CONVERGED with {model_summary['minimum_permutations']} permutations\")\n        print(f\"    Final MAE: {model_summary['final_mae']:.4f}\")\n    else:\n        print(f\"  {model_type}: DID NOT CONVERGE\")\n        print(f\"    Final MAE: {model_summary['final_mae']:.4f}\")\n\nprint(f\"\\nAll results saved to: {output_dir}\")\nprint(\"\\nExperiment completed successfully!\")

# Minimum Permutations for Edge Probability Distribution Learning

This notebook determines the minimum number of permuted networks needed to accurately learn edge probability distributions based on source and target node degrees.

## Methodology

1. **Training Loop**: Start with 1 permuted network and incrementally add more (up to 10)
2. **Models**: Train Neural Network (NN), Logistic Regression (LR), Penalized Logistic Regression (PLR), and Random Forest (RF)
3. **Features**: Source and target node degrees
4. **Target**: Edge probability prediction
5. **Validation**: Compare predicted vs observed edge probability distributions across 3 held-out networks
6. **Convergence**: Stop when distribution difference falls below threshold

## Outputs

- Minimum number of permutations needed for each model
- Edge probability distributions for converged models
- Validation metrics and visualizations