# Model Aggregation: Weighted Ensemble of Edge Prediction Models

This notebook creates ensemble models by taking weighted averages of trained models across different permutations. The goal is to create a single robust model for each model type (Neural Network, Logistic Regression, Polynomial Logistic Regression, Random Forest) that combines the knowledge learned from multiple permutations of the heterogeneous network.

## Overview

- **Input**: Multiple trained models from different permutations
- **Process**: Weight models by their performance (AUC scores) and create ensemble predictions
- **Output**: Single aggregated model for each model type

## Methodology

1. **Model Discovery**: Scan the models directory for all trained models
2. **Performance Extraction**: Extract AUC scores for weighting
3. **Weighted Averaging**: Create ensemble predictions weighted by performance
4. **Model Persistence**: Save the aggregated models and their metadata

In [None]:
# Import required libraries
import warnings
import pathlib
import sys
import json
import pickle
import glob
from collections import defaultdict
import re

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

# Set up paths
repo_dir = pathlib.Path().cwd().parent
src_dir = repo_dir / "src"
sys.path.insert(0, str(src_dir))

# Import custom modules
from models import EdgePredictionNN

warnings.filterwarnings('ignore')
print("Libraries imported successfully")
print(f"Repository directory: {repo_dir}")

## Parameters

Configure the aggregation process:

In [None]:
# Parameters for model aggregation
models_dir = repo_dir / "models"
output_dir = repo_dir / "aggregated_models"
edge_type = "AeG"  # Edge type to aggregate models for

# Weighting strategy: 'auc' (performance-based) or 'equal' (equal weights)
weighting_strategy = "auc"

# Minimum number of models required for aggregation
min_models_threshold = 2

# Create output directory
output_dir.mkdir(exist_ok=True)

print(f"Models directory: {models_dir}")
print(f"Output directory: {output_dir}")
print(f"Edge type: {edge_type}")
print(f"Weighting strategy: {weighting_strategy}")

## Model Discovery and Loading

Discover all trained models for the specified edge type:

In [None]:
def discover_models(models_dir, edge_type=None):
    """
    Discover all trained models in the models directory.
    
    Returns:
        dict: Organized by model type and permutation
    """
    model_files = {
        'neural_network': [],
        'logistic_regression': [],
        'polynomial_logistic': [],
        'random_forest': []
    }
    
    # Scan for different model types
    patterns = {
        'neural_network': 'edge_prediction_model_*.pt',
        'logistic_regression': 'logistic_regression_model_*.pkl',
        'polynomial_logistic': 'polynomial_logistic_model_*.pkl',
        'random_forest': 'random_forest_model_*.pkl'
    }
    
    for model_type, pattern in patterns.items():
        files = list(models_dir.glob(pattern))
        
        # Filter by edge type if specified
        if edge_type:
            files = [f for f in files if edge_type in f.name]
        
        model_files[model_type] = files
    
    return model_files

# Discover models
discovered_models = discover_models(models_dir, edge_type)

print("Discovered models:")
for model_type, files in discovered_models.items():
    print(f"  {model_type}: {len(files)} models")
    for f in files:
        print(f"    - {f.name}")

## Performance Extraction

Extract performance metrics for weighting models:

In [None]:
def extract_model_performance(model_file, model_type):
    """
    Extract performance metrics from model files.
    
    Returns:
        dict: Performance metrics including AUC
    """
    try:
        if model_type == 'neural_network':
            # Load PyTorch model
            checkpoint = torch.load(model_file, map_location='cpu')
            return {
                'auc': checkpoint['test_metrics']['auc'],
                'ap': checkpoint['test_metrics']['average_precision'],
                'permutation': checkpoint.get('permutation_name', 'unknown'),
                'model_data': checkpoint
            }
        else:
            # Load scikit-learn model
            with open(model_file, 'rb') as f:
                model_data = pickle.load(f)
            return {
                'auc': model_data['test_metrics']['auc'],
                'ap': model_data['test_metrics']['ap'],
                'permutation': model_data.get('permutation_name', 'unknown'),
                'model_data': model_data
            }
    except Exception as e:
        print(f"Error loading {model_file}: {e}")
        return None

# Extract performance for all models
model_performance = {}

for model_type, files in discovered_models.items():
    model_performance[model_type] = []
    
    for model_file in files:
        perf = extract_model_performance(model_file, model_type)
        if perf:
            perf['file'] = model_file
            model_performance[model_type].append(perf)

# Display performance summary
print("\nModel Performance Summary:")
for model_type, perfs in model_performance.items():
    if perfs:
        aucs = [p['auc'] for p in perfs]
        print(f"\n{model_type.replace('_', ' ').title()}:")
        print(f"  Models: {len(perfs)}")
        print(f"  AUC range: {min(aucs):.4f} - {max(aucs):.4f}")
        print(f"  Mean AUC: {np.mean(aucs):.4f}")
        for p in perfs:
            print(f"    {p['permutation']}: AUC={p['auc']:.4f}, AP={p['ap']:.4f}")

## Weighted Ensemble Models

Create ensemble models using performance-based weighting:

In [None]:
class WeightedEnsembleClassifier(BaseEstimator, ClassifierMixin):
    """
    A weighted ensemble classifier that combines predictions from multiple models.
    """
    
    def __init__(self, models, weights, model_type, scalers=None):
        self.models = models
        self.weights = np.array(weights)
        self.weights = self.weights / self.weights.sum()  # Normalize weights
        self.model_type = model_type
        self.scalers = scalers or [None] * len(models)
        
    def predict_proba(self, X):
        """
        Predict class probabilities using weighted ensemble.
        """
        predictions = []
        
        for i, (model, scaler) in enumerate(zip(self.models, self.scalers)):
            X_scaled = X if scaler is None else scaler.transform(X)
            
            if self.model_type == 'neural_network':
                model.eval()
                with torch.no_grad():
                    X_tensor = torch.FloatTensor(X_scaled)
                    pred = model(X_tensor).cpu().numpy()
                    # Convert single output to probability format
                    pred_proba = np.column_stack([1 - pred, pred])
            else:
                pred_proba = model.predict_proba(X_scaled)
            
            predictions.append(pred_proba)
        
        # Weighted average of predictions
        weighted_pred = np.average(predictions, axis=0, weights=self.weights)
        return weighted_pred
    
    def predict(self, X):
        """
        Predict class labels.
        """
        proba = self.predict_proba(X)
        return (proba[:, 1] > 0.5).astype(int)
    
    def get_model_info(self):
        """
        Get information about the ensemble.
        """
        return {
            'model_type': self.model_type,
            'num_models': len(self.models),
            'weights': self.weights.tolist(),
            'ensemble_type': 'weighted_average'
        }

print("Weighted ensemble classifier defined")

In [None]:
def create_ensemble_models(model_performance, weighting_strategy='auc', min_models=2):
    """
    Create ensemble models for each model type.
    """
    ensemble_models = {}
    
    for model_type, perfs in model_performance.items():
        if len(perfs) < min_models:
            print(f"Skipping {model_type}: insufficient models ({len(perfs)} < {min_models})")
            continue
        
        print(f"\nCreating ensemble for {model_type}...")
        
        # Extract models and performance
        models = []
        scalers = []
        weights = []
        
        for perf in perfs:
            if model_type == 'neural_network':
                # Load neural network
                model = EdgePredictionNN(input_size=2, hidden_size=64, dropout_rate=0.3)
                model.load_state_dict(perf['model_data']['model_state_dict'])
                model.eval()
                models.append(model)
                
                # Neural networks typically have their own scaling
                scalers.append(None)
            else:
                # Load scikit-learn model
                models.append(perf['model_data']['model'])
                scalers.append(perf['model_data'].get('scaler'))
            
            # Calculate weights based on strategy
            if weighting_strategy == 'auc':
                weights.append(perf['auc'])
            else:  # equal weighting
                weights.append(1.0)
        
        # Create ensemble
        ensemble = WeightedEnsembleClassifier(
            models=models,
            weights=weights,
            model_type=model_type,
            scalers=scalers
        )
        
        ensemble_models[model_type] = {
            'ensemble': ensemble,
            'component_performance': perfs,
            'weights': ensemble.weights,
            'metadata': {
                'edge_type': edge_type,
                'weighting_strategy': weighting_strategy,
                'num_components': len(models),
                'component_aucs': [p['auc'] for p in perfs],
                'weighted_mean_auc': np.average([p['auc'] for p in perfs], weights=weights)
            }
        }
        
        print(f"  Created ensemble with {len(models)} models")
        print(f"  Weights: {ensemble.weights}")
        print(f"  Weighted mean AUC: {ensemble_models[model_type]['metadata']['weighted_mean_auc']:.4f}")
    
    return ensemble_models

# Create ensemble models
ensemble_models = create_ensemble_models(
    model_performance, 
    weighting_strategy=weighting_strategy,
    min_models=min_models_threshold
)

print(f"\nCreated {len(ensemble_models)} ensemble models")

## Save Ensemble Models

Persist the ensemble models and their metadata:

In [None]:
def save_ensemble_models(ensemble_models, output_dir, edge_type):
    """
    Save ensemble models and their metadata.
    """
    saved_models = {}
    
    for model_type, ensemble_data in ensemble_models.items():
        # Define filenames
        model_filename = f"ensemble_{model_type}_{edge_type}.pkl"
        metadata_filename = f"ensemble_{model_type}_{edge_type}_metadata.json"
        
        model_path = output_dir / model_filename
        metadata_path = output_dir / metadata_filename
        
        # Save ensemble model
        with open(model_path, 'wb') as f:
            pickle.dump(ensemble_data['ensemble'], f)
        
        # Prepare metadata for JSON serialization
        metadata = ensemble_data['metadata'].copy()
        metadata['weights'] = ensemble_data['weights'].tolist()
        metadata['component_files'] = [str(p['file'].name) for p in ensemble_data['component_performance']]
        metadata['component_permutations'] = [p['permutation'] for p in ensemble_data['component_performance']]
        
        # Save metadata
        with open(metadata_path, 'w') as f:
            json.dump(metadata, f, indent=2)
        
        saved_models[model_type] = {
            'model_path': model_path,
            'metadata_path': metadata_path,
            'metadata': metadata
        }
        
        print(f"Saved {model_type} ensemble:")
        print(f"  Model: {model_path}")
        print(f"  Metadata: {metadata_path}")
    
    return saved_models

# Save ensemble models
saved_models = save_ensemble_models(ensemble_models, output_dir, edge_type)

print(f"\nAll ensemble models saved to: {output_dir}")

## Ensemble Performance Summary

Create a comprehensive summary of the ensemble models:

In [None]:
# Create summary visualization
if ensemble_models:
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    
    # Plot 1: Component AUCs and weights
    model_types = list(ensemble_models.keys())
    colors = plt.cm.Set3(np.linspace(0, 1, len(model_types)))
    
    ax1 = axes[0]
    y_pos = 0
    
    for i, (model_type, ensemble_data) in enumerate(ensemble_models.items()):
        aucs = ensemble_data['metadata']['component_aucs']
        weights = ensemble_data['weights']
        
        # Plot AUCs as bars with weight-based alpha
        for j, (auc, weight) in enumerate(zip(aucs, weights)):
            ax1.barh(y_pos + j, auc, alpha=0.3 + 0.7 * weight, 
                    color=colors[i], label=model_type if j == 0 else "")
            ax1.text(auc + 0.01, y_pos + j, f'{weight:.3f}', 
                    va='center', fontsize=8)
        
        y_pos += len(aucs) + 1
    
    ax1.set_xlabel('AUC Score')
    ax1.set_title('Component Model AUCs\n(Bar opacity = weight)')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Plot 2: Weighted mean AUCs
    ax2 = axes[1]
    model_names = [mt.replace('_', ' ').title() for mt in model_types]
    weighted_aucs = [ensemble_data['metadata']['weighted_mean_auc'] 
                    for ensemble_data in ensemble_models.values()]
    
    bars = ax2.bar(model_names, weighted_aucs, color=colors[:len(model_types)])
    ax2.set_ylabel('Weighted Mean AUC')
    ax2.set_title('Ensemble Model Performance')
    ax2.tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for bar, auc in zip(bars, weighted_aucs):
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{auc:.4f}', ha='center', va='bottom')
    
    plt.tight_layout()
    
    # Save plot
    plot_path = output_dir / f"ensemble_summary_{edge_type}.png"
    plt.savefig(plot_path, dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"Summary plot saved to: {plot_path}")

# Create summary report
summary_report = {
    'edge_type': edge_type,
    'weighting_strategy': weighting_strategy,
    'total_ensembles_created': len(ensemble_models),
    'ensemble_details': {}
}

for model_type, ensemble_data in ensemble_models.items():
    summary_report['ensemble_details'][model_type] = ensemble_data['metadata']

# Save summary report
summary_path = output_dir / f"ensemble_summary_{edge_type}.json"
with open(summary_path, 'w') as f:
    json.dump(summary_report, f, indent=2)

print(f"\nSummary report saved to: {summary_path}")
print("\nEnsemble model creation completed successfully!")

## Test Ensemble Models

Quick test to verify the ensemble models work correctly:

In [None]:
# Test ensemble models with dummy data
if ensemble_models:
    print("Testing ensemble models with dummy data...")
    
    # Create test data (2 features as expected by the models)
    X_test = np.random.rand(10, 2)
    
    for model_type, ensemble_data in ensemble_models.items():
        try:
            ensemble = ensemble_data['ensemble']
            
            # Test prediction
            proba = ensemble.predict_proba(X_test)
            pred = ensemble.predict(X_test)
            
            print(f"\n{model_type} ensemble test:")
            print(f"  Probability shape: {proba.shape}")
            print(f"  Prediction shape: {pred.shape}")
            print(f"  Sample probabilities: {proba[0]}")
            print(f"  Sample prediction: {pred[0]}")
            
        except Exception as e:
            print(f"Error testing {model_type}: {e}")

print("\nTesting completed!")