# Metapath Probability Analysis and Anomaly Detection

This notebook analyzes the compositionality of metapath probabilities in hetionet and implements anomaly detection for actual paths.

## Research Questions:
1. **Compositionality**: Are metapath probabilities compositional (independent edges) or conditional (dependent edges)?
2. **Degree Dependency**: How do source/target degrees affect path probabilities?
3. **Anomaly Detection**: Can we identify unusual paths based on predicted vs actual probabilities?

## Workflow:
1. **Data Loading & Preparation**
2. **Metapath Extraction**
3. **Probability Calculation Methods**
4. **Compositionality Testing**
5. **Conditional Probability Modeling**
6. **Anomaly Detection Implementation**
7. **Validation & Results**

In [None]:
# Papermill parameters
metapath_pattern = "CbGpPWpG"  # Default metapath to analyze
edge_types = ["CbG", "GpPW", "GpPW"]  # Edge types in the metapath
anomaly_threshold = 0.05  # Threshold for anomaly detection

In [None]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from collections import defaultdict, Counter
import warnings
import time
import json
from itertools import combinations
from scipy import stats
from sklearn.metrics import roc_auc_score, precision_recall_curve
import scipy.sparse as sp
warnings.filterwarnings('ignore')

# Setup paths
repo_dir = Path.cwd().parent
data_dir = repo_dir / 'data'
results_dir = repo_dir / 'results' / 'metapath_analysis'
prediction_dir = repo_dir / 'results' / 'model_comparison'
results_dir.mkdir(parents=True, exist_ok=True)

print(f"Analyzing metapath: {metapath_pattern}")
print(f"Edge types: {edge_types}")
print(f"Results directory: {results_dir}")

## 1. Data Loading & Preparation

In [None]:
class MetapathAnalyzer:
    """Comprehensive metapath probability analysis and anomaly detection."""
    
    def __init__(self, metapath_pattern, edge_types):
        self.metapath_pattern = metapath_pattern
        self.edge_types = edge_types
        self.edge_predictions = {}  # Store predictions for each edge type
        self.edge_matrices = {}     # Store actual edge matrices
        self.node_mappings = {}     # Store node type mappings
        
    def load_edge_predictions(self, prediction_dir):
        """Load model predictions for all edge types in the metapath."""
        print("Loading edge prediction files...")
        
        for edge_type in self.edge_types:
            # Load predictions
            pred_file = prediction_dir / f"{edge_type}_results" / f"{edge_type}_all_model_predictions.csv.gz"
            
            if pred_file.exists():
                print(f"  Loading {edge_type}: {pred_file}")
                df = pd.read_csv(pred_file)
                self.edge_predictions[edge_type] = df
                print(f"    Shape: {df.shape}, Edges: {df['edge_exists'].sum():,}")
            else:
                print(f"  ‚ö† Missing prediction file: {pred_file}")
                
            # Load actual edge matrix
            matrix_file = data_dir / 'permutations' / '000.hetmat' / 'edges' / f"{edge_type}.sparse.npz"
            if matrix_file.exists():
                self.edge_matrices[edge_type] = sp.load_npz(str(matrix_file))
                print(f"    Matrix shape: {self.edge_matrices[edge_type].shape}")
                
        return len(self.edge_predictions)
    
    def extract_metapaths(self, max_paths=10000):
        """Extract actual metapaths from the hetionet graph."""
        print(f"\nExtracting metapaths for pattern: {self.metapath_pattern}")
        
        if len(self.edge_types) != 3:
            raise ValueError("Currently supports 3-edge metapaths only")
            
        edge1_type, edge2_type, edge3_type = self.edge_types
        matrix1 = self.edge_matrices[edge1_type]  # C -> G
        matrix2 = self.edge_matrices[edge2_type]  # G -> PW
        matrix3 = self.edge_matrices[edge3_type]  # PW -> G (reverse)
        
        metapaths = []
        
        print(f"  Searching through {matrix1.nnz:,} edges of type {edge1_type}")
        
        # Find 3-edge paths: C-G-PW-G
        count = 0
        for c_idx, g1_idx in zip(*matrix1.nonzero()):
            if count >= max_paths:
                break
                
            # Find pathways connected to this gene
            pw_indices = matrix2.getrow(g1_idx).nonzero()[1]
            
            for pw_idx in pw_indices:
                # Find genes connected to this pathway (excluding the original gene)
                g2_indices = matrix3.getrow(pw_idx).nonzero()[1]
                
                for g2_idx in g2_indices:
                    if g2_idx != g1_idx:  # Different target gene
                        metapaths.append({
                            'compound_idx': c_idx,
                            'gene1_idx': g1_idx,
                            'pathway_idx': pw_idx,
                            'gene2_idx': g2_idx,
                            'path_id': f"{c_idx}-{g1_idx}-{pw_idx}-{g2_idx}"
                        })
                        count += 1
                        
                        if count >= max_paths:
                            break
                if count >= max_paths:
                    break
                    
        print(f"  Found {len(metapaths):,} metapaths")
        return pd.DataFrame(metapaths)
    
    def calculate_edge_probabilities(self, metapaths_df, model_name='random_forest'):
        """Calculate edge probabilities for each edge in the metapaths."""
        print(f"\nCalculating edge probabilities using {model_name} model...")
        
        # Add prediction columns
        pred_col = f'{model_name}_prediction'
        
        for i, (edge_type, edge_name) in enumerate(zip(self.edge_types, ['edge1', 'edge2', 'edge3'])):
            print(f"  Processing {edge_name} ({edge_type})...")
            
            if edge_type not in self.edge_predictions:
                print(f"    ‚ö† No predictions available for {edge_type}")
                continue
                
            pred_df = self.edge_predictions[edge_type]
            
            # Create lookup dictionary for fast access
            lookup = dict(zip(
                zip(pred_df['source_index'], pred_df['target_index']),
                pred_df[pred_col]
            ))
            
            # Map edge probabilities
            if i == 0:  # C -> G
                metapaths_df[f'{edge_name}_prob'] = metapaths_df.apply(
                    lambda row: lookup.get((row['compound_idx'], row['gene1_idx']), 0.0), axis=1
                )
            elif i == 1:  # G -> PW
                metapaths_df[f'{edge_name}_prob'] = metapaths_df.apply(
                    lambda row: lookup.get((row['gene1_idx'], row['pathway_idx']), 0.0), axis=1
                )
            elif i == 2:  # PW -> G
                metapaths_df[f'{edge_name}_prob'] = metapaths_df.apply(
                    lambda row: lookup.get((row['pathway_idx'], row['gene2_idx']), 0.0), axis=1
                )
                
        return metapaths_df

In [None]:
# Initialize analyzer
analyzer = MetapathAnalyzer(metapath_pattern, edge_types)

# Load edge predictions
loaded_edges = analyzer.load_edge_predictions(prediction_dir)
print(f"\nLoaded predictions for {loaded_edges}/{len(edge_types)} edge types")

## 2. Metapath Extraction

In [None]:
# Extract actual metapaths from the graph
metapaths_df = analyzer.extract_metapaths(max_paths=50000)

print(f"\nMetapath extraction results:")
print(f"  Total metapaths found: {len(metapaths_df):,}")
print(f"  Unique compounds: {metapaths_df['compound_idx'].nunique():,}")
print(f"  Unique gene1s: {metapaths_df['gene1_idx'].nunique():,}")
print(f"  Unique pathways: {metapaths_df['pathway_idx'].nunique():,}")
print(f"  Unique gene2s: {metapaths_df['gene2_idx'].nunique():,}")

# Show sample metapaths
print(f"\nSample metapaths:")
print(metapaths_df.head())

## 3. Probability Calculation Methods

In [None]:
# Calculate edge probabilities for all metapaths
metapaths_df = analyzer.calculate_edge_probabilities(metapaths_df, model_name='random_forest')

# Calculate compositional (independent) path probability
metapaths_df['compositional_prob'] = (
    metapaths_df['edge1_prob'] * 
    metapaths_df['edge2_prob'] * 
    metapaths_df['edge3_prob']
)

# Remove paths with zero probabilities for cleaner analysis
valid_paths = metapaths_df[
    (metapaths_df['edge1_prob'] > 0) & 
    (metapaths_df['edge2_prob'] > 0) & 
    (metapaths_df['edge3_prob'] > 0)
].copy()

print(f"\nProbability calculation results:")
print(f"  Valid paths with all probabilities > 0: {len(valid_paths):,}")
print(f"  Compositional probability range: {valid_paths['compositional_prob'].min():.2e} - {valid_paths['compositional_prob'].max():.2e}")
print(f"  Mean compositional probability: {valid_paths['compositional_prob'].mean():.2e}")

# Show edge probability distributions
for edge in ['edge1_prob', 'edge2_prob', 'edge3_prob']:
    print(f"  {edge}: mean={valid_paths[edge].mean():.4f}, std={valid_paths[edge].std():.4f}")

## 4. Compositionality Testing

In [None]:
class CompositionalityTester:
    """Test whether metapath probabilities are compositional or conditional."""
    
    def __init__(self, metapaths_df):
        self.metapaths_df = metapaths_df
        
    def test_edge_independence(self):
        """Test statistical independence between edge probabilities."""
        print("Testing edge independence...")
        
        edges = ['edge1_prob', 'edge2_prob', 'edge3_prob']
        independence_results = {}
        
        for i, edge1 in enumerate(edges):
            for edge2 in edges[i+1:]:
                # Spearman correlation (rank-based, robust to non-linearity)
                corr, p_value = stats.spearmanr(
                    self.metapaths_df[edge1], 
                    self.metapaths_df[edge2]
                )
                
                independence_results[f"{edge1}_vs_{edge2}"] = {
                    'correlation': corr,
                    'p_value': p_value,
                    'independent': p_value > 0.05 and abs(corr) < 0.1
                }
                
                print(f"  {edge1} vs {edge2}: r={corr:.4f}, p={p_value:.2e}, independent={independence_results[f'{edge1}_vs_{edge2}']['independent']}")
        
        return independence_results
    
    def calculate_empirical_path_probability(self):
        """Calculate empirical path probability from observed frequencies."""
        print("\nCalculating empirical path probabilities...")
        
        # Group by degree combinations and calculate empirical frequencies
        degree_groups = self.metapaths_df.groupby([
            'compound_idx', 'gene1_idx', 'pathway_idx', 'gene2_idx'
        ]).size().reset_index(name='path_count')
        
        # Calculate empirical probability as normalized frequency
        total_possible_paths = len(self.metapaths_df)
        degree_groups['empirical_prob'] = degree_groups['path_count'] / total_possible_paths
        
        # Merge back to main dataframe
        self.metapaths_df = self.metapaths_df.merge(
            degree_groups[['compound_idx', 'gene1_idx', 'pathway_idx', 'gene2_idx', 'empirical_prob']],
            on=['compound_idx', 'gene1_idx', 'pathway_idx', 'gene2_idx'],
            how='left'
        )
        
        print(f"  Empirical probability range: {self.metapaths_df['empirical_prob'].min():.2e} - {self.metapaths_df['empirical_prob'].max():.2e}")
        return self.metapaths_df
    
    def compare_compositional_vs_empirical(self):
        """Compare compositional predictions with empirical observations."""
        print("\nComparing compositional vs empirical probabilities...")
        
        # Calculate correlation
        corr_spearman, p_spearman = stats.spearmanr(
            self.metapaths_df['compositional_prob'],
            self.metapaths_df['empirical_prob']
        )
        
        corr_pearson, p_pearson = stats.pearsonr(
            self.metapaths_df['compositional_prob'],
            self.metapaths_df['empirical_prob']
        )
        
        print(f"  Spearman correlation: r={corr_spearman:.4f}, p={p_spearman:.2e}")
        print(f"  Pearson correlation: r={corr_pearson:.4f}, p={p_pearson:.2e}")
        
        # Calculate R¬≤ for explained variance
        from sklearn.metrics import r2_score
        r2 = r2_score(self.metapaths_df['empirical_prob'], self.metapaths_df['compositional_prob'])
        print(f"  R¬≤ (explained variance): {r2:.4f}")
        
        return {
            'spearman_r': corr_spearman,
            'spearman_p': p_spearman,
            'pearson_r': corr_pearson,
            'pearson_p': p_pearson,
            'r2_score': r2,
            'compositional_valid': corr_spearman > 0.7 and p_spearman < 0.01
        }

# Run compositionality tests
tester = CompositionalityTester(valid_paths)
independence_results = tester.test_edge_independence()
metapaths_with_empirical = tester.calculate_empirical_path_probability()
compositional_comparison = tester.compare_compositional_vs_empirical()

In [None]:
# Visualize compositionality analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Edge probability correlations
import seaborn as sns
edge_corr_data = metapaths_with_empirical[['edge1_prob', 'edge2_prob', 'edge3_prob']].corr()
sns.heatmap(edge_corr_data, annot=True, cmap='coolwarm', center=0, ax=axes[0,0])
axes[0,0].set_title('Edge Probability Correlations')

# 2. Compositional vs Empirical scatter
axes[0,1].scatter(metapaths_with_empirical['compositional_prob'], 
                  metapaths_with_empirical['empirical_prob'], 
                  alpha=0.6, s=20)
axes[0,1].plot([0, metapaths_with_empirical['compositional_prob'].max()], 
               [0, metapaths_with_empirical['compositional_prob'].max()], 
               'r--', alpha=0.8)
axes[0,1].set_xlabel('Compositional Probability')
axes[0,1].set_ylabel('Empirical Probability')
axes[0,1].set_title(f'Compositional vs Empirical\n(r={compositional_comparison["spearman_r"]:.3f})')
axes[0,1].set_xscale('log')
axes[0,1].set_yscale('log')

# 3. Probability distributions
axes[1,0].hist(metapaths_with_empirical['compositional_prob'], bins=50, alpha=0.7, label='Compositional', density=True)
axes[1,0].hist(metapaths_with_empirical['empirical_prob'], bins=50, alpha=0.7, label='Empirical', density=True)
axes[1,0].set_xlabel('Probability')
axes[1,0].set_ylabel('Density')
axes[1,0].set_title('Probability Distributions')
axes[1,0].set_xscale('log')
axes[1,0].legend()

# 4. Residuals analysis
residuals = metapaths_with_empirical['empirical_prob'] - metapaths_with_empirical['compositional_prob']
axes[1,1].scatter(metapaths_with_empirical['compositional_prob'], residuals, alpha=0.6, s=20)
axes[1,1].axhline(y=0, color='r', linestyle='--', alpha=0.8)
axes[1,1].set_xlabel('Compositional Probability')
axes[1,1].set_ylabel('Residuals (Empirical - Compositional)')
axes[1,1].set_title('Residuals Analysis')
axes[1,1].set_xscale('log')

plt.tight_layout()
plt.savefig(results_dir / f'{metapath_pattern}_compositionality_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nCompositionality Analysis Summary:")
print(f"  Edge independence: {sum(1 for r in independence_results.values() if r['independent'])}/{len(independence_results)} pairs independent")
print(f"  Compositional validity: {compositional_comparison['compositional_valid']}")
print(f"  Explained variance (R¬≤): {compositional_comparison['r2_score']:.4f}")

## 5. Conditional Probability Modeling

In [None]:
class ConditionalProbabilityModel:
    """Model conditional probabilities for metapaths."""
    
    def __init__(self, metapaths_df):
        self.metapaths_df = metapaths_df
        
    def calculate_conditional_probabilities(self):
        """Calculate P(edge2|edge1) and P(edge3|edge1,edge2)."""
        print("Calculating conditional probabilities...")
        
        # Discretize edge probabilities for conditional analysis
        n_bins = 10
        for edge in ['edge1_prob', 'edge2_prob', 'edge3_prob']:
            self.metapaths_df[f'{edge}_bin'] = pd.qcut(
                self.metapaths_df[edge], 
                q=n_bins, 
                labels=False, 
                duplicates='drop'
            )
        
        # Calculate P(edge2|edge1)
        edge2_given_edge1 = self.metapaths_df.groupby('edge1_prob_bin')['edge2_prob'].mean()
        self.metapaths_df['edge2_prob_conditional'] = self.metapaths_df['edge1_prob_bin'].map(edge2_given_edge1)
        
        # Calculate P(edge3|edge1,edge2)
        edge3_given_edge12 = self.metapaths_df.groupby(['edge1_prob_bin', 'edge2_prob_bin'])['edge3_prob'].mean()
        self.metapaths_df['edge3_prob_conditional'] = self.metapaths_df.apply(
            lambda row: edge3_given_edge12.get((row['edge1_prob_bin'], row['edge2_prob_bin']), row['edge3_prob']),
            axis=1
        )
        
        # Calculate conditional path probability
        self.metapaths_df['conditional_prob'] = (
            self.metapaths_df['edge1_prob'] * 
            self.metapaths_df['edge2_prob_conditional'] * 
            self.metapaths_df['edge3_prob_conditional']
        )
        
        print(f"  Conditional probability range: {self.metapaths_df['conditional_prob'].min():.2e} - {self.metapaths_df['conditional_prob'].max():.2e}")
        return self.metapaths_df
    
    def compare_probability_models(self):
        """Compare compositional vs conditional vs empirical probabilities."""
        print("\nComparing probability models...")
        
        models = {
            'compositional': 'compositional_prob',
            'conditional': 'conditional_prob',
            'empirical': 'empirical_prob'
        }
        
        results = {}
        
        for model1_name, model1_col in models.items():
            for model2_name, model2_col in models.items():
                if model1_name != model2_name:
                    corr, p_val = stats.spearmanr(
                        self.metapaths_df[model1_col],
                        self.metapaths_df[model2_col]
                    )
                    
                    results[f"{model1_name}_vs_{model2_name}"] = {
                        'correlation': corr,
                        'p_value': p_val
                    }
                    
                    print(f"  {model1_name} vs {model2_name}: r={corr:.4f}, p={p_val:.2e}")
        
        return results

# Run conditional probability modeling
conditional_model = ConditionalProbabilityModel(metapaths_with_empirical)
metapaths_final = conditional_model.calculate_conditional_probabilities()
probability_model_comparison = conditional_model.compare_probability_models()

## 6. Anomaly Detection Implementation

In [None]:
class MetapathAnomalyDetector:
    """Detect anomalous metapaths based on probability discrepancies."""
    
    def __init__(self, metapaths_df, threshold=0.05):
        self.metapaths_df = metapaths_df
        self.threshold = threshold
        
    def calculate_anomaly_scores(self):
        """Calculate anomaly scores based on multiple criteria."""
        print(f"Calculating anomaly scores with threshold {self.threshold}...")
        
        # 1. Probability residual score (empirical vs predicted)
        self.metapaths_df['residual_score'] = abs(
            self.metapaths_df['empirical_prob'] - self.metapaths_df['compositional_prob']
        ) / (self.metapaths_df['compositional_prob'] + 1e-10)
        
        # 2. Z-score based on compositional probability distribution
        comp_mean = self.metapaths_df['compositional_prob'].mean()
        comp_std = self.metapaths_df['compositional_prob'].std()
        self.metapaths_df['compositional_zscore'] = abs(
            (self.metapaths_df['compositional_prob'] - comp_mean) / comp_std
        )
        
        # 3. Edge probability consistency score
        edge_probs = self.metapaths_df[['edge1_prob', 'edge2_prob', 'edge3_prob']]
        self.metapaths_df['edge_consistency_score'] = edge_probs.std(axis=1) / (edge_probs.mean(axis=1) + 1e-10)
        
        # 4. Combined anomaly score
        self.metapaths_df['anomaly_score'] = (
            0.4 * self.normalize_score(self.metapaths_df['residual_score']) +
            0.3 * self.normalize_score(self.metapaths_df['compositional_zscore']) +
            0.3 * self.normalize_score(self.metapaths_df['edge_consistency_score'])
        )
        
        return self.metapaths_df
    
    def normalize_score(self, scores):
        """Normalize scores to [0, 1] range."""
        return (scores - scores.min()) / (scores.max() - scores.min() + 1e-10)
    
    def detect_anomalies(self):
        """Detect anomalous metapaths based on threshold."""
        print("\nDetecting anomalies...")
        
        # Define anomalies based on percentile threshold
        anomaly_threshold_value = self.metapaths_df['anomaly_score'].quantile(1 - self.threshold)
        self.metapaths_df['is_anomaly'] = self.metapaths_df['anomaly_score'] > anomaly_threshold_value
        
        n_anomalies = self.metapaths_df['is_anomaly'].sum()
        anomaly_rate = n_anomalies / len(self.metapaths_df)
        
        print(f"  Anomaly threshold (score): {anomaly_threshold_value:.4f}")
        print(f"  Detected anomalies: {n_anomalies:,} ({anomaly_rate:.1%})")
        
        # Analyze anomaly characteristics
        anomalies = self.metapaths_df[self.metapaths_df['is_anomaly']]
        normals = self.metapaths_df[~self.metapaths_df['is_anomaly']]
        
        print(f"\nAnomaly characteristics:")
        print(f"  Mean compositional prob - Anomalies: {anomalies['compositional_prob'].mean():.2e}")
        print(f"  Mean compositional prob - Normal: {normals['compositional_prob'].mean():.2e}")
        print(f"  Mean empirical prob - Anomalies: {anomalies['empirical_prob'].mean():.2e}")
        print(f"  Mean empirical prob - Normal: {normals['empirical_prob'].mean():.2e}")
        
        return self.metapaths_df, anomalies, normals
    
    def validate_anomaly_detection(self):
        """Validate anomaly detection using statistical tests."""
        print("\nValidating anomaly detection...")
        
        anomalies = self.metapaths_df[self.metapaths_df['is_anomaly']]
        normals = self.metapaths_df[~self.metapaths_df['is_anomaly']]
        
        # Statistical tests
        from scipy.stats import mannwhitneyu
        
        # Test difference in compositional probabilities
        stat_comp, p_comp = mannwhitneyu(
            anomalies['compositional_prob'], 
            normals['compositional_prob'],
            alternative='two-sided'
        )
        
        # Test difference in empirical probabilities
        stat_emp, p_emp = mannwhitneyu(
            anomalies['empirical_prob'], 
            normals['empirical_prob'],
            alternative='two-sided'
        )
        
        print(f"  Compositional prob difference: U={stat_comp:.0f}, p={p_comp:.2e}")
        print(f"  Empirical prob difference: U={stat_emp:.0f}, p={p_emp:.2e}")
        
        return {
            'compositional_test': {'statistic': stat_comp, 'p_value': p_comp},
            'empirical_test': {'statistic': stat_emp, 'p_value': p_emp}
        }

# Run anomaly detection
detector = MetapathAnomalyDetector(metapaths_final, threshold=anomaly_threshold)
metapaths_with_scores = detector.calculate_anomaly_scores()
metapaths_final, anomalies, normals = detector.detect_anomalies()
validation_results = detector.validate_anomaly_detection()

In [None]:
# Visualize anomaly detection results
fig, axes = plt.subplots(2, 3, figsize=(18, 12))

# 1. Anomaly score distribution
axes[0,0].hist(metapaths_final['anomaly_score'], bins=50, alpha=0.7, edgecolor='black')
anomaly_threshold_line = metapaths_final['anomaly_score'].quantile(1 - anomaly_threshold)
axes[0,0].axvline(anomaly_threshold_line, color='red', linestyle='--', 
                  label=f'Threshold ({anomaly_threshold_line:.3f})')
axes[0,0].set_xlabel('Anomaly Score')
axes[0,0].set_ylabel('Frequency')
axes[0,0].set_title('Anomaly Score Distribution')
axes[0,0].legend()

# 2. Compositional vs Empirical with anomalies highlighted
axes[0,1].scatter(normals['compositional_prob'], normals['empirical_prob'], 
                  alpha=0.6, s=20, label='Normal', color='blue')
axes[0,1].scatter(anomalies['compositional_prob'], anomalies['empirical_prob'], 
                  alpha=0.8, s=30, label='Anomaly', color='red')
axes[0,1].plot([0, metapaths_final['compositional_prob'].max()], 
               [0, metapaths_final['compositional_prob'].max()], 
               'k--', alpha=0.5)
axes[0,1].set_xlabel('Compositional Probability')
axes[0,1].set_ylabel('Empirical Probability')
axes[0,1].set_title('Anomalies in Probability Space')
axes[0,1].set_xscale('log')
axes[0,1].set_yscale('log')
axes[0,1].legend()

# 3. Edge probability patterns for anomalies vs normal
edge_cols = ['edge1_prob', 'edge2_prob', 'edge3_prob']
anomaly_means = [anomalies[col].mean() for col in edge_cols]
normal_means = [normals[col].mean() for col in edge_cols]

x_pos = np.arange(len(edge_cols))
width = 0.35

axes[0,2].bar(x_pos - width/2, normal_means, width, label='Normal', alpha=0.7)
axes[0,2].bar(x_pos + width/2, anomaly_means, width, label='Anomaly', alpha=0.7)
axes[0,2].set_xlabel('Edge Position')
axes[0,2].set_ylabel('Mean Probability')
axes[0,2].set_title('Edge Probability Patterns')
axes[0,2].set_xticks(x_pos)
axes[0,2].set_xticklabels(['Edge 1', 'Edge 2', 'Edge 3'])
axes[0,2].legend()

# 4. Component scores
score_components = ['residual_score', 'compositional_zscore', 'edge_consistency_score']
for i, component in enumerate(score_components):
    axes[1,i].hist(normals[component], bins=30, alpha=0.7, label='Normal', density=True)
    axes[1,i].hist(anomalies[component], bins=30, alpha=0.7, label='Anomaly', density=True)
    axes[1,i].set_xlabel(component.replace('_', ' ').title())
    axes[1,i].set_ylabel('Density')
    axes[1,i].set_title(f'{component.replace("_", " ").title()} Distribution')
    axes[1,i].legend()

plt.tight_layout()
plt.savefig(results_dir / f'{metapath_pattern}_anomaly_detection.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. Validation & Results

In [None]:
# Save comprehensive results
print("Saving analysis results...")

# Save metapaths with all probability calculations and anomaly scores
metapaths_output_file = results_dir / f'{metapath_pattern}_metapath_analysis.csv'
metapaths_final.to_csv(metapaths_output_file, index=False)
print(f"‚úì Saved metapath analysis: {metapaths_output_file}")

# Save anomalies separately for detailed inspection
anomalies_file = results_dir / f'{metapath_pattern}_anomalies.csv'
anomalies.to_csv(anomalies_file, index=False)
print(f"‚úì Saved anomalies: {anomalies_file}")

# Save summary statistics
summary_stats = {
    'metapath_pattern': metapath_pattern,
    'edge_types': edge_types,
    'total_metapaths': len(metapaths_final),
    'valid_metapaths': len(valid_paths),
    'anomaly_threshold': anomaly_threshold,
    'detected_anomalies': int(anomalies['is_anomaly'].sum()),
    'anomaly_rate': float(anomalies['is_anomaly'].mean()),
    
    # Compositionality results
    'compositionality': {
        'edge_independence_pairs': independence_results,
        'compositional_vs_empirical': compositional_comparison,
        'probability_model_comparison': probability_model_comparison
    },
    
    # Anomaly detection validation
    'anomaly_validation': validation_results,
    
    # Summary statistics
    'probability_statistics': {
        'compositional_mean': float(metapaths_final['compositional_prob'].mean()),
        'compositional_std': float(metapaths_final['compositional_prob'].std()),
        'empirical_mean': float(metapaths_final['empirical_prob'].mean()),
        'empirical_std': float(metapaths_final['empirical_prob'].std()),
        'conditional_mean': float(metapaths_final['conditional_prob'].mean()),
        'conditional_std': float(metapaths_final['conditional_prob'].std())
    }
}

# Save summary
summary_file = results_dir / f'{metapath_pattern}_analysis_summary.json'
with open(summary_file, 'w') as f:
    json.dump(summary_stats, f, indent=2, default=str)
print(f"‚úì Saved summary: {summary_file}")

print(f"\nAnalysis complete! Results saved to: {results_dir}")

In [None]:
# Final comprehensive summary
print("=" * 80)
print(f"METAPATH PROBABILITY ANALYSIS SUMMARY")
print("=" * 80)

print(f"\nMetapath: {metapath_pattern}")
print(f"Edge types: {' ‚Üí '.join(edge_types)}")
print(f"Total metapaths analyzed: {len(metapaths_final):,}")

print(f"\nüîç COMPOSITIONALITY ANALYSIS:")
print(f"  Edge independence: {sum(1 for r in independence_results.values() if r['independent'])}/{len(independence_results)} pairs independent")
print(f"  Compositional validity: {compositional_comparison['compositional_valid']}")
print(f"  Compositional vs Empirical correlation: {compositional_comparison['spearman_r']:.4f}")
print(f"  Explained variance (R¬≤): {compositional_comparison['r2_score']:.4f}")

best_model = max(probability_model_comparison.items(), 
                key=lambda x: x[1]['correlation'] if 'empirical' in x[0] else 0)
print(f"  Best probability model: {best_model[0]} (r={best_model[1]['correlation']:.4f})")

print(f"\nüéØ ANOMALY DETECTION:")
print(f"  Anomaly threshold: {anomaly_threshold:.1%}")
print(f"  Detected anomalies: {len(anomalies):,} ({len(anomalies)/len(metapaths_final):.1%})")
print(f"  Anomaly score range: {metapaths_final['anomaly_score'].min():.4f} - {metapaths_final['anomaly_score'].max():.4f}")
print(f"  Statistical significance: p < 0.01" if validation_results['compositional_test']['p_value'] < 0.01 else "  Not statistically significant")

print(f"\nüìä PROBABILITY SUMMARY:")
for prob_type in ['compositional', 'conditional', 'empirical']:
    col_name = f'{prob_type}_prob'
    mean_val = metapaths_final[col_name].mean()
    print(f"  {prob_type.capitalize()} probability: {mean_val:.2e} (¬±{metapaths_final[col_name].std():.2e})")

print(f"\nüìÅ OUTPUT FILES:")
output_files = list(results_dir.glob(f'{metapath_pattern}_*'))
for file_path in sorted(output_files):
    file_size = file_path.stat().st_size / (1024*1024)
    print(f"  - {file_path.name} ({file_size:.1f} MB)")

print("=" * 80)

## Conclusions

This analysis provides insights into:

1. **Compositionality**: Whether metapath probabilities can be calculated as independent edge products
2. **Conditional Dependencies**: How edge probabilities depend on previous edges in the path
3. **Anomaly Detection**: Identification of unusual metapaths for further investigation
4. **Model Validation**: Comparison of different probability calculation approaches

The results can be used for:
- **Drug Discovery**: Identifying unusual compound-gene-pathway-gene relationships
- **Network Analysis**: Understanding path probability patterns in biological networks
- **Quality Control**: Detecting potential data quality issues or interesting biological phenomena
- **Predictive Modeling**: Improving metapath-based prediction algorithms