# Degree-Aware Compositional Model

## Research Questions

1. **Does degree stratification improve compositional model fit?**
2. **How much of the conditional structure is explained by degree heterogeneity alone?**
3. **Can degree-aware priors reduce PMI and improve predictions?**

## Approach

### Naive Compositional Model
$$P(C \to P) = \sum_{g} P(C \to g) \times P(g \to P)$$

Assumes independence regardless of degrees.

### Degree-Aware Compositional Model
$$P(C \to P | \text{deg}(C)=u, \text{deg}(P)=v) = \sum_{g: \text{deg}(g)=w} P(C \to g | u, w) \times P(g \to P | w, v)$$

Stratifies by node degrees before applying compositional assumptions.

## Key Hypothesis from Notebook 11

Since Hetionet and null networks have identical PMI distributions (p=0.715), the conditional structure is **degree-driven**, not biology-driven. Therefore:

✓ Degree-aware models should improve fit in **both** Hetionet and null networks  
✓ Improvement should be **similar** in magnitude  
✓ Remaining residuals may reflect biological dependencies

In [None]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.sparse as sp
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import mean_squared_error, mean_absolute_error
from collections import defaultdict
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Setup paths
repo_dir = Path.cwd().parent
src_dir = repo_dir / 'src'
data_dir = repo_dir / 'data'
results_dir = repo_dir / 'results' / 'degree_aware_compositionality'
results_dir.mkdir(parents=True, exist_ok=True)

sys.path.append(str(src_dir))

print(f"Repository: {repo_dir}")
print(f"Results: {results_dir}")

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.dpi'] = 100

## Configuration

In [None]:
# Test metapath: CbGpPW (same as notebook 11)
metapath = ['CbG', 'GpPW']
metapath_name = 'CbGpPW'

# Permutations to analyze
PERMUTATION_IDS = [1, 2, 3, 4, 5]  # First 5 null networks
HETIONET_ID = 0  # Real network

# Degree bins (same as notebook 11)
DEGREE_BINS = [0, 5, 20, 100, np.inf]
DEGREE_LABELS = ['Very Low (0-5)', 'Low (5-20)', 'Medium (20-100)', 'High (>100)']

print(f"Testing metapath: {metapath_name}")
print(f"  Edge 1: {metapath[0]} (Compound → Gene)")
print(f"  Edge 2: {metapath[1]} (Gene → Pathway)")
print(f"\nAnalyzing Hetionet ({HETIONET_ID:03d}) + {len(PERMUTATION_IDS)} permutations")
print(f"Degree bins: {DEGREE_LABELS}")

## Helper Functions

In [None]:
def load_edge_matrix(edge_type: str, perm_id: int = 0) -> sp.csr_matrix:
    """Load edge matrix for given edge type and permutation."""
    edge_file = data_dir / 'permutations' / f'{perm_id:03d}.hetmat' / 'edges' / f'{edge_type}.sparse.npz'
    return sp.load_npz(edge_file)

def analytical_prior(u: float, v: float, m: float) -> float:
    """Current analytical formula for edge probability."""
    uv = u * v
    denominator = np.sqrt(uv**2 + (m - u - v + 1)**2)
    return uv / denominator if denominator > 0 else 0.0

def assign_degree_bin(degree: int, bins=DEGREE_BINS) -> int:
    """Assign degree to bin index."""
    for i, threshold in enumerate(bins[1:]):
        if degree < threshold:
            return i
    return len(bins) - 2

def compute_observed_metapath_frequencies(edge1: sp.csr_matrix, edge2: sp.csr_matrix) -> dict:
    """Compute observed metapath frequencies."""
    # Filter zero-degree nodes
    compound_degrees = np.array(edge1.sum(axis=1)).flatten()
    pathway_degrees = np.array(edge2.sum(axis=0)).flatten()
    
    compound_nonzero = np.where(compound_degrees > 0)[0]
    pathway_nonzero = np.where(pathway_degrees > 0)[0]
    
    edge1_filt = edge1[compound_nonzero, :]
    edge2_filt = edge2[:, pathway_nonzero]
    
    # Compute metapath matrix
    metapath_matrix = edge1_filt @ edge2_filt
    
    # Compute observed frequencies
    observed_freq = {}
    for i, j in zip(*metapath_matrix.nonzero()):
        compound_genes = set(edge1_filt.getrow(i).nonzero()[1])
        pathway_genes = set(edge2_filt.getcol(j).nonzero()[0])
        shared_genes = compound_genes & pathway_genes
        n_paths = len(shared_genes)
        n_possible = len(compound_genes)
        if n_possible > 0:
            observed_freq[(compound_nonzero[i], pathway_nonzero[j])] = n_paths / n_possible
    
    return observed_freq, edge1_filt, edge2_filt, compound_nonzero, pathway_nonzero

print("Helper functions defined")

## 1. Naive Compositional Model

In [None]:
def compute_naive_compositional_model(edge1: sp.csr_matrix, edge2: sp.csr_matrix) -> dict:
    """
    Compute naive compositional predictions.
    P(C→P) = sum_g P(C→g) × P(g→P)
    """
    # Get degrees
    compound_degrees = np.array(edge1.sum(axis=1)).flatten()
    pathway_degrees = np.array(edge2.sum(axis=0)).flatten()
    gene_degrees_1 = np.array(edge1.sum(axis=0)).flatten()
    gene_degrees_2 = np.array(edge2.sum(axis=1)).flatten()
    
    m1 = edge1.nnz
    m2 = edge2.nnz
    
    # Compute edge priors
    edge1_priors = {}
    for i, j in zip(*edge1.nonzero()):
        u, v = compound_degrees[i], gene_degrees_1[j]
        if u > 0 and v > 0:
            edge1_priors[(i, j)] = analytical_prior(u, v, m1)
    
    edge2_priors = {}
    for i, j in zip(*edge2.nonzero()):
        u, v = gene_degrees_2[i], pathway_degrees[j]
        if u > 0 and v > 0:
            edge2_priors[(i, j)] = analytical_prior(u, v, m2)
    
    # Compute compositional probabilities
    compositional_prob = {}
    n_compounds = edge1.shape[0]
    n_pathways = edge2.shape[1]
    
    for i in range(n_compounds):
        if compound_degrees[i] == 0:
            continue
        compound_genes = edge1.getrow(i).nonzero()[1]
        
        for j in range(n_pathways):
            if pathway_degrees[j] == 0:
                continue
            pathway_genes = edge2.getcol(j).nonzero()[0]
            
            total_prob = 0.0
            for gene in set(compound_genes) & set(pathway_genes):
                p1 = edge1_priors.get((i, gene), 0.0)
                p2 = edge2_priors.get((gene, j), 0.0)
                total_prob += p1 * p2
            
            if total_prob > 0:
                compositional_prob[(i, j)] = total_prob
    
    return compositional_prob

print("Naive compositional model function defined")

## 2. Degree-Aware Compositional Model

In [None]:
def compute_degree_aware_compositional_model(edge1: sp.csr_matrix, edge2: sp.csr_matrix) -> dict:
    """
    Compute degree-aware compositional predictions.
    
    Key difference from naive model:
    - Edge priors are computed WITHIN degree bins
    - Each bin has its own normalization (total edges in that bin)
    """
    # Get degrees
    compound_degrees = np.array(edge1.sum(axis=1)).flatten()
    pathway_degrees = np.array(edge2.sum(axis=0)).flatten()
    gene_degrees_1 = np.array(edge1.sum(axis=0)).flatten()
    gene_degrees_2 = np.array(edge2.sum(axis=1)).flatten()
    
    # Assign degree bins
    compound_bins = np.array([assign_degree_bin(d) for d in compound_degrees])
    pathway_bins = np.array([assign_degree_bin(d) for d in pathway_degrees])
    gene_bins_1 = np.array([assign_degree_bin(d) for d in gene_degrees_1])
    gene_bins_2 = np.array([assign_degree_bin(d) for d in gene_degrees_2])
    
    # Count edges per degree bin combination for normalization
    edge1_counts = defaultdict(int)
    edge2_counts = defaultdict(int)
    
    for i, j in zip(*edge1.nonzero()):
        bin_key = (compound_bins[i], gene_bins_1[j])
        edge1_counts[bin_key] += 1
    
    for i, j in zip(*edge2.nonzero()):
        bin_key = (gene_bins_2[i], pathway_bins[j])
        edge2_counts[bin_key] += 1
    
    # Compute degree-aware edge priors
    edge1_priors = {}
    for i, j in zip(*edge1.nonzero()):
        u, v = compound_degrees[i], gene_degrees_1[j]
        bin_key = (compound_bins[i], gene_bins_1[j])
        m_bin = edge1_counts[bin_key]
        if u > 0 and v > 0 and m_bin > 0:
            edge1_priors[(i, j)] = analytical_prior(u, v, m_bin)
    
    edge2_priors = {}
    for i, j in zip(*edge2.nonzero()):
        u, v = gene_degrees_2[i], pathway_degrees[j]
        bin_key = (gene_bins_2[i], pathway_bins[j])
        m_bin = edge2_counts[bin_key]
        if u > 0 and v > 0 and m_bin > 0:
            edge2_priors[(i, j)] = analytical_prior(u, v, m_bin)
    
    # Compute compositional probabilities (same as naive)
    compositional_prob = {}
    n_compounds = edge1.shape[0]
    n_pathways = edge2.shape[1]
    
    for i in range(n_compounds):
        if compound_degrees[i] == 0:
            continue
        compound_genes = edge1.getrow(i).nonzero()[1]
        
        for j in range(n_pathways):
            if pathway_degrees[j] == 0:
                continue
            pathway_genes = edge2.getcol(j).nonzero()[0]
            
            total_prob = 0.0
            for gene in set(compound_genes) & set(pathway_genes):
                p1 = edge1_priors.get((i, gene), 0.0)
                p2 = edge2_priors.get((gene, j), 0.0)
                total_prob += p1 * p2
            
            if total_prob > 0:
                compositional_prob[(i, j)] = total_prob
    
    return compositional_prob

print("Degree-aware compositional model function defined")

## 3. Analyze Hetionet

In [None]:
print("Analyzing Hetionet...\n")

# Load edges
edge1_het = load_edge_matrix(metapath[0], HETIONET_ID)
edge2_het = load_edge_matrix(metapath[1], HETIONET_ID)

print(f"Edge matrices loaded:")
print(f"  {metapath[0]}: {edge1_het.shape}, {edge1_het.nnz} edges")
print(f"  {metapath[1]}: {edge2_het.shape}, {edge2_het.nnz} edges")

# Compute observed frequencies
print("\nComputing observed metapath frequencies...")
observed_freq, _, _, _, _ = compute_observed_metapath_frequencies(edge1_het, edge2_het)
print(f"  Found {len(observed_freq)} metapath pairs")

# Compute naive compositional predictions
print("\nComputing naive compositional model...")
naive_pred = compute_naive_compositional_model(edge1_het, edge2_het)
print(f"  Generated {len(naive_pred)} predictions")

# Compute degree-aware compositional predictions
print("\nComputing degree-aware compositional model...")
degree_aware_pred = compute_degree_aware_compositional_model(edge1_het, edge2_het)
print(f"  Generated {len(degree_aware_pred)} predictions")

## 4. Compare Model Performance

In [None]:
# Find common pairs for fair comparison
common_pairs = set(observed_freq.keys()) & set(naive_pred.keys()) & set(degree_aware_pred.keys())
print(f"Common pairs across all three: {len(common_pairs)}")

# Extract values for common pairs
y_true = np.array([observed_freq[pair] for pair in common_pairs])
y_naive = np.array([naive_pred[pair] for pair in common_pairs])
y_degree_aware = np.array([degree_aware_pred[pair] for pair in common_pairs])

# Compute metrics
naive_corr = pearsonr(y_true, y_naive)[0]
degree_aware_corr = pearsonr(y_true, y_degree_aware)[0]

naive_rmse = np.sqrt(mean_squared_error(y_true, y_naive))
degree_aware_rmse = np.sqrt(mean_squared_error(y_true, y_degree_aware))

naive_mae = mean_absolute_error(y_true, y_naive)
degree_aware_mae = mean_absolute_error(y_true, y_degree_aware)

print("\n" + "="*80)
print("MODEL COMPARISON - HETIONET")
print("="*80)

print(f"\nNaive Compositional Model:")
print(f"  Correlation: {naive_corr:.4f}")
print(f"  RMSE:        {naive_rmse:.4f}")
print(f"  MAE:         {naive_mae:.4f}")

print(f"\nDegree-Aware Compositional Model:")
print(f"  Correlation: {degree_aware_corr:.4f}")
print(f"  RMSE:        {degree_aware_rmse:.4f}")
print(f"  MAE:         {degree_aware_mae:.4f}")

print(f"\nImprovement:")
print(f"  Δ Correlation: {degree_aware_corr - naive_corr:+.4f} ({(degree_aware_corr - naive_corr) / abs(naive_corr) * 100:+.1f}%)")
print(f"  Δ RMSE:        {degree_aware_rmse - naive_rmse:+.4f} ({(degree_aware_rmse - naive_rmse) / naive_rmse * 100:+.1f}%)")
print(f"  Δ MAE:         {degree_aware_mae - naive_mae:+.4f} ({(degree_aware_mae - naive_mae) / naive_mae * 100:+.1f}%)")

if degree_aware_corr > naive_corr:
    print(f"\n✓ DEGREE-AWARE MODEL IMPROVES FIT")
else:
    print(f"\n✗ No improvement from degree stratification")

## 5. PMI Analysis

In [None]:
# Compute PMI for both models
def compute_pmi(observed, predicted):
    """Compute PMI = log2(observed / predicted)"""
    pmi = []
    for obs, pred in zip(observed, predicted):
        if obs > 0 and pred > 0:
            pmi.append(np.log2(obs / pred))
    return np.array(pmi)

pmi_naive = compute_pmi(y_true, y_naive)
pmi_degree_aware = compute_pmi(y_true, y_degree_aware)

print("\nPMI Analysis:")
print(f"\nNaive Model:")
print(f"  Mean PMI:   {pmi_naive.mean():.4f}")
print(f"  Median PMI: {np.median(pmi_naive):.4f}")
print(f"  Std PMI:    {pmi_naive.std():.4f}")

print(f"\nDegree-Aware Model:")
print(f"  Mean PMI:   {pmi_degree_aware.mean():.4f}")
print(f"  Median PMI: {np.median(pmi_degree_aware):.4f}")
print(f"  Std PMI:    {pmi_degree_aware.std():.4f}")

print(f"\nPMI Reduction:")
print(f"  Δ Mean PMI: {pmi_degree_aware.mean() - pmi_naive.mean():.4f}")

if abs(pmi_degree_aware.mean()) < abs(pmi_naive.mean()):
    print(f"\n✓ DEGREE-AWARE MODEL REDUCES PMI (closer to 0 = better compositional fit)")
else:
    print(f"\n→ No PMI reduction")

## 6. Stratified Analysis by Degree Bins

In [None]:
# Get degrees for stratification
compound_degrees = np.array(edge1_het.sum(axis=1)).flatten()
pathway_degrees = np.array(edge2_het.sum(axis=0)).flatten()

# Create results dataframe
results_df = pd.DataFrame({
    'compound_idx': [pair[0] for pair in common_pairs],
    'pathway_idx': [pair[1] for pair in common_pairs],
    'observed': y_true,
    'naive_pred': y_naive,
    'degree_aware_pred': y_degree_aware
})

results_df['compound_degree'] = results_df['compound_idx'].map(lambda i: compound_degrees[i])
results_df['pathway_degree'] = results_df['pathway_idx'].map(lambda i: pathway_degrees[i])

results_df['compound_bin'] = pd.cut(results_df['compound_degree'], bins=DEGREE_BINS, labels=DEGREE_LABELS)
results_df['pathway_bin'] = pd.cut(results_df['pathway_degree'], bins=DEGREE_BINS, labels=DEGREE_LABELS)

results_df['pmi_naive'] = np.log2(results_df['observed'] / results_df['naive_pred'])
results_df['pmi_degree_aware'] = np.log2(results_df['observed'] / results_df['degree_aware_pred'])

print("\nPerformance by Compound Degree Bin:")
print("="*80)

for bin_label in DEGREE_LABELS:
    subset = results_df[results_df['compound_bin'] == bin_label]
    if len(subset) > 1:
        naive_corr_bin = pearsonr(subset['observed'], subset['naive_pred'])[0]
        da_corr_bin = pearsonr(subset['observed'], subset['degree_aware_pred'])[0]
        
        print(f"\n{bin_label} (n={len(subset)}):")
        print(f"  Naive correlation:        {naive_corr_bin:.4f}")
        print(f"  Degree-aware correlation: {da_corr_bin:.4f}")
        print(f"  Improvement:              {da_corr_bin - naive_corr_bin:+.4f}")
        print(f"  Mean PMI (naive):         {subset['pmi_naive'].mean():.4f}")
        print(f"  Mean PMI (degree-aware):  {subset['pmi_degree_aware'].mean():.4f}")

## 7. Null Network Comparison

In [None]:
print(f"\nAnalyzing {len(PERMUTATION_IDS)} null networks...\n")

null_results = []

for perm_id in tqdm(PERMUTATION_IDS, desc="Processing permutations"):
    # Load edges
    edge1_perm = load_edge_matrix(metapath[0], perm_id)
    edge2_perm = load_edge_matrix(metapath[1], perm_id)
    
    # Compute observed
    obs_freq, _, _, _, _ = compute_observed_metapath_frequencies(edge1_perm, edge2_perm)
    
    # Compute predictions
    naive_perm = compute_naive_compositional_model(edge1_perm, edge2_perm)
    da_perm = compute_degree_aware_compositional_model(edge1_perm, edge2_perm)
    
    # Common pairs
    common = set(obs_freq.keys()) & set(naive_perm.keys()) & set(da_perm.keys())
    
    if len(common) > 1:
        y_obs = np.array([obs_freq[p] for p in common])
        y_naive_p = np.array([naive_perm[p] for p in common])
        y_da_p = np.array([da_perm[p] for p in common])
        
        naive_corr_p = pearsonr(y_obs, y_naive_p)[0]
        da_corr_p = pearsonr(y_obs, y_da_p)[0]
        
        null_results.append({
            'perm_id': perm_id,
            'naive_corr': naive_corr_p,
            'degree_aware_corr': da_corr_p,
            'improvement': da_corr_p - naive_corr_p,
            'n_pairs': len(common)
        })

null_df = pd.DataFrame(null_results)

print("\n" + "="*80)
print("NULL NETWORK RESULTS")
print("="*80)

print(f"\nNull Networks (n={len(null_df)}):")
print(f"  Naive correlation:        {null_df['naive_corr'].mean():.4f} ± {null_df['naive_corr'].std():.4f}")
print(f"  Degree-aware correlation: {null_df['degree_aware_corr'].mean():.4f} ± {null_df['degree_aware_corr'].std():.4f}")
print(f"  Mean improvement:         {null_df['improvement'].mean():+.4f} ± {null_df['improvement'].std():.4f}")

print(f"\nHetionet vs Null:")
het_improvement = degree_aware_corr - naive_corr
null_improvement = null_df['improvement'].mean()
print(f"  Hetionet improvement:     {het_improvement:+.4f}")
print(f"  Null improvement:         {null_improvement:+.4f}")
print(f"  Difference:               {het_improvement - null_improvement:+.4f}")

if abs(het_improvement - null_improvement) < 0.01:
    print(f"\n✓ SIMILAR IMPROVEMENT IN HETIONET AND NULL")
    print(f"  → Confirms degree structure is main driver of conditional dependencies")
    print(f"  → Biology adds minimal signal beyond degrees")
else:
    print(f"\n→ DIFFERENT IMPROVEMENT PATTERNS")
    print(f"  → May indicate biological signal beyond degree structure")

## 8. Visualizations

In [None]:
# Create comprehensive visualization
fig, axes = plt.subplots(2, 3, figsize=(20, 13))

# 1. Scatter: Observed vs Naive
ax = axes[0, 0]
ax.scatter(y_true, y_naive, alpha=0.5, s=10, edgecolors='none')
ax.plot([0, 1], [0, 1], 'r--', alpha=0.8, linewidth=2)
ax.set_xlabel('Observed Frequency', fontsize=12)
ax.set_ylabel('Naive Compositional Prediction', fontsize=12)
ax.set_title(f'Naive Model (r={naive_corr:.3f})', fontsize=14, fontweight='bold')
ax.grid(alpha=0.3)
ax.set_xlim(0, 1)
ax.set_ylim(0, max(y_naive.max(), 1))

# 2. Scatter: Observed vs Degree-Aware
ax = axes[0, 1]
ax.scatter(y_true, y_degree_aware, alpha=0.5, s=10, edgecolors='none', color='green')
ax.plot([0, 1], [0, 1], 'r--', alpha=0.8, linewidth=2)
ax.set_xlabel('Observed Frequency', fontsize=12)
ax.set_ylabel('Degree-Aware Prediction', fontsize=12)
ax.set_title(f'Degree-Aware Model (r={degree_aware_corr:.3f})', fontsize=14, fontweight='bold')
ax.grid(alpha=0.3)
ax.set_xlim(0, 1)
ax.set_ylim(0, max(y_degree_aware.max(), 1))

# 3. PMI distributions
ax = axes[0, 2]
ax.hist(pmi_naive, bins=50, alpha=0.6, label='Naive', edgecolor='black')
ax.hist(pmi_degree_aware, bins=50, alpha=0.6, label='Degree-Aware', edgecolor='black', color='green')
ax.axvline(0, color='red', linestyle='--', linewidth=2, label='Perfect Fit (PMI=0)')
ax.set_xlabel('PMI', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)
ax.set_title('PMI Distribution', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)

# 4. Correlation by compound degree
ax = axes[1, 0]
corr_by_compound = []
for bin_label in DEGREE_LABELS:
    subset = results_df[results_df['compound_bin'] == bin_label]
    if len(subset) > 1:
        naive_c = pearsonr(subset['observed'], subset['naive_pred'])[0]
        da_c = pearsonr(subset['observed'], subset['degree_aware_pred'])[0]
        corr_by_compound.append({'bin': bin_label, 'naive': naive_c, 'degree_aware': da_c})

if corr_by_compound:
    corr_compound_df = pd.DataFrame(corr_by_compound)
    x = np.arange(len(corr_compound_df))
    width = 0.35
    ax.bar(x - width/2, corr_compound_df['naive'], width, label='Naive', alpha=0.7)
    ax.bar(x + width/2, corr_compound_df['degree_aware'], width, label='Degree-Aware', alpha=0.7, color='green')
    ax.set_xlabel('Compound Degree Bin', fontsize=12)
    ax.set_ylabel('Correlation', fontsize=12)
    ax.set_title('Correlation by Compound Degree', fontsize=14, fontweight='bold')
    ax.set_xticks(x)
    ax.set_xticklabels(corr_compound_df['bin'], rotation=45, ha='right')
    ax.legend()
    ax.grid(alpha=0.3)

# 5. Null network improvement distribution
ax = axes[1, 1]
ax.hist(null_df['improvement'], bins=15, alpha=0.7, edgecolor='black', color='blue')
ax.axvline(het_improvement, color='red', linestyle='--', linewidth=3, label=f'Hetionet ({het_improvement:+.3f})')
ax.axvline(null_improvement, color='blue', linestyle='--', linewidth=2, label=f'Null mean ({null_improvement:+.3f})')
ax.set_xlabel('Improvement in Correlation', fontsize=12)
ax.set_ylabel('Frequency', fontsize=12)
ax.set_title('Degree-Aware Improvement: Hetionet vs Null', fontsize=14, fontweight='bold')
ax.legend()
ax.grid(alpha=0.3)

# 6. Residual comparison
ax = axes[1, 2]
residual_naive = y_true - y_naive
residual_da = y_true - y_degree_aware
ax.scatter(residual_naive, residual_da, alpha=0.5, s=10, edgecolors='none')
ax.plot([residual_naive.min(), residual_naive.max()], [residual_naive.min(), residual_naive.max()], 'r--', alpha=0.8)
ax.set_xlabel('Naive Residuals', fontsize=12)
ax.set_ylabel('Degree-Aware Residuals', fontsize=12)
ax.set_title(f'Residual Comparison', fontsize=14, fontweight='bold')
ax.grid(alpha=0.3)
ax.axhline(0, color='black', linestyle='-', alpha=0.3)
ax.axvline(0, color='black', linestyle='-', alpha=0.3)

plt.tight_layout()
plt.savefig(results_dir / f'{metapath_name}_degree_aware_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\nVisualization saved to: {results_dir / f'{metapath_name}_degree_aware_analysis.png'}")

## 9. Save Results

In [None]:
# Save Hetionet results
results_df.to_csv(results_dir / f'{metapath_name}_hetionet_results.csv', index=False)

# Save null results
null_df.to_csv(results_dir / f'{metapath_name}_null_results.csv', index=False)

# Save summary
summary = {
    'metapath': metapath_name,
    'hetionet_naive_corr': naive_corr,
    'hetionet_degree_aware_corr': degree_aware_corr,
    'hetionet_improvement': het_improvement,
    'hetionet_naive_rmse': naive_rmse,
    'hetionet_degree_aware_rmse': degree_aware_rmse,
    'hetionet_naive_pmi_mean': pmi_naive.mean(),
    'hetionet_degree_aware_pmi_mean': pmi_degree_aware.mean(),
    'null_naive_corr_mean': null_df['naive_corr'].mean(),
    'null_degree_aware_corr_mean': null_df['degree_aware_corr'].mean(),
    'null_improvement_mean': null_improvement,
    'n_hetionet_pairs': len(common_pairs),
    'n_null_networks': len(null_df)
}

summary_df = pd.DataFrame([summary])
summary_df.to_csv(results_dir / f'{metapath_name}_summary.csv', index=False)

print(f"\nResults saved to:")
print(f"  - {results_dir / f'{metapath_name}_hetionet_results.csv'}")
print(f"  - {results_dir / f'{metapath_name}_null_results.csv'}")
print(f"  - {results_dir / f'{metapath_name}_summary.csv'}")

## 10. Conclusions

In [None]:
print("\n" + "="*80)
print("CONCLUSIONS: DEGREE-AWARE COMPOSITIONAL MODELING")
print("="*80)

print(f"\n1. MODEL PERFORMANCE:")
print(f"   Naive compositional:      r = {naive_corr:.4f}")
print(f"   Degree-aware:             r = {degree_aware_corr:.4f}")
print(f"   Improvement:              Δr = {het_improvement:+.4f} ({het_improvement / abs(naive_corr) * 100:+.1f}%)")

if het_improvement > 0.01:
    print(f"\n   ✓ DEGREE-AWARE MODEL SIGNIFICANTLY IMPROVES FIT")
elif het_improvement > 0:
    print(f"\n   → Modest improvement from degree stratification")
else:
    print(f"\n   ✗ No improvement or degradation")

print(f"\n2. PMI REDUCTION:")
print(f"   Naive PMI:                {pmi_naive.mean():.4f}")
print(f"   Degree-aware PMI:         {pmi_degree_aware.mean():.4f}")
print(f"   Reduction:                {pmi_naive.mean() - pmi_degree_aware.mean():.4f}")

if abs(pmi_degree_aware.mean()) < abs(pmi_naive.mean()):
    print(f"\n   ✓ PMI closer to 0 → better compositional fit")

print(f"\n3. HETIONET VS NULL:")
print(f"   Hetionet improvement:     {het_improvement:+.4f}")
print(f"   Null improvement (mean):  {null_improvement:+.4f}")
print(f"   Difference:               {abs(het_improvement - null_improvement):.4f}")

if abs(het_improvement - null_improvement) < 0.01:
    print(f"\n   ✓ SIMILAR IMPROVEMENT (difference < 0.01)")
    print(f"     → Degree structure is primary driver")
    print(f"     → Biological dependencies are minimal")
else:
    print(f"\n   → DIFFERENT IMPROVEMENT PATTERNS")
    print(f"     → Potential biological signal beyond degrees")

print(f"\n4. RECOMMENDATIONS:")

if het_improvement > 0.05 and abs(het_improvement - null_improvement) < 0.02:
    print(f"   → USE DEGREE-AWARE COMPOSITIONAL MODELS")
    print(f"   → Substantial improvement from degree stratification")
    print(f"   → Apply to both Hetionet and null networks")
    print(f"   → Stratify by {len(DEGREE_BINS)-1} degree bins: {DEGREE_LABELS}")
elif het_improvement > 0:
    print(f"   → DEGREE-AWARE MODELS PROVIDE MODEST BENEFIT")
    print(f"   → Consider simpler degree-correction methods")
    print(f"   → May not be worth added complexity")
else:
    print(f"   → DEGREE STRATIFICATION NOT HELPFUL")
    print(f"   → Use naive compositional or ML approaches")

if abs(het_improvement - null_improvement) > 0.02:
    print(f"\n   → BIOLOGICAL SIGNAL DETECTED")
    print(f"   → Consider hybrid: degree-aware + biological corrections")
    print(f"   → ML models may capture additional structure")

print(f"\n{'='*80}")
print("ANALYSIS COMPLETE")
print(f"{'='*80}")