# NB05: Summary Figures

Publication-quality figures summarizing the Metabolic Capability vs Dependency analysis.

**Runs locally** — no Spark required.

### Figures
1. **Overview heatmap**: Organisms × pathways colored by classification
2. **Amino acid vs carbon**: Comparative proportions
3. **Fitness violin**: Gene fitness distributions by pathway classification
4. **Black Queen scatter**: Pangenome openness vs latent capabilities
5. **Species-level hexbin**: Pathway heterogeneity vs openness (27K species)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.patches import Patch
import seaborn as sns
from scipy import stats
from pathlib import Path

# Style
plt.rcParams.update({
    'font.size': 10,
    'axes.titlesize': 12,
    'axes.labelsize': 11,
    'figure.dpi': 150
})

DATA_DIR = Path('../data')
FIG_DIR = Path('../figures')
FIG_DIR.mkdir(exist_ok=True)

# Load all results
classifications = pd.read_csv(DATA_DIR / 'pathway_classifications.csv')
org_summary = pd.read_csv(DATA_DIR / 'organism_classification_summary.csv')
pgf = pd.read_csv(DATA_DIR / 'pathway_gene_fitness.csv')
cross_species = pd.read_csv(DATA_DIR / 'cross_species_analysis.csv')
org_mapping = pd.read_csv(DATA_DIR / 'organism_mapping.csv')
pangenome = pd.read_csv(DATA_DIR / 'pangenome_stats.csv')

print(f"Classifications: {len(classifications)}")
print(f"Pathway-gene-fitness: {len(pgf):,}")
print(f"Cross-species: {len(cross_species):,}")

## Figure 1: Overview Heatmap — Capability vs Dependency Landscape

In [None]:
# Color scheme
CLASS_COLORS = {
    'active_dependency': '#d62728',   # Red
    'latent_capability': '#2ca02c',   # Green
    'partial': '#ff7f0e',             # Orange
    'absent': '#e0e0e0',              # Light gray
    'unmapped': '#bdbdbd'             # Gray
}

CLASS_NUMERIC = {
    'absent': 0,
    'unmapped': 0.5,
    'partial': 1,
    'latent_capability': 2,
    'active_dependency': 3
}

# Build matrix
classifications['class_num'] = classifications['classification'].map(CLASS_NUMERIC)
pivot = classifications.pivot_table(
    index='orgId',
    columns='gapmind_pathway',
    values='class_num',
    aggfunc='first'
).fillna(0)

# Sort organisms by total active dependencies
org_order = (pivot == 3).sum(axis=1).sort_values(ascending=False).index

# Sort pathways: amino acid first, then carbon, then by active frequency
pathway_info = classifications.drop_duplicates('gapmind_pathway')[['gapmind_pathway', 'metabolic_category']]
pathway_active_freq = (classifications[classifications['classification'] == 'active_dependency']
                       .groupby('gapmind_pathway').size())
pathway_info = pathway_info.merge(
    pathway_active_freq.rename('n_active').reset_index(),
    on='gapmind_pathway', how='left'
).fillna(0)
pathway_info = pathway_info.sort_values(['metabolic_category', 'n_active'], ascending=[True, False])
pathway_order = [p for p in pathway_info['gapmind_pathway'] if p in pivot.columns]

pivot = pivot.reindex(index=org_order, columns=pathway_order)

# Plot
fig, ax = plt.subplots(figsize=(22, 8))

from matplotlib.colors import ListedColormap, BoundaryNorm
cmap = ListedColormap(['#e0e0e0', '#bdbdbd', '#ff7f0e', '#2ca02c', '#d62728'])
bounds = [-0.25, 0.25, 0.75, 1.5, 2.5, 3.5]
norm = BoundaryNorm(bounds, cmap.N)

im = ax.imshow(pivot.values, aspect='auto', cmap=cmap, norm=norm, interpolation='none')

ax.set_yticks(range(len(pivot.index)))
ax.set_yticklabels(pivot.index, fontsize=7)
ax.set_xticks(range(len(pivot.columns)))
ax.set_xticklabels(pivot.columns, rotation=90, fontsize=6)

# Add category divider line between amino acid and carbon pathways
aa_count = len([p for p in pathway_order if pathway_info[pathway_info['gapmind_pathway'] == p]['metabolic_category'].iloc[0] == 'amino_acid'])
ax.axvline(x=aa_count - 0.5, color='black', linewidth=2, linestyle='-')
ax.text(aa_count / 2, -1.5, 'Amino Acid Biosynthesis', ha='center', fontsize=9, fontweight='bold')
ax.text(aa_count + (len(pathway_order) - aa_count) / 2, -1.5, 'Carbon Source Utilization',
        ha='center', fontsize=9, fontweight='bold')

# Legend
legend_elements = [
    Patch(facecolor='#d62728', label='Active Dependency'),
    Patch(facecolor='#2ca02c', label='Latent Capability'),
    Patch(facecolor='#ff7f0e', label='Partial'),
    Patch(facecolor='#e0e0e0', label='Absent'),
]
ax.legend(handles=legend_elements, loc='upper right', fontsize=9,
         framealpha=0.9, edgecolor='black')

ax.set_title('Metabolic Pathway Classification: Capability vs Dependency', fontsize=14, pad=20)

plt.tight_layout()
plt.savefig(FIG_DIR / 'fig1_classification_heatmap.png', dpi=200, bbox_inches='tight')
plt.show()
print(f"Saved: figures/fig1_classification_heatmap.png")

## Figure 2: Amino Acid vs Carbon Source Classification Proportions

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

for idx, (cat, title) in enumerate([
    ('amino_acid', 'Amino Acid Biosynthesis'),
    ('carbon', 'Carbon Source Utilization')
]):
    cat_data = classifications[classifications['metabolic_category'] == cat]
    counts = cat_data['classification'].value_counts()
    
    colors = [CLASS_COLORS.get(c, '#999') for c in counts.index]
    labels = [c.replace('_', ' ').title() for c in counts.index]
    
    wedges, texts, autotexts = axes[idx].pie(
        counts.values, labels=labels, colors=colors,
        autopct='%1.1f%%', pctdistance=0.85,
        textprops={'fontsize': 9}
    )
    for t in autotexts:
        t.set_fontsize(8)
    axes[idx].set_title(title, fontsize=12)

plt.suptitle('Pathway Classification by Metabolic Category', fontsize=14)
plt.tight_layout()
plt.savefig(FIG_DIR / 'fig2_aa_vs_carbon_proportions.png', dpi=150, bbox_inches='tight')
plt.show()
print(f"Saved: figures/fig2_aa_vs_carbon_proportions.png")

## Figure 3: Fitness Distributions by Pathway Classification

In [None]:
# Get per-gene fitness data with pathway classification
gene_class = pgf.merge(
    classifications[['orgId', 'gapmind_pathway', 'classification']],
    on=['orgId', 'gapmind_pathway'],
    how='left'
)

# Filter to genes with fitness data in active/latent pathways
plot_data = gene_class[
    gene_class['classification'].isin(['active_dependency', 'latent_capability']) &
    gene_class['mean_abs_fit'].notna()
].copy()

if len(plot_data) > 0:
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    
    # Panel A: Violin plot of mean absolute fitness
    sns.violinplot(
        data=plot_data, x='classification', y='mean_abs_fit',
        palette={'active_dependency': '#d62728', 'latent_capability': '#2ca02c'},
        ax=axes[0], cut=0
    )
    axes[0].set_xlabel('')
    axes[0].set_ylabel('Mean |Fitness Effect|')
    axes[0].set_title('Gene Fitness Importance')
    axes[0].set_xticklabels(['Active\nDependency', 'Latent\nCapability'])
    
    # Panel B: Fraction of genes with significant fitness effects
    if 'n_sig_important' in plot_data.columns:
        plot_data['has_sig_effect'] = plot_data['n_sig_important'].fillna(0) > 0
        sig_rates = plot_data.groupby('classification')['has_sig_effect'].mean() * 100
        
        bars = axes[1].bar(
            range(len(sig_rates)),
            sig_rates.values,
            color=[CLASS_COLORS[c] for c in sig_rates.index],
            edgecolor='black', linewidth=0.5
        )
        axes[1].set_xticks(range(len(sig_rates)))
        axes[1].set_xticklabels(['Active\nDependency', 'Latent\nCapability'])
        axes[1].set_ylabel('% Genes with Significant Fitness Effect')
        axes[1].set_title('Fitness Significance Rate')
        
        for bar, val in zip(bars, sig_rates.values):
            axes[1].text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 1,
                        f'{val:.1f}%', ha='center', fontsize=10)
    
    plt.tight_layout()
    plt.savefig(FIG_DIR / 'fig3_fitness_by_classification.png', dpi=150, bbox_inches='tight')
    plt.show()
    print(f"Saved: figures/fig3_fitness_by_classification.png")
else:
    print("No fitness data available for visualization")

## Figure 4: Black Queen Hypothesis — Openness vs Latent Capabilities

In [None]:
# Merge organism classifications with pangenome data
fb_plot = org_summary.merge(
    org_mapping[['orgId', 'gtdb_species_clade_id']],
    on='orgId'
).merge(
    pangenome[['gtdb_species_clade_id', 'openness', 'no_genomes']],
    on='gtdb_species_clade_id'
)

if 'pct_latent' in fb_plot.columns and len(fb_plot) > 5:
    fb_plot = fb_plot.dropna(subset=['pct_latent', 'openness'])
    
    fig, ax = plt.subplots(figsize=(8, 6))
    
    scatter = ax.scatter(
        fb_plot['pct_latent'].astype(float),
        fb_plot['openness'].astype(float),
        s=80, c='steelblue', alpha=0.7, edgecolors='black', linewidth=0.5
    )
    
    # Add labels
    for _, row in fb_plot.iterrows():
        ax.annotate(
            row['orgId'], (float(row['pct_latent']), float(row['openness'])),
            fontsize=7, alpha=0.7, ha='center', va='bottom',
            xytext=(0, 4), textcoords='offset points'
        )
    
    # Trend line
    x = fb_plot['pct_latent'].astype(float).values
    y = fb_plot['openness'].astype(float).values
    mask = np.isfinite(x) & np.isfinite(y)
    if mask.sum() > 2:
        z = np.polyfit(x[mask], y[mask], 1)
        p = np.poly1d(z)
        x_line = np.linspace(x[mask].min(), x[mask].max(), 100)
        ax.plot(x_line, p(x_line), 'r--', alpha=0.5, linewidth=2)
        
        rho, pval = stats.spearmanr(x[mask], y[mask])
        ax.text(0.05, 0.95, f'Spearman rho = {rho:.3f}\np = {pval:.4f}',
               transform=ax.transAxes, fontsize=10, verticalalignment='top',
               bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    ax.set_xlabel('% Latent Capabilities\n(complete pathways with fitness-neutral genes)', fontsize=11)
    ax.set_ylabel('Pangenome Openness\n(accessory / total gene clusters)', fontsize=11)
    ax.set_title('H2: Black Queen Hypothesis\nMore Latent Capabilities → More Open Pangenome?', fontsize=13)
    
    plt.tight_layout()
    plt.savefig(FIG_DIR / 'fig4_black_queen.png', dpi=150, bbox_inches='tight')
    plt.show()
    print(f"Saved: figures/fig4_black_queen.png")
else:
    print("Insufficient data for Black Queen figure")

## Figure 5: Species-Level Pathway Heterogeneity vs Openness

In [None]:
if len(cross_species) > 100:
    fig, ax = plt.subplots(figsize=(8, 6))
    
    hb = ax.hexbin(
        cross_species['pathway_cv'],
        cross_species['openness'],
        gridsize=40, cmap='YlOrRd', mincnt=1,
        edgecolors='gray', linewidths=0.2
    )
    
    cb = plt.colorbar(hb, ax=ax, label='Number of species')
    
    rho = cross_species[['pathway_cv', 'openness']].corr(method='spearman').iloc[0, 1]
    ax.text(0.05, 0.95, f'Spearman rho = {rho:.3f}\nn = {len(cross_species):,} species',
           transform=ax.transAxes, fontsize=10, verticalalignment='top',
           bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    ax.set_xlabel('Within-Species Pathway Completeness CV\n(metabolic heterogeneity)', fontsize=11)
    ax.set_ylabel('Pangenome Openness', fontsize=11)
    ax.set_title('H3: Metabolic Ecotypes\nPathway Heterogeneity vs Pangenome Openness (27K species)', fontsize=13)
    
    plt.tight_layout()
    plt.savefig(FIG_DIR / 'fig5_species_heterogeneity.png', dpi=150, bbox_inches='tight')
    plt.show()
    print(f"Saved: figures/fig5_species_heterogeneity.png")
else:
    print("Insufficient cross-species data for figure")

## Summary Statistics for Report

In [None]:
print("=" * 60)
print("SUMMARY STATISTICS FOR REPORT")
print("=" * 60)

n_orgs = classifications['orgId'].nunique()
n_pathways = classifications['gapmind_pathway'].nunique()
print(f"\nScale: {n_orgs} organisms × {n_pathways} pathways = {len(classifications)} pairs")

print(f"\nClassification breakdown:")
for cls, count in classifications['classification'].value_counts().items():
    pct = count / len(classifications) * 100
    print(f"  {cls:25s}: {count:5d} ({pct:.1f}%)")

# Complete pathways only
complete = classifications[classifications['classification'].isin(['active_dependency', 'latent_capability'])]
if len(complete) > 0:
    n_active = (complete['classification'] == 'active_dependency').sum()
    n_latent = (complete['classification'] == 'latent_capability').sum()
    print(f"\nAmong complete pathways:")
    print(f"  Active dependencies: {n_active} ({n_active/(n_active+n_latent)*100:.1f}%)")
    print(f"  Latent capabilities: {n_latent} ({n_latent/(n_active+n_latent)*100:.1f}%)")
    
    # By category
    for cat in ['amino_acid', 'carbon']:
        cat_complete = complete[complete['metabolic_category'] == cat]
        if len(cat_complete) > 0:
            cat_active = (cat_complete['classification'] == 'active_dependency').sum()
            cat_total = len(cat_complete)
            print(f"  {cat}: {cat_active}/{cat_total} active ({cat_active/cat_total*100:.1f}%)")

print(f"\nCross-species analysis:")
print(f"  Species analyzed: {len(cross_species):,}")

print("\n" + "=" * 60)
print("All figures saved to figures/")
print("Next: Run /synthesize to create REPORT.md")
print("=" * 60)