# NB 07: Summary Figures

Generate publication-quality summary figures combining results
from NB01-NB06.

**Runs locally**.

**Outputs**: `figures/summary_*.png`

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns

PROJECT_DIR = Path('..').resolve()
DATA_DIR = PROJECT_DIR / 'data'
FIGURES_DIR = PROJECT_DIR / 'figures'

# Load all results
metal_exps = pd.read_csv(DATA_DIR / 'metal_experiments.csv')
metal_fitness = pd.read_csv(DATA_DIR / 'metal_fitness_scores.csv')
metal_cons = pd.read_csv(DATA_DIR / 'metal_conservation_stats.csv')
org_cons = pd.read_csv(DATA_DIR / 'organism_conservation_stats.csv')
conserved_fam = pd.read_csv(DATA_DIR / 'conserved_metal_families.csv')

# Optional files
novel_path = DATA_DIR / 'novel_metal_candidates.csv'
novel = pd.read_csv(novel_path) if novel_path.exists() else pd.DataFrame()
pred_path = DATA_DIR / 'metal_tolerance_predictions_fb.csv'
predictions = pd.read_csv(pred_path) if pred_path.exists() else pd.DataFrame()

print('Data loaded.')

Data loaded.


## Figure 1: Atlas Overview — Scale and Scope

In [2]:
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# Panel A: Experiments per metal
ax = axes[0]
exclude = ['Platinum', 'Metal_limitation']
exps_clean = metal_exps[~metal_exps['metal_element'].isin(exclude)]
metal_counts = exps_clean.groupby('metal_element').size().sort_values(ascending=True)
colors = ['#3498db' if m in ('Iron', 'Molybdenum', 'Tungsten', 'Selenium', 'Manganese') 
          else '#e74c3c' for m in metal_counts.index]
metal_counts.plot(kind='barh', ax=ax, color=colors, edgecolor='black', linewidth=0.5)
ax.set_xlabel('Number of Experiments')
ax.set_title('A. Metal Experiments in Fitness Browser')
from matplotlib.patches import Patch
ax.legend(handles=[
    Patch(color='#e74c3c', label='Toxic'),
    Patch(color='#3498db', label='Essential'),
], loc='lower right', fontsize=9)

# Panel B: Metal-important genes per metal (% of genome)
ax = axes[1]
imp_pct = metal_fitness.groupby('metal_element').apply(
    lambda x: 100 * x['is_metal_important'].mean()
).sort_values(ascending=True)
imp_pct.plot(kind='barh', ax=ax, color='#e67e22', edgecolor='black', linewidth=0.5)
ax.set_xlabel('% Genes with Metal Fitness Defect')
ax.set_title('B. Metal Impact on Gene Fitness')
ax.axvline(metal_fitness['is_metal_important'].mean()*100, color='gray', 
           linestyle='--', alpha=0.7, label='Overall mean')
ax.legend(fontsize=9)

# Panel C: Conservation delta (core fraction: important - baseline) per metal
ax = axes[2]
cons_sorted = metal_cons.sort_values('delta')
colors_c = ['#e74c3c' if c == 'toxic' else '#3498db' for c in cons_sorted['category']]
ax.barh(range(len(cons_sorted)), cons_sorted['delta'], color=colors_c, 
        edgecolor='black', linewidth=0.5)
ax.set_yticks(range(len(cons_sorted)))
ax.set_yticklabels(cons_sorted['metal'])
ax.axvline(0, color='black', linewidth=0.5)
ax.set_xlabel('Δ Core Fraction (important - baseline)')
ax.set_title('C. Metal Genes: Core Enrichment')
# Add significance markers
for i, (_, row) in enumerate(cons_sorted.iterrows()):
    if row['p_value'] < 0.05:
        ax.text(row['delta'] + 0.002 * np.sign(row['delta']), i, '*',
                ha='center', va='center', fontsize=12, fontweight='bold')

plt.suptitle('Pan-Bacterial Metal Fitness Atlas: Overview', fontsize=16, y=1.05)
plt.tight_layout()
fig.savefig(FIGURES_DIR / 'summary_atlas_overview.png', dpi=150, bbox_inches='tight')
plt.show()
print(f'Saved: figures/summary_atlas_overview.png')

Saved: figures/summary_atlas_overview.png


## Figure 2: Cross-Species Conservation of Metal Gene Families

In [3]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Panel A: Distribution of family breadth
ax = axes[0]
breadth = conserved_fam['n_organisms_any']
ax.hist(breadth, bins=range(1, breadth.max()+2), color='#2ecc71', 
        alpha=0.8, edgecolor='black', linewidth=0.5, align='left')
ax.set_xlabel('# Organisms with Metal Phenotype')
ax.set_ylabel('# Gene Families')
ax.set_title('A. Metal Gene Family Breadth')
n_conserved = (breadth >= 3).sum()
ax.text(0.95, 0.95, f'{n_conserved} families in ≥3 organisms',
        transform=ax.transAxes, ha='right', va='top', fontsize=10,
        bbox=dict(boxstyle='round', facecolor='lightyellow'))

# Panel B: Novel vs annotated
ax = axes[1]
n_annotated = len(conserved_fam) - len(novel)
n_novel_val = len(novel)
ax.bar(['Annotated', 'Hypothetical\n(novel candidates)'], 
       [n_annotated, n_novel_val],
       color=['#3498db', '#e74c3c'], edgecolor='black', linewidth=0.5)
ax.set_ylabel('# Conserved Metal Gene Families')
ax.set_title('B. Annotated vs Novel Metal Gene Families')
total = n_annotated + n_novel_val
ax.text(0, n_annotated + 5, f'{n_annotated}\n({100*n_annotated/total:.0f}%)', 
        ha='center', fontsize=11)
ax.text(1, n_novel_val + 5, f'{n_novel_val}\n({100*n_novel_val/total:.0f}%)', 
        ha='center', fontsize=11)

plt.suptitle('Conserved Metal Fitness Gene Families', fontsize=14, y=1.02)
plt.tight_layout()
fig.savefig(FIGURES_DIR / 'summary_metal_families.png', dpi=150, bbox_inches='tight')
plt.show()
print(f'Saved: figures/summary_metal_families.png')

Saved: figures/summary_metal_families.png


In [4]:
print('=' * 80)
print('NB07 SUMMARY: All Figures Generated')
print('=' * 80)
print(f'\nAll figures in {FIGURES_DIR}:')
for f in sorted(FIGURES_DIR.glob('*.png')):
    print(f'  {f.name}')
print(f'\nAll data files in {DATA_DIR}:')
for f in sorted(DATA_DIR.glob('*.csv')):
    n_rows = len(pd.read_csv(f))
    print(f'  {f.name:45s} {n_rows:>8,} rows')
print('=' * 80)

NB07 SUMMARY: All Figures Generated

All figures in /home/psdehal/pangenome_science/BERIL-research-observatory/projects/metal_fitness_atlas/figures:
  bioleaching_species_scores.png
  core_fraction_by_metal.png
  metal_conservation_by_organism.png
  metal_family_conservation_heatmap.png
  metal_fitness_distributions.png
  metal_important_genes_by_organism.png
  metal_module_activity_heatmap.png
  organism_metal_matrix.png
  species_metal_score_distribution.png
  summary_atlas_overview.png
  summary_metal_families.png

All data files in /home/psdehal/pangenome_science/BERIL-research-observatory/projects/metal_fitness_atlas/data:
  conserved_metal_families.csv                     1,182 rows
  metal_conservation_stats.csv                        14 rows
  metal_experiments.csv                              559 rows
  metal_experiments_analysis.csv                     379 rows


  metal_fitness_scores.csv                       383,349 rows
  metal_functional_signature.csv                   1,287 rows
  metal_important_genes.csv                       12,838 rows
  metal_module_conservation.csv                      183 rows
  metal_modules.csv                               19,453 rows
  novel_metal_candidates.csv                         149 rows
  organism_conservation_stats.csv                     22 rows
  sensitivity_analysis.csv                             2 rows
  species_metal_scores.csv                        27,702 rows
