# NB05: NMDC Environmental Gradient Analysis

**Project**: Prophage Ecology Across Bacterial Phylogeny and Environmental Gradients

**Goal**: Test H1d — infer per-sample prophage burden from NMDC metagenomic taxonomic profiles, weighted by pangenome-derived prophage prevalence per genus. Correlate with abiotic environmental features.

**Dependencies**: NB01 outputs (`data/species_module_summary.tsv`)

**Environment**: Requires BERDL JupyterHub (Spark SQL) for NMDC data access

**Outputs**:
- `data/nmdc_prophage_prevalence.tsv` — per-sample prophage inference scores
- `data/nmdc_module_by_environment.tsv` — per-module abiotic correlations
- `figures/nmdc_prophage_vs_abiotic.png`

In [None]:
import sys
import os
import pandas as pd
import numpy as np
from scipy import stats

spark = get_spark_session()

sys.path.insert(0, '../src')
from prophage_utils import MODULES

os.makedirs('../data', exist_ok=True)
os.makedirs('../figures', exist_ok=True)

# Load NB01 species module summary
species_summary = pd.read_csv('../data/species_module_summary.tsv', sep='\t')
print(f'Species with prophage data: {len(species_summary):,}')

## 1. NMDC Data Overview

NMDC has no per-sample gene-level functional annotations — we use taxonomy-based inference (same approach as PHB NB04).

In [None]:
# Load NMDC taxonomy features and study metadata
tax_features = spark.sql("SELECT * FROM nmdc_arkin.taxonomy_features").toPandas()
studies = spark.sql("SELECT * FROM nmdc_arkin.study_table").toPandas()
abiotic_all = spark.sql("SELECT * FROM nmdc_arkin.abiotic_features").toPandas()
tax_dim = spark.sql("SELECT * FROM nmdc_arkin.taxonomy_dim").toPandas()

print(f'NMDC samples with taxonomy: {len(tax_features):,}')
print(f'NMDC samples with abiotic data: {len(abiotic_all):,}')
print(f'NMDC studies: {len(studies)}')
print(f'Taxonomy dimension table: {len(tax_dim):,} taxa')

taxon_cols = [c for c in tax_features.columns if c != 'sample_id']
print(f'Taxon columns: {len(taxon_cols)}')

In [None]:
# Study ecosystem overview
study_eco = studies[['study_id', 'name', 'ecosystem', 'ecosystem_category',
                      'ecosystem_type', 'ecosystem_subtype']].copy()
print('NMDC studies by ecosystem:')
for _, row in study_eco.iterrows():
    eco = f"{row['ecosystem_category'] or '?'}/{row['ecosystem_type'] or '?'}"
    print(f"  {row['study_id']}: {eco} — {str(row['name'])[:60]}")

## 2. Build Genus-Level Prophage Burden Scores

From the pangenome, compute the mean prophage module count per genus. This is our reference for weighting NMDC taxonomic profiles.

In [None]:
# Get genus-level taxonomy from GTDB
taxonomy = spark.sql("""
    SELECT DISTINCT g.gtdb_species_clade_id,
           REGEXP_EXTRACT(t.genus, 'g__(.*)', 1) AS gtdb_genus_name
    FROM kbase_ke_pangenome.genome g
    JOIN kbase_ke_pangenome.gtdb_taxonomy_r214v1 t ON g.gtdb_taxonomy_id = t.gtdb_taxonomy_id
    WHERE t.genus IS NOT NULL
""").toPandas()

# Merge with species module summary
species_with_genus = species_summary.merge(taxonomy, on='gtdb_species_clade_id', how='left')

# Build genus-level prophage burden scores
module_ids = sorted(MODULES.keys())

# Per-genus: mean prophage prevalence and per-module prevalence
genus_scores = species_with_genus.groupby('gtdb_genus_name').agg(
    n_species=('gtdb_species_clade_id', 'count'),
    mean_prophage_clusters=('n_prophage_clusters', 'mean'),
    mean_modules=('n_modules_present', 'mean'),
    **{f'pct_{m}': (f'has_{m}', 'mean') for m in module_ids},
).reset_index()

# Overall prophage burden score = mean prophage cluster count per species
genus_scores['prophage_burden'] = genus_scores['mean_prophage_clusters']

print(f'Genera with prophage burden scores: {len(genus_scores):,}')
print(f'\nTop 10 genera by prophage burden:')
print(genus_scores.nlargest(10, 'prophage_burden')[['gtdb_genus_name', 'n_species',
      'prophage_burden', 'mean_modules']].to_string(index=False))

## 3. Two-Tier Taxonomy Mapping

Map NMDC taxon IDs to pangenome genera via:
- **Tier 1**: gtdb_metadata NCBI taxid → GTDB genus (handles renames)
- **Tier 2**: Direct genus name matching via taxonomy_dim

In [None]:
# Tier 1: gtdb_metadata bridge (NCBI taxid → GTDB genus)
print('=== Tier 1: Mapping via gtdb_metadata ===')
ncbi_to_gtdb = spark.sql("""
    SELECT DISTINCT
        CAST(m.ncbi_species_taxid AS INT) as ncbi_taxid,
        REGEXP_EXTRACT(m.gtdb_taxonomy, 'g__([^;]+)', 1) AS gtdb_genus
    FROM kbase_ke_pangenome.gtdb_metadata m
    WHERE m.gtdb_taxonomy IS NOT NULL
      AND REGEXP_EXTRACT(m.gtdb_taxonomy, 'g__([^;]+)', 1) != ''
    UNION
    SELECT DISTINCT
        CAST(m.ncbi_taxid AS INT) as ncbi_taxid,
        REGEXP_EXTRACT(m.gtdb_taxonomy, 'g__([^;]+)', 1) AS gtdb_genus
    FROM kbase_ke_pangenome.gtdb_metadata m
    WHERE m.gtdb_taxonomy IS NOT NULL
      AND REGEXP_EXTRACT(m.gtdb_taxonomy, 'g__([^;]+)', 1) != ''
""").toPandas()

print(f'gtdb_metadata mappings: {len(ncbi_to_gtdb):,}')

# For taxids mapping to multiple genera, take the most common
ncbi_genus_map = ncbi_to_gtdb.groupby('ncbi_taxid')['gtdb_genus'].agg(
    lambda x: x.value_counts().index[0]
).to_dict()

# Build Tier 1 mapping
taxid_to_genus = {}
tier1_hits = 0
for col_id in taxon_cols:
    try:
        tid = int(col_id)
    except (ValueError, TypeError):
        continue
    if tid in ncbi_genus_map:
        taxid_to_genus[col_id] = ncbi_genus_map[tid]
        tier1_hits += 1

print(f'Tier 1 matches: {tier1_hits}/{len(taxon_cols)}')

# Tier 2: Direct genus name matching via taxonomy_dim
print('\n=== Tier 2: Fallback via taxonomy_dim ===')
gtdb_genus_set = set(genus_scores['gtdb_genus_name'].dropna().str.strip().str.lower())

tier2_hits = 0
for col_id in taxon_cols:
    if col_id in taxid_to_genus:
        continue
    try:
        tid = int(col_id)
    except (ValueError, TypeError):
        continue
    matches = tax_dim[tax_dim['taxid'] == tid]
    if len(matches) > 0:
        genus = str(matches.iloc[0]['genus']).strip()
        if genus and genus.lower() not in ('unclassified', 'nan', ''):
            if genus.lower() in gtdb_genus_set:
                taxid_to_genus[col_id] = genus
                tier2_hits += 1

print(f'Tier 2 matches: {tier2_hits} additional')
print(f'\nTotal mapped: {len(taxid_to_genus)}/{len(taxon_cols)} taxon columns')

In [None]:
# Build genus → prophage score lookups
genus_burden_lookup = dict(zip(
    genus_scores['gtdb_genus_name'].str.lower(),
    genus_scores['prophage_burden']
))

# Per-module burden lookups
genus_module_lookups = {}
for module_id in module_ids:
    genus_module_lookups[module_id] = dict(zip(
        genus_scores['gtdb_genus_name'].str.lower(),
        genus_scores[f'pct_{module_id}']
    ))

# Build matched taxon list
matched = []
for col_id in taxon_cols:
    genus = taxid_to_genus.get(col_id, None)
    if genus and genus.lower() in genus_burden_lookup:
        matched.append((col_id, genus.lower()))

print(f'Taxon IDs matched to pangenome genera with prophage scores: {len(matched)}/{len(taxon_cols)}')

## 4. Compute Per-Sample Prophage Inference Scores

In [None]:
# Compute per-sample prophage inference scores
# Overall score + per-module scores
sample_scores = []

for _, row in tax_features.iterrows():
    sample_id = row['sample_id']
    overall_score = 0.0
    module_scores = {m: 0.0 for m in module_ids}
    matched_abundance = 0.0
    total_abundance = 0.0
    
    for col_id, genus in matched:
        abundance = pd.to_numeric(row.get(col_id, 0), errors='coerce')
        if pd.notna(abundance) and abundance > 0:
            overall_score += abundance * genus_burden_lookup.get(genus, 0)
            matched_abundance += abundance
            for module_id in module_ids:
                module_scores[module_id] += abundance * genus_module_lookups[module_id].get(genus, 0)
    
    for col in taxon_cols:
        val = pd.to_numeric(row.get(col, 0), errors='coerce')
        if pd.notna(val) and val > 0:
            total_abundance += val
    
    result = {
        'sample_id': sample_id,
        'prophage_score': overall_score,
        'matched_abundance': matched_abundance,
        'total_abundance': total_abundance,
        'pct_matched': matched_abundance / total_abundance * 100 if total_abundance > 0 else 0,
    }
    for module_id in module_ids:
        result[f'score_{module_id}'] = module_scores[module_id]
    
    sample_scores.append(result)

sample_prophage = pd.DataFrame(sample_scores)
print(f'Computed prophage scores for {len(sample_prophage):,} samples')
print(f'\nOverall prophage score distribution:')
print(sample_prophage['prophage_score'].describe())
print(f'\nTaxon matching coverage:')
print(f'  Median % abundance matched: {sample_prophage["pct_matched"].median():.1f}%')

In [None]:
# Per-module score distributions
print('Per-module score summary:')
for module_id in module_ids:
    col = f'score_{module_id}'
    print(f'  {module_id}: median={sample_prophage[col].median():.3f}, '
          f'mean={sample_prophage[col].mean():.3f}, '
          f'max={sample_prophage[col].max():.3f}')

## 5. Correlate with Abiotic Features

In [None]:
# Merge prophage scores with abiotic features
prophage_abiotic = sample_prophage.merge(abiotic_all, on='sample_id', how='inner')
print(f'Samples with both prophage scores and abiotic data: {len(prophage_abiotic):,}')

# Identify abiotic columns
abiotic_cols = [c for c in abiotic_all.columns if c != 'sample_id']

# Correlate overall prophage score with abiotic variables
overall_corr = []
for col in abiotic_cols:
    vals = pd.to_numeric(prophage_abiotic[col], errors='coerce')
    valid = vals.notna() & (vals != 0) & prophage_abiotic['prophage_score'].notna()
    if valid.sum() >= 30:
        rho, p = stats.spearmanr(prophage_abiotic.loc[valid, 'prophage_score'], vals[valid])
        overall_corr.append({
            'abiotic_variable': col,
            'score_type': 'overall',
            'n': valid.sum(),
            'spearman_rho': rho,
            'p_value': p,
        })

overall_corr_df = pd.DataFrame(overall_corr).sort_values('p_value')
print('\nOverall prophage score vs abiotic variables:')
print(overall_corr_df.to_string(index=False))

In [None]:
# Per-module correlations with abiotic variables
module_corr_results = []

for module_id in module_ids:
    score_col = f'score_{module_id}'
    for abiotic_col in abiotic_cols:
        vals = pd.to_numeric(prophage_abiotic[abiotic_col], errors='coerce')
        valid = vals.notna() & (vals != 0) & prophage_abiotic[score_col].notna()
        if valid.sum() >= 30:
            rho, p = stats.spearmanr(prophage_abiotic.loc[valid, score_col], vals[valid])
            module_corr_results.append({
                'module': module_id,
                'module_name': MODULES[module_id]['full_name'],
                'abiotic_variable': abiotic_col,
                'n': valid.sum(),
                'spearman_rho': rho,
                'p_value': p,
            })

module_corr_df = pd.DataFrame(module_corr_results)

# Multiple testing correction (FDR)
from statsmodels.stats.multitest import multipletests
if len(module_corr_df) > 0:
    reject, pvals_corrected, _, _ = multipletests(module_corr_df['p_value'], method='fdr_bh')
    module_corr_df['p_fdr'] = pvals_corrected
    module_corr_df['significant_fdr'] = reject

# Show significant per-module correlations
sig_module = module_corr_df[module_corr_df['significant_fdr'] == True].sort_values('p_fdr')
print(f'\nSignificant module-abiotic correlations (FDR < 0.05): {len(sig_module)}')
if len(sig_module) > 0:
    print(sig_module[['module_name', 'abiotic_variable', 'spearman_rho', 'p_fdr']].head(30).to_string(index=False))

## 6. Study-Level Analysis

Compare prophage burden across NMDC study ecosystem types.

In [None]:
# Try to link samples to studies
# The biosample_set or omics_processing tables should have study links
try:
    sample_study = spark.sql("""
        SELECT DISTINCT sample_id, study_id
        FROM nmdc_arkin.biosample_set
    """).toPandas()
    print(f'Sample-study links from biosample_set: {len(sample_study):,}')
except Exception as e:
    print(f'biosample_set query failed: {e}')
    # Try alternative approach
    try:
        sample_study = spark.sql("""
            SELECT DISTINCT sample_id, study_id
            FROM nmdc_arkin.omics_processing_set
        """).toPandas()
        print(f'Sample-study links from omics_processing_set: {len(sample_study):,}')
    except Exception as e2:
        print(f'omics_processing_set also failed: {e2}')
        sample_study = None

if sample_study is not None and len(sample_study) > 0:
    # Merge with prophage scores and study metadata
    study_prophage = sample_prophage.merge(sample_study, on='sample_id', how='inner')
    study_prophage = study_prophage.merge(
        studies[['study_id', 'name', 'ecosystem_category', 'ecosystem_type']],
        on='study_id', how='left'
    )
    
    print(f'\nSamples linked to studies: {len(study_prophage):,}')
    
    # Prophage score by ecosystem type
    eco_stats = study_prophage.groupby('ecosystem_type').agg(
        n_samples=('sample_id', 'count'),
        mean_prophage_score=('prophage_score', 'mean'),
        median_prophage_score=('prophage_score', 'median'),
    ).sort_values('median_prophage_score', ascending=False)
    
    print('\nProphage score by ecosystem type:')
    print(eco_stats.to_string())
else:
    print('Could not link samples to studies — skipping study-level analysis')
    study_prophage = None

## 7. Save Outputs

In [None]:
# Save per-sample prophage scores
sample_prophage.to_csv('../data/nmdc_prophage_prevalence.tsv', sep='\t', index=False)
print(f'Saved data/nmdc_prophage_prevalence.tsv: {len(sample_prophage):,} rows')

# Save module × abiotic correlations
module_corr_df.to_csv('../data/nmdc_module_by_environment.tsv', sep='\t', index=False)
print(f'Saved data/nmdc_module_by_environment.tsv: {len(module_corr_df):,} rows')

## 8. Figures

In [None]:
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns

# Figure: multi-panel prophage vs abiotic
# Top correlations (by p-value) for overall score
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Panel 1: Distribution of prophage scores
ax = axes[0, 0]
ax.hist(sample_prophage['prophage_score'], bins=50, color='#E91E63', alpha=0.8, edgecolor='white')
ax.set_xlabel('Prophage inference score')
ax.set_ylabel('Number of samples')
ax.set_title('Distribution of Prophage Scores')
ax.axvline(sample_prophage['prophage_score'].median(), color='black', linestyle='--', alpha=0.5)

# Panel 2: Matching coverage
ax = axes[0, 1]
ax.hist(sample_prophage['pct_matched'], bins=50, color='#4CAF50', alpha=0.8, edgecolor='white')
ax.set_xlabel('% abundance matched')
ax.set_ylabel('Number of samples')
ax.set_title('Pangenome Matching Coverage')

# Panels 3-6: Top 4 abiotic correlations
if len(overall_corr_df) > 0:
    top_corr = overall_corr_df.head(4)
    panel_positions = [(0, 2), (1, 0), (1, 1), (1, 2)]
    
    for i, (_, corr_row) in enumerate(top_corr.iterrows()):
        if i >= len(panel_positions):
            break
        r, c = panel_positions[i]
        ax = axes[r, c]
        col = corr_row['abiotic_variable']
        vals = pd.to_numeric(prophage_abiotic[col], errors='coerce')
        valid = vals.notna() & (vals != 0) & prophage_abiotic['prophage_score'].notna()
        
        ax.scatter(vals[valid], prophage_abiotic.loc[valid, 'prophage_score'],
                   alpha=0.2, s=10, color='#E91E63')
        clean_name = col.replace('annotations_', '').replace('_has_numeric_value', '')
        ax.set_xlabel(clean_name)
        ax.set_ylabel('Prophage score')
        ax.set_title(f'rho={corr_row["spearman_rho"]:.3f}, p={corr_row["p_value"]:.1e}')

plt.suptitle('NMDC: Prophage Burden vs Environmental Variables', fontsize=13, y=1.02)
plt.tight_layout()
plt.savefig('../figures/nmdc_prophage_vs_abiotic.png', dpi=150, bbox_inches='tight')
plt.show()
print('Saved figures/nmdc_prophage_vs_abiotic.png')

In [None]:
# Figure: Per-module abiotic correlation heatmap
if len(sig_module) > 0:
    # Pivot: module × abiotic variable → rho
    pivot = module_corr_df.pivot_table(
        index='module_name', columns='abiotic_variable', values='spearman_rho'
    )
    
    # Clean column names
    pivot.columns = [c.replace('annotations_', '').replace('_has_numeric_value', '')
                     for c in pivot.columns]
    
    fig, ax = plt.subplots(figsize=(14, 6))
    sns.heatmap(pivot, cmap='RdBu_r', center=0, annot=True, fmt='.2f', ax=ax,
                cbar_kws={'label': 'Spearman rho'})
    ax.set_title('Per-Module Correlation with Abiotic Variables')
    ax.set_ylabel('')
    plt.tight_layout()
    plt.savefig('../figures/nmdc_module_abiotic_heatmap.png', dpi=150, bbox_inches='tight')
    plt.show()
    print('Saved figures/nmdc_module_abiotic_heatmap.png')
else:
    print('No significant module-abiotic correlations to visualize')

In [None]:
# Summary
print('='*60)
print('NB05 SUMMARY')
print('='*60)
print(f'NMDC samples scored: {len(sample_prophage):,}')
print(f'Median matching coverage: {sample_prophage["pct_matched"].median():.1f}%')
print(f'Genera matched to pangenome: {len(matched)}/{len(taxon_cols)}')

n_sig_overall = (overall_corr_df['p_value'] < 0.05).sum() if len(overall_corr_df) > 0 else 0
print(f'\nOverall prophage score significant correlations (p<0.05): {n_sig_overall}')
if len(overall_corr_df) > 0:
    for _, row in overall_corr_df.head(5).iterrows():
        clean = row['abiotic_variable'].replace('annotations_', '').replace('_has_numeric_value', '')
        print(f'  {clean}: rho={row["spearman_rho"]:.3f}, p={row["p_value"]:.2e}')

n_sig_module = len(sig_module) if len(module_corr_df) > 0 else 0
print(f'\nPer-module significant correlations (FDR<0.05): {n_sig_module}')

print(f'\nFiles saved:')
print(f'  data/nmdc_prophage_prevalence.tsv ({len(sample_prophage):,} rows)')
print(f'  data/nmdc_module_by_environment.tsv ({len(module_corr_df):,} rows)')
print(f'  figures/nmdc_prophage_vs_abiotic.png')