# NB03: Contamination vs Functional Potential Models

Compute sample-level functional potential scores and test association with contamination.

Inputs:
- `../data/geochemistry_sample_matrix.tsv`
- `../data/community_taxon_counts.tsv`
- `../data/taxon_bridge.tsv`
- `../data/taxon_functional_features.tsv`

Outputs:
- `../data/site_functional_scores.tsv`
- `../data/model_results.tsv`
- `../figures/contamination_vs_functional_score.png`


In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from scipy.stats import linregress
import matplotlib.pyplot as plt

DATA_DIR = Path('../data')
FIG_DIR = Path('../figures')
FIG_DIR.mkdir(parents=True, exist_ok=True)

geo = pd.read_csv(DATA_DIR / 'geochemistry_sample_matrix.tsv', sep='	')
community = pd.read_csv(DATA_DIR / 'community_taxon_counts.tsv', sep='	')
bridge = pd.read_csv(DATA_DIR / 'taxon_bridge.tsv', sep='	')
features = pd.read_csv(DATA_DIR / 'taxon_functional_features.tsv', sep='	')

print('geo:', geo.shape)
print('community:', community.shape)
print('bridge:', bridge.shape)
print('features:', features.shape)


geo: (108, 49)
community: (41711, 5)
bridge: (8242, 5)
features: (1590, 4)


In [2]:
metal_keywords = ['uranium', 'chromium', 'nickel', 'zinc', 'copper', 'cadmium', 'lead', 'arsenic', 'mercury']
metal_cols = [c for c in geo.columns if any(k in c.lower() for k in metal_keywords)]
if not metal_cols:
    raise RuntimeError('No contamination columns found in geochemistry_sample_matrix.tsv')

geo_model = geo[['sdt_sample_name'] + metal_cols].copy()
for c in metal_cols:
    geo_model[c] = pd.to_numeric(geo_model[c], errors='coerce')

zparts = []
for c in metal_cols:
    s = np.log1p(geo_model[c])
    std = s.std(ddof=0)
    z = (s - s.mean()) / (std if std else 1)
    zparts.append(z.rename(c + '_z'))

zmat = pd.concat(zparts, axis=1)
geo_model['contamination_index'] = zmat.mean(axis=1, skipna=True)
geo_model = geo_model[['sdt_sample_name', 'contamination_index']].dropna()
print('Samples with contamination_index:', len(geo_model))


Samples with contamination_index: 108


In [3]:
comm = community[['sdt_sample_name', 'genus', 'read_count']].copy()
comm['read_count'] = pd.to_numeric(comm['read_count'], errors='coerce').fillna(0)
comm = comm[comm['read_count'] > 0]

genus_counts = comm.groupby(['sdt_sample_name', 'genus'], as_index=False)['read_count'].sum()
totals = genus_counts.groupby('sdt_sample_name', as_index=False)['read_count'].sum().rename(columns={'read_count':'sample_total'})
genus_counts = genus_counts.merge(totals, on='sdt_sample_name', how='left')
genus_counts['rel_abundance'] = genus_counts['read_count'] / genus_counts['sample_total']
print('Sample-genus abundance rows:', len(genus_counts))


Sample-genus abundance rows: 28001


In [4]:
bridge_ok = bridge[bridge['mapping_tier'] == 'genus_exact'][['genus', 'genus_norm']].drop_duplicates()
feat_wide = features.pivot_table(index='genus_norm', columns='feature_name', values='feature_value', aggfunc='mean').reset_index()

genus_feat = genus_counts.merge(bridge_ok, on='genus', how='left').merge(feat_wide, on='genus_norm', how='left')

for c in ['cog_defense_fraction', 'cog_mobilome_fraction', 'cog_metabolism_fraction']:
    if c not in genus_feat.columns:
        genus_feat[c] = 0.0
    genus_feat[c] = pd.to_numeric(genus_feat[c], errors='coerce').fillna(0.0)

genus_feat['stress_function_score'] = 0.5 * genus_feat['cog_defense_fraction'] + 0.5 * genus_feat['cog_mobilome_fraction']


In [5]:
for c in ['cog_defense_fraction', 'cog_mobilome_fraction', 'cog_metabolism_fraction', 'stress_function_score']:
    genus_feat[c + '_weighted'] = genus_feat['rel_abundance'] * genus_feat[c]

site_scores = genus_feat.groupby('sdt_sample_name', as_index=False)[
    ['cog_defense_fraction_weighted', 'cog_mobilome_fraction_weighted', 'cog_metabolism_fraction_weighted', 'stress_function_score_weighted']
].sum()
site_scores = site_scores.rename(columns={
    'cog_defense_fraction_weighted': 'site_defense_score',
    'cog_mobilome_fraction_weighted': 'site_mobilome_score',
    'cog_metabolism_fraction_weighted': 'site_metabolism_score',
    'stress_function_score_weighted': 'site_stress_score'
})

model_df = site_scores.merge(geo_model, on='sdt_sample_name', how='inner').dropna()
print('Model dataframe shape:', model_df.shape)


Model dataframe shape: (108, 6)


In [6]:
rows = []

for y in ['site_defense_score', 'site_mobilome_score', 'site_metabolism_score', 'site_stress_score']:
    d = model_df[['contamination_index', y]].dropna()
    if len(d) < 10:
        continue

    rho, p_spear = spearmanr(d['contamination_index'], d[y])

    lr = linregress(d['contamination_index'], d[y])
    beta = float(lr.slope)
    p_ols = float(lr.pvalue)

    obs = abs(rho)
    perms = 500
    gt = 0
    arr_x = d['contamination_index'].to_numpy()
    arr_y = d[y].to_numpy()
    rng = np.random.default_rng(42)
    for _ in range(perms):
        r, _ = spearmanr(arr_x, rng.permutation(arr_y))
        if abs(r) >= obs:
            gt += 1
    p_perm = (gt + 1) / (perms + 1)

    rows.append({
        'outcome': y,
        'n_samples': len(d),
        'spearman_rho': rho,
        'spearman_p': p_spear,
        'permutation_p': p_perm,
        'ols_beta_contamination': beta,
        'ols_p_contamination': p_ols
    })

model_results = pd.DataFrame(rows).sort_values('spearman_p') if rows else pd.DataFrame()
model_results


  rho, p_spear = spearmanr(d['contamination_index'], d[y])
  r, _ = spearmanr(arr_x, rng.permutation(arr_y))


Unnamed: 0,outcome,n_samples,spearman_rho,spearman_p,permutation_p,ols_beta_contamination,ols_p_contamination
0,site_defense_score,108,0.058685,0.546311,0.530938,0.000946,0.068083
3,site_stress_score,108,0.058685,0.546311,0.530938,0.000473,0.068083
2,site_metabolism_score,108,-0.00645,0.947181,0.944112,0.002382,0.761656
1,site_mobilome_score,108,,,0.001996,0.0,


In [7]:
site_scores.to_csv(DATA_DIR / 'site_functional_scores.tsv', sep='	', index=False)
model_results.to_csv(DATA_DIR / 'model_results.tsv', sep='	', index=False)

plot_df = model_df[['contamination_index', 'site_stress_score']].dropna()
plt.figure(figsize=(6, 4))
plt.scatter(plot_df['contamination_index'], plot_df['site_stress_score'], s=16, alpha=0.7)
if len(plot_df) >= 3:
    m, b = np.polyfit(plot_df['contamination_index'], plot_df['site_stress_score'], 1)
    xs = np.linspace(plot_df['contamination_index'].min(), plot_df['contamination_index'].max(), 100)
    plt.plot(xs, m * xs + b)
plt.xlabel('Contamination index (z-score composite)')
plt.ylabel('Site stress functional score')
plt.title('Contamination vs inferred stress functional potential')
plt.tight_layout()
plt.savefig(FIG_DIR / 'contamination_vs_functional_score.png', dpi=160)
plt.close()

print('Saved:')
print(' -', (DATA_DIR / 'site_functional_scores.tsv').resolve())
print(' -', (DATA_DIR / 'model_results.tsv').resolve())
print(' -', (FIG_DIR / 'contamination_vs_functional_score.png').resolve())
print('\nTop model rows:')
print(model_results.head(10).to_string(index=False) if len(model_results) else 'No model rows produced')


Saved:
 - /home/psdehal/pangenome_science/BERIL-research-observatory/projects/enigma_contamination_functional_potential/data/site_functional_scores.tsv
 - /home/psdehal/pangenome_science/BERIL-research-observatory/projects/enigma_contamination_functional_potential/data/model_results.tsv
 - /home/psdehal/pangenome_science/BERIL-research-observatory/projects/enigma_contamination_functional_potential/figures/contamination_vs_functional_score.png

Top model rows:
              outcome  n_samples  spearman_rho  spearman_p  permutation_p  ols_beta_contamination  ols_p_contamination
   site_defense_score        108      0.058685    0.546311       0.530938                0.000946             0.068083
    site_stress_score        108      0.058685    0.546311       0.530938                0.000473             0.068083
site_metabolism_score        108     -0.006450    0.947181       0.944112                0.002382             0.761656
  site_mobilome_score        108           NaN         NaN  