# NB01: ENIGMA Extraction and QC

Extract overlap-ready geochemistry, community, and metadata tables for downstream modeling.

Requires BERDL JupyterHub with built-in `get_spark_session()`.

Outputs:
- `../data/geochemistry_sample_matrix.tsv`
- `../data/community_taxon_counts.tsv`
- `../data/sample_location_metadata.tsv`


In [None]:
from pathlib import Path
import re
import pandas as pd

DATA_DIR = Path('../data')
DATA_DIR.mkdir(parents=True, exist_ok=True)

spark = get_spark_session()
print('Spark session ready')


In [None]:
tables = spark.sql('SHOW TABLES IN enigma_coral').toPandas()
table_names = set(tables['tableName'].tolist())
print(f"ENIGMA tables discovered: {len(table_names)}")

community_brick = 'ddt_brick0000476' if 'ddt_brick0000476' in table_names else 'ddt_brick0000459'
print('Using community count table:', community_brick)

for t in ['ddt_brick0000010', community_brick, 'ddt_brick0000454', 'sdt_sample', 'sdt_community', 'sdt_location']:
    print(f"\n=== {t} ===")
    spark.sql(f'DESCRIBE enigma_coral.{t}').show(200, truncate=False)


In [None]:
geochem_raw = spark.sql("""
SELECT
  sdt_sample_name,
  molecule_from_list_sys_oterm_name AS molecule,
  CAST(concentration_micromolar AS DOUBLE) AS concentration_micromolar
FROM enigma_coral.ddt_brick0000010
WHERE concentration_micromolar IS NOT NULL
""").toPandas()

geochem_raw = geochem_raw.dropna(subset=['sdt_sample_name', 'molecule'])
print(f"Geochemistry raw rows: {len(geochem_raw):,}")
print(f"Samples: {geochem_raw['sdt_sample_name'].nunique():,}")
print(f"Molecules: {geochem_raw['molecule'].nunique():,}")

geochem = geochem_raw.pivot_table(
    index='sdt_sample_name',
    columns='molecule',
    values='concentration_micromolar',
    aggfunc='mean'
)
geochem.columns = [re.sub(r'[^a-z0-9]+', '_', c.lower()).strip('_').replace('_atom', '') for c in geochem.columns]
geochem = geochem.reset_index()
print('Geochemistry matrix shape:', geochem.shape)

target_cols = [c for c in geochem.columns if any(k in c for k in ['uranium', 'chromium', 'nickel', 'zinc', 'copper', 'cadmium', 'lead', 'arsenic', 'mercury'])]
print('Contaminant columns:', target_cols[:20])


In [None]:
geochem_samples = geochem[['sdt_sample_name']].drop_duplicates()
spark.createDataFrame(geochem_samples).createOrReplaceTempView('geochem_samples_tmp')

overlap_communities = spark.sql("""
SELECT DISTINCT
  c.sdt_community_name,
  c.sdt_sample_name
FROM enigma_coral.sdt_community c
JOIN geochem_samples_tmp g ON c.sdt_sample_name = g.sdt_sample_name
WHERE c.sdt_community_name IS NOT NULL
""").toPandas()
print('Overlap communities:', len(overlap_communities))
print('Overlap samples:', overlap_communities['sdt_sample_name'].nunique())

spark.createDataFrame(overlap_communities).createOrReplaceTempView('overlap_comms_tmp')

asv_counts = spark.sql(f"""
SELECT
  b.sdt_asv_name,
  b.sdt_community_name,
  o.sdt_sample_name,
  CAST(regexp_replace(CAST(b.count_count_unit AS STRING), '[^0-9.-]', '') AS DOUBLE) AS read_count
FROM enigma_coral.{community_brick} b
JOIN overlap_comms_tmp o ON b.sdt_community_name = o.sdt_community_name
WHERE b.count_count_unit IS NOT NULL
""").toPandas()

asv_counts = asv_counts.dropna(subset=['sdt_asv_name', 'sdt_community_name', 'sdt_sample_name', 'read_count'])
asv_counts = asv_counts[asv_counts['read_count'] > 0]
asv_counts = asv_counts.groupby(['sdt_asv_name', 'sdt_community_name', 'sdt_sample_name'], as_index=False)['read_count'].sum()
print(f"ASV count rows after cleanup: {len(asv_counts):,}")
print(f"Unique ASVs: {asv_counts['sdt_asv_name'].nunique():,}")


In [None]:
asv_genus = spark.sql("""
SELECT DISTINCT
  sdt_asv_name,
  sdt_taxon_name AS genus
FROM enigma_coral.ddt_brick0000454
WHERE taxonomic_level_sys_oterm_name = 'Genus'
""").toPandas()

asv_phylum = spark.sql("""
SELECT DISTINCT
  sdt_asv_name,
  sdt_taxon_name AS phylum
FROM enigma_coral.ddt_brick0000454
WHERE taxonomic_level_sys_oterm_name = 'Phylum'
""").toPandas()

asv_tax = asv_genus.merge(asv_phylum, on='sdt_asv_name', how='left')
community_taxon = asv_counts.merge(asv_tax, on='sdt_asv_name', how='left')

community_taxon['genus'] = community_taxon['genus'].fillna('unclassified')
community_taxon['phylum'] = community_taxon['phylum'].fillna('unclassified')
print('Community+taxonomy rows:', len(community_taxon))
print('Unique genera:', community_taxon['genus'].nunique())


In [None]:
sample_meta = spark.sql("""
SELECT DISTINCT
  s.sdt_sample_name,
  s.sdt_location_name,
  s.date,
  s.depth_meter,
  l.latitude_degree,
  l.longitude_degree,
  l.region
FROM enigma_coral.sdt_sample s
LEFT JOIN enigma_coral.sdt_location l
  ON s.sdt_location_name = l.sdt_location_name
""").toPandas()

valid_samples = set(community_taxon['sdt_sample_name'])
geochem_f = geochem[geochem['sdt_sample_name'].isin(valid_samples)].copy()
community_f = community_taxon[community_taxon['sdt_sample_name'].isin(valid_samples)].copy()
sample_meta_f = sample_meta[sample_meta['sdt_sample_name'].isin(valid_samples)].copy()

print('Filtered geochemistry shape:', geochem_f.shape)
print('Filtered community rows:', len(community_f))
print('Filtered sample metadata rows:', len(sample_meta_f))


In [None]:
geochem_f.to_csv(DATA_DIR / 'geochemistry_sample_matrix.tsv', sep='	', index=False)
community_f.to_csv(DATA_DIR / 'community_taxon_counts.tsv', sep='	', index=False)
sample_meta_f.to_csv(DATA_DIR / 'sample_location_metadata.tsv', sep='	', index=False)

print('Saved outputs:')
print(' -', (DATA_DIR / 'geochemistry_sample_matrix.tsv').resolve())
print(' -', (DATA_DIR / 'community_taxon_counts.tsv').resolve())
print(' -', (DATA_DIR / 'sample_location_metadata.tsv').resolve())

print('\nSummary')
print('Samples with both geochem+community:', geochem_f['sdt_sample_name'].nunique())
print('Communities:', community_f['sdt_community_name'].nunique())
print('ASVs:', community_f['sdt_asv_name'].nunique())
print('Genera:', community_f['genus'].nunique())
