# NB02: Interactive Exploration -- AlphaEarth Embeddings, Geography & Environment

**Runs locally** (or on JupyterHub). Requires `pip install -r ../requirements.txt`.

Loads the extracted data from NB01 and creates interactive visualizations:
1. Coverage overview (UpSet plot)
2. Coordinate QC (flag institutional addresses)
3. Environment label harmonization
4. UMAP of 64-dim embedding space
5. Geographic map
6. Embedding vs geography distance
7. Environment label vs embedding cluster cross-tabulation

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
from upsetplot import UpSet
from scipy.spatial.distance import cosine
from sklearn.cluster import DBSCAN
import umap
import os
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

DATA_DIR = '../data'
FIG_DIR = '../figures'
os.makedirs(FIG_DIR, exist_ok=True)

# Embedding columns
EMB_COLS = [f'A{i:02d}' for i in range(64)]

In [None]:
# Load extracted data
df = pd.read_csv(os.path.join(DATA_DIR, 'alphaearth_with_env.csv'))
coverage = pd.read_csv(os.path.join(DATA_DIR, 'coverage_stats.csv'))
attr_counts = pd.read_csv(os.path.join(DATA_DIR, 'ncbi_env_attribute_counts.csv'))
iso_counts = pd.read_csv(os.path.join(DATA_DIR, 'isolation_source_raw_counts.csv'))

print(f'Loaded {len(df):,} genomes with {len(df.columns)} columns')
print(f'Embedding dimensions: {len(EMB_COLS)}')
print(f'Environment attributes inventoried: {len(attr_counts)}')
print(f'Unique isolation_source values: {len(iso_counts):,}')

---
## 1. Coverage Overview

UpSet plot showing which combinations of metadata attributes are available.

In [None]:
# Build boolean DataFrame for UpSet plot
# Use the has_* columns created in NB01, or recompute
upset_cols = {
    'Lat/Lon': df['cleaned_lat'].notna() & df['cleaned_lon'].notna(),
    'Isolation Source': df['isolation_source'].notna(),
    'Env Broad Scale': df['env_broad_scale'].notna(),
    'Env Local Scale': df['env_local_scale'].notna(),
    'Host': df['host'].notna(),
    'Geo Location': df['geo_loc_name'].notna(),
}

upset_df = pd.DataFrame(upset_cols)

# Print summary
print('Attribute coverage (of all AlphaEarth genomes):')
for col in upset_df.columns:
    n = upset_df[col].sum()
    print(f'  {col}: {n:,} ({100*n/len(df):.1f}%)')

In [None]:
# UpSet plot
upset_index = upset_df.set_index(list(upset_df.columns))
upset_series = upset_index.groupby(level=list(upset_df.columns)).size()

fig_upset, ax = plt.subplots()
upset = UpSet(
    upset_series,
    subset_size='auto',
    show_counts=True,
    sort_by='cardinality',
    min_subset_size=100,
)
upset.plot()
plt.suptitle('Metadata Availability for AlphaEarth Genomes', y=1.02, fontsize=14)
plt.savefig(os.path.join(FIG_DIR, 'coverage_upset.png'), dpi=150, bbox_inches='tight')
plt.show()
print('Saved figures/coverage_upset.png')

In [None]:
# Bar chart of attribute population rates
fig_cov = px.bar(
    coverage.sort_values('pct_of_alphaearth', ascending=True),
    x='pct_of_alphaearth',
    y='attribute',
    orientation='h',
    text='n_genomes',
    title='NCBI Environment Attribute Population Rates (AlphaEarth genomes)',
    labels={'pct_of_alphaearth': '% of AlphaEarth genomes', 'attribute': ''},
)
fig_cov.update_traces(texttemplate='%{text:,}', textposition='outside')
fig_cov.update_layout(width=700, height=400)
fig_cov.show()

---
## 2. Coordinate QC

Flag potentially suspicious lat/lon entries:
- Exact-duplicate coordinates (many genomes at same spot = likely institutional address)
- Integer-degree coordinates (low precision, possibly approximate)
- Coordinates that seem unreasonable

In [None]:
# Work with genomes that have lat/lon
has_coords = df['cleaned_lat'].notna() & df['cleaned_lon'].notna()
coords = df[has_coords].copy()
print(f'Genomes with lat/lon: {len(coords):,}')

# Round to 4 decimal places for duplicate detection
coords['lat_round'] = coords['cleaned_lat'].round(4)
coords['lon_round'] = coords['cleaned_lon'].round(4)
coords['coord_key'] = coords['lat_round'].astype(str) + ',' + coords['lon_round'].astype(str)

In [None]:
# Count genomes per coordinate
coord_counts = coords['coord_key'].value_counts().reset_index()
coord_counts.columns = ['coord_key', 'n_genomes']

print(f'Unique coordinate locations: {len(coord_counts):,}')
print(f'\nTop 20 most-shared coordinates:')

# For top coordinates, check diversity of isolation sources
top_coords = coord_counts.head(20)
for _, row in top_coords.iterrows():
    mask = coords['coord_key'] == row['coord_key']
    subset = coords[mask]
    n_species = subset['species'].nunique() if 'species' in subset.columns else '?'
    iso_values = subset['isolation_source'].dropna().unique()
    n_iso = len(iso_values)
    iso_sample = ', '.join(iso_values[:3]) if n_iso > 0 else 'N/A'
    lat, lon = row['coord_key'].split(',')
    print(f'  ({lat}, {lon}): {row["n_genomes"]:,} genomes, '
          f'{n_species} species, {n_iso} isolation sources [{iso_sample}]')

In [None]:
# Flag suspicious coordinates
# Criterion 1: Many genomes at exact same location with diverse species = likely institutional
coord_genome_count = coords.groupby('coord_key').size()
coord_species_count = coords.groupby('coord_key')['species'].nunique()

# Suspicious if: >50 genomes at same spot AND >10 different species
suspicious_coords = set(
    coord_genome_count[
        (coord_genome_count > 50) & 
        (coord_species_count > 10)
    ].index
)

# Criterion 2: Integer lat AND integer lon (low precision)
is_integer_lat = (coords['cleaned_lat'] % 1 == 0)
is_integer_lon = (coords['cleaned_lon'] % 1 == 0)
is_integer_coords = is_integer_lat & is_integer_lon

# Build quality flag
coords['coord_suspicious_cluster'] = coords['coord_key'].isin(suspicious_coords)
coords['coord_integer_degrees'] = is_integer_coords
coords['coord_quality'] = 'good'
coords.loc[coords['coord_integer_degrees'], 'coord_quality'] = 'low_precision'
coords.loc[coords['coord_suspicious_cluster'], 'coord_quality'] = 'suspicious_cluster'

print('Coordinate quality flags:')
print(coords['coord_quality'].value_counts().to_string())
print(f'\nSuspicious cluster coordinates: {len(suspicious_coords)} locations')
print(f'Integer-degree coordinates: {is_integer_coords.sum():,} genomes')

In [None]:
# Map of coordinate quality
fig_qc = px.scatter_geo(
    coords,
    lat='cleaned_lat',
    lon='cleaned_lon',
    color='coord_quality',
    color_discrete_map={
        'good': 'green',
        'low_precision': 'orange',
        'suspicious_cluster': 'red'
    },
    hover_data=['genome_id', 'species', 'isolation_source', 'coord_key'],
    title='Coordinate Quality Assessment',
    opacity=0.5,
    size_max=5,
)
fig_qc.update_layout(width=1000, height=600)
fig_qc.show()

In [None]:
# Propagate coord_quality back to main DataFrame
df['coord_quality'] = 'no_coords'
df.loc[coords.index, 'coord_quality'] = coords['coord_quality']

print('Coordinate quality distribution (all genomes):')
print(df['coord_quality'].value_counts().to_string())

---
## 3. Environment Label Harmonization

Map free-text `isolation_source` values to broad categories using keyword matching.

In [None]:
# Keyword-based harmonization of isolation_source
# Order matters: first match wins
ENV_CATEGORIES = [
    # Marine / Ocean
    ('Marine', ['ocean', 'marine', 'sea water', 'seawater', 'deep sea',
                'coastal water', 'marine sediment', 'coral', 'sponge',
                'hydrothermal', 'estuary', 'brackish', 'saline lake',
                'salt lake', 'salt marsh', 'mangrove']),
    # Freshwater
    ('Freshwater', ['freshwater', 'fresh water', 'river', 'lake', 'stream',
                    'pond', 'spring water', 'groundwater', 'aquifer',
                    'drinking water', 'tap water', 'well water']),
    # Soil
    ('Soil', ['soil', 'rhizosphere', 'root', 'compost', 'peat',
              'permafrost', 'sediment', 'mud', 'clay', 'sand',
              'agricultural', 'farmland', 'forest soil', 'grassland']),
    # Human gut
    ('Human gut', ['human gut', 'human feces', 'human fecal', 'human stool',
                   'human faeces', 'human faecal', 'human intestin',
                   'human colon', 'human cecum', 'human rectal',
                   'meconium', 'infant fec', 'infant gut']),
    # Human other
    ('Human other', ['human', 'homo sapiens', 'patient', 'clinical',
                     'blood', 'sputum', 'urine', 'wound', 'abscess',
                     'skin', 'oral', 'saliva', 'nasal', 'vaginal',
                     'respiratory', 'csf', 'biopsy', 'bronch']),
    # Animal
    ('Animal', ['chicken', 'cattle', 'cow', 'pig', 'swine', 'sheep',
                'goat', 'horse', 'dog', 'cat', 'mouse', 'rat',
                'fish', 'shrimp', 'insect', 'bee', 'ant', 'termite',
                'bird', 'poultry', 'animal', 'bovine', 'porcine',
                'feline', 'canine', 'avian', 'tick', 'mosquito',
                'nematode', 'worm']),
    # Plant
    ('Plant', ['plant', 'leaf', 'stem', 'flower', 'fruit', 'seed',
               'phyllosphere', 'endophyte', 'epiphyte', 'bark',
               'wood', 'crop', 'rice', 'wheat', 'maize', 'corn',
               'soybean', 'potato', 'tomato', 'lettuce', 'grape']),
    # Food / Fermented
    ('Food', ['food', 'cheese', 'milk', 'dairy', 'yogurt', 'ferment',
              'kimchi', 'sauerkraut', 'wine', 'beer', 'kefir',
              'meat', 'sausage', 'bread', 'dough', 'pickle']),
    # Wastewater / Industrial
    ('Wastewater', ['wastewater', 'waste water', 'sewage', 'sludge',
                    'activated sludge', 'bioreactor', 'biogas',
                    'anaerobic digest', 'treatment plant']),
    # Hot spring / Extreme
    ('Extreme', ['hot spring', 'thermal', 'geothermal', 'volcanic',
                 'hypersaline', 'alkaline', 'acidic', 'acid mine',
                 'mine drainage', 'glacier', 'ice', 'polar',
                 'desert', 'cave']),
    # Air
    ('Air', ['air', 'atmosphere', 'aerosol', 'dust', 'indoor air']),
]


def harmonize_isolation_source(value):
    """Map a raw isolation_source string to a broad category."""
    if pd.isna(value):
        return 'Unknown'
    value_lower = str(value).lower().strip()
    if value_lower in ('', 'missing', 'not collected', 'not applicable',
                       'not available', 'unknown', 'na', 'n/a', 'none'):
        return 'Unknown'
    for category, keywords in ENV_CATEGORIES:
        for kw in keywords:
            if kw in value_lower:
                return category
    return 'Other'


df['env_category'] = df['isolation_source'].apply(harmonize_isolation_source)

print('Harmonized environment categories:')
cat_counts = df['env_category'].value_counts()
for cat, count in cat_counts.items():
    pct = 100 * count / len(df)
    print(f'  {cat}: {count:,} ({pct:.1f}%)')

In [None]:
# Show what got mapped to "Other" -- these may need additional keywords
other_mask = df['env_category'] == 'Other'
other_sources = df.loc[other_mask, 'isolation_source'].value_counts().head(30)
print(f'Top 30 "Other" isolation sources ({other_mask.sum():,} genomes):')
print(other_sources.to_string())

In [None]:
# Check env_broad_scale -- is it cleaner than isolation_source?
print('env_broad_scale coverage and top values:')
n_broad = df['env_broad_scale'].notna().sum()
print(f'  Non-null: {n_broad:,} ({100*n_broad/len(df):.1f}%)')
if n_broad > 0:
    print(f'\nTop 20 env_broad_scale values:')
    print(df['env_broad_scale'].value_counts().head(20).to_string())

print('\nenv_local_scale coverage and top values:')
n_local = df['env_local_scale'].notna().sum()
print(f'  Non-null: {n_local:,} ({100*n_local/len(df):.1f}%)')
if n_local > 0:
    print(f'\nTop 20 env_local_scale values:')
    print(df['env_local_scale'].value_counts().head(20).to_string())

In [None]:
# Bar chart of harmonized categories
cat_df = cat_counts.reset_index()
cat_df.columns = ['category', 'count']
cat_df = cat_df[cat_df['category'] != 'Unknown']  # exclude unknown for cleaner viz

fig_cat = px.bar(
    cat_df.sort_values('count', ascending=True),
    x='count',
    y='category',
    orientation='h',
    title='Harmonized Environment Categories (excluding Unknown)',
    labels={'count': 'Number of genomes', 'category': ''},
    text='count',
)
fig_cat.update_traces(texttemplate='%{text:,}', textposition='outside')
fig_cat.update_layout(width=700, height=500)
fig_cat.show()

---
## 4. UMAP of Embedding Space

Reduce 64-dimensional AlphaEarth embeddings to 2D for visualization.

In [None]:
# Extract embedding matrix
embeddings = df[EMB_COLS].values
print(f'Embedding matrix shape: {embeddings.shape}')

# Check for NaNs
n_nan = np.isnan(embeddings).any(axis=1).sum()
print(f'Rows with NaN embeddings: {n_nan}')

# Remove NaN rows if any
valid_mask = ~np.isnan(embeddings).any(axis=1)
embeddings_clean = embeddings[valid_mask]
df_clean = df[valid_mask].copy()
print(f'Clean embedding matrix: {embeddings_clean.shape}')

In [None]:
# Run UMAP
print('Running UMAP (this may take a minute for 83K points)...')
reducer = umap.UMAP(
    n_components=2,
    n_neighbors=30,
    min_dist=0.1,
    metric='cosine',
    random_state=42,
)
umap_coords = reducer.fit_transform(embeddings_clean)

df_clean['umap_x'] = umap_coords[:, 0]
df_clean['umap_y'] = umap_coords[:, 1]

print(f'UMAP complete: {umap_coords.shape}')

In [None]:
# UMAP colored by harmonized environment category
fig_umap_env = px.scatter(
    df_clean,
    x='umap_x',
    y='umap_y',
    color='env_category',
    hover_data=['genome_id', 'species', 'isolation_source', 'cleaned_lat', 'cleaned_lon'],
    title='UMAP of AlphaEarth Embeddings -- colored by Environment Category',
    opacity=0.4,
    labels={'umap_x': 'UMAP 1', 'umap_y': 'UMAP 2'},
)
fig_umap_env.update_traces(marker_size=3)
fig_umap_env.update_layout(width=1000, height=700)
fig_umap_env.show()

# Save static version
fig_umap_env.write_image(os.path.join(FIG_DIR, 'umap_by_env_category.png'), scale=2)
print('Saved figures/umap_by_env_category.png')

In [None]:
# UMAP colored by phylum
# Limit to top N phyla for readability, group rest as "Other"
top_phyla = df_clean['phylum'].value_counts().head(10).index.tolist()
df_clean['phylum_display'] = df_clean['phylum'].where(
    df_clean['phylum'].isin(top_phyla), 'Other'
)

fig_umap_phylum = px.scatter(
    df_clean,
    x='umap_x',
    y='umap_y',
    color='phylum_display',
    hover_data=['genome_id', 'species', 'isolation_source'],
    title='UMAP of AlphaEarth Embeddings -- colored by Phylum',
    opacity=0.4,
    labels={'umap_x': 'UMAP 1', 'umap_y': 'UMAP 2'},
)
fig_umap_phylum.update_traces(marker_size=3)
fig_umap_phylum.update_layout(width=1000, height=700)
fig_umap_phylum.show()

fig_umap_phylum.write_image(os.path.join(FIG_DIR, 'umap_by_phylum.png'), scale=2)
print('Saved figures/umap_by_phylum.png')

In [None]:
# UMAP colored by coordinate quality
fig_umap_qc = px.scatter(
    df_clean,
    x='umap_x',
    y='umap_y',
    color='coord_quality',
    color_discrete_map={
        'good': 'green',
        'low_precision': 'orange',
        'suspicious_cluster': 'red',
        'no_coords': 'lightgray'
    },
    hover_data=['genome_id', 'species', 'cleaned_lat', 'cleaned_lon'],
    title='UMAP of AlphaEarth Embeddings -- colored by Coordinate Quality',
    opacity=0.4,
    labels={'umap_x': 'UMAP 1', 'umap_y': 'UMAP 2'},
)
fig_umap_qc.update_traces(marker_size=3)
fig_umap_qc.update_layout(width=1000, height=700)
fig_umap_qc.show()

---
## 5. Geographic Map

Plot genomes on a world map using plotly scattergeo.

In [None]:
# All genomes with coordinates, colored by environment category
geo_df = df_clean[df_clean['cleaned_lat'].notna()].copy()
print(f'Genomes with coordinates for mapping: {len(geo_df):,}')

fig_map = px.scatter_geo(
    geo_df,
    lat='cleaned_lat',
    lon='cleaned_lon',
    color='env_category',
    hover_data=['genome_id', 'species', 'isolation_source', 'phylum'],
    title='Global Distribution of AlphaEarth Genomes -- by Environment Category',
    opacity=0.5,
)
fig_map.update_traces(marker_size=3)
fig_map.update_layout(width=1100, height=600)
fig_map.show()

fig_map.write_image(os.path.join(FIG_DIR, 'global_map_by_env.png'), scale=2)
print('Saved figures/global_map_by_env.png')

In [None]:
# Map colored by phylum
geo_df['phylum_display'] = geo_df['phylum'].where(
    geo_df['phylum'].isin(top_phyla), 'Other'
)

fig_map_phylum = px.scatter_geo(
    geo_df,
    lat='cleaned_lat',
    lon='cleaned_lon',
    color='phylum_display',
    hover_data=['genome_id', 'species', 'isolation_source'],
    title='Global Distribution of AlphaEarth Genomes -- by Phylum',
    opacity=0.5,
)
fig_map_phylum.update_traces(marker_size=3)
fig_map_phylum.update_layout(width=1100, height=600)
fig_map_phylum.show()

In [None]:
# Separate map: trustworthy vs suspicious coordinates
good_coords = geo_df[geo_df['coord_quality'] == 'good']
suspicious_coords_df = geo_df[geo_df['coord_quality'] != 'good']

print(f'Good coordinates: {len(good_coords):,}')
print(f'Suspicious/low-precision: {len(suspicious_coords_df):,}')

---
## 6. Embedding vs Geography

Do geographically nearby genomes have similar AlphaEarth embeddings?

In [None]:
from scipy.spatial.distance import cdist

# Use only good-quality coordinates
good = df_clean[
    (df_clean['coord_quality'] == 'good') &
    df_clean['cleaned_lat'].notna()
].copy()

print(f'Genomes with good coordinates: {len(good):,}')

# Sample random pairs (full pairwise is too large)
np.random.seed(42)
N_PAIRS = 50_000
n = len(good)
idx1 = np.random.randint(0, n, size=N_PAIRS)
idx2 = np.random.randint(0, n, size=N_PAIRS)
# Ensure no self-pairs
mask = idx1 != idx2
idx1, idx2 = idx1[mask], idx2[mask]

print(f'Sampled {len(idx1):,} random pairs')

In [None]:
# Compute geographic distance (haversine)
def haversine_km(lat1, lon1, lat2, lon2):
    """Haversine distance in km between two lat/lon points."""
    R = 6371  # Earth radius in km
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    return 2 * R * np.arcsin(np.sqrt(a))

lats = good['cleaned_lat'].values
lons = good['cleaned_lon'].values
emb_matrix = good[EMB_COLS].values

geo_dist = haversine_km(lats[idx1], lons[idx1], lats[idx2], lons[idx2])

# Compute embedding cosine distance
emb_dist = np.array([
    cosine(emb_matrix[i], emb_matrix[j])
    for i, j in zip(idx1, idx2)
])

# Same species flag
species_vals = good['species'].values
same_species = species_vals[idx1] == species_vals[idx2]

pairs_df = pd.DataFrame({
    'geo_dist_km': geo_dist,
    'emb_cosine_dist': emb_dist,
    'same_species': same_species,
})

print(f'Geographic distance range: {geo_dist.min():.0f} - {geo_dist.max():.0f} km')
print(f'Embedding cosine distance range: {emb_dist.min():.4f} - {emb_dist.max():.4f}')
print(f'Same-species pairs: {same_species.sum():,} ({100*same_species.mean():.1f}%)')

In [None]:
# Scatter: geographic distance vs embedding distance
fig_dist = px.scatter(
    pairs_df.sample(min(10_000, len(pairs_df)), random_state=42),
    x='geo_dist_km',
    y='emb_cosine_dist',
    color='same_species',
    color_discrete_map={True: 'blue', False: 'gray'},
    title='Geographic Distance vs AlphaEarth Embedding Distance',
    labels={
        'geo_dist_km': 'Geographic Distance (km)',
        'emb_cosine_dist': 'Embedding Cosine Distance',
        'same_species': 'Same Species'
    },
    opacity=0.3,
)
fig_dist.update_traces(marker_size=3)
fig_dist.update_layout(width=900, height=600)
fig_dist.show()

fig_dist.write_image(os.path.join(FIG_DIR, 'geo_vs_embedding_distance.png'), scale=2)
print('Saved figures/geo_vs_embedding_distance.png')

In [None]:
# Binned analysis: mean embedding distance by geographic distance bin
pairs_df['geo_bin'] = pd.cut(
    pairs_df['geo_dist_km'],
    bins=[0, 100, 500, 1000, 2000, 5000, 10000, 20000],
    labels=['<100', '100-500', '500-1K', '1K-2K', '2K-5K', '5K-10K', '10K-20K']
)

binned = pairs_df.groupby('geo_bin', observed=True).agg(
    mean_emb_dist=('emb_cosine_dist', 'mean'),
    median_emb_dist=('emb_cosine_dist', 'median'),
    n_pairs=('emb_cosine_dist', 'count'),
).reset_index()

fig_binned = px.bar(
    binned,
    x='geo_bin',
    y='mean_emb_dist',
    text='n_pairs',
    title='Mean Embedding Distance by Geographic Distance Bin',
    labels={
        'geo_bin': 'Geographic Distance (km)',
        'mean_emb_dist': 'Mean Cosine Distance',
        'n_pairs': 'N pairs'
    },
)
fig_binned.update_traces(texttemplate='n=%{text:,}', textposition='outside')
fig_binned.update_layout(width=800, height=500)
fig_binned.show()

---
## 7. Environment Label vs Embedding Cluster Cross-Tabulation

Cluster UMAP space, then compare clusters to environment categories.

In [None]:
# Cluster UMAP coordinates with DBSCAN
from sklearn.cluster import DBSCAN

umap_xy = df_clean[['umap_x', 'umap_y']].values

clustering = DBSCAN(eps=0.5, min_samples=50).fit(umap_xy)
df_clean['umap_cluster'] = clustering.labels_

n_clusters = len(set(clustering.labels_)) - (1 if -1 in clustering.labels_ else 0)
n_noise = (clustering.labels_ == -1).sum()
print(f'DBSCAN found {n_clusters} clusters + {n_noise:,} noise points')
print(f'\nCluster sizes:')
for label in sorted(set(clustering.labels_)):
    if label == -1:
        continue
    n = (clustering.labels_ == label).sum()
    print(f'  Cluster {label}: {n:,} genomes')

In [None]:
# UMAP colored by cluster
df_clean['umap_cluster_str'] = df_clean['umap_cluster'].astype(str)
df_clean.loc[df_clean['umap_cluster'] == -1, 'umap_cluster_str'] = 'noise'

fig_umap_cluster = px.scatter(
    df_clean,
    x='umap_x',
    y='umap_y',
    color='umap_cluster_str',
    hover_data=['genome_id', 'species', 'env_category', 'isolation_source'],
    title='UMAP of AlphaEarth Embeddings -- DBSCAN Clusters',
    opacity=0.4,
    labels={'umap_x': 'UMAP 1', 'umap_y': 'UMAP 2', 'umap_cluster_str': 'Cluster'},
)
fig_umap_cluster.update_traces(marker_size=3)
fig_umap_cluster.update_layout(width=1000, height=700)
fig_umap_cluster.show()

In [None]:
# Cross-tabulation: environment category vs UMAP cluster
# Exclude noise and Unknown
xtab_df = df_clean[
    (df_clean['umap_cluster'] >= 0) &
    (df_clean['env_category'] != 'Unknown')
].copy()

xtab = pd.crosstab(
    xtab_df['env_category'],
    xtab_df['umap_cluster'],
    normalize='columns',  # Normalize within each cluster
)

fig_heat = px.imshow(
    xtab,
    title='Environment Category Composition of UMAP Clusters (column-normalized)',
    labels={'x': 'UMAP Cluster', 'y': 'Environment Category', 'color': 'Fraction'},
    aspect='auto',
    color_continuous_scale='Blues',
)
fig_heat.update_layout(width=900, height=500)
fig_heat.show()

fig_heat.write_image(os.path.join(FIG_DIR, 'cluster_env_heatmap.png'), scale=2)
print('Saved figures/cluster_env_heatmap.png')

In [None]:
# Row-normalized: for each environment category, which clusters contain its genomes?
xtab_row = pd.crosstab(
    xtab_df['env_category'],
    xtab_df['umap_cluster'],
    normalize='index',  # Normalize within each env category
)

fig_heat_row = px.imshow(
    xtab_row,
    title='Cluster Distribution of Each Environment Category (row-normalized)',
    labels={'x': 'UMAP Cluster', 'y': 'Environment Category', 'color': 'Fraction'},
    aspect='auto',
    color_continuous_scale='Oranges',
)
fig_heat_row.update_layout(width=900, height=500)
fig_heat_row.show()

---
## Summary

In [None]:
print('=== Analysis Summary ===')
print(f'\nTotal genomes: {len(df):,}')
print(f'With valid embeddings: {len(df_clean):,}')
print(f'With good coordinates: {(df["coord_quality"] == "good").sum():,}')
print(f'\nCoordinate quality:')
print(df['coord_quality'].value_counts().to_string())
print(f'\nEnvironment categories:')
print(df['env_category'].value_counts().to_string())
print(f'\nUMAP clusters: {n_clusters}')
print(f'\nFigures saved to {os.path.abspath(FIG_DIR)}:')
for f in sorted(os.listdir(FIG_DIR)):
    print(f'  {f}')