# Notebook 02: COG Enrichment by Openness Quartile

**Project**: Openness vs Functional Composition

**Goal**: Query COG functional category distributions for ~40 target species (10 per openness quartile) and compute enrichment scores.

**Input**: `../data/species_openness_quartiles.csv` (from Notebook 01)

**Output**: `../data/cog_enrichment_by_openness.csv`

In [None]:
import pandas as pd

spark = get_spark_session()

# Load target species from NB 01
target_df = pd.read_csv('../data/species_openness_quartiles.csv')
print(f"Target species: {len(target_df)}")
print(target_df.groupby('openness_quartile').size())

## 1. Query COG Distributions

Reuses the proven 3-way join from `cog_analysis` project.
Expected runtime: ~6-8 min for 40 species.

In [None]:
# Build species list for SQL IN clause
species_list = target_df['gtdb_species_clade_id'].tolist()
species_sql = ", ".join([f"'{s}'" for s in species_list])

print(f"Querying COG distributions for {len(species_list)} species...")

cog_raw = spark.sql(f"""
    SELECT 
        gc.gtdb_species_clade_id,
        gc.is_core,
        gc.is_auxiliary,
        gc.is_singleton,
        ann.COG_category,
        COUNT(*) as gene_count
    FROM kbase_ke_pangenome.gene_cluster gc
    JOIN kbase_ke_pangenome.gene_genecluster_junction j 
        ON gc.gene_cluster_id = j.gene_cluster_id
    JOIN kbase_ke_pangenome.eggnog_mapper_annotations ann 
        ON j.gene_id = ann.query_name
    WHERE gc.gtdb_species_clade_id IN ({species_sql})
        AND ann.COG_category IS NOT NULL
        AND ann.COG_category != '-'
    GROUP BY 
        gc.gtdb_species_clade_id,
        gc.is_core, gc.is_auxiliary, gc.is_singleton,
        ann.COG_category
    ORDER BY gc.gtdb_species_clade_id, gc.is_core DESC, gene_count DESC
""")

cog_pdf = cog_raw.toPandas()
print(f"Query returned {len(cog_pdf):,} rows")
print(f"Species returned: {cog_pdf['gtdb_species_clade_id'].nunique()}")
cog_pdf.head(10)

## 2. Classify Gene Classes & Split Multi-letter COGs

In [None]:
# Assign gene class labels
def classify_gene(row):
    if row['is_core'] == 1:
        return 'Core'
    elif row['is_singleton'] == 1:
        return 'Singleton'
    else:
        return 'Auxiliary'

cog_pdf['gene_class'] = cog_pdf.apply(classify_gene, axis=1)

# Split multi-letter COG categories (e.g., 'LV' -> 'L' and 'V')
# Each letter gets the full gene count (a gene with 'LV' is both L and V)
rows = []
for _, row in cog_pdf.iterrows():
    for letter in row['COG_category']:
        if letter.isalpha():
            new_row = row.copy()
            new_row['COG_letter'] = letter
            rows.append(new_row)

cog_split = pd.DataFrame(rows)
print(f"After splitting multi-letter COGs: {len(cog_split):,} rows")
print(f"Unique COG letters: {sorted(cog_split['COG_letter'].unique())}")

## 3. Compute Per-Species Enrichment

Enrichment = (proportion in novel/singleton) - (proportion in core)

Positive enrichment means the COG category is more common in novel genes.

In [None]:
# Aggregate gene counts by species, gene class, COG letter
agg = cog_split.groupby(
    ['gtdb_species_clade_id', 'gene_class', 'COG_letter']
)['gene_count'].sum().reset_index()

# Compute proportions within each species + gene class
totals = agg.groupby(['gtdb_species_clade_id', 'gene_class'])['gene_count'].transform('sum')
agg['proportion'] = agg['gene_count'] / totals

# Pivot to get core and singleton proportions side by side
core_props = agg[agg['gene_class'] == 'Core'][['gtdb_species_clade_id', 'COG_letter', 'proportion']]
core_props = core_props.rename(columns={'proportion': 'core_proportion'})

singleton_props = agg[agg['gene_class'] == 'Singleton'][['gtdb_species_clade_id', 'COG_letter', 'proportion']]
singleton_props = singleton_props.rename(columns={'proportion': 'singleton_proportion'})

# Merge and compute enrichment
enrichment = core_props.merge(singleton_props, on=['gtdb_species_clade_id', 'COG_letter'], how='outer').fillna(0)
enrichment['enrichment'] = enrichment['singleton_proportion'] - enrichment['core_proportion']

# Add openness quartile
enrichment = enrichment.merge(
    target_df[['gtdb_species_clade_id', 'openness_quartile', 'openness', 'phylum', 'GTDB_species', 'no_genomes']],
    on='gtdb_species_clade_id',
    how='left'
)

print(f"Enrichment table: {len(enrichment)} rows")
enrichment.head(10)

## 4. Summarize by Quartile

In [None]:
# Mean enrichment per COG letter per quartile
quartile_summary = enrichment.groupby(
    ['openness_quartile', 'COG_letter']
).agg(
    mean_enrichment=('enrichment', 'mean'),
    std_enrichment=('enrichment', 'std'),
    n_species=('enrichment', 'count'),
    median_enrichment=('enrichment', 'median')
).reset_index()

# Show key categories
for cog in ['L', 'V', 'S', 'J', 'E', 'C', 'G']:
    print(f"\n=== COG {cog} ===")
    subset = quartile_summary[quartile_summary['COG_letter'] == cog].sort_values('openness_quartile')
    print(subset[['openness_quartile', 'mean_enrichment', 'std_enrichment', 'n_species']].to_string(index=False))

## 5. Save Results

In [None]:
# Save per-species enrichment with quartile labels
enrichment.to_csv('../data/cog_enrichment_by_openness.csv', index=False)
print(f"Saved {len(enrichment)} rows to ../data/cog_enrichment_by_openness.csv")

# Save quartile summary
quartile_summary.to_csv('../data/cog_enrichment_quartile_summary.csv', index=False)
print(f"Saved quartile summary to ../data/cog_enrichment_quartile_summary.csv")

# Save raw COG counts for notebook 03
cog_pdf.to_csv('../data/cog_raw_counts.csv', index=False)
print(f"Saved raw counts to ../data/cog_raw_counts.csv")

## Findings

Record after running:
- Does L enrichment increase from Q1 (closed) to Q4 (open)? ___
- Does V enrichment increase with openness? ___
- Do metabolic categories (E, C, G) show stronger core enrichment in closed pangenomes? ___
- Any unexpected patterns? ___