# Scientific Queries for CDM Ontology Data

This notebook contains scientifically relevant queries for exploring the CDM ontology and genomic data.

**Data Overview:**
- `statements`: 42.4M ontology statements (RDF triples)
- `entailed_edge`: 117.5M inferred relationships
- `feature_annotation`: 237K genomic features from 50 E. coli genomes
- `term_association`: 3.3K enzyme-reaction mappings

**Note**: Run these queries on your remote JupyterHub for full dataset. This notebook includes both Spark and pandas versions.

In [None]:
# For remote execution with Spark
from spark.utils import get_spark_session
import time
spark = get_spark_session()

# Set the namespace
namespace = 'ontology_data'

# Helper function to time queries
def time_query(query_name, query_func):
    """Execute a query and print execution time"""
    print(f"\n{'='*60}")
    print(f"Executing: {query_name}")
    print(f"{'='*60}")
    start_time = time.time()
    result = query_func()
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"\nQuery execution time: {execution_time:.2f} seconds")
    return result

## 0. Genome Feature to Reaction Mapping

Map genomic features to their associated SEED reactions through RAST role annotations

In [None]:
# Query 0: Map E. coli genome features to SEED reactions via RAST roles
def query_genome_reactions():
    query = f"""
    WITH genome_features AS (
        -- Get all features for genome 562.61239 with RAST annotations
        SELECT 
            f.genome_id,
            f.feature_id,
            f.rast
        FROM {namespace}.feature_annotation f
        WHERE f.genome_id = '562.61239'
        AND f.rast IS NOT NULL
    ),
    feature_reactions AS (
        -- Map RAST roles to SEED reactions through term_association
        SELECT DISTINCT
            gf.genome_id,
            gf.feature_id,
            gf.rast,
            ta.object as seed_reaction
        FROM genome_features gf
        INNER JOIN {namespace}.term_association ta
            ON gf.rast = ta.subject
        WHERE ta.object LIKE 'seed.reaction:%'
    ),
    reaction_names AS (
        -- Get reaction names from statements table
        SELECT 
            subject as reaction_id,
            value as reaction_name
        FROM {namespace}.statements
        WHERE predicate = 'rdfs:label'
        AND subject LIKE 'seed.reaction:%'
    )
    SELECT 
        fr.genome_id,
        fr.feature_id,
        fr.rast,
        fr.seed_reaction,
        rn.reaction_name
    FROM feature_reactions fr
    LEFT JOIN reaction_names rn ON fr.seed_reaction = rn.reaction_id
    ORDER BY fr.genome_id, fr.feature_id
    LIMIT 100
    """
    
    df = spark.sql(query).toPandas()
    display(df.head(20))
    print(f"\\nTotal features with reaction mappings: {len(df)}")
    return df

# Execute the query with timing
time_query("Genome Feature to Reaction Mapping", query_genome_reactions)

## 1. Metabolic Pathway Analysis

Find all biochemical reactions associated with E. coli enzymes

In [None]:
# Query 1.1: Find all EC numbers in E. coli genomes and their associated reactions
def query_ec_reactions():
    query = f"""
    WITH ec_numbers AS (
        SELECT DISTINCT 
            bakta_ec as ec_number,
            COUNT(DISTINCT genome_id) as genome_count,
            COUNT(*) as feature_count
        FROM {namespace}.feature_annotation
        WHERE bakta_ec IS NOT NULL
        GROUP BY bakta_ec
    ),
    ec_reactions AS (
        SELECT 
            s.subject as ec_id,
            s.object as reaction_id,
            s.predicate
        FROM {namespace}.statements s
        WHERE s.subject LIKE 'EC:%'
        AND s.predicate IN ('skos:exactMatch', 'oio:hasDbXref')
        AND s.object LIKE 'RHEA:%'
    )
    SELECT 
        e.ec_number,
        e.genome_count,
        e.feature_count,
        r.reaction_id,
        r.predicate
    FROM ec_numbers e
    LEFT JOIN ec_reactions r ON CONCAT('EC:', e.ec_number) = r.ec_id
    ORDER BY e.genome_count DESC, e.feature_count DESC
    LIMIT 20
    """
    
    df = spark.sql(query).show(truncate=False)
    return df

# Execute the query with timing
time_query("EC Numbers and RHEA Reactions", query_ec_reactions)

In [None]:
# Query 1.2: Find metabolic subsystems through ModelSEED roles
def query_subsystems():
    query = f"""
    WITH seed_roles AS (
        SELECT DISTINCT
            subject as role_id,
            object as reaction_id
        FROM {namespace}.term_association
        WHERE predicate = 'RO:0002327'  -- enables
    ),
    role_names AS (
        SELECT 
            subject,
            value as role_name
        FROM {namespace}.statements
        WHERE subject LIKE 'seed.role:%'
        AND predicate = 'rdfs:label'
    ),
    reaction_names AS (
        SELECT 
            subject,
            value as reaction_name
        FROM {namespace}.statements
        WHERE subject LIKE 'seed.reaction:%'
        AND predicate = 'rdfs:label'
    )
    SELECT 
        r.role_id,
        rn.role_name,
        r.reaction_id,
        rxn.reaction_name,
        COUNT(*) OVER (PARTITION BY r.role_id) as reactions_per_role
    FROM seed_roles r
    LEFT JOIN role_names rn ON r.role_id = rn.subject
    LEFT JOIN reaction_names rxn ON r.reaction_id = rxn.subject
    ORDER BY reactions_per_role DESC, r.role_id
    LIMIT 30
    """
    
    df = spark.sql(query).show(truncate=False)
    return df

# Execute the query with timing
time_query("ModelSEED Roles and Reactions", query_subsystems)

## 2. Taxonomic Distribution Analysis

Analyze the distribution of metabolic capabilities across taxa

In [None]:
# Query 2.1: E. coli strain diversity analysis
def query_strain_diversity():
    query = f"""
    WITH strain_features AS (
        SELECT 
            genome_id,
            genome_taxa,
            COUNT(DISTINCT feature_id) as total_features,
            COUNT(DISTINCT bakta_ec) as unique_ec_numbers,
            COUNT(DISTINCT bakta_go) as unique_go_terms,
            COUNT(DISTINCT bakta_cog_id) as unique_cog_categories,
            SUM(CASE WHEN bakta_ec IS NOT NULL THEN 1 ELSE 0 END) as features_with_ec,
            SUM(CASE WHEN bakta_go IS NOT NULL THEN 1 ELSE 0 END) as features_with_go
        FROM {namespace}.feature_annotation
        GROUP BY genome_id, genome_taxa
    ),
    taxa_labels AS (
        SELECT 
            subject,
            value as organism_name
        FROM {namespace}.statements
        WHERE predicate = 'rdfs:label'
        AND subject LIKE 'NCBITaxon:%'
    )
    SELECT 
        s.*,
        t.organism_name,
        ROUND(s.features_with_ec * 100.0 / s.total_features, 2) as pct_with_ec,
        ROUND(s.features_with_go * 100.0 / s.total_features, 2) as pct_with_go
    FROM strain_features s
    LEFT JOIN taxa_labels t ON s.genome_taxa = t.subject
    ORDER BY s.unique_ec_numbers DESC
    LIMIT 20
    """
    
    df = spark.sql(query)
    df.show(truncate=False)
    return df

# Execute the query with timing
time_query("E. coli Strain Functional Diversity", query_strain_diversity)

In [None]:
# Query 2.2: Taxonomic hierarchy exploration
def query_taxonomy():
    query = f"""
    WITH ecoli_taxa AS (
        SELECT DISTINCT genome_taxa 
        FROM {namespace}.feature_annotation
    ),
    taxonomic_hierarchy AS (
        SELECT 
            e.subject,
            e.object as parent_taxon,
            s1.value as subject_name,
            s2.value as parent_name
        FROM {namespace}.entailed_edge e
        INNER JOIN ecoli_taxa t ON e.subject = t.genome_taxa
        LEFT JOIN {namespace}.statements s1 
            ON e.subject = s1.subject AND s1.predicate = 'rdfs:label'
        LEFT JOIN {namespace}.statements s2 
            ON e.object = s2.subject AND s2.predicate = 'rdfs:label'
        WHERE e.predicate = 'rdfs:subClassOf'
        AND e.object LIKE 'NCBITaxon:%'
    )
    SELECT * FROM taxonomic_hierarchy
    WHERE parent_name IS NOT NULL
    ORDER BY subject
    LIMIT 50
    """
    
    df = spark.sql(query)
    df.show(truncate=False)
    return df

# Execute the query with timing
time_query("Taxonomic Hierarchy for E. coli Strains", query_taxonomy)

## 3. Chemical Transformation Networks

Analyze biochemical reactions and compound transformations

In [None]:
# Query 3.1: Find metabolic reactions with their substrates and products
def query_reactions():
    query = f"""
    WITH rhea_reactions AS (
        SELECT DISTINCT
            subject as reaction_id,
            value as reaction_name
        FROM {namespace}.statements
        WHERE subject LIKE 'RHEA:%'
        AND predicate = 'rdfs:label'
    ),
    reaction_participants AS (
        SELECT 
            e.subject as reaction_id,
            e.object as compound_id,
            e.predicate as role
        FROM {namespace}.entailed_edge e
        WHERE e.subject LIKE 'RHEA:%'
        AND e.object LIKE 'CHEBI:%'
        AND e.predicate IN ('RO:0000057', 'BFO:0000051')  -- has participant, has part
    ),
    compound_names AS (
        SELECT 
            subject as compound_id,
            value as compound_name
        FROM {namespace}.statements
        WHERE subject LIKE 'CHEBI:%'
        AND predicate = 'rdfs:label'
    )
    SELECT 
        r.reaction_id,
        r.reaction_name,
        p.compound_id,
        c.compound_name,
        p.role
    FROM rhea_reactions r
    JOIN reaction_participants p ON r.reaction_id = p.reaction_id
    LEFT JOIN compound_names c ON p.compound_id = c.compound_id
    WHERE r.reaction_name IS NOT NULL
    ORDER BY r.reaction_id, p.role
    LIMIT 50
    """
    
    df = spark.sql(query)
    df.show(truncate=False)
    return df

# Execute the query with timing
time_query("RHEA Reactions and Chemical Participants", query_reactions)

In [None]:
# Query 3.2: ModelSEED compound analysis
def query_seed_compounds():
    query = f"""
    WITH seed_compounds AS (
        SELECT 
            subject as compound_id,
            value as compound_name
        FROM {namespace}.statements
        WHERE subject LIKE 'seed.compound:%'
        AND predicate = 'rdfs:label'
    ),
    compound_xrefs AS (
        SELECT 
            subject as compound_id,
            object as external_id
        FROM {namespace}.statements
        WHERE subject LIKE 'seed.compound:%'
        AND predicate IN ('oio:hasDbXref', 'skos:exactMatch')
        AND (object LIKE 'CHEBI:%' OR object LIKE 'KEGG:%')
    )
    SELECT 
        c.compound_id,
        c.compound_name,
        x.external_id,
        COUNT(*) OVER (PARTITION BY c.compound_id) as xref_count
    FROM seed_compounds c
    LEFT JOIN compound_xrefs x ON c.compound_id = x.compound_id
    ORDER BY xref_count DESC, c.compound_id
    LIMIT 30
    """
    
    df = spark.sql(query)
    df.show(truncate=False)
    return df

# Execute the query with timing
time_query("ModelSEED Compounds and Cross-References", query_seed_compounds)

## 4. Functional Annotation Analysis

Analyze GO terms and functional categories

In [None]:
# Query 4.1: GO term enrichment across genomes
query_go_enrichment = f"""
WITH go_counts AS (
    SELECT 
        bakta_go as go_term,
        COUNT(DISTINCT genome_id) as genome_count,
        COUNT(*) as annotation_count
    FROM {namespace}.feature_annotation
    WHERE bakta_go IS NOT NULL
    GROUP BY bakta_go
),
go_labels AS (
    SELECT 
        subject as go_id,
        value as go_name
    FROM {namespace}.statements
    WHERE subject LIKE 'GO:%'
    AND predicate = 'rdfs:label'
),
go_definitions AS (
    SELECT 
        subject as go_id,
        value as go_definition
    FROM {namespace}.statements
    WHERE subject LIKE 'GO:%'
    AND predicate = 'oio:hasDefinition'
)
SELECT 
    g.go_term,
    l.go_name,
    d.go_definition,
    g.genome_count,
    g.annotation_count,
    ROUND(g.genome_count * 100.0 / 50, 2) as pct_genomes  -- 50 total genomes
FROM go_counts g
LEFT JOIN go_labels l ON g.go_term = l.go_id
LEFT JOIN go_definitions d ON g.go_term = d.go_id
ORDER BY g.genome_count DESC, g.annotation_count DESC
LIMIT 25
"""

print("Most common GO terms across E. coli genomes:")
spark.sql(query_go_enrichment).show(truncate=False)

In [None]:
# Query 4.2: EC to GO mapping through shared functions
query_ec_go_mapping = f"""
WITH ec_go_features AS (
    SELECT 
        bakta_ec as ec_number,
        bakta_go as go_term,
        COUNT(*) as co_occurrence_count
    FROM {namespace}.feature_annotation
    WHERE bakta_ec IS NOT NULL 
    AND bakta_go IS NOT NULL
    GROUP BY bakta_ec, bakta_go
),
ec_labels AS (
    SELECT 
        subject,
        value as ec_name
    FROM {namespace}.statements
    WHERE subject LIKE 'EC:%'
    AND predicate = 'rdfs:label'
),
go_labels AS (
    SELECT 
        subject,
        value as go_name
    FROM {namespace}.statements
    WHERE subject LIKE 'GO:%'
    AND predicate = 'rdfs:label'
)
SELECT 
    f.ec_number,
    e.ec_name,
    f.go_term,
    g.go_name,
    f.co_occurrence_count
FROM ec_go_features f
LEFT JOIN ec_labels e ON CONCAT('EC:', f.ec_number) = e.subject
LEFT JOIN go_labels g ON f.go_term = g.subject
WHERE f.co_occurrence_count > 5
ORDER BY f.co_occurrence_count DESC
LIMIT 30
"""

print("EC number to GO term co-occurrence mapping:")
spark.sql(query_ec_go_mapping).show(truncate=False)

## 5. Protein Family and Domain Analysis

In [None]:
# Query 5.1: InterPro domain distribution
def query_interpro():
    query = f"""
    WITH interpro_counts AS (
        SELECT 
            bakta_interpro as interpro_id,
            COUNT(DISTINCT genome_id) as genome_count,
            COUNT(DISTINCT protein_hash) as unique_proteins,
            COUNT(*) as total_occurrences
        FROM {namespace}.feature_annotation
        WHERE bakta_interpro IS NOT NULL
        GROUP BY bakta_interpro
    ),
    interpro_info AS (
        SELECT 
            subject,
            value as interpro_name
        FROM {namespace}.statements
        WHERE subject LIKE 'IPR%'
        AND predicate = 'rdfs:label'
    )
    SELECT 
        i.interpro_id,
        info.interpro_name,
        i.genome_count,
        i.unique_proteins,
        i.total_occurrences,
        ROUND(i.total_occurrences * 1.0 / i.unique_proteins, 2) as avg_copies_per_protein
    FROM interpro_counts i
    LEFT JOIN interpro_info info ON i.interpro_id = info.subject
    ORDER BY i.genome_count DESC, i.total_occurrences DESC
    LIMIT 30
    """
    
    df = spark.sql(query)
    df.show(truncate=False)
    return df

# Execute the query with timing
time_query("InterPro Domain Distribution", query_interpro)

In [None]:
# Query 5.2: UniRef cluster analysis
def query_uniref():
    query = f"""
    SELECT 
        bakta_uniref as uniref_cluster,
        COUNT(DISTINCT genome_id) as genome_count,
        COUNT(DISTINCT protein_hash) as unique_sequences,
        COUNT(*) as total_genes,
        COLLECT_SET(bakta_gene)[0] as example_gene_name,
        COLLECT_SET(bakta_product)[0] as example_product
    FROM {namespace}.feature_annotation
    WHERE bakta_uniref IS NOT NULL
    GROUP BY bakta_uniref
    HAVING genome_count = 50  -- Core genes present in all genomes
    ORDER BY total_genes DESC
    LIMIT 20
    """
    
    df = spark.sql(query)
    df.show(truncate=False)
    return df

# Execute the query with timing
time_query("Core Genes (UniRef Clusters)", query_uniref)

## 6. Complex Network Queries

Advanced queries combining multiple data sources

In [None]:
# Query 6.1: Complete metabolic pathway reconstruction
def query_pathway():
    query = f"""
    WITH enzyme_reactions AS (
        -- Get EC numbers from genomes and their reactions
        SELECT DISTINCT
            f.bakta_ec as ec_number,
            f.genome_id,
            ta.object as reaction_id
        FROM {namespace}.feature_annotation f
        JOIN {namespace}.statements s 
            ON CONCAT('EC:', f.bakta_ec) = s.subject
        JOIN {namespace}.term_association ta
            ON s.object = ta.subject
        WHERE f.bakta_ec IS NOT NULL
        AND s.predicate = 'skos:exactMatch'
        AND ta.predicate = 'RO:0002327'
    ),
    reaction_compounds AS (
        -- Get compounds involved in reactions
        SELECT 
            e.subject as reaction_id,
            e.object as compound_id,
            s.value as compound_name
        FROM {namespace}.entailed_edge e
        JOIN {namespace}.statements s
            ON e.object = s.subject AND s.predicate = 'rdfs:label'
        WHERE e.subject LIKE 'seed.reaction:%'
        AND e.object LIKE 'seed.compound:%'
        AND e.predicate = 'RO:0000057'  -- has participant
    )
    SELECT 
        er.genome_id,
        er.ec_number,
        er.reaction_id,
        rc.compound_id,
        rc.compound_name
    FROM enzyme_reactions er
    JOIN reaction_compounds rc ON er.reaction_id = rc.reaction_id
    WHERE rc.compound_name LIKE '%glucose%' OR rc.compound_name LIKE '%pyruvate%'
    ORDER BY er.genome_id, er.reaction_id
    LIMIT 50
    """
    
    df = spark.sql(query)
    df.show(truncate=False)
    return df

# Execute the query with timing
time_query("Metabolic Pathway Reconstruction - Glucose/Pyruvate", query_pathway)

In [None]:
# Query 6.2: Functional comparison between strains
def query_strain_comparison():
    query = f"""
    WITH genome_functions AS (
        SELECT 
            genome_id,
            genome_taxa,
            COLLECT_SET(bakta_ec) as ec_set,
            COLLECT_SET(bakta_go) as go_set,
            COLLECT_SET(bakta_cog_category) as cog_set
        FROM {namespace}.feature_annotation
        GROUP BY genome_id, genome_taxa
    ),
    genome_pairs AS (
        SELECT 
            g1.genome_id as genome1,
            g2.genome_id as genome2,
            SIZE(array_intersect(g1.ec_set, g2.ec_set)) as shared_ec,
            SIZE(g1.ec_set) as ec_count1,
            SIZE(g2.ec_set) as ec_count2,
            SIZE(array_intersect(g1.go_set, g2.go_set)) as shared_go,
            SIZE(g1.go_set) as go_count1,
            SIZE(g2.go_set) as go_count2
        FROM genome_functions g1
        CROSS JOIN genome_functions g2
        WHERE g1.genome_id < g2.genome_id
    )
    SELECT 
        genome1,
        genome2,
        shared_ec,
        ROUND(shared_ec * 200.0 / (ec_count1 + ec_count2), 2) as ec_similarity_pct,
        shared_go,
        ROUND(shared_go * 200.0 / (go_count1 + go_count2), 2) as go_similarity_pct
    FROM genome_pairs
    ORDER BY ec_similarity_pct DESC
    LIMIT 20
    """
    
    df = spark.sql(query)
    df.show(truncate=False)
    return df

# Execute the query with timing
time_query("Functional Similarity Between E. coli Strains", query_strain_comparison)

## 7. Data Export Queries

Queries to export specific datasets for further analysis

In [None]:
# Export query 1: Create a gene-to-function mapping table
def export_gene_functions():
    query = f"""
    SELECT 
        feature_id,
        genome_id,
        bakta_gene as gene_name,
        bakta_product as product,
        bakta_ec as ec_number,
        bakta_go as go_term,
        bakta_cog_id as cog_id,
        bakta_cog_category as cog_category,
        bakta_interpro as interpro_domain,
        bakta_uniref as uniref_cluster,
        protein_hash
    FROM {namespace}.feature_annotation
    WHERE bakta_ec IS NOT NULL OR bakta_go IS NOT NULL
    ORDER BY genome_id, feature_id
    """
    
    df = spark.sql(query)
    # Uncomment to save:
    # df.coalesce(1).write.mode('overwrite').option('header', 'true').csv('gene_functions_export')
    print(f"Export query would return {df.count()} rows")
    return df

# Execute the query with timing
time_query("Gene Function Export Query", export_gene_functions)

In [None]:
# Export query 2: Create reaction network for visualization
def export_reaction_network():
    query = f"""
    WITH reaction_edges AS (
        SELECT DISTINCT
            r1.object as compound,
            r1.subject as reaction1,
            r2.subject as reaction2
        FROM {namespace}.entailed_edge r1
        JOIN {namespace}.entailed_edge r2
            ON r1.object = r2.object
        WHERE r1.subject LIKE 'seed.reaction:%'
        AND r2.subject LIKE 'seed.reaction:%'
        AND r1.subject < r2.subject
        AND r1.predicate = 'RO:0000057'
        AND r2.predicate = 'RO:0000057'
    ),
    compound_names AS (
        SELECT subject, value as name
        FROM {namespace}.statements
        WHERE predicate = 'rdfs:label'
        AND subject LIKE 'seed.compound:%'
    )
    SELECT 
        e.reaction1,
        e.reaction2,
        e.compound,
        c.name as compound_name,
        COUNT(*) as shared_compounds
    FROM reaction_edges e
    LEFT JOIN compound_names c ON e.compound = c.subject
    GROUP BY e.reaction1, e.reaction2, e.compound, c.name
    ORDER BY shared_compounds DESC
    LIMIT 1000
    """
    
    df = spark.sql(query)
    print(f"Reaction network export would contain {df.count()} edges")
    return df

# Execute the query with timing
time_query("Reaction Network Export Query", export_reaction_network)

## Summary Statistics

In [None]:
# Generate summary statistics
def query_summary_stats():
    query = f"""
    WITH stats AS (
        SELECT 
            'Total genomes' as metric,
            COUNT(DISTINCT genome_id) as value
        FROM {namespace}.feature_annotation
        UNION ALL
        SELECT 
            'Total features' as metric,
            COUNT(*) as value
        FROM {namespace}.feature_annotation
        UNION ALL
        SELECT 
            'Features with EC numbers' as metric,
            COUNT(*) as value
        FROM {namespace}.feature_annotation
        WHERE bakta_ec IS NOT NULL
        UNION ALL
        SELECT 
            'Features with GO terms' as metric,
            COUNT(*) as value
        FROM {namespace}.feature_annotation
        WHERE bakta_go IS NOT NULL
        UNION ALL
        SELECT 
            'Unique EC numbers' as metric,
            COUNT(DISTINCT bakta_ec) as value
        FROM {namespace}.feature_annotation
        UNION ALL
        SELECT 
            'Unique GO terms' as metric,
            COUNT(DISTINCT bakta_go) as value
        FROM {namespace}.feature_annotation
        UNION ALL
        SELECT 
            'Total statements' as metric,
            COUNT(*) as value
        FROM {namespace}.statements
        UNION ALL
        SELECT 
            'Total entailed edges' as metric,
            COUNT(*) as value
        FROM {namespace}.entailed_edge
    )
    SELECT * FROM stats
    ORDER BY metric
    """
    
    df = spark.sql(query)
    df.show(truncate=False)
    return df

# Execute the query with timing
time_query("Dataset Summary Statistics", query_summary_stats)