# Interactive Data Search Tool (Remote Version)

This notebook provides an interactive way to search through the CDM ontology data tables using Spark.

**Note**: This version is designed to work on the remote JupyterHub with Spark tables.

In [1]:
from spark.utils import get_spark_session
from IPython.display import display, HTML
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Create Spark session
spark = get_spark_session()
namespace = 'ontology_data'

print("Spark session created successfully")

25/07/08 15:52:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/08 15:52:48 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/07/08 15:52:48 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.
25/07/08 15:52:49 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
25/07/08 15:52:51 WARN S3ABlockOutputStream: Application invoked the Syncable API against stream writing to spark-job-logs/jplfaria/app-20250708155249-0013.inprogress. This is unsupported
25/07/08 15:52:51 WARN Utils: spark.executor.instances less than spark.dynamicAllocation.minExecutors is invalid, ignoring its setting, please update your configs.


Spark session created successfully


## Check Available Tables

In [2]:
# List all tables in the namespace
tables_df = spark.sql(f"SHOW TABLES IN {namespace}")
tables = [row.tableName for row in tables_df.collect()]

print(f"Available tables in {namespace}:")
for table in tables:
    count = spark.sql(f"SELECT COUNT(*) as cnt FROM {namespace}.{table}").collect()[0]['cnt']
    print(f"  - {table}: {count:,} rows")

Available tables in ontology_data:


                                                                                

  - entailed_edge: 117,545,336 rows


                                                                                

  - feature_annotation: 236,843 rows


                                                                                

  - prefix: 1,221 rows


                                                                                

  - statements: 42,373,349 rows




  - term_association: 3,271 rows


                                                                                

## Interactive Search Functions

These functions allow searching through Spark tables efficiently:

In [3]:
def search_table(table_name, search_term, column=None, limit=100):
    """Search for a term in a specific table"""
    
    if column:
        # Search specific column
        query = f"""
        SELECT * 
        FROM {namespace}.{table_name}
        WHERE LOWER({column}) LIKE LOWER('%{search_term}%')
        LIMIT {limit}
        """
    else:
        # Get columns for this table
        cols = spark.sql(f"SELECT * FROM {namespace}.{table_name} LIMIT 1").columns
        
        # Build WHERE clause for all string columns
        where_clauses = []
        for col in cols:
            where_clauses.append(f"LOWER(CAST({col} AS STRING)) LIKE LOWER('%{search_term}%')")
        
        where_condition = " OR ".join(where_clauses)
        
        query = f"""
        SELECT * 
        FROM {namespace}.{table_name}
        WHERE {where_condition}
        LIMIT {limit}
        """
    
    try:
        results_df = spark.sql(query)
        pandas_df = results_df.toPandas()
        return pandas_df
    except Exception as e:
        print(f"Error searching {table_name}: {e}")
        return pd.DataFrame()

def search_all_tables(search_term, limit_per_table=50):
    """Search across all tables"""
    all_results = {}
    
    for table in tables:
        if table == 'prefix':  # Skip prefix table as requested
            continue
            
        print(f"Searching {table}...")
        results = search_table(table, search_term, limit=limit_per_table)
        
        if not results.empty:
            all_results[table] = results
            print(f"  Found {len(results)} matches")
    
    return all_results

## Example Searches

In [4]:
# Search for a specific term
search_term = "glucose"  # Change this to search for different terms

print(f"Searching for '{search_term}' across all tables...\n")
results = search_all_tables(search_term, limit_per_table=20)

# Display results
for table, df in results.items():
    print(f"\n{'='*60}")
    print(f"Results from {table}: {len(df)} matches")
    print(f"{'='*60}")
    display(df.head(5))

Searching for 'glucose' across all tables...

Searching entailed_edge...


                                                                                

  Found 20 matches
Searching feature_annotation...


                                                                                

  Found 20 matches
Searching statements...


                                                                                

  Found 20 matches
Searching term_association...


                                                                                

  Found 20 matches

Results from entailed_edge: 20 matches


Unnamed: 0,subject,predicate,object
0,https://metacyc.org/reaction?orgid=META&id=GLU...,rdfs:subClassOf,BFO:0000001
1,metacyc.pathway:GLUCOSE1PMETAB-PWY,rdfs:subClassOf,BFO:0000001
2,https://metacyc.org/reaction?orgid=META&id=DTD...,rdfs:subClassOf,BFO:0000001
3,https://metacyc.org/reaction?orgid=META&id=GLU...,rdfs:subClassOf,BFO:0000001
4,https://metacyc.org/reaction?orgid=META&id=KET...,rdfs:subClassOf,BFO:0000001



Results from feature_annotation: 20 matches


Unnamed: 0,feature_id,genome_id,genome_ref,genome_taxa,protein_hash,protein_seq,rast,bakta_ec,bakta_gene,bakta_product,bakta_go,bakta_cog,bakta_refseq,bakta_uniparc,bakta_uniref
0,562.61239_169,562.61239,219790/2/1,cellular organisms:Bacteria:Proteobacteria:Gam...,af7fb5a02e572b9a92072d5de59ab17ec8e5c6fba188c5...,MGLFDKLKSLVSDDKKDTGTIEIIAPLSGEIVNIEDVPDVVFAEKI...,"PTS system, glucose-specific IIA component (EC...",2.7.1.199,crr,PTS glucose transporter subunit IIA,GO:0009401,,,,"UniRef50_P45338,UniRef90_A0A8S0FP79"
1,562.61239_211,562.61239,219790/2/1,cellular organisms:Bacteria:Proteobacteria:Gam...,8b1bc56d82bf5d6840a8ac7ccad63fe2d73be0370d3a73...,MIKKIFALPVIEQISPVLSRRKLDELDLIVVDHPQVKASFALQGAH...,Aldose 1-epimerase family protein YeaD,,,Putative glucose-6-phosphate 1-epimerase,,,,,UniRef50_P39173
2,562.61239_264,562.61239,219790/2/1,cellular organisms:Bacteria:Proteobacteria:Gam...,e961e899c59a2a9dd76e1dfe0764f96ac01ed664fbd90b...,MSTPRQILAAIFDMDGLLIDSEPLWDRAELDVMASLGVDISRRNEL...,2-deoxyglucose-6-phosphate hydrolase (EC 3.1.3...,"3.1.3.22,3.1.3.50,3.1.3.68",hxpB,hexitol phosphatase HxpB,,,,,"UniRef50_A0A264VSC4,UniRef90_A0A797HCN7"
3,562.61239_376,562.61239,219790/2/1,cellular organisms:Bacteria:Proteobacteria:Gam...,9851040991cda5e91c5821a837896b04a3b6b621863574...,MTAKTAPKVTLWEFFQQLGKTFMLPVALLSFCGIMLGIGSSLSSHD...,"PTS system, maltose and glucose-specific IIC c...",,malX,PTS maltose transporter subunit IICB,"GO:0005363,GO:0005886,GO:0008982,GO:0009401,GO...","COG1264,G",,,"UniRef50_P19642,UniRef90_P19642"
4,562.61239_833,562.61239,219790/2/1,cellular organisms:Bacteria:Proteobacteria:Gam...,187e2488edebc628f3b8b8e985caa851d01f9afdbb0141...,MQVLHVCSEMFPLLKTGGLADVIGALPAAQIADGVDARVLLPAFPD...,"Glycogen synthase, ADP-glucose transglucosylas...",2.4.1.21,glgA,glycogen synthase GlgA,"GO:0004373,GO:0005829,GO:0005978,GO:0009011","COG0297,G",,,"UniRef50_P0A6U8,UniRef90_P0A6U8"



Results from statements: 20 matches


Unnamed: 0,stanza,subject,predicate,object,value,datatype,language,graph
0,EC:3.1.1.33,EC:3.1.1.33,rdfs:label,,6-acetylglucose deacetylase,,,
1,EC:3.1.3.10,EC:3.1.3.10,rdfs:label,,glucose-1-phosphatase,,,
2,EC:3.1.3.68,EC:3.1.3.68,rdfs:label,,2-deoxyglucose-6-phosphatase,,,
3,EC:3.1.3.68,EC:3.1.3.68,oio:hasExactSynonym,,2-deoxyglucose-6-phosphate phosphatase,,,
4,EC:3.1.3.9,EC:3.1.3.9,rdfs:label,,glucose-6-phosphatase,,,



Results from term_association: 20 matches


Unnamed: 0,id,subject,predicate,object,evidence_type,publication,source
0,seed.role:0000000008648,UDP-glucose:(glucosyl)lipopolysaccharide alpha...,RO:0002327,seed.reaction:rxn08620,ECO:0000501,ModelSEED_template,ModelSEED:GramNegModelTemplateV6
1,seed.role:0000000007531,"Similar to CDP-glucose 4,6-dehydratase (EC 4.2...",RO:0002327,seed.reaction:rxn01750,ECO:0000501,ModelSEED_template,ModelSEED:GramNegModelTemplateV6
2,seed.role:0000000003125,Glucose-1-phosphate cytidylyltransferase (EC 2...,RO:0002327,seed.reaction:rxn00702,ECO:0000501,ModelSEED_template,ModelSEED:GramNegModelTemplateV6
3,seed.role:0000000003129,Glucose-6-phosphate isomerase (EC 5.3.1.9),RO:0002327,seed.reaction:rxn00558,ECO:0000501,ModelSEED_template,ModelSEED:GramNegModelTemplateV6
4,seed.role:0000000003131,"Glucose-6-phosphate isomerase, archaeal II (EC...",RO:0002327,seed.reaction:rxn00558,ECO:0000501,ModelSEED_template,ModelSEED:GramNegModelTemplateV6


## Specialized Search Functions

In [5]:
def find_seed_compounds(compound_pattern=None, limit=100):
    """Find SEED compounds with their details"""
    
    if compound_pattern:
        pattern_filter = f"AND s1.subject LIKE '%{compound_pattern}%'"
    else:
        pattern_filter = ""
    
    query = f"""
    SELECT DISTINCT
        s1.subject as compound_id,
        s1.value as compound_name,
        s2.object as cross_reference,
        s2.predicate as ref_type
    FROM {namespace}.statements s1
    LEFT JOIN {namespace}.statements s2 
        ON s1.subject = s2.subject 
        AND s2.predicate IN ('oio:hasDbXref', 'skos:exactMatch')
    WHERE s1.subject LIKE 'seed.compound:%'
    AND s1.predicate = 'rdfs:label'
    {pattern_filter}
    LIMIT {limit}
    """
    
    return spark.sql(query).toPandas()

# Example: Find glucose-related compounds
glucose_compounds = find_seed_compounds('glucose')
print(f"Found {len(glucose_compounds)} SEED compounds related to glucose:")
display(glucose_compounds.head(10))



Found 0 SEED compounds related to glucose:


Unnamed: 0,compound_id,compound_name,cross_reference,ref_type


In [6]:
def find_ec_numbers_with_details(ec_pattern=None, limit=100):
    """Find EC numbers with their names and associated reactions"""
    
    ec_filter = f"WHERE f.bakta_ec LIKE '{ec_pattern}%'" if ec_pattern else ""
    
    query = f"""
    WITH ec_in_genomes AS (
        SELECT DISTINCT
            bakta_ec as ec_number,
            COUNT(DISTINCT genome_id) as genome_count,
            COUNT(*) as feature_count,
            FIRST(bakta_product) as example_product
        FROM {namespace}.feature_annotation
        WHERE bakta_ec IS NOT NULL
        GROUP BY bakta_ec
    ),
    ec_details AS (
        SELECT 
            subject as ec_id,
            value as ec_name
        FROM {namespace}.statements
        WHERE predicate = 'rdfs:label'
        AND subject LIKE 'EC:%'
    )
    SELECT 
        e.ec_number,
        d.ec_name,
        e.genome_count,
        e.feature_count,
        e.example_product
    FROM ec_in_genomes e
    LEFT JOIN ec_details d ON CONCAT('EC:', e.ec_number) = d.ec_id
    {ec_filter.replace('f.', 'e.')}
    ORDER BY e.genome_count DESC, e.feature_count DESC
    LIMIT {limit}
    """
    
    return spark.sql(query).toPandas()

# Example: Find all transferases (EC 2.*)
transferases = find_ec_numbers_with_details('2')
print(f"Found {len(transferases)} transferase EC numbers:")
display(transferases.head(10))



AnalysisException: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `e`.`bakta_ec` cannot be resolved. Did you mean one of the following? [`d`.`ec_id`, `d`.`ec_name`, `e`.`ec_number`, `e`.`feature_count`, `e`.`genome_count`].; line 28 pos 10;
'WithCTE
:- CTERelationDef 0, false
:  +- SubqueryAlias ec_in_genomes
:     +- Distinct
:        +- Aggregate [bakta_ec#2484], [bakta_ec#2484 AS ec_number#2470, count(distinct genome_id#2478) AS genome_count#2471L, count(1) AS feature_count#2472L, first(bakta_product#2486, false) AS example_product#2474]
:           +- Filter isnotnull(bakta_ec#2484)
:              +- SubqueryAlias spark_catalog.ontology_data.feature_annotation
:                 +- Relation spark_catalog.ontology_data.feature_annotation[feature_id#2477,genome_id#2478,genome_ref#2479,genome_taxa#2480,protein_hash#2481,protein_seq#2482,rast#2483,bakta_ec#2484,bakta_gene#2485,bakta_product#2486,bakta_go#2487,bakta_cog#2488,bakta_refseq#2489,bakta_uniparc#2490,bakta_uniref#2491] parquet
:- CTERelationDef 1, false
:  +- SubqueryAlias ec_details
:     +- Project [subject#2493 AS ec_id#2475, value#2496 AS ec_name#2476]
:        +- Filter ((predicate#2494 = rdfs:label) AND subject#2493 LIKE EC:%)
:           +- SubqueryAlias spark_catalog.ontology_data.statements
:              +- Relation spark_catalog.ontology_data.statements[stanza#2492,subject#2493,predicate#2494,object#2495,value#2496,datatype#2497,language#2498,graph#2499] parquet
+- 'GlobalLimit 100
   +- 'LocalLimit 100
      +- 'Sort ['e.genome_count DESC NULLS LAST, 'e.feature_count DESC NULLS LAST], true
         +- 'Project ['e.ec_number, 'd.ec_name, 'e.genome_count, 'e.feature_count, 'e.example_product]
            +- 'Filter 'e.bakta_ec LIKE 2%
               +- Join LeftOuter, (concat(EC:, ec_number#2470) = ec_id#2475)
                  :- SubqueryAlias e
                  :  +- SubqueryAlias ec_in_genomes
                  :     +- CTERelationRef 0, true, [ec_number#2470, genome_count#2471L, feature_count#2472L, example_product#2474], false
                  +- SubqueryAlias d
                     +- SubqueryAlias ec_details
                        +- CTERelationRef 1, true, [ec_id#2475, ec_name#2476], false


In [None]:
def find_go_terms(go_pattern=None, limit=100):
    """Find GO terms with their usage in genomes"""
    
    go_filter = f"WHERE f.bakta_go LIKE '%{go_pattern}%'" if go_pattern else ""
    
    query = f"""
    WITH go_usage AS (
        SELECT 
            bakta_go as go_term,
            COUNT(DISTINCT genome_id) as genome_count,
            COUNT(*) as annotation_count,
            COLLECT_SET(bakta_product)[0] as example_product
        FROM {namespace}.feature_annotation f
        WHERE bakta_go IS NOT NULL
        {go_filter}
        GROUP BY bakta_go
    ),
    go_details AS (
        SELECT 
            subject as go_id,
            value as go_name
        FROM {namespace}.statements
        WHERE predicate = 'rdfs:label'
        AND subject LIKE 'GO:%'
    )
    SELECT 
        g.go_term,
        d.go_name,
        g.genome_count,
        g.annotation_count,
        g.example_product,
        ROUND(g.genome_count * 100.0 / 50, 2) as pct_genomes
    FROM go_usage g
    LEFT JOIN go_details d ON g.go_term = d.go_id
    ORDER BY g.genome_count DESC, g.annotation_count DESC
    LIMIT {limit}
    """
    
    return spark.sql(query).toPandas()

# Example: Find binding-related GO terms
binding_terms = find_go_terms('binding')
print(f"Found {len(binding_terms)} GO terms related to binding:")
display(binding_terms.head(10))

## Export Search Results

In [None]:
def export_search_to_csv(search_term, output_path='search_results'):
    """Export search results to CSV files"""
    
    results = search_all_tables(search_term, limit_per_table=1000)
    
    for table, df in results.items():
        filename = f"{output_path}_{table}_{search_term.replace(' ', '_')}.csv"
        df.to_csv(filename, index=False)
        print(f"Exported {len(df)} results from {table} to {filename}")

# Example: Export all glucose-related entries
# export_search_to_csv('glucose', 'glucose_search')

## Complex Search: Find Metabolic Pathways

In [None]:
def find_metabolic_pathway(compound_name, limit=50):
    """Find reactions and enzymes related to a specific compound"""
    
    query = f"""
    WITH compound_reactions AS (
        -- Find reactions involving the compound
        SELECT DISTINCT
            e.subject as reaction_id,
            e.object as compound_id,
            s.value as compound_name
        FROM {namespace}.entailed_edge e
        JOIN {namespace}.statements s
            ON e.object = s.subject 
            AND s.predicate = 'rdfs:label'
        WHERE e.predicate = 'RO:0000057'  -- has participant
        AND e.subject LIKE 'seed.reaction:%'
        AND LOWER(s.value) LIKE LOWER('%{compound_name}%')
    ),
    reaction_enzymes AS (
        -- Find enzymes that catalyze these reactions
        SELECT 
            ta.object as reaction_id,
            ta.subject as role_id,
            s.value as role_name
        FROM {namespace}.term_association ta
        JOIN {namespace}.statements s
            ON ta.subject = s.subject
            AND s.predicate = 'rdfs:label'
        WHERE ta.predicate = 'RO:0002327'  -- enables
    )
    SELECT DISTINCT
        cr.reaction_id,
        cr.compound_name,
        re.role_name as enzyme_role,
        rs.value as reaction_name
    FROM compound_reactions cr
    LEFT JOIN reaction_enzymes re ON cr.reaction_id = re.reaction_id
    LEFT JOIN {namespace}.statements rs 
        ON cr.reaction_id = rs.subject 
        AND rs.predicate = 'rdfs:label'
    LIMIT {limit}
    """
    
    return spark.sql(query).toPandas()

# Example: Find pathways involving pyruvate
pyruvate_pathways = find_metabolic_pathway('pyruvate', limit=30)
print(f"Found {len(pyruvate_pathways)} reactions involving pyruvate:")
display(pyruvate_pathways)

## Summary Statistics

In [None]:
# Get summary statistics for all tables
def get_table_statistics():
    """Get detailed statistics for each table"""
    
    for table in tables:
        print(f"\n{'='*60}")
        print(f"Table: {namespace}.{table}")
        print(f"{'='*60}")
        
        # Get row count
        count = spark.sql(f"SELECT COUNT(*) as cnt FROM {namespace}.{table}").collect()[0]['cnt']
        print(f"Total rows: {count:,}")
        
        # Get schema
        schema = spark.sql(f"SELECT * FROM {namespace}.{table} LIMIT 1").schema
        print(f"\nColumns ({len(schema.fields)}):")
        for field in schema.fields:
            print(f"  - {field.name}: {field.dataType}")
        
        # Show sample
        print(f"\nSample data:")
        sample_df = spark.sql(f"SELECT * FROM {namespace}.{table} LIMIT 3").toPandas()
        display(sample_df)

# Run statistics
get_table_statistics()