# Interactive Data Search Tool (Remote Version)

This notebook provides an interactive way to search through the CDM ontology data tables using Spark.

**Note**: This version is designed to work on the remote JupyterHub with Spark tables.

In [None]:
from spark.utils import get_spark_session
from IPython.display import display, HTML
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Create Spark session
spark = get_spark_session()
namespace = 'ontology_data'

print("Spark session created successfully")

## Check Available Tables

In [None]:
# List all tables in the namespace
tables_df = spark.sql(f"SHOW TABLES IN {namespace}")
tables = [row.tableName for row in tables_df.collect()]

print(f"Available tables in {namespace}:")
for table in tables:
    count = spark.sql(f"SELECT COUNT(*) as cnt FROM {namespace}.{table}").collect()[0]['cnt']
    print(f"  - {table}: {count:,} rows")

## Interactive Search Functions

These functions allow searching through Spark tables efficiently:

In [None]:
def search_table(table_name, search_term, column=None, limit=100):
    """Search for a term in a specific table"""
    
    if column:
        # Search specific column
        query = f"""
        SELECT * 
        FROM {namespace}.{table_name}
        WHERE LOWER({column}) LIKE LOWER('%{search_term}%')
        LIMIT {limit}
        """
    else:
        # Get columns for this table
        cols = spark.sql(f"SELECT * FROM {namespace}.{table_name} LIMIT 1").columns
        
        # Build WHERE clause for all string columns
        where_clauses = []
        for col in cols:
            where_clauses.append(f"LOWER(CAST({col} AS STRING)) LIKE LOWER('%{search_term}%')")
        
        where_condition = " OR ".join(where_clauses)
        
        query = f"""
        SELECT * 
        FROM {namespace}.{table_name}
        WHERE {where_condition}
        LIMIT {limit}
        """
    
    try:
        results_df = spark.sql(query)
        pandas_df = results_df.toPandas()
        return pandas_df
    except Exception as e:
        print(f"Error searching {table_name}: {e}")
        return pd.DataFrame()

def search_all_tables(search_term, limit_per_table=50):
    """Search across all tables"""
    all_results = {}
    
    for table in tables:
        if table == 'prefix':  # Skip prefix table as requested
            continue
            
        print(f"Searching {table}...")
        results = search_table(table, search_term, limit=limit_per_table)
        
        if not results.empty:
            all_results[table] = results
            print(f"  Found {len(results)} matches")
    
    return all_results

## Example Searches

In [None]:
# Search for a specific term
search_term = "glucose"  # Change this to search for different terms

print(f"Searching for '{search_term}' across all tables...\n")
results = search_all_tables(search_term, limit_per_table=20)

# Display results
for table, df in results.items():
    print(f"\n{'='*60}")
    print(f"Results from {table}: {len(df)} matches")
    print(f"{'='*60}")
    display(df.head(5))

## Specialized Search Functions

In [None]:
def find_seed_compounds(compound_pattern=None, limit=100):
    """Find SEED compounds with their details"""
    
    if compound_pattern:
        pattern_filter = f"AND s1.subject LIKE '%{compound_pattern}%'"
    else:
        pattern_filter = ""
    
    query = f"""
    SELECT DISTINCT
        s1.subject as compound_id,
        s1.value as compound_name,
        s2.object as cross_reference,
        s2.predicate as ref_type
    FROM {namespace}.statements s1
    LEFT JOIN {namespace}.statements s2 
        ON s1.subject = s2.subject 
        AND s2.predicate IN ('oio:hasDbXref', 'skos:exactMatch')
    WHERE s1.subject LIKE 'seed.compound:%'
    AND s1.predicate = 'rdfs:label'
    {pattern_filter}
    LIMIT {limit}
    """
    
    return spark.sql(query).toPandas()

# Example: Find glucose-related compounds
glucose_compounds = find_seed_compounds('glucose')
print(f"Found {len(glucose_compounds)} SEED compounds related to glucose:")
display(glucose_compounds.head(10))

In [None]:
def find_ec_numbers_with_details(ec_pattern=None, limit=100):
    """Find EC numbers with their names and associated reactions"""
    
    ec_filter = f"WHERE f.bakta_ec LIKE '{ec_pattern}%'" if ec_pattern else ""
    
    query = f"""
    WITH ec_in_genomes AS (
        SELECT DISTINCT
            bakta_ec as ec_number,
            COUNT(DISTINCT genome_id) as genome_count,
            COUNT(*) as feature_count,
            FIRST(bakta_product) as example_product
        FROM {namespace}.feature_annotation
        WHERE bakta_ec IS NOT NULL
        GROUP BY bakta_ec
    ),
    ec_details AS (
        SELECT 
            subject as ec_id,
            value as ec_name
        FROM {namespace}.statements
        WHERE predicate = 'rdfs:label'
        AND subject LIKE 'EC:%'
    )
    SELECT 
        e.ec_number,
        d.ec_name,
        e.genome_count,
        e.feature_count,
        e.example_product
    FROM ec_in_genomes e
    LEFT JOIN ec_details d ON CONCAT('EC:', e.ec_number) = d.ec_id
    {ec_filter.replace('f.', 'e.')}
    ORDER BY e.genome_count DESC, e.feature_count DESC
    LIMIT {limit}
    """
    
    return spark.sql(query).toPandas()

# Example: Find all transferases (EC 2.*)
transferases = find_ec_numbers_with_details('2')
print(f"Found {len(transferases)} transferase EC numbers:")
display(transferases.head(10))

In [None]:
def find_go_terms(go_pattern=None, limit=100):
    """Find GO terms with their usage in genomes"""
    
    go_filter = f"WHERE f.bakta_go LIKE '%{go_pattern}%'" if go_pattern else ""
    
    query = f"""
    WITH go_usage AS (
        SELECT 
            bakta_go as go_term,
            COUNT(DISTINCT genome_id) as genome_count,
            COUNT(*) as annotation_count,
            COLLECT_SET(bakta_product)[0] as example_product
        FROM {namespace}.feature_annotation f
        WHERE bakta_go IS NOT NULL
        {go_filter}
        GROUP BY bakta_go
    ),
    go_details AS (
        SELECT 
            subject as go_id,
            value as go_name
        FROM {namespace}.statements
        WHERE predicate = 'rdfs:label'
        AND subject LIKE 'GO:%'
    )
    SELECT 
        g.go_term,
        d.go_name,
        g.genome_count,
        g.annotation_count,
        g.example_product,
        ROUND(g.genome_count * 100.0 / 50, 2) as pct_genomes
    FROM go_usage g
    LEFT JOIN go_details d ON g.go_term = d.go_id
    ORDER BY g.genome_count DESC, g.annotation_count DESC
    LIMIT {limit}
    """
    
    return spark.sql(query).toPandas()

# Example: Find binding-related GO terms
binding_terms = find_go_terms('binding')
print(f"Found {len(binding_terms)} GO terms related to binding:")
display(binding_terms.head(10))

## Export Search Results

In [None]:
def export_search_to_csv(search_term, output_path='search_results'):
    """Export search results to CSV files"""
    
    results = search_all_tables(search_term, limit_per_table=1000)
    
    for table, df in results.items():
        filename = f"{output_path}_{table}_{search_term.replace(' ', '_')}.csv"
        df.to_csv(filename, index=False)
        print(f"Exported {len(df)} results from {table} to {filename}")

# Example: Export all glucose-related entries
# export_search_to_csv('glucose', 'glucose_search')

## Complex Search: Find Metabolic Pathways

In [None]:
def find_metabolic_pathway(compound_name, limit=50):
    """Find reactions and enzymes related to a specific compound"""
    
    query = f"""
    WITH compound_reactions AS (
        -- Find reactions involving the compound
        SELECT DISTINCT
            e.subject as reaction_id,
            e.object as compound_id,
            s.value as compound_name
        FROM {namespace}.entailed_edge e
        JOIN {namespace}.statements s
            ON e.object = s.subject 
            AND s.predicate = 'rdfs:label'
        WHERE e.predicate = 'RO:0000057'  -- has participant
        AND e.subject LIKE 'seed.reaction:%'
        AND LOWER(s.value) LIKE LOWER('%{compound_name}%')
    ),
    reaction_enzymes AS (
        -- Find enzymes that catalyze these reactions
        SELECT 
            ta.object as reaction_id,
            ta.subject as role_id,
            s.value as role_name
        FROM {namespace}.term_association ta
        JOIN {namespace}.statements s
            ON ta.subject = s.subject
            AND s.predicate = 'rdfs:label'
        WHERE ta.predicate = 'RO:0002327'  -- enables
    )
    SELECT DISTINCT
        cr.reaction_id,
        cr.compound_name,
        re.role_name as enzyme_role,
        rs.value as reaction_name
    FROM compound_reactions cr
    LEFT JOIN reaction_enzymes re ON cr.reaction_id = re.reaction_id
    LEFT JOIN {namespace}.statements rs 
        ON cr.reaction_id = rs.subject 
        AND rs.predicate = 'rdfs:label'
    LIMIT {limit}
    """
    
    return spark.sql(query).toPandas()

# Example: Find pathways involving pyruvate
pyruvate_pathways = find_metabolic_pathway('pyruvate', limit=30)
print(f"Found {len(pyruvate_pathways)} reactions involving pyruvate:")
display(pyruvate_pathways)

## Summary Statistics

In [None]:
# Get summary statistics for all tables
def get_table_statistics():
    """Get detailed statistics for each table"""
    
    for table in tables:
        print(f"\n{'='*60}")
        print(f"Table: {namespace}.{table}")
        print(f"{'='*60}")
        
        # Get row count
        count = spark.sql(f"SELECT COUNT(*) as cnt FROM {namespace}.{table}").collect()[0]['cnt']
        print(f"Total rows: {count:,}")
        
        # Get schema
        schema = spark.sql(f"SELECT * FROM {namespace}.{table} LIMIT 1").schema
        print(f"\nColumns ({len(schema.fields)}):")
        for field in schema.fields:
            print(f"  - {field.name}: {field.dataType}")
        
        # Show sample
        print(f"\nSample data:")
        sample_df = spark.sql(f"SELECT * FROM {namespace}.{table} LIMIT 3").toPandas()
        display(sample_df)

# Run statistics
get_table_statistics()