# Interactive Search for CDM Ontology Data

This notebook provides interactive search capabilities for the CDM ontology data loaded in the warehouse.
Each table has its own section with search widgets.

In [7]:
# Setup - Import required libraries and initialize Spark
from spark.utils import get_spark_session
import ipywidgets as widgets
from IPython.display import display, HTML
import pandas as pd

spark = get_spark_session()
namespace = 'ontology_data'

# Helper function to display search results
def display_results(df, max_rows=100):
    """Display dataframe results with nice formatting"""
    if df.empty:
        print("No results found.")
    else:
        print(f"Found {len(df)} results (showing up to {max_rows}):")
        display(df.head(max_rows))

## 1. Statements Table Search

Search through 42.4M ontology statements (RDF triples)

In [8]:
# Load statements table
print("Loading statements table...")
statements_df = spark.table(f"{namespace}.statements")
print(f"Statements table loaded. Schema:")
statements_df.printSchema()

# Create search widgets for statements
search_text = widgets.Text(
    value='',
    placeholder='Enter search term',
    description='Search:',
    style={'description_width': 'initial'}
)

search_column = widgets.Dropdown(
    options=['subject', 'predicate', 'object', 'value', 'datatype', 'language'],
    value='subject',
    description='Column:',
    style={'description_width': 'initial'}
)

limit_slider = widgets.IntSlider(
    value=50,
    min=10,
    max=500,
    step=10,
    description='Limit:',
    style={'description_width': 'initial'}
)

search_button = widgets.Button(
    description='Search Statements',
    button_style='primary',
    icon='search'
)

output = widgets.Output()

def search_statements(b):
    with output:
        output.clear_output()
        if search_text.value:
            # Create SQL query
            query = f"""
            SELECT * 
            FROM {namespace}.statements
            WHERE LOWER({search_column.value}) LIKE LOWER('%{search_text.value}%')
            LIMIT {limit_slider.value}
            """
            
            print(f"Searching for '{search_text.value}' in column '{search_column.value}'...")
            result_df = spark.sql(query).toPandas()
            display_results(result_df, limit_slider.value)

search_button.on_click(search_statements)

# Display widgets
display(widgets.HBox([search_text, search_column, limit_slider]))
display(search_button)
display(output)

Loading statements table...
Statements table loaded. Schema:
root
 |-- stanza: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- predicate: string (nullable = true)
 |-- object: string (nullable = true)
 |-- value: string (nullable = true)
 |-- datatype: string (nullable = true)
 |-- language: string (nullable = true)
 |-- graph: integer (nullable = true)



HBox(children=(Text(value='', description='Search:', placeholder='Enter search term', style=TextStyle(descript…

Button(button_style='primary', description='Search Statements', icon='search', style=ButtonStyle())

Output()

## 2. Entailed Edge Table Search

Search through 117.5M inferred relationships

In [None]:
# Load entailed_edge table
print("Loading entailed_edge table...")
entailed_edge_df = spark.table(f"{namespace}.entailed_edge")
print(f"Entailed edge table loaded. Schema:")
entailed_edge_df.printSchema()

# Create search widgets for entailed_edge
edge_search_text = widgets.Text(
    value='',
    placeholder='Enter search term',
    description='Search:',
    style={'description_width': 'initial'}
)

edge_search_column = widgets.Dropdown(
    options=['subject', 'predicate', 'object'],
    value='subject',
    description='Column:',
    style={'description_width': 'initial'}
)

edge_limit_slider = widgets.IntSlider(
    value=50,
    min=10,
    max=500,
    step=10,
    description='Limit:',
    style={'description_width': 'initial'}
)

edge_search_button = widgets.Button(
    description='Search Edges',
    button_style='primary',
    icon='search'
)

edge_output = widgets.Output()

def search_edges(b):
    with edge_output:
        edge_output.clear_output()
        if edge_search_text.value:
            # Create SQL query
            query = f"""
            SELECT * 
            FROM {namespace}.entailed_edge
            WHERE LOWER({edge_search_column.value}) LIKE LOWER('%{edge_search_text.value}%')
            LIMIT {edge_limit_slider.value}
            """
            
            print(f"Searching for '{edge_search_text.value}' in column '{edge_search_column.value}'...")
            result_df = spark.sql(query).toPandas()
            display_results(result_df, edge_limit_slider.value)

edge_search_button.on_click(search_edges)

# Display widgets
display(widgets.HBox([edge_search_text, edge_search_column, edge_limit_slider]))
display(edge_search_button)
display(edge_output)

## 3. Feature Annotation Table Search

Search through 237K genomic features from 50 E. coli genomes

In [11]:
# Load feature_annotation table
print("Loading feature_annotation table...")
feature_annotation_df = spark.table(f"{namespace}.feature_annotation")
print(f"Feature annotation table loaded. Schema:")
feature_annotation_df.printSchema()

# Create search widgets for feature_annotation
feature_search_text = widgets.Text(
    value='',
    placeholder='Enter search term',
    description='Search:',
    style={'description_width': 'initial'}
)

feature_search_column = widgets.Dropdown(
    options=['feature_id', 'genome_id', 'genome_taxa', 'rast','bakta_type', 'bakta_gene', 
             'bakta_product', 'bakta_ec', 'bakta_go', 'bakta_cog_id', 'bakta_cog_category',
             'bakta_interpro', 'bakta_uniref', 'protein_hash'],
    value='bakta_product',
    description='Column:',
    style={'description_width': 'initial'}
)

feature_limit_slider = widgets.IntSlider(
    value=50,
    min=10,
    max=500,
    step=10,
    description='Limit:',
    style={'description_width': 'initial'}
)

feature_search_button = widgets.Button(
    description='Search Features',
    button_style='primary',
    icon='search'
)

feature_output = widgets.Output()

def search_features(b):
    with feature_output:
        feature_output.clear_output()
        if feature_search_text.value:
            # Create SQL query
            query = f"""
            SELECT * 
            FROM {namespace}.feature_annotation
            WHERE LOWER({feature_search_column.value}) LIKE LOWER('%{feature_search_text.value}%')
            LIMIT {feature_limit_slider.value}
            """
            
            print(f"Searching for '{feature_search_text.value}' in column '{feature_search_column.value}'...")
            result_df = spark.sql(query).toPandas()
            display_results(result_df, feature_limit_slider.value)

feature_search_button.on_click(search_features)

# Display widgets
display(widgets.HBox([feature_search_text, feature_search_column]))
display(widgets.HBox([feature_limit_slider, feature_search_button]))
display(feature_output)

Loading feature_annotation table...
Feature annotation table loaded. Schema:
root
 |-- feature_id: string (nullable = true)
 |-- genome_id: string (nullable = true)
 |-- genome_ref: string (nullable = true)
 |-- genome_taxa: string (nullable = true)
 |-- protein_hash: string (nullable = true)
 |-- protein_seq: string (nullable = true)
 |-- rast: string (nullable = true)
 |-- bakta_ec: string (nullable = true)
 |-- bakta_gene: string (nullable = true)
 |-- bakta_product: string (nullable = true)
 |-- bakta_go: string (nullable = true)
 |-- bakta_cog: string (nullable = true)
 |-- bakta_refseq: double (nullable = true)
 |-- bakta_uniparc: double (nullable = true)
 |-- bakta_uniref: string (nullable = true)



HBox(children=(Text(value='', description='Search:', placeholder='Enter search term', style=TextStyle(descript…

HBox(children=(IntSlider(value=50, description='Limit:', max=500, min=10, step=10, style=SliderStyle(descripti…

Output()

## 4. Term Association Table Search

Search through 3.3K enzyme-reaction mappings

In [14]:
# Load term_association table
print("Loading term_association table...")
term_association_df = spark.table(f"{namespace}.term_association")
print(f"Term association table loaded. Schema:")
term_association_df.printSchema()

# Create search widgets for term_association
term_search_text = widgets.Text(
    value='',
    placeholder='Enter search term',
    description='Search:',
    style={'description_width': 'initial'}
)

term_search_column = widgets.Dropdown(
    options=['id','subject', 'predicate', 'object'],
    value='subject',
    description='Column:',
    style={'description_width': 'initial'}
)

term_limit_slider = widgets.IntSlider(
    value=50,
    min=10,
    max=500,
    step=10,
    description='Limit:',
    style={'description_width': 'initial'}
)

term_search_button = widgets.Button(
    description='Search Terms',
    button_style='primary',
    icon='search'
)

term_output = widgets.Output()

def search_terms(b):
    with term_output:
        term_output.clear_output()
        if term_search_text.value:
            # Create SQL query
            query = f"""
            SELECT * 
            FROM {namespace}.term_association
            WHERE LOWER({term_search_column.value}) LIKE LOWER('%{term_search_text.value}%')
            LIMIT {term_limit_slider.value}
            """
            
            print(f"Searching for '{term_search_text.value}' in column '{term_search_column.value}'...")
            result_df = spark.sql(query).toPandas()
            display_results(result_df, term_limit_slider.value)

term_search_button.on_click(search_terms)

# Display widgets
display(widgets.HBox([term_search_text, term_search_column, term_limit_slider]))
display(term_search_button)
display(term_output)

Loading term_association table...
Term association table loaded. Schema:
root
 |-- id: string (nullable = true)
 |-- subject: string (nullable = true)
 |-- predicate: string (nullable = true)
 |-- object: string (nullable = true)
 |-- evidence_type: string (nullable = true)
 |-- publication: string (nullable = true)
 |-- source: string (nullable = true)



HBox(children=(Text(value='', description='Search:', placeholder='Enter search term', style=TextStyle(descript…

Button(button_style='primary', description='Search Terms', icon='search', style=ButtonStyle())

Output()

## Quick Reference

### Available Tables:
- **statements** (42.4M rows): Ontology statements (RDF triples)
- **entailed_edge** (117.5M rows): Inferred relationships  
- **feature_annotation** (237K rows): Genomic features from 50 E. coli genomes
- **term_association** (3.3K rows): Enzyme-reaction mappings

### Usage:
1. Each table has its own search section with interactive widgets
2. Select the column to search in from the dropdown
3. Enter your search term and adjust the result limit
4. Click the search button to execute the query
5. Results will appear below each search widget

## Example Searches