# Interactive Data Search Tool

This notebook provides an interactive way to search through the CDM ontology data tables.

**Note**: This uses local parquet files for quick searching. Run on remote JupyterHub for full dataset.

In [None]:
import pandas as pd
import pyarrow.parquet as pq
from IPython.display import display, HTML
import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed
import warnings
warnings.filterwarnings('ignore')

# For remote execution, uncomment these:
# from spark.utils import get_spark_session
# spark = get_spark_session()

## Load Data Tables

Loading smaller tables into memory for interactive searching. For the large tables, we'll use sampling.

In [None]:
# Define data files
data_files = {
    'statements': 'data/statements.parquet',
    'entailed_edge': 'data/entailed_edge.parquet',
    'feature_annotation': 'data/feature_annotation.parquet',
    'term_association': 'data/term_association.parquet'
    # Ignoring prefix.parquet as requested
}

# Load smaller tables completely, sample larger ones
dataframes = {}
table_info = {}

for name, path in data_files.items():
    print(f"Loading {name}...")
    
    # Get file info
    parquet_file = pq.ParquetFile(path)
    total_rows = parquet_file.metadata.num_rows
    
    if total_rows > 1_000_000:  # Sample large files
        # Read only first 100k rows for interactive search
        df = pd.read_parquet(path, engine='pyarrow').head(100_000)
        table_info[name] = f"Sampled: 100,000 of {total_rows:,} rows"
    else:
        df = pd.read_parquet(path, engine='pyarrow')
        table_info[name] = f"Complete: {total_rows:,} rows"
    
    dataframes[name] = df
    print(f"  Loaded {table_info[name]}")
    print(f"  Columns: {list(df.columns)}")
    print()

## Interactive Search Widget

In [None]:
# Create search interface
def search_data(table, search_term, column='all', max_results=50):
    """Search for a term in the specified table and column"""
    if not search_term:
        return "Enter a search term"
    
    df = dataframes[table]
    
    # Convert search term to lowercase for case-insensitive search
    search_term = search_term.lower()
    
    if column == 'all':
        # Search across all string columns
        mask = pd.Series([False] * len(df))
        for col in df.columns:
            if df[col].dtype == 'object':
                mask |= df[col].astype(str).str.lower().str.contains(search_term, na=False)
    else:
        # Search specific column
        mask = df[column].astype(str).str.lower().str.contains(search_term, na=False)
    
    results = df[mask].head(max_results)
    
    if len(results) == 0:
        return f"No results found for '{search_term}' in {table}"
    
    # Display results with highlighting
    html = f"<h3>Found {len(df[mask])} results (showing first {len(results)})</h3>"
    html += f"<p><em>{table_info[table]}</em></p>"
    
    # Convert to HTML and highlight search term
    results_html = results.to_html()
    # Simple highlighting (case-insensitive)
    import re
    pattern = re.compile(re.escape(search_term), re.IGNORECASE)
    results_html = pattern.sub(lambda m: f'<mark style="background-color: yellow">{m.group()}</mark>', results_html)
    
    return HTML(html + results_html)

# Create interactive widgets
table_widget = widgets.Dropdown(
    options=list(dataframes.keys()),
    value='statements',
    description='Table:'
)

search_widget = widgets.Text(
    placeholder='Enter search term...',
    description='Search:'
)

column_widget = widgets.Dropdown(
    options=['all'],
    value='all',
    description='Column:'
)

max_results_widget = widgets.IntSlider(
    value=50,
    min=10,
    max=200,
    step=10,
    description='Max Results:'
)

# Update column options when table changes
def update_columns(*args):
    column_widget.options = ['all'] + list(dataframes[table_widget.value].columns)
    
table_widget.observe(update_columns, 'value')
update_columns()  # Initialize

# Create search interface
out = widgets.interactive_output(
    search_data, 
    {
        'table': table_widget, 
        'search_term': search_widget,
        'column': column_widget,
        'max_results': max_results_widget
    }
)

display(widgets.VBox([
    widgets.HBox([table_widget, column_widget]),
    search_widget,
    max_results_widget,
    out
]))

## Quick Search Functions

Pre-built searches for common queries:

In [None]:
def find_seed_entries(compound_id=None, reaction_id=None, role_id=None):
    """Find SEED database entries"""
    results = {}
    
    # Search patterns
    patterns = []
    if compound_id:
        patterns.append(f"seed.compound:{compound_id}")
    if reaction_id:
        patterns.append(f"seed.reaction:{reaction_id}")
    if role_id:
        patterns.append(f"seed.role:{role_id}")
    
    if not patterns:
        patterns = ['seed.compound', 'seed.reaction', 'seed.role']
    
    for pattern in patterns:
        for table_name, df in dataframes.items():
            mask = pd.Series([False] * len(df))
            for col in df.columns:
                if df[col].dtype == 'object':
                    mask |= df[col].astype(str).str.contains(pattern, na=False, case=False)
            
            found = df[mask]
            if len(found) > 0:
                if table_name not in results:
                    results[table_name] = pd.DataFrame()
                results[table_name] = pd.concat([results[table_name], found]).drop_duplicates()
    
    return results

# Example: Find all seed.compound entries
print("Searching for SEED compounds...")
seed_results = find_seed_entries(compound_id='cpd')
for table, df in seed_results.items():
    if len(df) > 0:
        print(f"\n{table}: {len(df)} entries")
        display(df.head())

In [None]:
def find_ec_numbers(ec_pattern=None):
    """Find Enzyme Commission (EC) numbers"""
    results = {}
    
    # EC number pattern (e.g., EC:1.1.1.1)
    if ec_pattern:
        pattern = f"EC:{ec_pattern}"
    else:
        pattern = "EC:"
    
    for table_name, df in dataframes.items():
        mask = pd.Series([False] * len(df))
        for col in df.columns:
            if df[col].dtype == 'object':
                mask |= df[col].astype(str).str.contains(pattern, na=False)
        
        found = df[mask]
        if len(found) > 0:
            results[table_name] = found
    
    return results

# Example: Find specific EC class
print("Searching for EC numbers starting with 2.8...")
ec_results = find_ec_numbers("2.8")
for table, df in ec_results.items():
    print(f"\n{table}: {len(df)} entries")
    display(df.head(3))

In [None]:
def find_ncbi_taxa(taxon_id=None, search_term=None):
    """Find NCBI Taxonomy entries"""
    results = {}
    
    if taxon_id:
        pattern = f"NCBITaxon:{taxon_id}"
    elif search_term:
        pattern = search_term
    else:
        pattern = "NCBITaxon:"
    
    for table_name, df in dataframes.items():
        mask = pd.Series([False] * len(df))
        for col in df.columns:
            if df[col].dtype == 'object':
                mask |= df[col].astype(str).str.contains(pattern, na=False, case=False)
        
        found = df[mask]
        if len(found) > 0:
            results[table_name] = found
    
    return results

# Example usage
print("Searching for NCBI Taxonomy entries...")
taxa_results = find_ncbi_taxa()
for table, df in taxa_results.items():
    print(f"\n{table}: {len(df)} entries with NCBI Taxon IDs")
    # Show unique subjects if applicable
    if 'subject' in df.columns:
        unique_taxa = df['subject'].str.extract(r'(NCBITaxon:\d+)')[0].dropna().unique()
        print(f"  Unique taxa: {len(unique_taxa)}")
        print(f"  Examples: {list(unique_taxa[:5])}")

## Data Statistics Dashboard

In [None]:
# Create a summary dashboard
def create_data_summary():
    """Create a summary of all data tables"""
    summary_html = "<h2>Data Tables Summary</h2>"
    
    for name, df in dataframes.items():
        summary_html += f"<h3>{name}</h3>"
        summary_html += f"<p><em>{table_info[name]}</em></p>"
        summary_html += "<ul>"
        summary_html += f"<li>Columns: {', '.join(df.columns)}</li>"
        
        # Column statistics
        for col in df.columns:
            if df[col].dtype == 'object':
                unique_vals = df[col].nunique()
                summary_html += f"<li>{col}: {unique_vals:,} unique values</li>"
        
        summary_html += "</ul>"
        
        # Show sample
        summary_html += "<details><summary>View sample data</summary>"
        summary_html += df.head(3).to_html()
        summary_html += "</details><br>"
    
    return HTML(summary_html)

create_data_summary()

## Export Search Results

In [None]:
def export_search_results(search_term, output_file='search_results.csv'):
    """Export all search results to a CSV file"""
    all_results = []
    
    for table_name, df in dataframes.items():
        mask = pd.Series([False] * len(df))
        for col in df.columns:
            if df[col].dtype == 'object':
                mask |= df[col].astype(str).str.contains(search_term, na=False, case=False)
        
        results = df[mask].copy()
        if len(results) > 0:
            results['source_table'] = table_name
            all_results.append(results)
    
    if all_results:
        combined = pd.concat(all_results, ignore_index=True)
        combined.to_csv(output_file, index=False)
        print(f"Exported {len(combined)} results to {output_file}")
        return combined
    else:
        print("No results found")
        return None

# Example: Export all SEED compound references
# export_search_results('seed.compound', 'seed_compounds.csv')