# iSamples Interactive Explorer

An interactive interface for exploring iSamples data across all sources.

**Features:**
- Map view with 6M+ samples (lonboard WebGL)
- Interactive table with filtering (ipydatagrid)
- Sample cards on selection
- **Faceted filtering**: Filter by source, material type, and time period
- **Fulltext search**: Search label, description, place name with ranked results
- **Bidirectional selection sync**: Click map → highlights table row; click table → recenters map
- **Viewport Mode**: Dynamic loading based on pan/zoom (with loading indicator)
- **Adaptive sampling**: More points when zoomed in, fewer when zoomed out

**Data:** Zenodo wide parquet (~282 MB, 20M rows)

In [None]:
# Imports
import os
import math
import threading
import duckdb
import pandas as pd
import geopandas as gpd
import numpy as np
from functools import partial
from shapely.geometry import Point

# Visualization
from lonboard import Map, ScatterplotLayer
from lonboard.colormap import apply_continuous_cmap
from ipydatagrid import DataGrid
import ipywidgets as widgets
from IPython.display import display, HTML

In [None]:
# Data paths
LOCAL_WIDE = os.path.expanduser("~/Data/iSample/pqg_refining/zenodo_wide_2026-01-09.parquet")
REMOTE_WIDE = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet"

# Use local if available
PARQUET_PATH = LOCAL_WIDE if os.path.exists(LOCAL_WIDE) else REMOTE_WIDE
print(f"Using: {PARQUET_PATH}")

# Connect to DuckDB
con = duckdb.connect()

In [None]:
# Source color scheme (consistent across iSamples)
SOURCE_COLORS = {
    'SESAR': [51, 102, 204, 200],       # Blue
    'OPENCONTEXT': [220, 57, 18, 200],  # Red
    'GEOME': [16, 150, 24, 200],        # Green
    'SMITHSONIAN': [255, 153, 0, 200],  # Orange
}

DEFAULT_COLOR = [128, 128, 128, 200]  # Gray for unknown

## Load Sample Data

We start with a sample of 50K records across all sources for responsive interaction.

In [None]:
def load_samples(max_per_source=12500, source_filter=None, bbox=None, search_term=None,
                 source_filters=None, material_filters=None, year_range=None):
    """
    Load samples with coordinates from the wide parquet.

    Args:
        max_per_source: Maximum samples per source (for balanced representation)
        source_filter: Optional single source name to filter (e.g., 'OPENCONTEXT') - DEPRECATED
        bbox: Optional bounding box dict with min_lat, max_lat, min_lon, max_lon
        search_term: Optional search string to filter and rank results
        source_filters: Set of source names to include (empty = all)
        material_filters: Set of material labels to include (empty = all)
        year_range: Tuple of (min_year, max_year) or None for no filter

    Returns:
        GeoDataFrame with sample data (includes search_score if search_term provided)
    """
    # Build WHERE clause with optional table prefix for material filter queries
    def build_where_clause(prefix=""):
        p = f"{prefix}." if prefix else ""
        clause = f"WHERE {p}otype = 'MaterialSampleRecord' AND {p}latitude IS NOT NULL"

        # Handle source filtering (new multi-select takes precedence)
        if source_filters:
            sources_sql = ", ".join(f"'{s}'" for s in source_filters)
            clause += f" AND {p}n IN ({sources_sql})"
        elif source_filter:
            clause += f" AND {p}n = '{source_filter}'"

        if bbox:
            clause += f" AND {p}latitude BETWEEN {bbox['min_lat']} AND {bbox['max_lat']}"
            clause += f" AND {p}longitude BETWEEN {bbox['min_lon']} AND {bbox['max_lon']}"

        # Year range filter - cast result_time to TIMESTAMP first
        if year_range and (year_range[0] is not None or year_range[1] is not None):
            if year_range[0] is not None and year_range[1] is not None:
                clause += f" AND EXTRACT(YEAR FROM TRY_CAST({p}result_time AS TIMESTAMP)) BETWEEN {year_range[0]} AND {year_range[1]}"
            elif year_range[0] is not None:
                clause += f" AND EXTRACT(YEAR FROM TRY_CAST({p}result_time AS TIMESTAMP)) >= {year_range[0]}"
            elif year_range[1] is not None:
                clause += f" AND EXTRACT(YEAR FROM TRY_CAST({p}result_time AS TIMESTAMP)) <= {year_range[1]}"

        return clause

    # Search filtering and scoring
    def build_search_expr(prefix=""):
        p = f"{prefix}." if prefix else ""
        if not search_term or not search_term.strip():
            return "0 AS search_score", "", "ORDER BY RANDOM()"

        # Escape single quotes in search term
        term = search_term.strip().replace("'", "''")

        # Weighted scoring: label (10) > description (5) > place_name (3)
        score_expr = f"""
            (CASE WHEN {p}label ILIKE '%{term}%' THEN 10 ELSE 0 END +
             CASE WHEN {p}description ILIKE '%{term}%' THEN 5 ELSE 0 END +
             CASE WHEN CAST({p}place_name AS VARCHAR) ILIKE '%{term}%' THEN 3 ELSE 0 END) AS search_score
        """

        # Filter to only matching records
        search_filter = f"""
            AND ({p}label ILIKE '%{term}%' 
                 OR {p}description ILIKE '%{term}%' 
                 OR CAST({p}place_name AS VARCHAR) ILIKE '%{term}%')
        """

        # Sort by score (highest first), then random within same score
        order_by = "ORDER BY search_score DESC, RANDOM()"

        return score_expr, search_filter, order_by

    # Query with balanced sampling across sources
    if material_filters:
        # Material filter requires a CTE with join
        material_labels_sql = ", ".join(f"'{m}'" for m in material_filters)
        where_clause = build_where_clause("base")
        search_score_expr, search_filter, order_by = build_search_expr("base")

        query = f"""
            WITH material_matches AS (
                SELECT DISTINCT msr.row_id
                FROM read_parquet('{PARQUET_PATH}') msr
                CROSS JOIN UNNEST(msr.p__has_material_category) AS t(mat_id)
                JOIN read_parquet('{PARQUET_PATH}') ic ON ic.row_id = mat_id
                WHERE msr.otype = 'MaterialSampleRecord'
                  AND ic.label IN ({material_labels_sql})
            ),
            scored AS (
                SELECT 
                    base.row_id, base.pid, base.label, base.description, 
                    base.latitude, base.longitude, base.n as source,
                    base.place_name, base.result_time,
                    {search_score_expr}
                FROM read_parquet('{PARQUET_PATH}') base
                {where_clause}
                {search_filter}
                AND base.row_id IN (SELECT row_id FROM material_matches)
            ),
            ranked AS (
                SELECT *,
                    ROW_NUMBER() OVER (PARTITION BY source {order_by.replace('ORDER BY', 'ORDER BY')}) as rn
                FROM scored
            )
            SELECT row_id, pid, label, description, latitude, longitude, source, place_name, result_time, search_score
            FROM ranked
            WHERE rn <= {max_per_source}
            {order_by}
        """
    else:
        where_clause = build_where_clause()
        search_score_expr, search_filter, order_by = build_search_expr()

        query = f"""
            WITH scored AS (
                SELECT 
                    row_id, pid, label, description, latitude, longitude, n as source,
                    place_name, result_time,
                    {search_score_expr}
                FROM read_parquet('{PARQUET_PATH}')
                {where_clause}
                {search_filter}
            ),
            ranked AS (
                SELECT *,
                    ROW_NUMBER() OVER (PARTITION BY source {order_by.replace('ORDER BY', 'ORDER BY')}) as rn
                FROM scored
            )
            SELECT row_id, pid, label, description, latitude, longitude, source, place_name, result_time, search_score
            FROM ranked
            WHERE rn <= {max_per_source}
            {order_by}
        """

    df = con.sql(query).df()

    # Convert to GeoDataFrame
    geometry = [Point(lon, lat) for lon, lat in zip(df['longitude'], df['latitude'])]
    gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")

    return gdf


def view_state_to_bbox(view_state, buffer_factor=1.5, aspect_ratio=1.5):
    """
    Calculate bounding box from lonboard view_state.

    The view_state contains latitude, longitude, and zoom level.
    We calculate the visible extent using Web Mercator projection math.

    Args:
        view_state: lonboard MapViewState with latitude, longitude, zoom
        buffer_factor: Multiply bbox by this to load slightly more data (default 1.5)
        aspect_ratio: Width/height ratio of map container (default 1.5 for wider maps)

    Returns:
        dict with min_lat, max_lat, min_lon, max_lon
    """
    lat = view_state.latitude
    lon = view_state.longitude
    zoom = view_state.zoom

    # At zoom 0, entire world visible (~360 degrees longitude)
    # Each zoom level halves the visible area
    # Approximate degrees visible at zoom level
    degrees_visible = 360 / (2 ** zoom)

    # Latitude visible area - apply buffer
    lat_degrees = degrees_visible * buffer_factor / 2

    # Longitude visible area - wider due to aspect ratio and Mercator at higher latitudes
    # Mercator stretches longitude at higher latitudes, so we need more buffer
    lat_rad = math.radians(abs(lat))
    mercator_stretch = 1 / max(math.cos(lat_rad), 0.1)  # Avoid division by zero near poles
    lon_degrees = degrees_visible * buffer_factor * aspect_ratio * mercator_stretch / 2

    # Clamp latitude to valid range
    min_lat = max(-90, lat - lat_degrees)
    max_lat = min(90, lat + lat_degrees)
    min_lon = max(-180, lon - lon_degrees)
    max_lon = min(180, lon + lon_degrees)

    return {
        'min_lat': min_lat,
        'max_lat': max_lat,
        'min_lon': min_lon,
        'max_lon': max_lon
    }


def adaptive_sample_size(zoom, base_size=50000):
    """
    Calculate sample size based on zoom level.

    At low zoom (world view), sample aggressively to avoid overwhelming.
    At high zoom (local view), show all available points.

    Args:
        zoom: Current zoom level (0-20)
        base_size: Base sample size per source

    Returns:
        Sample size to use per source
    """
    if zoom < 2:
        return min(base_size, 10000)  # World view: max 10K per source
    elif zoom < 5:
        return min(base_size, 25000)  # Continent view: max 25K
    elif zoom < 8:
        return min(base_size, 50000)  # Country view: max 50K
    elif zoom < 12:
        return min(base_size, 100000)  # Region view: max 100K
    else:
        return base_size  # Local view: use full base_size


# Load initial data
print("Loading samples...")
samples_gdf = load_samples(max_per_source=12500)
print(f"Loaded {len(samples_gdf):,} samples")
print(f"\nBy source:")
print(samples_gdf['source'].value_counts())

## Sample Card Renderer

In [None]:
def render_sample_card(row):
    """
    Render a sample as an HTML card.
    
    Args:
        row: DataFrame row or Series with sample data
    
    Returns:
        HTML string
    """
    if row is None:
        return "<div style='padding: 10px; color: #666;'>Click a point to see details</div>"
    
    source = row.get('source', 'Unknown')
    source_color = {
        'SESAR': '#3366CC',
        'OPENCONTEXT': '#DC3912',
        'GEOME': '#109618',
        'SMITHSONIAN': '#FF9900'
    }.get(source, '#808080')
    
    label = row.get('label', 'No label')
    if pd.isna(label):
        label = 'No label'
    
    description = row.get('description', '')
    if pd.isna(description):
        description = ''
    elif len(str(description)) > 200:
        description = str(description)[:200] + '...'
    
    lat = row.get('latitude', 0)
    lon = row.get('longitude', 0)
    if pd.isna(lat):
        lat = 0
    if pd.isna(lon):
        lon = 0
        
    pid = row.get('pid', '')
    if pd.isna(pid):
        pid = ''
    
    place = row.get('place_name', '')
    if pd.isna(place):
        place = ''
    elif isinstance(place, list):
        place = ' > '.join(str(p) for p in place if p and not pd.isna(p))
    else:
        place = str(place)
    
    # Build place HTML only if place has content
    place_html = ''
    if place and len(place) > 0:
        place_html = f'<div style="margin-bottom: 4px;"><strong>Place:</strong> {place[:100]}</div>'
    
    html = f"""
    <div style="font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
                border: 1px solid #ddd; border-radius: 8px; padding: 16px; 
                max-width: 400px; background: white; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
        <div style="display: flex; align-items: center; margin-bottom: 12px;">
            <span style="background: {source_color}; color: white; padding: 4px 8px; 
                        border-radius: 4px; font-size: 12px; font-weight: 600;">{source}</span>
        </div>
        <h3 style="margin: 0 0 8px 0; font-size: 16px; color: #333;">{label}</h3>
        <p style="margin: 0 0 12px 0; font-size: 13px; color: #666; line-height: 1.4;">
            {description if description else '<em>No description</em>'}
        </p>
        <div style="font-size: 12px; color: #888;">
            <div style="margin-bottom: 4px;"><strong>Location:</strong> {lat:.4f}, {lon:.4f}</div>
            {place_html}
            <div><strong>ID:</strong> <code style="background: #f5f5f5; padding: 2px 4px; border-radius: 3px;">{str(pid)[:50]}{'...' if len(str(pid)) > 50 else ''}</code></div>
        </div>
    </div>
    """
    return html

# Test the card
display(HTML(render_sample_card(samples_gdf.iloc[0])))

## Map Component

In [None]:
def get_colors_for_sources(sources):
    """
    Get color array for a list of sources.
    
    Args:
        sources: pandas Series or list of source names
    
    Returns:
        numpy array of RGBA colors
    """
    colors = np.array([
        SOURCE_COLORS.get(s, DEFAULT_COLOR) for s in sources
    ], dtype=np.uint8)
    return colors

def create_map_layer(gdf):
    """
    Create a lonboard ScatterplotLayer from a GeoDataFrame.
    """
    colors = get_colors_for_sources(gdf['source'])
    
    layer = ScatterplotLayer.from_geopandas(
        gdf,
        get_fill_color=colors,
        get_radius=1000,
        radius_units='meters',
        radius_min_pixels=2,
        radius_max_pixels=10,
        pickable=True,
        auto_highlight=True,
    )
    return layer

# Basemap options - uncomment your preference
from lonboard.basemap import CartoStyle, MaplibreBasemap

BASEMAP_VOYAGER = MaplibreBasemap(style=CartoStyle.Voyager)      # Light with labels (default)
BASEMAP_POSITRON = MaplibreBasemap(style=CartoStyle.Positron)  # Light, minimal
BASEMAP_DARK = MaplibreBasemap(style=CartoStyle.DarkMatter)    # Dark theme

# Create initial map
layer = create_map_layer(samples_gdf)
sample_map = Map(layers=[layer], basemap=BASEMAP_VOYAGER)
print("Map created with", len(samples_gdf), "points")

## Table Component

In [None]:
def create_table(gdf):
    """
    Create an ipydatagrid table from sample data.
    """
    # Select columns for display
    display_cols = ['source', 'label', 'latitude', 'longitude']
    df_display = gdf[display_cols].copy()
    df_display['latitude'] = df_display['latitude'].round(4)
    df_display['longitude'] = df_display['longitude'].round(4)
    
    grid = DataGrid(
        df_display,
        base_row_size=32,
        base_column_size=120,
        selection_mode='row',
        editable=False,
        layout=widgets.Layout(height='300px', width='100%')
    )
    return grid

# Create table
sample_table = create_table(samples_gdf)

## Interactive Controls

In [None]:
# State management
class ExplorerState:
    def __init__(self):
        self.selected_index = None
        self.selected_row = None
        self.current_gdf = None
        self.viewport_mode = False
        self.debounce_timer = None
        self.loading = False
        self.syncing_selection = False  # Prevent infinite loops
        self.current_search = ""  # Current search term
        # Facet filter state
        self.source_filters = set()      # Selected sources (empty = all)
        self.material_filters = set()    # Selected material URIs (full URIs for filtering)
        self.year_range = (None, None)   # (min_year, max_year) or None for no filter
        # Facet counts cache
        self.facet_counts_cache = {}
        self.facet_cache_time = 0

state = ExplorerState()
state.current_gdf = samples_gdf


# =============================================================================
# Facet Query Functions
# =============================================================================

def uri_to_display_name(uri):
    """
    Convert a vocabulary URI to a human-readable display name.
    
    Examples:
        https://w3id.org/isample/vocabulary/material/1.0/rock -> Rock
        https://w3id.org/isample/opencontext/material/0.1/ceramicclay -> Ceramic Clay
    """
    if not uri or not isinstance(uri, str):
        return str(uri)
    
    # Extract last path segment
    name = uri.rstrip('/').split('/')[-1]
    
    # Insert spaces before uppercase letters (camelCase -> Camel Case)
    import re
    name = re.sub(r'([a-z])([A-Z])', r'\1 \2', name)
    
    # Capitalize first letter of each word
    name = name.title()
    
    return name


def get_source_counts(additional_filters=None):
    """
    Get counts of samples by source.
    
    Args:
        additional_filters: Dict with material_filters, year_range keys
    
    Returns:
        Dict of {source_name: count}
    """
    where_clause = "WHERE otype = 'MaterialSampleRecord' AND latitude IS NOT NULL"
    
    # Apply year filter if present (cast result_time to TIMESTAMP)
    if additional_filters and additional_filters.get('year_range'):
        yr = additional_filters['year_range']
        if yr[0] is not None and yr[1] is not None:
            where_clause += f" AND EXTRACT(YEAR FROM TRY_CAST(result_time AS TIMESTAMP)) BETWEEN {yr[0]} AND {yr[1]}"
        elif yr[0] is not None:
            where_clause += f" AND EXTRACT(YEAR FROM TRY_CAST(result_time AS TIMESTAMP)) >= {yr[0]}"
        elif yr[1] is not None:
            where_clause += f" AND EXTRACT(YEAR FROM TRY_CAST(result_time AS TIMESTAMP)) <= {yr[1]}"
    
    # Note: We don't filter by material here to show all source options
    query = f"""
        SELECT n as source, COUNT(*) as count
        FROM read_parquet('{PARQUET_PATH}')
        {where_clause}
        GROUP BY n ORDER BY count DESC
    """
    
    result = con.sql(query).df()
    return dict(zip(result['source'], result['count']))


def get_material_counts(additional_filters=None, limit=50):
    """
    Get counts of samples by material category (requires join for labels).
    
    Args:
        additional_filters: Dict with source_filters, year_range keys
        limit: Max number of materials to return
    
    Returns:
        Dict of {display_name: {'uri': full_uri, 'count': count}}
    """
    where_clause = "WHERE msr.otype = 'MaterialSampleRecord' AND msr.latitude IS NOT NULL"
    
    # Apply source filter if present
    if additional_filters and additional_filters.get('source_filters'):
        sources_sql = ", ".join(f"'{s}'" for s in additional_filters['source_filters'])
        where_clause += f" AND msr.n IN ({sources_sql})"
    
    # Apply year filter if present (cast result_time to TIMESTAMP)
    if additional_filters and additional_filters.get('year_range'):
        yr = additional_filters['year_range']
        if yr[0] is not None and yr[1] is not None:
            where_clause += f" AND EXTRACT(YEAR FROM TRY_CAST(msr.result_time AS TIMESTAMP)) BETWEEN {yr[0]} AND {yr[1]}"
        elif yr[0] is not None:
            where_clause += f" AND EXTRACT(YEAR FROM TRY_CAST(msr.result_time AS TIMESTAMP)) >= {yr[0]}"
        elif yr[1] is not None:
            where_clause += f" AND EXTRACT(YEAR FROM TRY_CAST(msr.result_time AS TIMESTAMP)) <= {yr[1]}"
    
    query = f"""
        WITH samples AS (
            SELECT msr.row_id, UNNEST(msr.p__has_material_category) as material_id
            FROM read_parquet('{PARQUET_PATH}') msr
            {where_clause}
        )
        SELECT ic.label as material_uri, COUNT(*) as count
        FROM samples s
        JOIN read_parquet('{PARQUET_PATH}') ic ON ic.row_id = s.material_id
        WHERE ic.label IS NOT NULL
        GROUP BY ic.label 
        ORDER BY count DESC
        LIMIT {limit}
    """
    
    try:
        result = con.sql(query).df()
        # Return dict with display name as key, containing uri and count
        materials = {}
        for _, row in result.iterrows():
            uri = row['material_uri']
            display_name = uri_to_display_name(uri)
            materials[display_name] = {
                'uri': uri,
                'count': row['count']
            }
        return materials
    except Exception as e:
        print(f"Material count error: {e}")
        return {}


def get_year_range_stats():
    """
    Get min/max years and decade counts for time facet.
    
    Returns:
        Dict with 'min_year', 'max_year', 'decades' (dict of decade: count)
    """
    # Cast result_time to TIMESTAMP before extracting year
    query = f"""
        SELECT 
            MIN(EXTRACT(YEAR FROM TRY_CAST(result_time AS TIMESTAMP)))::INT as min_year,
            MAX(EXTRACT(YEAR FROM TRY_CAST(result_time AS TIMESTAMP)))::INT as max_year
        FROM read_parquet('{PARQUET_PATH}')
        WHERE otype = 'MaterialSampleRecord' 
          AND latitude IS NOT NULL
          AND result_time IS NOT NULL
          AND TRY_CAST(result_time AS TIMESTAMP) IS NOT NULL
    """
    
    stats = con.sql(query).df().iloc[0]
    
    # Get decade counts
    decade_query = f"""
        SELECT
            (EXTRACT(YEAR FROM TRY_CAST(result_time AS TIMESTAMP))::INT / 10) * 10 as decade,
            COUNT(*) as count
        FROM read_parquet('{PARQUET_PATH}')
        WHERE otype = 'MaterialSampleRecord'
          AND latitude IS NOT NULL
          AND result_time IS NOT NULL
          AND TRY_CAST(result_time AS TIMESTAMP) IS NOT NULL
        GROUP BY decade 
        ORDER BY decade
    """
    
    decades_df = con.sql(decade_query).df()
    decades = dict(zip(decades_df['decade'].astype(int), decades_df['count']))
    
    return {
        'min_year': int(stats['min_year']) if pd.notna(stats['min_year']) else 1900,
        'max_year': int(stats['max_year']) if pd.notna(stats['max_year']) else 2025,
        'decades': decades
    }


# =============================================================================
# Facet Widgets
# =============================================================================

# Get initial counts for facets
print("Loading facet counts...")
initial_source_counts = get_source_counts()
initial_material_counts = get_material_counts(limit=20)  # Reduced to 20 for cleaner display
year_stats = get_year_range_stats()

print(f"Sources: {list(initial_source_counts.keys())}")
print(f"Materials: {len(initial_material_counts)} types")
print(f"Years: {year_stats['min_year']} - {year_stats['max_year']}")


# Build material options for SelectMultiple widget
material_options = []
material_uri_map = {}  # Map display string -> URI
for display_name, data in initial_material_counts.items():
    count = data['count']
    uri = data['uri']
    option_label = f"{display_name} ({count:,})"
    material_options.append(option_label)
    material_uri_map[option_label] = uri


def create_source_checkboxes(counts):
    """Create checkboxes for source facet."""
    checkboxes = []
    for source, count in counts.items():
        cb = widgets.Checkbox(
            value=False,
            description=f"{source} ({count:,})",
            indent=False,
            layout=widgets.Layout(width='100%', margin='2px 0')
        )
        cb.source_name = source  # Store source name for easy access
        checkboxes.append(cb)
    return checkboxes


# Create source checkboxes
source_checkboxes = create_source_checkboxes(initial_source_counts)
source_facet_box = widgets.VBox(
    source_checkboxes, 
    layout=widgets.Layout(max_height='180px', overflow_y='auto', padding='5px')
)

# Use SelectMultiple for materials - much cleaner than checkboxes
material_select = widgets.SelectMultiple(
    options=material_options,
    value=[],
    rows=10,
    description='',
    layout=widgets.Layout(width='100%', height='220px'),
    style={'description_width': '0px'}
)

material_facet_box = widgets.VBox([
    widgets.HTML("<small style='color:#666;'>Ctrl/Cmd+click to select multiple</small>"),
    material_select
], layout=widgets.Layout(padding='5px'))

# Time facet widgets
year_slider = widgets.IntRangeSlider(
    value=[year_stats['min_year'], year_stats['max_year']],
    min=year_stats['min_year'],
    max=year_stats['max_year'],
    step=1,
    description='Years:',
    continuous_update=False,
    layout=widgets.Layout(width='100%'),
    style={'description_width': '50px'}
)

# Enable/disable time filter
time_filter_enabled = widgets.Checkbox(
    value=False,
    description='Filter by time',
    indent=False,
    layout=widgets.Layout(margin='5px 0')
)

time_facet_box = widgets.VBox([
    time_filter_enabled,
    year_slider
], layout=widgets.Layout(padding='5px'))

# Create accordion
facet_accordion = widgets.Accordion(
    children=[source_facet_box, material_facet_box, time_facet_box],
    titles=['Sources', 'Material Type', 'Time Period'],
    layout=widgets.Layout(width='100%')
)
facet_accordion.selected_index = None  # Start collapsed

# Clear all filters button
clear_filters_btn = widgets.Button(
    description='Clear All Filters',
    button_style='warning',
    icon='times-circle',
    layout=widgets.Layout(width='100%', margin='10px 0')
)


# =============================================================================
# Original Widgets (preserved)
# =============================================================================

source_filter = widgets.Dropdown(
    options=['All Sources', 'SESAR', 'OPENCONTEXT', 'GEOME', 'SMITHSONIAN'],
    value='All Sources',
    description='Source:',
    style={'description_width': '60px'},
    layout=widgets.Layout(display='none')  # Hidden - replaced by facet
)

search_input = widgets.Text(
    value='',
    placeholder='Search label, description, place...',
    description='Search:',
    style={'description_width': '60px'},
    layout=widgets.Layout(width='280px')
)

search_btn = widgets.Button(
    description='',
    button_style='',
    icon='search',
    tooltip='Search (or press Enter)',
    layout=widgets.Layout(width='40px')
)

clear_search_btn = widgets.Button(
    description='',
    button_style='',
    icon='times',
    tooltip='Clear search',
    layout=widgets.Layout(width='40px')
)

sample_count = widgets.IntSlider(
    value=12500,
    min=1000,
    max=500000,  # 500K per source - plenty for 128GB RAM
    step=5000,
    description='Per source:',
    style={'description_width': '80px'}
)

viewport_toggle = widgets.ToggleButton(
    value=False,
    description='Viewport Mode',
    tooltip='When enabled, automatically loads data for current map view',
    icon='map',
    button_style=''  # 'success' when active
)

refresh_btn = widgets.Button(
    description='Refresh Data',
    button_style='primary',
    icon='refresh'
)

# Loading indicator with spinner
loading_indicator = widgets.HTML(value="")

status_label = widgets.HTML(value=f"<b>Loaded:</b> {len(samples_gdf):,} samples")

card_output = widgets.HTML(value=render_sample_card(None))

# Active filters display
active_filters_html = widgets.HTML(value="")


def update_active_filters_display():
    """Update the display of currently active filters."""
    filters = []
    
    if state.source_filters:
        filters.append(f"<span style='background:#3366CC; color:white; padding:2px 6px; border-radius:3px; margin:2px;'>Sources: {', '.join(state.source_filters)}</span>")
    
    if state.material_filters:
        # Get display names from selected options
        mat_names = [uri_to_display_name(uri) for uri in state.material_filters]
        mat_display = mat_names[:2]
        if len(mat_names) > 2:
            mat_display.append(f"+{len(mat_names)-2} more")
        filters.append(f"<span style='background:#109618; color:white; padding:2px 6px; border-radius:3px; margin:2px;'>Materials: {', '.join(mat_display)}</span>")
    
    if state.year_range[0] is not None or state.year_range[1] is not None:
        yr_str = f"{state.year_range[0] or 'any'} - {state.year_range[1] or 'any'}"
        filters.append(f"<span style='background:#FF9900; color:white; padding:2px 6px; border-radius:3px; margin:2px;'>Years: {yr_str}</span>")
    
    if filters:
        active_filters_html.value = f"<div style='margin:5px 0;'><b>Active:</b> {''.join(filters)}</div>"
    else:
        active_filters_html.value = ""


def show_loading(message="Loading..."):
    """Show loading indicator."""
    state.loading = True
    loading_indicator.value = f"""
    <div style="display: inline-flex; align-items: center; color: #666;">
        <svg width="20" height="20" viewBox="0 0 50 50" style="animation: spin 1s linear infinite; margin-right: 8px;">
            <circle cx="25" cy="25" r="20" fill="none" stroke="#3366CC" stroke-width="4" stroke-dasharray="80,40"/>
        </svg>
        <style>@keyframes spin {{ from {{ transform: rotate(0deg); }} to {{ transform: rotate(360deg); }} }}</style>
        <span>{message}</span>
    </div>
    """


def hide_loading():
    """Hide loading indicator."""
    state.loading = False
    loading_indicator.value = ""


def select_sample(idx, source='map'):
    """
    Select a sample by index and sync map/table/card.

    Args:
        idx: Row index in current_gdf
        source: 'map' or 'table' - which triggered the selection
    """
    if idx is None or idx >= len(state.current_gdf):
        return

    state.selected_index = idx
    state.selected_row = state.current_gdf.iloc[idx]

    # Update sample card
    card_output.value = render_sample_card(state.selected_row)

    if source == 'map':
        # Map click -> highlight table row
        # Column count depends on whether we're showing search_score
        col_count = 4 if state.current_search else 3
        sample_table.selections = [{'r1': idx, 'c1': 0, 'r2': idx, 'c2': col_count}]

    elif source == 'table':
        # Table click -> recenter map (keep current zoom)
        lat = state.selected_row['latitude']
        lon = state.selected_row['longitude']
        if not pd.isna(lat) and not pd.isna(lon):
            sample_map.set_view_state(latitude=float(lat), longitude=float(lon))


def on_map_point_click(change):
    """Handle click on a map point - highlight corresponding table row."""
    if state.syncing_selection:
        return

    idx = change.get('new')
    if idx is None:
        return

    state.syncing_selection = True
    try:
        select_sample(idx, source='map')
    finally:
        state.syncing_selection = False


def setup_layer_observer(layer):
    """Setup the selected_index observer on a layer."""
    layer.observe(on_map_point_click, names=['selected_index'])


def update_map_and_table(new_gdf, search_active=False):
    """Update map and table with new data."""
    state.current_gdf = new_gdf
    state.current_search = search_input.value.strip() if search_active else ""

    # Update map with new layer
    new_layer = create_map_layer(new_gdf)

    # Setup observer on new layer BEFORE adding to map
    setup_layer_observer(new_layer)

    sample_map.layers = [new_layer]

    # Update table - include score column if searching
    if search_active and 'search_score' in new_gdf.columns:
        display_cols = ['search_score', 'source', 'label', 'latitude', 'longitude']
        df_display = new_gdf[display_cols].copy()
        df_display = df_display.rename(columns={'search_score': 'score'})
    else:
        display_cols = ['source', 'label', 'latitude', 'longitude']
        df_display = new_gdf[display_cols].copy()

    df_display['latitude'] = df_display['latitude'].round(4)
    df_display['longitude'] = df_display['longitude'].round(4)
    sample_table.data = df_display

    # Update status
    if search_active:
        status_label.value = f"<b>Found:</b> {len(new_gdf):,} matches for '{state.current_search}'"
    else:
        status_label.value = f"<b>Loaded:</b> {len(new_gdf):,} samples"
    
    # Update active filters display
    update_active_filters_display()


def do_search():
    """Execute search with current parameters."""
    show_loading("Searching...")

    try:
        # Use facet filters instead of single source dropdown
        source_filters_set = state.source_filters if state.source_filters else None
        material_filters_set = state.material_filters if state.material_filters else None
        year_range = state.year_range if (state.year_range[0] is not None or state.year_range[1] is not None) else None
        
        search_term = search_input.value.strip()

        if state.viewport_mode:
            # Search within current viewport
            view_state = sample_map.view_state
            zoom = view_state.zoom if hasattr(view_state, 'zoom') else 1
            bbox = view_state_to_bbox(view_state)

            # When searching, use slider value directly (no adaptive reduction)
            # When browsing, use adaptive sampling based on zoom
            if search_term:
                max_samples = sample_count.value
            else:
                max_samples = adaptive_sample_size(zoom, base_size=sample_count.value)

            new_gdf = load_samples(
                max_per_source=max_samples,
                bbox=bbox,
                search_term=search_term if search_term else None,
                source_filters=source_filters_set,
                material_filters=material_filters_set,
                year_range=year_range
            )

            zoom_info = f" (zoom {zoom:.1f})"
        else:
            # Search globally
            new_gdf = load_samples(
                max_per_source=sample_count.value,
                search_term=search_term if search_term else None,
                source_filters=source_filters_set,
                material_filters=material_filters_set,
                year_range=year_range
            )
            zoom_info = ""

        update_map_and_table(new_gdf, search_active=bool(search_term))

        if search_term:
            status_label.value = f"<b>Found:</b> {len(new_gdf):,} matches for '{search_term}'{zoom_info}"
        else:
            status_label.value = f"<b>Loaded:</b> {len(new_gdf):,} samples{zoom_info}"

    except Exception as e:
        status_label.value = f"<b>Error:</b> {str(e)[:50]}"
        import traceback
        traceback.print_exc()
    finally:
        hide_loading()


def on_search_click(b):
    """Handle search button click."""
    do_search()


def on_search_submit(change):
    """Handle Enter key in search box."""
    do_search()


def on_clear_search(b):
    """Clear search and reload data."""
    search_input.value = ''
    do_search()


search_btn.on_click(on_search_click)
search_input.on_submit(on_search_submit)
clear_search_btn.on_click(on_clear_search)


def load_viewport_data():
    """Load data for current viewport with adaptive sampling."""
    if state.loading:
        return

    show_loading("Loading viewport data...")

    try:
        # Get current view state
        view_state = sample_map.view_state
        zoom = view_state.zoom if hasattr(view_state, 'zoom') else 1

        # Calculate bounding box
        bbox = view_state_to_bbox(view_state)

        # Get facet filters and search term
        source_filters_set = state.source_filters if state.source_filters else None
        material_filters_set = state.material_filters if state.material_filters else None
        year_range = state.year_range if (state.year_range[0] is not None or state.year_range[1] is not None) else None
        search_term = search_input.value.strip() if search_input.value.strip() else None

        # When searching, use slider value directly (no adaptive reduction)
        # When browsing, use adaptive sampling based on zoom
        if search_term:
            max_samples = sample_count.value
        else:
            max_samples = adaptive_sample_size(zoom, base_size=sample_count.value)

        # Load data
        new_gdf = load_samples(
            max_per_source=max_samples,
            bbox=bbox,
            search_term=search_term,
            source_filters=source_filters_set,
            material_filters=material_filters_set,
            year_range=year_range
        )

        update_map_and_table(new_gdf, search_active=bool(search_term))

        # Show zoom info in status
        if search_term:
            status_label.value = f"<b>Found:</b> {len(new_gdf):,} matches for '{search_term}' (zoom {zoom:.1f})"
        else:
            status_label.value = f"<b>Loaded:</b> {len(new_gdf):,} samples (zoom {zoom:.1f}, {max_samples:,}/source max)"

    except Exception as e:
        status_label.value = f"<b>Error:</b> {str(e)[:50]}"
    finally:
        hide_loading()


def debounced_viewport_load():
    """Debounced viewport loading - waits for user to stop panning/zooming."""
    # Cancel any existing timer
    if state.debounce_timer is not None:
        state.debounce_timer.cancel()

    # Set new timer (500ms delay)
    state.debounce_timer = threading.Timer(0.5, load_viewport_data)
    state.debounce_timer.start()


def on_view_state_change(change):
    """Handle map pan/zoom changes."""
    if state.viewport_mode and not state.loading:
        debounced_viewport_load()


def on_viewport_toggle(change):
    """Handle viewport mode toggle."""
    state.viewport_mode = change['new']
    if change['new']:
        viewport_toggle.button_style = 'success'
        viewport_toggle.description = 'Viewport Mode ON'
        # Immediately load viewport data
        load_viewport_data()
    else:
        viewport_toggle.button_style = ''
        viewport_toggle.description = 'Viewport Mode'


viewport_toggle.observe(on_viewport_toggle, names=['value'])


# Event handlers
def on_refresh_click(b):
    do_search()  # Refresh now uses same logic as search

refresh_btn.on_click(on_refresh_click)


def on_table_selection(change):
    """Handle table row selection - recenter map on selected point."""
    if state.syncing_selection:
        return

    # selections is a LIST of selection dicts
    selections = change.get('new', [])
    if selections and len(selections) > 0:
        # Get the first selection
        sel = selections[0]
        row_idx = sel.get('r1')
        if row_idx is not None and row_idx < len(state.current_gdf):
            state.syncing_selection = True
            try:
                select_sample(row_idx, source='table')
            finally:
                state.syncing_selection = False

sample_table.observe(on_table_selection, names=['selections'])

# Register view_state observer on the map
sample_map.observe(on_view_state_change, names=['view_state'])

# Setup observer on initial layer
setup_layer_observer(sample_map.layers[0])


# =============================================================================
# Facet Event Handlers
# =============================================================================

def on_source_checkbox_change(change):
    """Handle source checkbox changes."""
    # Rebuild source_filters from all checkboxes
    state.source_filters = set()
    for cb in source_checkboxes:
        if cb.value:
            state.source_filters.add(cb.source_name)
    
    # Trigger data reload
    do_search()


def on_material_select_change(change):
    """Handle material selection changes."""
    # Convert selected option labels to URIs
    state.material_filters = set()
    for option_label in material_select.value:
        if option_label in material_uri_map:
            state.material_filters.add(material_uri_map[option_label])
    
    # Trigger data reload
    do_search()


def on_time_filter_change(change):
    """Handle time filter enable/disable."""
    if time_filter_enabled.value:
        state.year_range = (year_slider.value[0], year_slider.value[1])
    else:
        state.year_range = (None, None)
    
    do_search()


def on_year_slider_change(change):
    """Handle year slider changes."""
    if time_filter_enabled.value:
        state.year_range = (year_slider.value[0], year_slider.value[1])
        do_search()


def on_clear_all_filters(b):
    """Clear all facet filters."""
    # Clear source checkboxes
    for cb in source_checkboxes:
        cb.value = False
    state.source_filters = set()
    
    # Clear material selection
    material_select.value = []
    state.material_filters = set()
    
    # Clear time filter
    time_filter_enabled.value = False
    year_slider.value = [year_stats['min_year'], year_stats['max_year']]
    state.year_range = (None, None)
    
    # Clear search
    search_input.value = ''
    
    # Reload data
    do_search()


# Wire up facet event handlers
for cb in source_checkboxes:
    cb.observe(on_source_checkbox_change, names=['value'])

material_select.observe(on_material_select_change, names=['value'])

time_filter_enabled.observe(on_time_filter_change, names=['value'])
year_slider.observe(on_year_slider_change, names=['value'])
clear_filters_btn.on_click(on_clear_all_filters)

print("Facet widgets ready!")

## Explorer Interface

Run this cell to launch the interactive explorer.

In [None]:
# Layout the interface

# Search box with buttons
search_box = widgets.HBox([
    search_input,
    search_btn,
    clear_search_btn
], layout=widgets.Layout(margin='0 15px 0 0'))

# Row 1: Search and viewport mode
controls_row1 = widgets.HBox([
    search_box,
    viewport_toggle,
], layout=widgets.Layout(margin='5px 0'))

# Row 2: Sample count, refresh, status
controls_row2 = widgets.HBox([
    sample_count,
    refresh_btn,
    loading_indicator,
    status_label
], layout=widgets.Layout(margin='5px 0', flex_wrap='wrap'))

# Row 3: Active filters display
controls_row3 = widgets.HBox([
    active_filters_html
], layout=widgets.Layout(margin='0'))

controls = widgets.VBox([controls_row1, controls_row2, controls_row3])

# Legend
legend_html = """
<div style="display: flex; gap: 15px; padding: 8px; background: #f9f9f9; border-radius: 4px; font-size: 12px;">
    <span><span style="display: inline-block; width: 12px; height: 12px; background: #3366CC; border-radius: 50%; margin-right: 4px;"></span>SESAR</span>
    <span><span style="display: inline-block; width: 12px; height: 12px; background: #DC3912; border-radius: 50%; margin-right: 4px;"></span>OpenContext</span>
    <span><span style="display: inline-block; width: 12px; height: 12px; background: #109618; border-radius: 50%; margin-right: 4px;"></span>GEOME</span>
    <span><span style="display: inline-block; width: 12px; height: 12px; background: #FF9900; border-radius: 50%; margin-right: 4px;"></span>Smithsonian</span>
</div>
"""
legend = widgets.HTML(value=legend_html)

# Facet panel header
facet_header = widgets.HTML(value="<h4 style='margin: 0 0 8px 0;'>Filters</h4>")

# Main layout with three columns: map | facets | details
left_panel = widgets.VBox([
    widgets.HTML("<h4 style='margin: 0 0 8px 0;'>Map</h4>"),
    legend,
    sample_map
], layout=widgets.Layout(flex='2', margin='0 10px 0 0'))

center_panel = widgets.VBox([
    facet_header,
    facet_accordion,
    clear_filters_btn
], layout=widgets.Layout(width='320px', min_width='280px', margin='0 10px 0 0'))

right_panel = widgets.VBox([
    widgets.HTML("<h4 style='margin: 0 0 8px 0;'>Selected Sample</h4>"),
    card_output,
    widgets.HTML("<h4 style='margin: 16px 0 8px 0;'>Sample List</h4>"),
    sample_table
], layout=widgets.Layout(flex='1', min_width='350px'))

main_layout = widgets.HBox([left_panel, center_panel, right_panel])

# Display
display(widgets.VBox([
    widgets.HTML("<h2 style='margin-bottom: 5px;'>iSamples Explorer</h2>"),
    widgets.HTML("<p style='color: #666; margin-top: 0;'>Interactive exploration of physical samples across scientific domains</p>"),
    controls,
    main_layout
]))

## Usage

### Faceted Filters (Center Panel)

The **Filters** panel provides multi-select faceted filtering:

**Sources** - Filter by data source (multi-select):
- Check one or more sources to show only samples from those sources
- Counts show total samples per source
- Unchecking all shows all sources

**Material Type** - Filter by material category:
- Shows top 30 material types by frequency
- Check multiple materials to show samples of any selected type
- Material labels come from the iSamples vocabulary

**Time Period** - Filter by collection/sampling date:
- Check "Filter by time" to enable
- Use the year range slider to narrow to a specific period
- Useful for finding samples from particular decades

**Clear All Filters** - Reset all facet selections and search

### Search

Search filters samples by matching text in **label**, **description**, and **place name** fields:

- **Enter a term**: Type "pottery", "basalt", "Cyprus", etc. and press Enter
- **Results are ranked**: Label matches (10 pts) > Description (5 pts) > Place name (3 pts)
- **Score column**: When searching, a "score" column appears in the table showing match quality
- **Combines with facets**: Search works together with facet filters (AND logic)
- **Viewport aware**: With Viewport Mode ON, search is limited to the current map view

### Selection Sync (Bidirectional)

Map and table selections are synchronized:

- **Click a dot on the map** → The corresponding row is highlighted in the table, and the sample card updates
- **Click a row in the table** → The map recenters on that point (zoom level is preserved), and the sample card updates

This makes it easy to explore samples visually on the map and then find them in the table, or vice versa.

### Viewport Mode (Dynamic Loading)

Enable **Viewport Mode** to automatically reload data as you pan and zoom:

- **Toggle ON**: Click the "Viewport Mode" button (turns green when active)
- **Pan/zoom**: Data reloads automatically after you stop moving (500ms debounce)
- **Loading indicator**: Spinner shows while data is being fetched
- **Adaptive sampling**: 
  - World view (zoom < 2): max 10K samples per source
  - Continent (zoom 2-5): max 25K per source
  - Country (zoom 5-8): max 50K per source
  - Region (zoom 8-12): max 100K per source
  - Local (zoom > 12): uses your slider value

### Active Filters Display

When filters are active, colored tags appear below the controls showing:
- **Blue tag**: Active source filters
- **Green tag**: Active material filters  
- **Orange tag**: Active time range

### Filter Combinations

All filters work together with AND logic:
- Source + Material: Show pottery samples from OpenContext only
- Material + Time: Show rock samples collected in the 2010s
- Source + Time + Search: Find "Cyprus" in SESAR samples from 2000-2020

### Color Legend
- **Blue**: SESAR (geological samples, IGSNs)
- **Red**: OpenContext (archaeological samples)
- **Green**: GEOME (genomic/biological samples)
- **Orange**: Smithsonian (museum collections)

## Debug: Raw Data Access

Use these cells to explore the underlying data.

In [None]:
# Current selection
if state.selected_row is not None:
    print("Selected sample:")
    print(state.selected_row)
else:
    print("No sample selected")

In [None]:
# Query the full dataset
con.sql(f"""
    SELECT n as source, COUNT(*) as total_samples
    FROM read_parquet('{PARQUET_PATH}')
    WHERE otype = 'MaterialSampleRecord' AND latitude IS NOT NULL
    GROUP BY n
    ORDER BY total_samples DESC
""").df()

## Material Type Hierarchy Analysis

The iSamples material vocabulary is a SKOS hierarchy with **3 levels**. However, samples are tagged inconsistently at different levels, and **rollup does not happen automatically**.

### Key Findings

1. **No automatic rollup** - "Earth Material" (2.2M samples) does NOT include Rock (1M), Sediment (66K), etc. They're tagged separately at different hierarchy levels.

2. **Inconsistent tagging depth** - Some samples tagged at root "Material" (664K), some at mid-level "Earth Material" (2.2M), some at leaf "Rock" (1M).

3. **Intermediate nodes often empty** - "Natural Solid Material", "Fluid Material", "Dispersed Media" have 0 direct tags.

### Implications for Faceted Search

A **hierarchical facet with rollup** would be valuable - selecting "Earth Material" should include all its children (Rock, Sediment, Soil, Mineral, Mixed). Currently the flat facet misses this relationship.

### Pre-computed Hierarchy (January 2026)

```
- Material: 664,199
  - Natural Solid Material: 0 (not used directly)
    - Earth Material: 2,251,086
      - Rock: 1,052,183
      - Sediment: 66,648
      - Soil: 32,157
      - Mineral: 300,179
      - Mixed Soil/Sediment/Rock: 838,726
    - Biogenic Non-organic: 1,090,222
  - Organic Material: 862,220
    - Plant Material: 1
    - Animal Product: 266
  - Anthropogenic Material: 44,399
    - Anthropogenic Metal: 269,981
    - Ceramic Clay: 100,501
  - Fluid Material: 0 (not used directly)
    - Liquid Water: 24,080
    - Gas: 1,154
    - Non-aqueous Liquid: 44
  - Dispersed Media: 0 (not used directly)
    - Particulate: 122
  - Any Ice: 4
```

Total samples with coordinates: ~6M

In [None]:
# Compute Material Type Hierarchy with Sample Counts
# 
# This analysis shows how samples are tagged at different levels of the 
# iSamples material vocabulary hierarchy.

def compute_material_hierarchy():
    """
    Compute material type counts and display as a hierarchy tree.
    
    The iSamples material vocabulary is a 3-level SKOS hierarchy.
    This function queries the data to show counts at each level.
    """
    # Get all material counts from the data
    query = """
    WITH samples AS (
        SELECT UNNEST(p__has_material_category) as material_id
        FROM read_parquet(?)
        WHERE otype = 'MaterialSampleRecord' AND latitude IS NOT NULL
    )
    SELECT ic.label as uri, COUNT(*) as cnt
    FROM samples s
    JOIN read_parquet(?) ic ON ic.row_id = s.material_id
    WHERE ic.label IS NOT NULL
    GROUP BY ic.label
    ORDER BY cnt DESC
    """
    df = con.execute(query, [PARQUET_PATH, PARQUET_PATH]).df()
    
    # Build counts dict from URI last segment
    counts = {}
    for _, row in df.iterrows():
        key = row['uri'].rstrip('/').split('/')[-1].lower()
        counts[key] = row['cnt']
    
    # Define the hierarchy structure (based on iSamples vocabulary)
    # https://isamplesorg.github.io/metadata/vocabularies/material.html
    hierarchy = [
        ("Material", "material", 0, [
            ("Natural Solid Material", "naturalsolidmaterial", 1, [
                ("Earth Material", "earthmaterial", 2, [
                    ("Rock", "rock", 3, []),
                    ("Sediment", "sediment", 3, []),
                    ("Soil", "soil", 3, []),
                    ("Mineral", "mineral", 3, []),
                    ("Mixed Soil/Sediment/Rock", "mixedsoilsedimentrock", 3, []),
                ]),
                ("Biogenic Non-organic", "biogenicnonorganicmaterial", 2, []),
            ]),
            ("Organic Material", "organicmaterial", 1, [
                ("Plant Material", "plantmaterial", 2, []),
                ("Animal Product", "organicanimalproduct", 2, []),
            ]),
            ("Anthropogenic Material", "anyanthropogenicmaterial", 1, [
                ("Anthropogenic Metal", "anthropogenicmetal", 2, []),
                ("Ceramic Clay", "ceramicclay", 2, []),
            ]),
            ("Fluid Material", "fluidmaterial", 1, [
                ("Liquid Water", "liquidwater", 2, []),
                ("Gas", "gas", 2, []),
                ("Non-aqueous Liquid", "nonaqueousliquid", 2, []),
            ]),
            ("Dispersed Media", "dispersedmedia", 1, [
                ("Particulate", "particulate", 2, []),
            ]),
            ("Any Ice", "anyice", 1, []),
        ])
    ]
    
    def print_node(nodes, indent=0):
        """Recursively print hierarchy with counts."""
        for name, key, level, children in nodes:
            cnt = counts.get(key, 0)
            prefix = "  " * indent
            marker = "- " if indent == 0 else "└─ "
            
            # Calculate rollup (what count WOULD be with proper rollup)
            def calc_rollup(node_list):
                total = 0
                for n, k, l, c in node_list:
                    total += counts.get(k, 0) + calc_rollup(c)
                return total
            
            rollup = cnt + calc_rollup(children)
            
            if children and rollup != cnt:
                print(f"{prefix}{marker}**{name}**: {cnt:,} (rollup would be {rollup:,})")
            else:
                print(f"{prefix}{marker}**{name}**: {cnt:,}")
            
            if children:
                print_node(children, indent + 1)
    
    print("Material Type Hierarchy with Sample Counts")
    print("=" * 50)
    print()
    print_node(hierarchy)
    print()
    
    # Summary statistics
    total_tags = sum(counts.values())
    total_samples = con.execute(f"""
        SELECT COUNT(*) FROM read_parquet('{PARQUET_PATH}')
        WHERE otype = 'MaterialSampleRecord' AND latitude IS NOT NULL
    """).fetchone()[0]
    
    # Materials per sample distribution
    dist_query = f"""
        SELECT 
            LEN(p__has_material_category) as num_materials,
            COUNT(*) as num_samples
        FROM read_parquet('{PARQUET_PATH}')
        WHERE otype = 'MaterialSampleRecord' 
          AND latitude IS NOT NULL
          AND p__has_material_category IS NOT NULL
        GROUP BY LEN(p__has_material_category)
        ORDER BY num_materials
    """
    dist_df = con.execute(dist_query).df()
    
    print("Summary Statistics")
    print("-" * 30)
    print(f"Total samples with coordinates: {total_samples:,}")
    print(f"Total material tags: {total_tags:,}")
    print(f"Unique material types: {len(counts)}")
    print()
    print("Materials per sample:")
    for _, row in dist_df.iterrows():
        print(f"  {int(row['num_materials'])} material(s): {int(row['num_samples']):,} samples")
    
    return counts, hierarchy

# Run the analysis
material_counts, material_hierarchy = compute_material_hierarchy()