# iSamples Interactive Explorer

An interactive interface for exploring iSamples data across all sources.

**Features:**
- Map view with 6M+ samples (lonboard WebGL)
- Interactive table with filtering (ipydatagrid)
- Sample cards on selection
- Source filtering

**Data:** Zenodo wide parquet (~282 MB, 20M rows)

In [None]:
# Imports
import os
import duckdb
import pandas as pd
import geopandas as gpd
import numpy as np
from shapely.geometry import Point

# Visualization
from lonboard import Map, ScatterplotLayer
from lonboard.colormap import apply_continuous_cmap
from ipydatagrid import DataGrid
import ipywidgets as widgets
from IPython.display import display, HTML

In [None]:
# Data paths
LOCAL_WIDE = os.path.expanduser("~/Data/iSample/pqg_refining/zenodo_wide_2026-01-09.parquet")
REMOTE_WIDE = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet"

# Use local if available
PARQUET_PATH = LOCAL_WIDE if os.path.exists(LOCAL_WIDE) else REMOTE_WIDE
print(f"Using: {PARQUET_PATH}")

# Connect to DuckDB
con = duckdb.connect()

In [None]:
# Source color scheme (consistent across iSamples)
SOURCE_COLORS = {
    'SESAR': [51, 102, 204, 200],       # Blue
    'OPENCONTEXT': [220, 57, 18, 200],  # Red
    'GEOME': [16, 150, 24, 200],        # Green
    'SMITHSONIAN': [255, 153, 0, 200],  # Orange
}

DEFAULT_COLOR = [128, 128, 128, 200]  # Gray for unknown

## Load Sample Data

We start with a sample of 50K records across all sources for responsive interaction.

In [None]:
def load_samples(max_per_source=12500, source_filter=None):
    """
    Load samples with coordinates from the wide parquet.
    
    Args:
        max_per_source: Maximum samples per source (for balanced representation)
        source_filter: Optional source name to filter (e.g., 'OPENCONTEXT')
    
    Returns:
        GeoDataFrame with sample data
    """
    where_clause = "WHERE otype = 'MaterialSampleRecord' AND latitude IS NOT NULL"
    if source_filter:
        where_clause += f" AND n = '{source_filter}'"
    
    # Balanced sampling across sources
    query = f"""
        WITH ranked AS (
            SELECT 
                row_id, pid, label, description, latitude, longitude, n as source,
                place_name, result_time,
                ROW_NUMBER() OVER (PARTITION BY n ORDER BY RANDOM()) as rn
            FROM read_parquet('{PARQUET_PATH}')
            {where_clause}
        )
        SELECT row_id, pid, label, description, latitude, longitude, source, place_name, result_time
        FROM ranked
        WHERE rn <= {max_per_source}
    """
    
    df = con.sql(query).df()
    
    # Convert to GeoDataFrame
    geometry = [Point(lon, lat) for lon, lat in zip(df['longitude'], df['latitude'])]
    gdf = gpd.GeoDataFrame(df, geometry=geometry, crs="EPSG:4326")
    
    return gdf

# Load initial data
print("Loading samples...")
samples_gdf = load_samples(max_per_source=12500)
print(f"Loaded {len(samples_gdf):,} samples")
print(f"\nBy source:")
print(samples_gdf['source'].value_counts())

## Sample Card Renderer

In [None]:
def render_sample_card(row):
    """
    Render a sample as an HTML card.
    
    Args:
        row: DataFrame row or Series with sample data
    
    Returns:
        HTML string
    """
    if row is None:
        return "<div style='padding: 10px; color: #666;'>Click a point to see details</div>"
    
    source = row.get('source', 'Unknown')
    source_color = {
        'SESAR': '#3366CC',
        'OPENCONTEXT': '#DC3912',
        'GEOME': '#109618',
        'SMITHSONIAN': '#FF9900'
    }.get(source, '#808080')
    
    label = row.get('label', 'No label')
    if pd.isna(label):
        label = 'No label'
    
    description = row.get('description', '')
    if pd.isna(description):
        description = ''
    elif len(str(description)) > 200:
        description = str(description)[:200] + '...'
    
    lat = row.get('latitude', 0)
    lon = row.get('longitude', 0)
    if pd.isna(lat):
        lat = 0
    if pd.isna(lon):
        lon = 0
        
    pid = row.get('pid', '')
    if pd.isna(pid):
        pid = ''
    
    place = row.get('place_name', '')
    if pd.isna(place):
        place = ''
    elif isinstance(place, list):
        place = ' > '.join(str(p) for p in place if p and not pd.isna(p))
    else:
        place = str(place)
    
    # Build place HTML only if place has content
    place_html = ''
    if place and len(place) > 0:
        place_html = f'<div style="margin-bottom: 4px;"><strong>Place:</strong> {place[:100]}</div>'
    
    html = f"""
    <div style="font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
                border: 1px solid #ddd; border-radius: 8px; padding: 16px; 
                max-width: 400px; background: white; box-shadow: 0 2px 4px rgba(0,0,0,0.1);">
        <div style="display: flex; align-items: center; margin-bottom: 12px;">
            <span style="background: {source_color}; color: white; padding: 4px 8px; 
                        border-radius: 4px; font-size: 12px; font-weight: 600;">{source}</span>
        </div>
        <h3 style="margin: 0 0 8px 0; font-size: 16px; color: #333;">{label}</h3>
        <p style="margin: 0 0 12px 0; font-size: 13px; color: #666; line-height: 1.4;">
            {description if description else '<em>No description</em>'}
        </p>
        <div style="font-size: 12px; color: #888;">
            <div style="margin-bottom: 4px;"><strong>Location:</strong> {lat:.4f}, {lon:.4f}</div>
            {place_html}
            <div><strong>ID:</strong> <code style="background: #f5f5f5; padding: 2px 4px; border-radius: 3px;">{str(pid)[:50]}{'...' if len(str(pid)) > 50 else ''}</code></div>
        </div>
    </div>
    """
    return html

# Test the card
display(HTML(render_sample_card(samples_gdf.iloc[0])))

## Map Component

In [None]:
def get_colors_for_sources(sources):
    """
    Get color array for a list of sources.
    
    Args:
        sources: pandas Series or list of source names
    
    Returns:
        numpy array of RGBA colors
    """
    colors = np.array([
        SOURCE_COLORS.get(s, DEFAULT_COLOR) for s in sources
    ], dtype=np.uint8)
    return colors

def create_map_layer(gdf):
    """
    Create a lonboard ScatterplotLayer from a GeoDataFrame.
    """
    colors = get_colors_for_sources(gdf['source'])
    
    layer = ScatterplotLayer.from_geopandas(
        gdf,
        get_fill_color=colors,
        get_radius=1000,
        radius_units='meters',
        radius_min_pixels=2,
        radius_max_pixels=10,
        pickable=True,
        auto_highlight=True,
    )
    return layer

# Create initial map
layer = create_map_layer(samples_gdf)
sample_map = Map(layers=[layer], _height=500)
print("Map created with", len(samples_gdf), "points")

## Table Component

In [None]:
def create_table(gdf):
    """
    Create an ipydatagrid table from sample data.
    """
    # Select columns for display
    display_cols = ['source', 'label', 'latitude', 'longitude']
    df_display = gdf[display_cols].copy()
    df_display['latitude'] = df_display['latitude'].round(4)
    df_display['longitude'] = df_display['longitude'].round(4)
    
    grid = DataGrid(
        df_display,
        base_row_size=32,
        base_column_size=120,
        selection_mode='row',
        editable=False,
        layout=widgets.Layout(height='300px', width='100%')
    )
    return grid

# Create table
sample_table = create_table(samples_gdf)

## Interactive Controls

In [None]:
# State management
class ExplorerState:
    def __init__(self):
        self.selected_index = None
        self.selected_row = None
        self.current_gdf = None

state = ExplorerState()
state.current_gdf = samples_gdf

# Widgets
source_filter = widgets.Dropdown(
    options=['All Sources', 'SESAR', 'OPENCONTEXT', 'GEOME', 'SMITHSONIAN'],
    value='All Sources',
    description='Source:',
    style={'description_width': '60px'}
)

sample_count = widgets.IntSlider(
    value=12500,
    min=1000,
    max=500000,  # 500K per source - plenty for 128GB RAM
    step=5000,
    description='Per source:',
    style={'description_width': '80px'}
)

refresh_btn = widgets.Button(
    description='Refresh Data',
    button_style='primary',
    icon='refresh'
)

status_label = widgets.HTML(value=f"<b>Loaded:</b> {len(samples_gdf):,} samples")

card_output = widgets.HTML(value=render_sample_card(None))

# Event handlers
def on_refresh_click(b):
    status_label.value = "<b>Loading...</b>"
    
    source = None if source_filter.value == 'All Sources' else source_filter.value
    new_gdf = load_samples(max_per_source=sample_count.value, source_filter=source)
    
    state.current_gdf = new_gdf
    
    # Update map
    new_layer = create_map_layer(new_gdf)
    sample_map.layers = [new_layer]
    
    # Update table
    display_cols = ['source', 'label', 'latitude', 'longitude']
    df_display = new_gdf[display_cols].copy()
    df_display['latitude'] = df_display['latitude'].round(4)
    df_display['longitude'] = df_display['longitude'].round(4)
    sample_table.data = df_display
    
    status_label.value = f"<b>Loaded:</b> {len(new_gdf):,} samples"

refresh_btn.on_click(on_refresh_click)

def on_table_selection(change):
    """Handle table row selection."""
    selection = change.get('new', {})
    if selection and 'r1' in selection:
        row_idx = selection['r1']
        if row_idx < len(state.current_gdf):
            state.selected_index = row_idx
            state.selected_row = state.current_gdf.iloc[row_idx]
            card_output.value = render_sample_card(state.selected_row)

sample_table.observe(on_table_selection, names=['selections'])

## Explorer Interface

Run this cell to launch the interactive explorer.

In [None]:
# Layout the interface
controls = widgets.HBox([
    source_filter,
    sample_count,
    refresh_btn,
    status_label
], layout=widgets.Layout(margin='10px 0'))

# Legend
legend_html = """
<div style="display: flex; gap: 15px; padding: 8px; background: #f9f9f9; border-radius: 4px; font-size: 12px;">
    <span><span style="display: inline-block; width: 12px; height: 12px; background: #3366CC; border-radius: 50%; margin-right: 4px;"></span>SESAR</span>
    <span><span style="display: inline-block; width: 12px; height: 12px; background: #DC3912; border-radius: 50%; margin-right: 4px;"></span>OpenContext</span>
    <span><span style="display: inline-block; width: 12px; height: 12px; background: #109618; border-radius: 50%; margin-right: 4px;"></span>GEOME</span>
    <span><span style="display: inline-block; width: 12px; height: 12px; background: #FF9900; border-radius: 50%; margin-right: 4px;"></span>Smithsonian</span>
</div>
"""
legend = widgets.HTML(value=legend_html)

# Main layout
left_panel = widgets.VBox([
    widgets.HTML("<h4 style='margin: 0 0 8px 0;'>Map</h4>"),
    legend,
    sample_map
], layout=widgets.Layout(flex='2', margin='0 10px 0 0'))

right_panel = widgets.VBox([
    widgets.HTML("<h4 style='margin: 0 0 8px 0;'>Selected Sample</h4>"),
    card_output,
    widgets.HTML("<h4 style='margin: 16px 0 8px 0;'>Sample List</h4>"),
    sample_table
], layout=widgets.Layout(flex='1', min_width='420px'))

main_layout = widgets.HBox([left_panel, right_panel])

# Display
display(widgets.VBox([
    widgets.HTML("<h2 style='margin-bottom: 5px;'>iSamples Explorer</h2>"),
    widgets.HTML("<p style='color: #666; margin-top: 0;'>Interactive exploration of physical samples across scientific domains</p>"),
    controls,
    main_layout
]))

## Usage

1. **Filter by source**: Use the dropdown to show only one data source
2. **Adjust sample size**: Increase/decrease points per source for performance vs. coverage
3. **Click Refresh**: Load new data after changing filters
4. **Select in table**: Click a row to see the sample card
5. **Pan/zoom map**: Explore geographic distribution

### Color Legend
- **Blue**: SESAR (geological samples, IGSNs)
- **Red**: OpenContext (archaeological samples)
- **Green**: GEOME (genomic/biological samples)
- **Orange**: Smithsonian (museum collections)

## Debug: Raw Data Access

Use these cells to explore the underlying data.

In [None]:
# Current selection
if state.selected_row is not None:
    print("Selected sample:")
    print(state.selected_row)
else:
    print("No sample selected")

In [None]:
# Query the full dataset
con.sql(f"""
    SELECT n as source, COUNT(*) as total_samples
    FROM read_parquet('{PARQUET_PATH}')
    WHERE otype = 'MaterialSampleRecord' AND latitude IS NOT NULL
    GROUP BY n
    ORDER BY total_samples DESC
""").df()