In [None]:
from pathlib import Path

import requests

import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
import pandas as pd
import shapely
from palettable.colorbrewer.diverging import BrBG_10
# from sidecar import Sidecar

from lonboard import Map, ScatterplotLayer
from lonboard.colormap import apply_continuous_cmap

import ibis

import ipywidgets as widgets
from IPython.display import display
from ipywidgets import Layout, Button, HBox, VBox, HTML
from ipywidgets import Output, HTMLMath

In [None]:
# local_path = Path("/Users/raymondyee/Data/iSample/2025_02_20_10_30_49/isamples_export_2025_02_20_10_30_49_geo.parquet")
# local_path = Path("/Users/raymondyee/Data/iSample/OPENCONTEXT.parquet")
# local_path = Path("/Users/raymondyee/Data/iSample/oc_isamples_pqg.parquet")
# LOCAL_PATH = "isamples_export_2025_04_21_16_23_46_geo.parquet"
LOCAL_PATH = "/Users/raymondyee/Data/iSample/2025_04_21_16_23_46/isamples_export_2025_04_21_16_23_46_geo.parquet"
local_path = Path(LOCAL_PATH)
if not local_path.exists():
    remote_url = "https://zenodo.org/records/15278211/files/isamples_export_2025_04_21_16_23_46_geo.parquet"
    # retrieve the file and store to local_path
    response = requests.get(remote_url)
    with open(local_path, 'wb') as file:
        file.write(response.content)
    
    

In [None]:
# write out some info about the local file
# how big is it?
print(f"Local file: {local_path}")
print(f"File size: {local_path.stat().st_size / 1024 / 1024:.2f} MB")

In [None]:
all_columns = ['sample_identifier',
 'label',
 'description',
 'source_collection',
 'has_sample_object_type',
 'has_material_category',
 'has_context_category',
 'informal_classification',
 'keywords',
 'produced_by',
 'curation',
 'registrant',
 'related_resource',
 'sampling_purpose',
 'sample_location_longitude',
 'sample_location_latitude',
 'geometry']

# read a subset of columns
columns = ['sample_identifier', 'source_collection', 'geometry']
# columns = all_columns




In [None]:
if local_path.exists():
    gdf = gpd.read_parquet(local_path, columns=columns)
    # Get a sample if the dataset is too large


In [None]:
# use ibis to read the parquet file and compute some basic stats

table = ibis.read_parquet(local_path)
result = table["source_collection"].value_counts().execute()
print(result)


In [None]:
# Get all column names
print(table.columns)

# Display table schema/structure with data types
print(table.schema())

# Get number of rows
print(table.count().execute())

# Preview first few rows (similar to pandas head())
print(table.limit(5).execute())

In [None]:
# Value counts for categorical columns
print("Source collections:")
print(table["source_collection"].value_counts().execute())

print("Sample object types:")
print(table["has_sample_object_type"].value_counts().limit(10).execute())

print("Material categories:")
print(table["has_material_category"].value_counts().limit(10).execute())

# Check for null values in important columns
null_counts = {col: table[col].isnull().sum().execute() for col in table.columns}
print("Null counts per column:")
for col, count in null_counts.items():
    print(f"{col}: {count}")

In [None]:
# Summary statistics for numeric columns
print("Latitude statistics:")
lat_stats = table.aggregate([
    table["sample_location_latitude"].count().name('count'),
    table["sample_location_latitude"].min().name('min'),
    table["sample_location_latitude"].max().name('max'),
    table["sample_location_latitude"].mean().name('mean'),
    table["sample_location_latitude"].std().name('std'),
]).execute()
print(lat_stats)

print("Longitude statistics:")
lon_stats = table.aggregate([
    table["sample_location_longitude"].count().name('count'),
    table["sample_location_longitude"].min().name('min'),
    table["sample_location_longitude"].max().name('max'),
    table["sample_location_longitude"].mean().name('mean'),
    table["sample_location_longitude"].std().name('std'),
]).execute()
print(lon_stats)

# For percentiles, you can use quantile:
print("Latitude percentiles:")
lat_percentiles = table.aggregate([
    table["sample_location_latitude"].quantile(0.25).name('25%'),
    table["sample_location_latitude"].quantile(0.50).name('50%'),
    table["sample_location_latitude"].quantile(0.75).name('75%')
]).execute()
print(lat_percentiles)

In [None]:
# Group by source collection and count records
collection_summary = (
    table.group_by("source_collection")
    .aggregate(count=table.count())
    .order_by(ibis.desc("count"))
    .execute()
)
print("Records per source collection:")
print(collection_summary)

# Find records with geographic information
geography_stats = (
    table.group_by("source_collection")
    .aggregate(
        total=table.count(),
        with_coords=((~table["geometry"].isnull()).sum()),
        coord_percentage=(100 * (~table["geometry"].isnull()).mean())
    )
    .execute()
)
print("Geographic data availability by collection:")
print(geography_stats)

In [None]:
gdf.dtypes

In [None]:
gdf

In [None]:
list(gdf.columns)

In [None]:
# Convert source_collection to categorical -- to save space and speed up plotting
gdf['source_collection'] = gdf['source_collection'].astype('category')

# Verify it worked
print(gdf['source_collection'].dtype)

In [None]:
# Filter out null and empty geometries
gdf_valid = gdf[~gdf.geometry.isna() & ~gdf.geometry.is_empty]

print(f"Original dataframe: {len(gdf):,} records")
print(f"After removing empty geometries: {len(gdf_valid):,} records")
print(f"Removed: {len(gdf) - len(gdf_valid):,} records ({(len(gdf) - len(gdf_valid))/len(gdf)*100:.2f}%)")


In [None]:
# reduce the size of gdf to make it easier to plot

# Europe
# gdf = gdf.cx[-11.83:25.5, 34.9:59]
# USA
# gdf = gdf.cx[-125:-66, 24:50]
# WORLD
# gdf = gdf.cx[-180:180, -90:90]

In [None]:
len(gdf)

In [None]:
default_color = [128, 128, 128, 255]  # Gray for unknown sources
# Define color map 
color_map = {
    "SESAR": [51, 102, 204, 255],       # Vibrant blue (#3366CC)
    "OPENCONTEXT": [220, 57, 18, 255],  # Crimson red (#DC3912)
    "GEOME": [16, 150, 24, 255],        # Forest green (#109618)
    "SMITHSONIAN": [255, 153, 0, 255]   # Deep orange (#FF9900)
}

# Get selected collections
selected_collections = ['SESAR', 'OPENCONTEXT', 'GEOME', 'SMITHSONIAN']

def create_color_map_0(gdf, color_map, selected_collections=None, default_color=[128, 128, 128, 255]):
    # Pre-compute colors for each point
    colors = np.zeros((len(gdf), 4), dtype=np.uint8)
    for i, source in enumerate(gdf['source_collection']):
        if (selected_collections is None or source in selected_collections) and source in color_map:
            colors[i] = color_map[source]
        else:
            colors[i] = default_color
    return colors


# function to create a color map with selected collections (which has default of all collections)
# use faster vectorized approach
def create_color_map(gdf, color_map, selected_collections=None, default_color=[128, 128, 128, 255]):
    # Pre-compute colors for each point
    colors = np.zeros((len(gdf), 4), dtype=np.uint8)
    
    # Create a mapping dictionary once
    color_lookup = {cat: np.array(color_map.get(cat, default_color)) for cat in gdf['source_collection'].cat.categories}
    
    # Apply the mapping using categorical codes
    for cat_code, cat in enumerate(gdf['source_collection'].cat.categories):
        mask = gdf['source_collection'].cat.codes == cat_code
        # Only apply color if the category is in selected_collections (if provided)
        if selected_collections is None or cat in selected_collections:
            colors[mask] = color_lookup.get(cat, default_color)
        else:
            colors[mask] = default_color
    
    return colors


In [None]:


# write this comparision as a test
# pass arguments to the function

def test_color_map():
    # Test with full dataset (no selections)
    colors0 = create_color_map_0(gdf_sample, color_map)
    colors1 = create_color_map(gdf_sample, color_map)
    assert np.array_equal(colors0, colors1), "Full dataset color mapping failed"

    # Test with selected collections
    selected_collections = ['SESAR', 'OPENCONTEXT']
    colors0_selected = create_color_map_0(gdf_sample, color_map, selected_collections)
    colors1_selected = create_color_map(gdf_sample, color_map, selected_collections)
    assert np.array_equal(colors0_selected, colors1_selected), "Selected collections color mapping failed"

In [None]:
from lonboard import ScatterplotLayer, Map, BitmapTileLayer
import numpy as np

# First, ensure source_collection is categorical
gdf['source_collection'] = gdf['source_collection'].astype('category')

# Filter out null and empty geometries
gdf_valid = gdf[~gdf.geometry.isna() & ~gdf.geometry.is_empty]

# Get a sample if the dataset is too large
gdf_sample = gdf_valid.sample(frac=1.0, random_state=42)  # Adjust number as needed

# Create a color map for the sample
colors = create_color_map(gdf_sample, color_map, selected_collections)


# Create a base tile layer with OpenStreetMap
base_layer = BitmapTileLayer(
        data="https://tile.openstreetmap.org/{z}/{x}/{y}.png",
        tile_size=256,
        max_requests=-1,
        min_zoom=0,
        max_zoom=19,
    )

satellite_layer = BitmapTileLayer(
    data="https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}",
    tile_size=256,
    min_zoom=0,
    max_zoom=19
)

# Create a ScatterplotLayer with the pre-computed colors
layer = ScatterplotLayer.from_geopandas(
    gdf_sample,
    get_fill_color=colors,  # Pass the numpy array of colors
    get_radius=300,
    radius_units='meters',  # Use pixels instead of meters
    pickable=True
)

# Create and display the map
m = Map([base_layer, layer], _height=800)
# m = Map([satellite_layer, layer], _height=800)
display(m)

# example code to manipulate the map
# layer.get_fill_color = [0, 50, 200, 200]

In [None]:
# let's play with the map and layer to learn how to use it

# layer.get_fill_color = [0, 50, 200, 200]
# set layer fill color to the color map

layer.get_fill_color = colors

# Just update zoom
# Correct way to update the view state
new_view_state = {
    "longitude": m.view_state.longitude,
    "latitude": m.view_state.latitude,
    "zoom": 6,  # Your new zoom level
    "pitch": m.view_state.pitch,
    "bearing": m.view_state.bearing
}

m.view_state = new_view_state


# view_state has the following attributes: longitude, latitude, zoom, pitch, bearing
# m.view_state = {"zoom": 10} 

# dynamically change layers in the map
m.layers = [base_layer, layer]
# m.layers = [satellite_layer, layer]

In [None]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import math

@interact(x=(0,1000,1))
def f(x=20):
    return math.factorial(x)


In [None]:
# Correct the output widget code in cell with ID "5d3f6ec5"
gdf_sample['source_collection']

# construct checkboxes for each source collection
source_collections = gdf_sample['source_collection'].unique()
checkboxes = {source: widgets.Checkbox(value=False, description=source) for source in source_collections}

# Create output widget
output = widgets.Output()

# Respond to checkbox changes - FIX HERE
def on_checkbox_change(change):
    with output:
        output.clear_output()
        selected_collections = [source for source, checkbox in checkboxes.items() if checkbox.value]
        # Print to the output widget instead of trying to set its value
        print(f"Selected collections: {', '.join(selected_collections)}")
        print(f"Number of rows in selection: {gdf_sample['source_collection'].isin(selected_collections).sum()}")

        # now update the layer and the map
        # Create a color map fo

# Register the callback with all checkboxes
for checkbox in checkboxes.values():
    checkbox.observe(on_checkbox_change, names='value')

# Display the checkboxes and output
display(widgets.VBox(list(checkboxes.values())), output)


In [None]:
gdf_sample['source_collection'].isin(selected_collections).sum()

In [None]:
gdf_sample['source_collection'].value_counts()

## Managing Environment with `pip-tools`

`pip-tools` is used to manage Python package dependencies for reproducible environments. The typical workflow involves two main commands: `pip-compile` and `pip-sync`.

1.  **Define Direct Dependencies (`requirements.in`)**:
    *   List your project's top-level dependencies in a `requirements.in` file. You can specify version constraints if needed.
    *   Example `requirements.in`:
        ```
        pandas>=1.0
        geopandas
        lonboard
        # For local editable installs:
        # -e /path/to/local/package
        ```

2.  **Compile Dependencies (`pip-compile`)**:
    *   Run `pip-compile requirements.in` (or specify input and output files: `pip-compile requirements.in --output-file requirements.txt`).
    *   This generates a `requirements.txt` file, which pins the versions of your direct dependencies and all their sub-dependencies. This file ensures that your environment is reproducible.

3.  **Synchronize Environment (`pip-sync`)**:
    *   Run `pip-sync requirements.txt` (or just `pip-sync` if `requirements.txt` is in the current directory).
    *   This command modifies your current virtual environment to exactly match the packages and versions specified in `requirements.txt`. It will:
        *   Install any missing packages.
        *   Upgrade or downgrade existing packages to their pinned versions.
        *   Uninstall any packages in the environment that are not listed in `requirements.txt`.

**How to "Install" Packages with `pip-tools`**:

`pip-tools` doesn't have a direct `install` subcommand like `pip install <package>`. To add or update packages:
1.  Add or modify the package entry in your `requirements.in` file.
2.  Run `pip-compile requirements.in` to update `requirements.txt`.
3.  Run `pip-sync` to apply the changes to your virtual environment.

This process ensures that your `requirements.txt` always reflects the complete, pinned set of dependencies for your project, leading to more stable and predictable environments.

In [None]:
# Create functions to make the map configurable and update based on user selections

def update_layer_colors(gdf_data, selected_collections=None, radius=300, radius_units='meters'):
    """
    Update the ScatterplotLayer with filtered data and colors based on selected collections
    
    Parameters:
    -----------
    gdf_data : GeoDataFrame
        The geodataframe containing the data to plot
    selected_collections : list, optional
        List of collection names to highlight. If None, all collections are shown
    radius : float, optional
        Radius of the points
    radius_units : str, optional
        Units for the radius ('meters' or 'pixels')
        
    Returns:
    --------
    layer : ScatterplotLayer
        Updated ScatterplotLayer with filtered data and colors
    colors : numpy.ndarray
        Array of colors for the points
    """
    # If selected_collections is empty or None, use all collections
    if not selected_collections:
        selected_collections = gdf_data['source_collection'].unique()
    
    # Filter the data if needed
    if len(selected_collections) < len(gdf_data['source_collection'].unique()):
        filtered_data = gdf_data[gdf_data['source_collection'].isin(selected_collections)]
    else:
        filtered_data = gdf_data
    
    # Create colors based on the selected collections
    colors = create_color_map(filtered_data, color_map, selected_collections)
    
    # Create the layer
    layer = ScatterplotLayer.from_geopandas(
        filtered_data,
        get_fill_color=colors,
        get_radius=radius,
        radius_units=radius_units,
        pickable=True
    )
    
    return layer, colors, filtered_data

def create_map(base_layer_type="osm", layer=None, height=800):
    """
    Create and return a map with the specified base layer and data layer
    
    Parameters:
    -----------
    base_layer_type : str, optional
        Type of base layer to use ('osm' or 'satellite')
    layer : ScatterplotLayer, optional
        Data layer to add to the map
    height : int, optional
        Height of the map in pixels
        
    Returns:
    --------
    m : Map
        Map object with the specified layers
    """
    # Define base layers
    osm_layer = BitmapTileLayer(
        data="https://tile.openstreetmap.org/{z}/{x}/{y}.png",
        tile_size=256,
        max_requests=-1,
        min_zoom=0,
        max_zoom=19,
    )
    
    satellite_layer = BitmapTileLayer(
        data="https://server.arcgisonline.com/ArcGIS/rest/services/World_Imagery/MapServer/tile/{z}/{y}/{x}",
        tile_size=256,
        min_zoom=0,
        max_zoom=19
    )
    
    # Select the base layer
    if base_layer_type.lower() == "satellite":
        base = satellite_layer
    else:
        base = osm_layer
    
    # Create the map with appropriate layers
    layers = [base]
    if layer is not None:
        layers.append(layer)
    
    m = Map(layers, _height=height)
    return m

In [None]:
# Create interactive widgets for map configuration
from ipywidgets import widgets, interactive, Layout, HBox, VBox, Output

# Create widgets for collection selection
collection_checkboxes = {
    collection: widgets.Checkbox(
        value=True, 
        description=collection,
        layout=Layout(width='auto')
    ) for collection in gdf_sample['source_collection'].unique()
}

# Create a widget for base map selection
base_map_dropdown = widgets.Dropdown(
    options=['OpenStreetMap', 'Satellite'],
    value='OpenStreetMap',
    description='Base Map:',
    layout=Layout(width='200px')
)

# Create a widget for point size
point_size_slider = widgets.IntSlider(
    value=300,
    min=100,
    max=1000,
    step=50,
    description='Point Size:',
    layout=Layout(width='300px')
)

# Create a widget for the units
radius_units_dropdown = widgets.Dropdown(
    options=['meters', 'pixels'],
    value='meters',
    description='Units:',
    layout=Layout(width='200px')
)

# Create a button to update the map
update_button = widgets.Button(
    description='Update Map',
    button_style='primary',
    layout=Layout(width='150px')
)

# Create an output widget for the map and statistics
map_output = widgets.Output()
stats_output = widgets.Output()

# Function to update the map based on widget values
def update_map(b):
    with map_output:
        map_output.clear_output(wait=True)
        
        # Get selected collections
        selected_collections = [
            collection for collection, checkbox in collection_checkboxes.items() 
            if checkbox.value
        ]
        
        # Get base map type
        base_layer_type = 'osm' if base_map_dropdown.value == 'OpenStreetMap' else 'satellite'
        
        # Update layer with selected collections and point size
        layer, colors, filtered_data = update_layer_colors(
            gdf_sample, 
            selected_collections, 
            radius=point_size_slider.value,
            radius_units=radius_units_dropdown.value
        )
        
        # Create and display the map
        m = create_map(base_layer_type=base_layer_type, layer=layer)
        display(m)
        
        # Update statistics
        with stats_output:
            stats_output.clear_output(wait=True)
            print(f"Selected collections: {', '.join(selected_collections)}")
            print(f"Points displayed: {len(filtered_data):,} of {len(gdf_sample):,} ({len(filtered_data)/len(gdf_sample)*100:.1f}%)")
            print(f"Points by collection:")
            for collection in selected_collections:
                count = sum(filtered_data['source_collection'] == collection)
                print(f"  {collection}: {count:,} points")

# Connect the update function to the button
update_button.on_click(update_map)

# Create the layout for the widgets
collection_box = VBox([widgets.HTML("<b>Data Collections:</b>")] + list(collection_checkboxes.values()))
config_box = VBox([
    widgets.HTML("<b>Map Configuration:</b>"),
    base_map_dropdown,
    point_size_slider,
    radius_units_dropdown,
    update_button
])

# Arrange the widgets in a horizontal layout
control_panel = HBox([collection_box, config_box], layout=Layout(width='100%'))

# Display the widgets and outputs
display(control_panel)
display(stats_output)
display(map_output)

# Initialize the map
update_map(None)

In [None]:
# Add a function to zoom to specific regions
zoom_regions = {
    'World': {'longitude': 0, 'latitude': 0, 'zoom': 1},
    'North America': {'longitude': -100, 'latitude': 40, 'zoom': 3},
    'Europe': {'longitude': 10, 'latitude': 50, 'zoom': 4},
    'Asia': {'longitude': 100, 'latitude': 30, 'zoom': 3},
    'Africa': {'longitude': 20, 'latitude': 0, 'zoom': 3},
    'South America': {'longitude': -60, 'latitude': -20, 'zoom': 3},
    'Australia': {'longitude': 135, 'latitude': -25, 'zoom': 4},
}

# Create a dropdown for region selection
region_dropdown = widgets.Dropdown(
    options=list(zoom_regions.keys()),
    value='World',
    description='Zoom to:',
    layout=Layout(width='200px')
)

# Function to zoom the map to a region
def zoom_to_region(change):
    if not hasattr(zoom_to_region, 'current_map'):
        return
    
    region = change['new']
    view_state = zoom_regions[region].copy()
    # Add missing view state properties
    if 'pitch' not in view_state:
        view_state['pitch'] = 0
    if 'bearing' not in view_state:
        view_state['bearing'] = 0
    
    zoom_to_region.current_map.view_state = view_state

# Function to update the map based on widget values (updated version)
def update_map(b):
    with map_output:
        map_output.clear_output(wait=True)
        
        # Get selected collections
        selected_collections = [
            collection for collection, checkbox in collection_checkboxes.items() 
            if checkbox.value
        ]
        
        # Get base map type
        base_layer_type = 'osm' if base_map_dropdown.value == 'OpenStreetMap' else 'satellite'
        
        # Update layer with selected collections and point size
        layer, colors, filtered_data = update_layer_colors(
            gdf_sample, 
            selected_collections, 
            radius=point_size_slider.value,
            radius_units=radius_units_dropdown.value
        )
        
        # Create and display the map
        m = create_map(base_layer_type=base_layer_type, layer=layer)
        display(m)
        
        # Store the map for later zoom operations
        zoom_to_region.current_map = m
        
        # Update statistics
        with stats_output:
            stats_output.clear_output(wait=True)
            print(f"Selected collections: {', '.join(selected_collections)}")
            print(f"Points displayed: {len(filtered_data):,} of {len(gdf_sample):,} ({len(filtered_data)/len(gdf_sample)*100:.1f}%)")
            print(f"Points by collection:")
            for collection in selected_collections:
                count = sum(filtered_data['source_collection'] == collection)
                print(f"  {collection}: {count:,} points ({count/len(filtered_data)*100:.1f}%)")

# Connect the region dropdown to the zoom function
region_dropdown.observe(zoom_to_region, names='value')

# Update the control panel to include the region dropdown
config_box = VBox([
    widgets.HTML("<b>Map Configuration:</b>"),
    base_map_dropdown,
    point_size_slider,
    radius_units_dropdown,
    region_dropdown,
    update_button
])

# Recreate the control panel
control_panel = HBox([collection_box, config_box], layout=Layout(width='100%'))

# Display the updated widgets and outputs
display(control_panel)
display(stats_output)
display(map_output)

# Initialize the map
update_map(None)

## Interactive iSamples Map

This interactive map allows you to explore the iSamples dataset with the following features:

1. **Collection Selection**: Choose which data collections to display
2. **Base Map**: Switch between OpenStreetMap and satellite imagery
3. **Point Size**: Adjust the size of the points on the map
4. **Units**: Choose between meters and pixels for point sizing
5. **Region Selection**: Quickly zoom to different regions of the world
6. **Statistics**: View counts and percentages of displayed points

The map is rendered using the Lonboard library, which provides fast visualization of large geospatial datasets directly in the notebook.