In [None]:
from pathlib import Path

import geopandas as gpd
import pandas as pd
import shapely
from palettable.colorbrewer.diverging import BrBG_10
# from sidecar import Sidecar

from lonboard import Map, ScatterplotLayer
from lonboard.colormap import apply_continuous_cmap

import ibis

import ipywidgets as widgets
from IPython.display import display
from ipywidgets import Layout, Button, HBox, VBox, HTML
from ipywidgets import Output, HTMLMath

In [None]:
# local_path = Path("/Users/raymondyee/Data/iSample/2025_02_20_10_30_49/isamples_export_2025_02_20_10_30_49_geo.parquet")
# local_path = Path("/Users/raymondyee/Data/iSample/OPENCONTEXT.parquet")
# local_path = Path("/Users/raymondyee/Data/iSample/oc_isamples_pqg.parquet")
local_path = Path("/Users/raymondyee/Data/iSample/2025_04_21_16_23_46/isamples_export_2025_04_21_16_23_46_geo.parquet")
local_path.exists()

In [None]:
all_columns = ['sample_identifier',
 'label',
 'description',
 'source_collection',
 'has_sample_object_type',
 'has_material_category',
 'has_context_category',
 'informal_classification',
 'keywords',
 'produced_by',
 'curation',
 'registrant',
 'related_resource',
 'sampling_purpose',
 'sample_location_longitude',
 'sample_location_latitude',
 'geometry']

# read a subset of columns
columns = ['sample_identifier', 'source_collection', 'geometry']
# columns = all_columns




In [None]:
if local_path.exists():
    gdf = gpd.read_parquet(local_path, columns=columns)


In [None]:
# use ibis to read the parquet file and compute some basic stats

table = ibis.read_parquet(local_path)
result = table["source_collection"].value_counts().execute()
print(result)


In [None]:
# Get all column names
print(table.columns)

# Display table schema/structure with data types
print(table.schema())

# Get number of rows
print(table.count().execute())

# Preview first few rows (similar to pandas head())
print(table.limit(5).execute())

In [None]:
# Value counts for categorical columns
print("Source collections:")
print(table["source_collection"].value_counts().execute())

print("Sample object types:")
print(table["has_sample_object_type"].value_counts().limit(10).execute())

print("Material categories:")
print(table["has_material_category"].value_counts().limit(10).execute())

# Check for null values in important columns
null_counts = {col: table[col].isnull().sum().execute() for col in table.columns}
print("Null counts per column:")
for col, count in null_counts.items():
    print(f"{col}: {count}")

In [None]:
# Summary statistics for numeric columns
print("Latitude statistics:")
lat_stats = table.aggregate([
    table["sample_location_latitude"].count().name('count'),
    table["sample_location_latitude"].min().name('min'),
    table["sample_location_latitude"].max().name('max'),
    table["sample_location_latitude"].mean().name('mean'),
    table["sample_location_latitude"].std().name('std'),
]).execute()
print(lat_stats)

print("Longitude statistics:")
lon_stats = table.aggregate([
    table["sample_location_longitude"].count().name('count'),
    table["sample_location_longitude"].min().name('min'),
    table["sample_location_longitude"].max().name('max'),
    table["sample_location_longitude"].mean().name('mean'),
    table["sample_location_longitude"].std().name('std'),
]).execute()
print(lon_stats)

# For percentiles, you can use quantile:
print("Latitude percentiles:")
lat_percentiles = table.aggregate([
    table["sample_location_latitude"].quantile(0.25).name('25%'),
    table["sample_location_latitude"].quantile(0.50).name('50%'),
    table["sample_location_latitude"].quantile(0.75).name('75%')
]).execute()
print(lat_percentiles)

In [None]:
# Group by source collection and count records
collection_summary = (
    table.group_by("source_collection")
    .aggregate(count=table.count())
    .order_by(ibis.desc("count"))
    .execute()
)
print("Records per source collection:")
print(collection_summary)

# Find records with geographic information
geography_stats = (
    table.group_by("source_collection")
    .aggregate(
        total=table.count(),
        with_coords=((~table["geometry"].isnull()).sum()),
        coord_percentage=(100 * (~table["geometry"].isnull()).mean())
    )
    .execute()
)
print("Geographic data availability by collection:")
print(geography_stats)

In [None]:
gdf.dtypes

In [None]:
gdf

In [None]:
list(gdf.columns)

In [None]:
# Convert source_collection to categorical -- to save space and speed up plotting
gdf['source_collection'] = gdf['source_collection'].astype('category')

# Verify it worked
print(gdf['source_collection'].dtype)

In [None]:
# Filter out null and empty geometries
gdf_valid = gdf[~gdf.geometry.isna() & ~gdf.geometry.is_empty]

print(f"Original dataframe: {len(gdf):,} records")
print(f"After removing empty geometries: {len(gdf_valid):,} records")
print(f"Removed: {len(gdf) - len(gdf_valid):,} records ({(len(gdf) - len(gdf_valid))/len(gdf)*100:.2f}%)")


In [None]:
# reduce the size of gdf to make it easier to plot

# Europe
# gdf = gdf.cx[-11.83:25.5, 34.9:59]
# USA
# gdf = gdf.cx[-125:-66, 24:50]
# WORLD
# gdf = gdf.cx[-180:180, -90:90]

In [None]:
len(gdf)

In [None]:
# Sample a manageable number of points
gdf_sample = gdf_valid.sample(frac=1.0, random_state=42)  # Adjust number as needed
layer = ScatterplotLayer.from_geopandas(gdf_sample)
m = Map(layer, _height=800)
display(m)
#with sidecar:
#    display(m)


In [None]:
layer.get_fill_color = [0, 50, 200, 200]

In [None]:
from lonboard import ScatterplotLayer, Map
import numpy as np

# First, ensure source_collection is categorical
gdf['source_collection'] = gdf['source_collection'].astype('category')

# Filter out null and empty geometries
gdf_valid = gdf[~gdf.geometry.isna() & ~gdf.geometry.is_empty]

# Get a sample if the dataset is too large
gdf_sample = gdf_valid.sample(frac=1.0, random_state=42)  # Adjust number as needed

# Define color map 
color_map = {
    "SESAR": [51, 102, 204, 255],       # Vibrant blue (#3366CC)
    "OPENCONTEXT": [220, 57, 18, 255],  # Crimson red (#DC3912)
    "GEOME": [16, 150, 24, 255],        # Forest green (#109618)
    "SMITHSONIAN": [255, 153, 0, 255]   # Deep orange (#FF9900)
}

# Pre-compute colors for each point
colors = np.zeros((len(gdf_sample), 4), dtype=np.uint8)
for i, source in enumerate(gdf_sample['source_collection']):
    if source in color_map:
        colors[i] = color_map[source]
    else:
        colors[i] = [128, 128, 128, 255]  # Gray for any other values

# Create a ScatterplotLayer with the pre-computed colors
layer = ScatterplotLayer.from_geopandas(
    gdf_sample,
    get_fill_color=colors,  # Pass the numpy array of colors
    get_radius=300,
    radius_units='meters',  # Use pixels instead of meters
    pickable=True
)

# Create and display the map
m = Map(layer, _height=800)
display(m)

In [None]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets
import math

@interact(x=(0,1000,1))
def f(x=20):
    return math.factorial(x)


In [None]:
# Correct the output widget code in cell with ID "5d3f6ec5"
gdf_sample['source_collection']

# construct checkboxes for each source collection
source_collections = gdf_sample['source_collection'].unique()
checkboxes = {source: widgets.Checkbox(value=False, description=source) for source in source_collections}

# Create output widget
output = widgets.Output()

# Respond to checkbox changes - FIX HERE
def on_checkbox_change(change):
    with output:
        output.clear_output()
        selected_collections = [source for source, checkbox in checkboxes.items() if checkbox.value]
        # Print to the output widget instead of trying to set its value
        print(f"Selected collections: {', '.join(selected_collections)}")

# Register the callback with all checkboxes
for checkbox in checkboxes.values():
    checkbox.observe(on_checkbox_change, names='value')

# Display the checkboxes and output
display(widgets.VBox(list(checkboxes.values())), output)


In [None]:
gdf_sample['source_collection'].value_counts()