diff --git a/.gitignore b/.gitignore index 90466f5..e5915f9 100644 --- a/.gitignore +++ b/.gitignore @@ -141,3 +141,5 @@ dmypy.json .idea /.quarto/ + +**/*.quarto_ipynb diff --git a/tutorials/parquet_cesium.qmd b/tutorials/parquet_cesium.qmd index 229a09d..bbc277f 100644 --- a/tutorials/parquet_cesium.qmd +++ b/tutorials/parquet_cesium.qmd @@ -28,9 +28,51 @@ Cesium.Ion.defaultAccessToken = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJqdGkiOi ```{ojs} //| echo: false -viewof parquet_path = Inputs.text({label:"Source", value:"https://storage.googleapis.com/opencontext-parquet/oc_isamples_pqg.parquet", width:"100%", submit:true}); +viewof parquet_path = Inputs.text({ + label:"Source", + value:"https://storage.googleapis.com/opencontext-parquet/oc_isamples_pqg.parquet", + placeholder: "URL or file:///path/to/file.parquet", + width:"100%", + submit:true +}); ``` +::: {.callout-tip collapse="true"} +#### Using a local cached file for faster performance + +DuckDB-WASM running in the browser **cannot access local files via `file://` URLs** due to browser security restrictions. However, you can use a local cached file when running `quarto preview`: + +**Local Development (recommended)** + +The repository includes a cached parquet file. To use it: + +1. Ensure the file exists in `docs/assets/oc_isamples_pqg.parquet` (691MB) + - The file must be in Quarto's output directory `docs/assets/`, not just the source `assets/` directory + - If needed, copy: `cp assets/oc_isamples_pqg.parquet docs/assets/` + +2. When running `quarto preview`, use the full localhost URL: + ``` + http://localhost:4979/assets/oc_isamples_pqg.parquet + ``` + (Replace `4979` with your actual preview port) + +**Alternative: Python HTTP server** +```bash +# In the directory containing your parquet file: +cd /Users/raymondyee/Data/iSample +python3 -m http.server 8000 +``` + +Then use: `http://localhost:8000/oc_isamples_pqg.parquet` + +**Benefits of local cached file:** +- Much faster initial load (no network transfer) +- Works offline +- Matches the notebook's local file access pattern + +**Limitation:** Only works during local development, not on published GitHub Pages. +::: + ::: callout-warning #### Heads up: first interaction may be slow The first click or query can take a few seconds while the in‑browser database engine initializes and the remote Parquet file is fetched and indexed. Subsequent interactions are much faster because both the browser and DuckDB cache metadata and column chunks, so later queries reuse what was already loaded. @@ -77,19 +119,56 @@ async function loadData(query, params = [], waiting_id = null, key = "default") } locations = { - // get the content form the parquet file - const query = `SELECT pid, latitude, longitude FROM nodes WHERE otype='GeospatialCoordLocation'`; + // Get geographic locations with classification by usage type + const query = ` + WITH geo_classification AS ( + SELECT + geo.pid, + geo.latitude, + geo.longitude, + MAX(CASE WHEN e.p = 'sample_location' THEN 1 ELSE 0 END) as is_sample_location, + MAX(CASE WHEN e.p = 'site_location' THEN 1 ELSE 0 END) as is_site_location + FROM nodes geo + JOIN nodes e ON (geo.row_id = e.o[1]) + WHERE geo.otype = 'GeospatialCoordLocation' + GROUP BY geo.pid, geo.latitude, geo.longitude + ) + SELECT + pid, + latitude, + longitude, + CASE + WHEN is_sample_location = 1 AND is_site_location = 1 THEN 'both' + WHEN is_sample_location = 1 THEN 'sample_location_only' + WHEN is_site_location = 1 THEN 'site_location_only' + END as location_type + FROM geo_classification + `; const data = await loadData(query, [], "loading_1", "locations"); // Clear the existing PointPrimitiveCollection content.points.removeAll(); - //content.points = new Cesium.PointPrimitiveCollection(); - // create point primitives for cesium display + // Color and size styling by location type + const styles = { + sample_location_only: { + color: Cesium.Color.fromCssColorString('#2E86AB'), + size: 3 + }, // Blue - field collection points + site_location_only: { + color: Cesium.Color.fromCssColorString('#A23B72'), + size: 6 + }, // Purple - administrative markers + both: { + color: Cesium.Color.fromCssColorString('#F18F01'), + size: 5 + } // Orange - dual-purpose + }; + + // Create point primitives for cesium display const scalar = new Cesium.NearFarScalar(1.5e2, 2, 8.0e6, 0.2); - const color = Cesium.Color.PINK; - const point_size = 4; for (const row of data) { + const style = styles[row.location_type] || styles.both; // fallback to orange content.points.add({ id: row.pid, // https://cesium.com/learn/cesiumjs/ref-doc/Cartesian3.html#.fromDegrees @@ -98,8 +177,8 @@ locations = { row.latitude, //latitude 0,//randomCoordinateJitter(10.0, 10.0), //elevation, m ), - pixelSize: point_size, - color: color, + pixelSize: style.size, + color: style.color, scaleByDistance: scalar, }); } @@ -260,6 +339,66 @@ async function get_samples_2(pid) { return result ?? []; } +async function get_samples_at_geo_cord_location_via_sample_event(pid) { + if (pid === null || pid ==="" || pid == "unset") { + return []; + } + const q = ` + -- Path 1: Direct event location + SELECT DISTINCT + s.pid as sample_pid, + s.label as sample_label, + s.description as sample_description, + s.thumbnail_url, + s.alternate_identifiers, + event.label as event_label, + site.label as site_label, + site.pid as site_pid, + 'direct_event_location' as location_path + FROM nodes s + JOIN nodes e1 ON s.row_id = e1.s AND e1.p = 'produced_by' + JOIN nodes event ON e1.o[1] = event.row_id + JOIN nodes e2 ON event.row_id = e2.s AND e2.p = 'sample_location' + JOIN nodes g ON e2.o[1] = g.row_id + LEFT JOIN nodes e3 ON event.row_id = e3.s AND e3.p = 'sampling_site' + LEFT JOIN nodes site ON e3.o[1] = site.row_id + WHERE s.otype = 'MaterialSampleRecord' + AND event.otype = 'SamplingEvent' + AND g.otype = 'GeospatialCoordLocation' + AND g.pid = ? + + UNION + + -- Path 2: Via site location + SELECT DISTINCT + s.pid as sample_pid, + s.label as sample_label, + s.description as sample_description, + s.thumbnail_url, + s.alternate_identifiers, + event.label as event_label, + site.label as site_label, + site.pid as site_pid, + 'via_site_location' as location_path + FROM nodes s + JOIN nodes e1 ON s.row_id = e1.s AND e1.p = 'produced_by' + JOIN nodes event ON e1.o[1] = event.row_id + JOIN nodes e2 ON event.row_id = e2.s AND e2.p = 'sampling_site' + JOIN nodes site ON e2.o[1] = site.row_id + JOIN nodes e3 ON site.row_id = e3.s AND e3.p = 'site_location' + JOIN nodes g ON e3.o[1] = g.row_id + WHERE s.otype = 'MaterialSampleRecord' + AND event.otype = 'SamplingEvent' + AND site.otype = 'SamplingSite' + AND g.otype = 'GeospatialCoordLocation' + AND g.pid = ? + + ORDER BY thumbnail_url IS NOT NULL DESC, sample_label + `; + const result = await loadData(q, [pid, pid], "loading_combined", "samples_combined"); + return result ?? []; +} + async function locationUsedBy(rowid){ if (rowid === undefined || rowid === null) { return []; @@ -273,6 +412,7 @@ mutable clickedPointId = "unset"; mutable geoLoading = false; mutable s1Loading = false; mutable s2Loading = false; +mutable combinedLoading = false; // Precompute selection-driven data with loading flags selectedGeoRecord = { @@ -302,9 +442,48 @@ selectedSamples2 = { } } +selectedSamplesCombined = { + mutable combinedLoading = true; + try { + return await get_samples_at_geo_cord_location_via_sample_event(clickedPointId); + } finally { + mutable combinedLoading = false; + } +} + md`Retrieved ${pointdata.length} locations from ${parquet_path}.`; ``` +```{ojs} +//| echo: false +// Center initial Cesium view on PKAP Survey Area and also set Home to PKAP! +{ + const viewer = content.viewer; + // PKAP Survey Area near Cyprus + // Source: https://opencontext.org/subjects/48fd434c-f6d3... + const pkapLat = 34.987406; + const pkapLon = 33.708047; + const delta = 0.3; // degrees padding around point + const pkapRect = Cesium.Rectangle.fromDegrees( + pkapLon - delta, // west (lon) + pkapLat - delta, // south (lat) + pkapLon + delta, // east (lon) + pkapLat + delta // north (lat) + ); + + // Make the Home button go to PKAP as well + Cesium.Camera.DEFAULT_VIEW_RECTANGLE = pkapRect; + Cesium.Camera.DEFAULT_VIEW_FACTOR = 0.5; + + // Apply camera after the first render to avoid resize/tab visibility issues + const once = () => { + viewer.camera.setView({ destination: pkapRect }); + viewer.scene.postRender.removeEventListener(once); + }; + viewer.scene.postRender.addEventListener(once); +} +``` + ::: {.panel-tabset} ## Map @@ -321,10 +500,10 @@ md`Retrieved ${pointdata.length} locations from ${parquet_path}.`; viewof pointdata = { const data_table = Inputs.table(locations, { header: { - row_id:"Row ID", pid: "PID", latitude: "Latitude", - longitude: "Longitude" + longitude: "Longitude", + location_type: "Location Type" }, }); return data_table; @@ -361,6 +540,121 @@ ${JSON.stringify(testrecord, null, 2)} ` ``` +## Understanding Paths in the iSamples Property Graph + +### Why "Path 1" and "Path 2"? + +These terms describe the **two main ways to get from a MaterialSampleRecord to geographic coordinates**. They're not the only relationship paths in the graph, but they're the most commonly used for spatial queries. + +**Path 1 (Direct Event Location)** +``` +MaterialSampleRecord + → produced_by → +SamplingEvent + → sample_location → +GeospatialCoordLocation +``` + +**Path 2 (Via Sampling Site)** +``` +MaterialSampleRecord + → produced_by → +SamplingEvent + → sampling_site → +SamplingSite + → site_location → +GeospatialCoordLocation +``` + +**Key Differences:** +- **Path 1 is direct**: Event → Location (3 hops total) +- **Path 2 goes through Site**: Event → Site → Location (4 hops total) +- **Path 1** = "Where was this specific sample collected?" +- **Path 2** = "What named site is this sample from, and where is that site?" + +**Important:** The queries below use INNER JOIN for both paths, meaning samples must have connections through both paths to appear in results. Samples with only one path will be excluded. + +### Full Relationship Map (Beyond Path 1 and Path 2) + +The iSamples property graph contains many more relationships than just the geographic paths: + +``` + Agent + ↑ + | {responsibility, registrant} + | +MaterialSampleRecord ────produced_by──→ SamplingEvent ────sample_location──→ GeospatialCoordLocation + | | ↑ + | | | + | {keywords, └────sampling_site──→ SamplingSite ──site_location─┘ + | has_sample_object_type, + | has_material_category} + | + └──→ IdentifiedConcept +``` + +**Path Categories:** +- **PATH 1**: MaterialSampleRecord → SamplingEvent → GeospatialCoordLocation (direct location) +- **PATH 2**: MaterialSampleRecord → SamplingEvent → SamplingSite → GeospatialCoordLocation (via site) +- **AGENT PATH**: MaterialSampleRecord → SamplingEvent → Agent (who collected/registered) +- **CONCEPT PATH**: MaterialSampleRecord → IdentifiedConcept (types, keywords - direct, no event!) + +**Key Insight:** SamplingEvent is the central hub for most relationships, except concepts which attach directly to MaterialSampleRecord. + +### Query Pattern Analysis (from Eric Kansa's open-context-py) + +The following analysis is based on Eric's query functions that demonstrate different path traversal patterns: + +#### 1. `get_sample_data_via_sample_pid` - Uses BOTH Path 1 AND Path 2 +``` +MaterialSampleRecord (WHERE pid = ?) + → produced_by → SamplingEvent + ├─→ sample_location → GeospatialCoordLocation [Path 1] + └─→ sampling_site → SamplingSite [Path 2] + +Returns: sample metadata + lat/lon + site label/pid +Required: BOTH paths must exist (INNER JOIN) +``` + +#### 2. `get_sample_data_agents_sample_pid` - Uses AGENT PATH +``` +MaterialSampleRecord (WHERE pid = ?) + → produced_by → SamplingEvent + → {responsibility, registrant} → Agent + +Returns: sample metadata + agent info (who collected/registered) +Independent of: Path 1 and Path 2 (no geographic data) +``` + +#### 3. `get_sample_types_and_keywords_via_sample_pid` - Uses CONCEPT PATH +``` +MaterialSampleRecord (WHERE pid = ?) + → {keywords, has_sample_object_type, has_material_category} → IdentifiedConcept + +Returns: sample metadata + classification keywords/types +Independent of: Path 1, Path 2, and SamplingEvent! +``` + +#### 4. `get_samples_at_geo_cord_location_via_sample_event` - REVERSE Path 1 + Path 2 +``` +GeospatialCoordLocation (WHERE pid = ?) ← START HERE (reverse!) + ← sample_location ← SamplingEvent [Path 1 REVERSED] + ├─→ sampling_site → SamplingSite [Path 2 enrichment] + └─← produced_by ← MaterialSampleRecord [complete chain] + +Returns: all samples at a given location + site info +Direction: geo → samples (opposite of other queries) +``` + +**Summary Table:** + +| Function | Path 1 | Path 2 | Direction | Notes | +|----------|--------|--------|-----------|-------| +| `get_sample_data_via_sample_pid` | ✅ Required | ✅ Required | Forward | INNER JOIN - no row if either missing | +| `get_sample_data_agents_sample_pid` | ❌ N/A | ❌ N/A | N/A | Uses agent path instead | +| `get_sample_types_and_keywords_via_sample_pid` | ❌ N/A | ❌ N/A | N/A | Direct edges to concepts | +| `get_samples_at_geo_cord_location_via_sample_event` | ✅ Required | ✅ Required | Reverse | Walks from geo to samples | + ## Related Sample Path 1 (selected)