From bbb09e97f2bdcea9a53a85392431570f8c917d5a Mon Sep 17 00:00:00 2001 From: Raymond Yee Date: Fri, 5 Dec 2025 08:25:17 -0800 Subject: [PATCH] refactor: Replace Path 1/Path 2 terminology with typed edge vocabulary MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace ambiguous "Path 1" and "Path 2" terminology with precise typed edge vocabulary throughout tutorial documentation: - Path 1 β†’ "Direct Location (EVENT_SAMPLE_LOCATION)" - Path 2 β†’ "Site-Mediated Location (EVENT_SAMPLING_SITE β†’ SITE_LOCATION)" Add documentation of the 14 iSamples typed edge types which provide unambiguous vocabulary for describing relationships: - MSR_PRODUCED_BY, EVENT_SAMPLE_LOCATION, EVENT_SAMPLING_SITE, SITE_LOCATION, etc. Updated files: - parquet_cesium.qmd: Full documentation section rewrite, code comments, UI text - parquet_cesium_wide.qmd: Same refactoring with wide format column mappings - oc_parquet_enhanced.qmd: 2 Path references updated This change improves documentation precision and aligns with Eric's canonical query formulation which requires both direct location AND site provenance. πŸ€– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- tutorials/oc_parquet_enhanced.qmd | 10 +- tutorials/parquet_cesium.qmd | 193 +++++++++++++++++------------- tutorials/parquet_cesium_wide.qmd | 167 ++++++++++++++------------ 3 files changed, 206 insertions(+), 164 deletions(-) diff --git a/tutorials/oc_parquet_enhanced.qmd b/tutorials/oc_parquet_enhanced.qmd index ff4f329..0837bc6 100644 --- a/tutorials/oc_parquet_enhanced.qmd +++ b/tutorials/oc_parquet_enhanced.qmd @@ -190,16 +190,16 @@ viewof relationshipTable = Inputs.table(relationshipPatterns, { ❌ **Common Mistake**: Assuming direct Sample β†’ Location relationships βœ… **Reality**: All location queries require multi-hop traversal through SamplingEvent -### The Correct Paths Discovered +### The Correct Traversals Discovered (Using Typed Edges) -**Path 1: Direct Event Location** +**Direct Location (via EVENT_SAMPLE_LOCATION)** ``` -MaterialSampleRecord β†’ produced_by β†’ SamplingEvent β†’ sample_location β†’ GeospatialCoordLocation +MaterialSampleRecord β†’ MSR_PRODUCED_BY β†’ SamplingEvent β†’ EVENT_SAMPLE_LOCATION β†’ GeospatialCoordLocation ``` -**Path 2: Via Site Location** +**Site-Mediated Location (via EVENT_SAMPLING_SITE β†’ SITE_LOCATION)** ``` -MaterialSampleRecord β†’ produced_by β†’ SamplingEvent β†’ sampling_site β†’ SamplingSite β†’ site_location β†’ GeospatialCoordLocation +MaterialSampleRecord β†’ MSR_PRODUCED_BY β†’ SamplingEvent β†’ EVENT_SAMPLING_SITE β†’ SamplingSite β†’ SITE_LOCATION β†’ GeospatialCoordLocation ``` This discovery unlocked **1,096,274 samples** that were previously inaccessible due to incorrect query patterns! diff --git a/tutorials/parquet_cesium.qmd b/tutorials/parquet_cesium.qmd index 8aa7e76..4ca952c 100644 --- a/tutorials/parquet_cesium.qmd +++ b/tutorials/parquet_cesium.qmd @@ -341,7 +341,7 @@ async function get_samples_1(pid) { if (pid === null || pid ==="" || pid == "unset") { return []; } - // Path 1: Direct event location - enhanced to match Eric's query structure + // Direct Location (EVENT_SAMPLE_LOCATION) - enhanced to match Eric's query structure const q = ` SELECT geo.latitude, @@ -395,7 +395,7 @@ async function get_samples_1(pid) { performance.mark('samples1-end'); performance.measure('samples1-query', 'samples1-start', 'samples1-end'); const queryTime = performance.getEntriesByName('samples1-query')[0].duration; - console.log(`Path 1 query executed in ${queryTime.toFixed(0)}ms - retrieved ${result?.length || 0} samples`); + console.log(`Direct location query (EVENT_SAMPLE_LOCATION) executed in ${queryTime.toFixed(0)}ms - retrieved ${result?.length || 0} samples`); return result ?? []; } @@ -403,7 +403,7 @@ async function get_samples_2(pid) { if (pid === null || pid ==="" || pid == "unset") { return []; } - // Path 2: Via site location - enhanced to match Eric's query structure + // Site-Mediated Location (EVENT_SAMPLING_SITE β†’ SITE_LOCATION) - enhanced to match Eric's query structure const q = ` SELECT geo.latitude, @@ -457,7 +457,7 @@ async function get_samples_2(pid) { performance.mark('samples2-end'); performance.measure('samples2-query', 'samples2-start', 'samples2-end'); const queryTime = performance.getEntriesByName('samples2-query')[0].duration; - console.log(`Path 2 query executed in ${queryTime.toFixed(0)}ms - retrieved ${result?.length || 0} samples`); + console.log(`Site-mediated location query (SITE_LOCATION) executed in ${queryTime.toFixed(0)}ms - retrieved ${result?.length || 0} samples`); return result ?? []; } @@ -937,11 +937,11 @@ ${JSON.stringify(testrecord, null, 2)} This query implements Eric Kansa's authoritative `get_samples_at_geo_cord_location_via_sample_event` function from [open-context-py](https://github.com/ekansa/open-context-py/blob/staging/opencontext_py/apps/all_items/isamples/isamples_explore.py). -**Query Strategy (Path 1 Only)**: +**Query Strategy (EVENT_SAMPLE_LOCATION + EVENT_SAMPLING_SITE)**: - Starts at a GeospatialCoordLocation (clicked point) -- Walks **backward** via `sample_location` edges to find SamplingEvents that reference this location -- From those events, finds MaterialSampleRecords produced by them -- Requires site context (INNER JOIN on `sampling_site` β†’ SamplingSite) +- Walks **backward** via `EVENT_SAMPLE_LOCATION` edges to find SamplingEvents that reference this location +- From those events, finds MaterialSampleRecords produced by them via `MSR_PRODUCED_BY` +- Requires site context (INNER JOIN on `EVENT_SAMPLING_SITE` β†’ SamplingSite) - this is the provenance constraint **Returns**: - Geographic coordinates: `latitude`, `longitude` @@ -951,7 +951,7 @@ This query implements Eric Kansa's authoritative `get_samples_at_geo_cord_locati **Ordering**: Prioritizes samples with images (`ORDER BY has_thumbnail DESC`) -**Important**: This query only returns samples whose **sampling events directly reference this geolocation** via `sample_location` (Path 1). Samples that reach this location only through their site's `site_location` (Path 2) are **not included**. This means site marker locations may return 0 results if no events were recorded at that exact coordinate. +**Important**: This query only returns samples whose **sampling events directly reference this geolocation** via `EVENT_SAMPLE_LOCATION` (direct location). Samples that reach this location only through their site's `SITE_LOCATION` (site-mediated) are **not included**. This means site marker locations may return 0 results if no events were recorded at that exact coordinate. ```{ojs} //| echo: false @@ -1033,131 +1033,156 @@ html`${ ` : html`
- No samples found at this location via Path 1 (direct sampling events). + No samples found at this location via direct location (EVENT_SAMPLE_LOCATION).
` }` ``` -## Understanding Paths in the iSamples Property Graph +## Understanding Location Traversals Using Typed Edges -### Why "Path 1" and "Path 2"? +### The 14 iSamples Typed Edge Types -These terms describe the **two main ways to get from a MaterialSampleRecord to geographic coordinates**. They're not the only relationship paths in the graph, but they're the most commonly used for spatial queries. +The iSamples property graph uses **14 typed edges** to express all relationships. These provide a precise, unambiguous vocabulary for describing any query traversal: -**Path 1 (Direct Event Location)** +| Edge Type | From β†’ To | +|-----------|-----------| +| `MSR_PRODUCED_BY` | MaterialSampleRecord β†’ SamplingEvent | +| `MSR_REGISTRANT` | MaterialSampleRecord β†’ Agent | +| `MSR_KEYWORDS` | MaterialSampleRecord β†’ IdentifiedConcept | +| `MSR_HAS_CONTEXT_CATEGORY` | MaterialSampleRecord β†’ IdentifiedConcept | +| `MSR_HAS_MATERIAL_CATEGORY` | MaterialSampleRecord β†’ IdentifiedConcept | +| `MSR_HAS_SAMPLE_OBJECT_TYPE` | MaterialSampleRecord β†’ IdentifiedConcept | +| `MSR_CURATION` | MaterialSampleRecord β†’ MaterialSampleCuration | +| `MSR_RELATED_RESOURCE` | MaterialSampleRecord β†’ SampleRelation | +| `EVENT_SAMPLING_SITE` | SamplingEvent β†’ SamplingSite | +| `EVENT_SAMPLE_LOCATION` | SamplingEvent β†’ GeospatialCoordLocation | +| `EVENT_RESPONSIBILITY` | SamplingEvent β†’ Agent | +| `EVENT_HAS_CONTEXT_CATEGORY` | SamplingEvent β†’ IdentifiedConcept | +| `SITE_LOCATION` | SamplingSite β†’ GeospatialCoordLocation | +| `CURATION_RESPONSIBILITY` | MaterialSampleCuration β†’ Agent | + +### Two Ways to Get Geographic Coordinates + +There are **two traversal patterns** from a MaterialSampleRecord to geographic coordinates: + +**Direct Location (via EVENT_SAMPLE_LOCATION)** ``` MaterialSampleRecord - β†’ produced_by β†’ + β†’ MSR_PRODUCED_BY β†’ SamplingEvent - β†’ sample_location β†’ + β†’ EVENT_SAMPLE_LOCATION β†’ GeospatialCoordLocation ``` -**Path 2 (Via Sampling Site)** +**Site-Mediated Location (via EVENT_SAMPLING_SITE β†’ SITE_LOCATION)** ``` MaterialSampleRecord - β†’ produced_by β†’ + β†’ MSR_PRODUCED_BY β†’ SamplingEvent - β†’ sampling_site β†’ + β†’ EVENT_SAMPLING_SITE β†’ SamplingSite - β†’ site_location β†’ + β†’ SITE_LOCATION β†’ GeospatialCoordLocation ``` **Key Differences:** -- **Path 1 is direct**: Event β†’ Location (3 hops total) -- **Path 2 goes through Site**: Event β†’ Site β†’ Location (4 hops total) -- **Path 1** = "Where was this specific sample collected?" -- **Path 2** = "What named site is this sample from, and where is that site?" +- **Direct** uses `EVENT_SAMPLE_LOCATION`: 2 edges from Sample to Location +- **Site-Mediated** uses `EVENT_SAMPLING_SITE` + `SITE_LOCATION`: 3 edges from Sample to Location +- **Direct** = "Where was this specific sample collected?" +- **Site-Mediated** = "What named site is this sample from, and where is that site?" -**Important:** The queries below use INNER JOIN for both paths, meaning samples must have connections through both paths to appear in results. Samples with only one path will be excluded. +**Important:** The queries below use INNER JOIN for both traversals, meaning samples must have connections through both to appear in results. Samples with only one traversal will be excluded. -### Full Relationship Map (Beyond Path 1 and Path 2) +### Full Relationship Map Using Typed Edges -The iSamples property graph contains many more relationships than just the geographic paths: +The iSamples property graph contains many more relationships than just the geographic traversals: ``` Agent ↑ - | {responsibility, registrant} + | EVENT_RESPONSIBILITY, MSR_REGISTRANT | -MaterialSampleRecord ────produced_by──→ SamplingEvent ────sample_location──→ GeospatialCoordLocation +MaterialSampleRecord ─MSR_PRODUCED_BY─→ SamplingEvent ─EVENT_SAMPLE_LOCATION─→ GeospatialCoordLocation | | ↑ | | | - | {keywords, └────sampling_site──→ SamplingSite ──site_locationβ”€β”˜ - | has_sample_object_type, - | has_material_category} + | MSR_KEYWORDS, └─EVENT_SAMPLING_SITE─→ SamplingSite ─SITE_LOCATIONβ”€β”˜ + | MSR_HAS_SAMPLE_OBJECT_TYPE, + | MSR_HAS_MATERIAL_CATEGORY | └──→ IdentifiedConcept ``` -**Path Categories:** -- **PATH 1**: MaterialSampleRecord β†’ SamplingEvent β†’ GeospatialCoordLocation (direct location) -- **PATH 2**: MaterialSampleRecord β†’ SamplingEvent β†’ SamplingSite β†’ GeospatialCoordLocation (via site) -- **AGENT PATH**: MaterialSampleRecord β†’ SamplingEvent β†’ Agent (who collected/registered) -- **CONCEPT PATH**: MaterialSampleRecord β†’ IdentifiedConcept (types, keywords - direct, no event!) +**Edge Categories:** +- **DIRECT LOCATION**: `MSR_PRODUCED_BY` β†’ `EVENT_SAMPLE_LOCATION` +- **SITE-MEDIATED LOCATION**: `MSR_PRODUCED_BY` β†’ `EVENT_SAMPLING_SITE` β†’ `SITE_LOCATION` +- **AGENT EDGES**: `MSR_REGISTRANT`, `EVENT_RESPONSIBILITY` +- **CONCEPT EDGES**: `MSR_KEYWORDS`, `MSR_HAS_*_CATEGORY`, `MSR_HAS_SAMPLE_OBJECT_TYPE` -**Key Insight:** SamplingEvent is the central hub for most relationships, except concepts which attach directly to MaterialSampleRecord. +**Key Insight:** SamplingEvent is the central hub for most relationships, except concept edges which attach directly to MaterialSampleRecord. ### Query Pattern Analysis (from Eric Kansa's open-context-py) -The following analysis is based on Eric's query functions that demonstrate different path traversal patterns: +The following analysis is based on Eric's query functions that demonstrate different edge traversal patterns: -#### 1. `get_sample_data_via_sample_pid` - Uses BOTH Path 1 AND Path 2 +#### 1. `get_sample_data_via_sample_pid` - Uses BOTH Location Traversals ``` MaterialSampleRecord (WHERE pid = ?) - β†’ produced_by β†’ SamplingEvent - β”œβ”€β†’ sample_location β†’ GeospatialCoordLocation [Path 1] - └─→ sampling_site β†’ SamplingSite [Path 2] + β†’ MSR_PRODUCED_BY β†’ SamplingEvent + β”œβ”€β†’ EVENT_SAMPLE_LOCATION β†’ GeospatialCoordLocation [Direct] + └─→ EVENT_SAMPLING_SITE β†’ SamplingSite [Site-Mediated] Returns: sample metadata + lat/lon + site label/pid -Required: BOTH paths must exist (INNER JOIN) +Required: BOTH traversals must exist (INNER JOIN) ``` -#### 2. `get_sample_data_agents_sample_pid` - Uses AGENT PATH +#### 2. `get_sample_data_agents_sample_pid` - Uses Agent Edges ``` MaterialSampleRecord (WHERE pid = ?) - β†’ produced_by β†’ SamplingEvent - β†’ {responsibility, registrant} β†’ Agent + β†’ MSR_PRODUCED_BY β†’ SamplingEvent + β†’ EVENT_RESPONSIBILITY β†’ Agent Returns: sample metadata + agent info (who collected/registered) -Independent of: Path 1 and Path 2 (no geographic data) +Independent of: Location traversals (no geographic data) ``` -#### 3. `get_sample_types_and_keywords_via_sample_pid` - Uses CONCEPT PATH +#### 3. `get_sample_types_and_keywords_via_sample_pid` - Uses Concept Edges ``` MaterialSampleRecord (WHERE pid = ?) - β†’ {keywords, has_sample_object_type, has_material_category} β†’ IdentifiedConcept + β†’ MSR_KEYWORDS β†’ IdentifiedConcept + β†’ MSR_HAS_SAMPLE_OBJECT_TYPE β†’ IdentifiedConcept + β†’ MSR_HAS_MATERIAL_CATEGORY β†’ IdentifiedConcept Returns: sample metadata + classification keywords/types -Independent of: Path 1, Path 2, and SamplingEvent! +Independent of: Location traversals and SamplingEvent! ``` -#### 4. `get_samples_at_geo_cord_location_via_sample_event` - REVERSE Path 1 + Path 2 +#### 4. `get_samples_at_geo_cord_location_via_sample_event` - Eric's Canonical Query ``` GeospatialCoordLocation (WHERE pid = ?) ← START HERE (reverse!) - ← sample_location ← SamplingEvent [Path 1 REVERSED] - β”œβ”€β†’ sampling_site β†’ SamplingSite [Path 2 enrichment] - └─← produced_by ← MaterialSampleRecord [complete chain] + ← EVENT_SAMPLE_LOCATION ← SamplingEvent [Direct, reversed] + β”œβ”€β†’ EVENT_SAMPLING_SITE β†’ SamplingSite [Site context - provenance!] + └─← MSR_PRODUCED_BY ← MaterialSampleRecord [complete chain] Returns: all samples at a given location + site info Direction: geo β†’ samples (opposite of other queries) ``` +**Key Insight (from Eric Kansa):** This canonical query requires BOTH `EVENT_SAMPLE_LOCATION` AND `EVENT_SAMPLING_SITE` - it's not just "find samples at this location" but "find samples at this location that have proper site provenance." + **Summary Table:** -| Function | Path 1 | Path 2 | Direction | Notes | -|----------|--------|--------|-----------|-------| +| Function | Direct Location | Site-Mediated | Direction | Notes | +|----------|-----------------|---------------|-----------|-------| | `get_sample_data_via_sample_pid` | βœ… Required | βœ… Required | Forward | INNER JOIN - no row if either missing | -| `get_sample_data_agents_sample_pid` | ❌ N/A | ❌ N/A | N/A | Uses agent path instead | -| `get_sample_types_and_keywords_via_sample_pid` | ❌ N/A | ❌ N/A | N/A | Direct edges to concepts | -| `get_samples_at_geo_cord_location_via_sample_event` | βœ… Required | βœ… Required | Reverse | Walks from geo to samples | +| `get_sample_data_agents_sample_pid` | ❌ N/A | ❌ N/A | N/A | Uses agent edges instead | +| `get_sample_types_and_keywords_via_sample_pid` | ❌ N/A | ❌ N/A | N/A | Direct concept edges | +| `get_samples_at_geo_cord_location_via_sample_event` | βœ… Required | βœ… Required | Reverse | Geo β†’ samples with site provenance | -## Related Sample Path 1 (selected) +## Related Samples via Direct Location (EVENT_SAMPLE_LOCATION) - + -Path 1 (direct_event_location): find MaterialSampleRecord items whose producing SamplingEvent has a direct sample_location pointing to the clicked GeospatialCoordLocation (pid). +Direct Location (via EVENT_SAMPLE_LOCATION): find MaterialSampleRecord items whose producing SamplingEvent has a direct `sample_location` edge pointing to the clicked GeospatialCoordLocation (pid). - Chain: MaterialSampleRecord β†’ produced_by β†’ SamplingEvent β†’ sample_location β†’ GeospatialCoordLocation (clicked pid) - This matches the "direct_samples" concept in the Python notebook and is labeled as `location_path = 'direct_event_location'` in the query. @@ -1171,7 +1196,7 @@ samples_1 = selectedSamples1 //| echo: false html`${ s1Loading ? - html`
Loading Path 1 samples…
` + html`
Loading direct location samples…
` : samples_1 && samples_1.length > 0 ? html`
@@ -1238,23 +1263,23 @@ html`${
- Found ${samples_1.length} sample${samples_1.length !== 1 ? 's' : ''} via Path 1 (direct event location) + Found ${samples_1.length} sample${samples_1.length !== 1 ? 's' : ''} via direct location (EVENT_SAMPLE_LOCATION)
` : html`
- No samples found via Path 1 (direct event location). + No samples found via direct location (EVENT_SAMPLE_LOCATION).
` }` ``` -## Related Sample Path 2 (selected) +## Related Samples via Site-Mediated Location (EVENT_SAMPLING_SITE β†’ SITE_LOCATION) - + -Path 2 (via_site_location): find MaterialSampleRecord items whose producing SamplingEvent references a SamplingSite, and that site’s site_location points to the clicked GeospatialCoordLocation (pid). +Site-Mediated Location (via EVENT_SAMPLING_SITE β†’ SITE_LOCATION): find MaterialSampleRecord items whose producing SamplingEvent references a SamplingSite via `sampling_site` edge, and that site's `site_location` edge points to the clicked GeospatialCoordLocation (pid). -- Chain: MaterialSampleRecord β†’ produced_by β†’ SamplingEvent β†’ sampling_site β†’ SamplingSite β†’ site_location β†’ GeospatialCoordLocation (clicked pid) +- Chain: MaterialSampleRecord β†’ MSR_PRODUCED_BY β†’ SamplingEvent β†’ EVENT_SAMPLING_SITE β†’ SamplingSite β†’ SITE_LOCATION β†’ GeospatialCoordLocation (clicked pid) - This matches the "samples_via_sites" concept in the Python notebook and is labeled as `location_path = 'via_site_location'` in the query. ```{ojs} @@ -1266,7 +1291,7 @@ samples_2 = selectedSamples2 //| echo: false html`${ s2Loading ? - html`
Loading Path 2 samples…
` + html`
Loading site-mediated samples…
` : samples_2 && samples_2.length > 0 ? html`
@@ -1333,11 +1358,11 @@ html`${
- Found ${samples_2.length} sample${samples_2.length !== 1 ? 's' : ''} via Path 2 (via site location) + Found ${samples_2.length} sample${samples_2.length !== 1 ? 's' : ''} via site-mediated location (SITE_LOCATION)
` : html`
- No samples found via Path 2 (via site location). + No samples found via site-mediated location (SITE_LOCATION).
` }` ``` @@ -1349,24 +1374,24 @@ html`${ **Current implementation**: GeospatialCoordLocations are now color-coded by their semantic role in the property graph: -- πŸ”΅ **Blue (small)** - `sample_location_only`: Precise field collection points (Path 1) -- 🟣 **Purple (large)** - `site_location_only`: Administrative site markers (Path 2) -- 🟠 **Orange (medium)** - `both`: Dual-purpose locations (used for both Path 1 and Path 2) +- πŸ”΅ **Blue (small)** - `sample_location_only`: Precise field collection points (via EVENT_SAMPLE_LOCATION) +- 🟣 **Purple (large)** - `site_location_only`: Administrative site markers (via SITE_LOCATION) +- 🟠 **Orange (medium)** - `both`: Dual-purpose locations (used by both edge types) **Discovery**: Analysis of the OpenContext parquet data reveals that geos fall into three distinct categories based on their usage: -1. **`sample_location_only`**: Precise field collection points (Path 1) +1. **`sample_location_only`**: Precise field collection points (via EVENT_SAMPLE_LOCATION) - Most common category - Represents exact GPS coordinates where sampling events occurred - Varies per event, even within the same site -2. **`site_location_only`**: Administrative site markers (Path 2) +2. **`site_location_only`**: Administrative site markers (via SITE_LOCATION) - Represents general/reference locations for named archaeological sites - One coordinate per site - May not correspond to any actual collection point 3. **`both`**: 10,346 geos (5.2%) - Dual-purpose locations - - Used as BOTH `sample_location` AND `site_location` + - Used by BOTH `EVENT_SAMPLE_LOCATION` AND `SITE_LOCATION` edges - Primarily single-location sites (85% of all sites) - Occasionally one of many locations at multi-location sites (e.g., PKAP) @@ -1379,11 +1404,11 @@ html`${ ### Benefits of Current Implementation -1. **Educational**: Makes Path 1 vs Path 2 distinction visually concrete +1. **Educational**: Makes direct vs site-mediated location distinction visually concrete - Users can SEE the semantic difference between precise and administrative locations - - Blue points show where samples were actually collected (Path 1) - - Purple points show administrative site markers (Path 2) - - Demonstrates the complementary nature of the two geographic paths + - Blue points show where samples were actually collected (EVENT_SAMPLE_LOCATION) + - Purple points show administrative site markers (SITE_LOCATION) + - Demonstrates the complementary nature of the two location edge types 2. **Exploratory**: Enables visual understanding of spatial patterns - Archaeological sites appear as purple markers (large points) @@ -1432,7 +1457,7 @@ User clicks PKAP Survey Area marker (purple) - ⬜ Convex hull/region drawing for distributed sites - ⬜ Dynamic statistics display on site selection -This implementation transforms the visualization from uniform points into a pedagogical tool that visually demonstrates the Path 1 vs Path 2 distinction in the iSamples metadata model architecture. +This implementation transforms the visualization from uniform points into a pedagogical tool that visually demonstrates the `EVENT_SAMPLE_LOCATION` vs `SITE_LOCATION` distinction in the iSamples typed edge model. ::: ``` diff --git a/tutorials/parquet_cesium_wide.qmd b/tutorials/parquet_cesium_wide.qmd index b507e5d..4449d8c 100644 --- a/tutorials/parquet_cesium_wide.qmd +++ b/tutorials/parquet_cesium_wide.qmd @@ -351,13 +351,13 @@ async function getGeoRecord(pid) { return rows && rows.length ? rows[0] : null; } -// WIDE FORMAT: Path 1 - Direct event location +// WIDE FORMAT: Direct Location (p__sample_location / EVENT_SAMPLE_LOCATION) // Uses p__sample_location column instead of edge row JOINs async function get_samples_1(pid) { if (pid === null || pid ==="" || pid == "unset") { return []; } - // Path 1: Direct event location - WIDE FORMAT version + // Direct Location (EVENT_SAMPLE_LOCATION) - WIDE FORMAT version // Uses p__* columns instead of edge rows const q = ` SELECT @@ -397,17 +397,17 @@ async function get_samples_1(pid) { performance.mark('samples1-end'); performance.measure('samples1-query', 'samples1-start', 'samples1-end'); const queryTime = performance.getEntriesByName('samples1-query')[0].duration; - console.log(`Path 1 query (wide) executed in ${queryTime.toFixed(0)}ms - retrieved ${result?.length || 0} samples`); + console.log(`Direct location query (wide) executed in ${queryTime.toFixed(0)}ms - retrieved ${result?.length || 0} samples`); return result ?? []; } -// WIDE FORMAT: Path 2 - Via site location +// WIDE FORMAT: Site-Mediated Location (p__sampling_site β†’ p__site_location) // Uses p__site_location and p__sampling_site columns async function get_samples_2(pid) { if (pid === null || pid ==="" || pid == "unset") { return []; } - // Path 2: Via site location - WIDE FORMAT version + // Site-Mediated Location (SITE_LOCATION) - WIDE FORMAT version const q = ` SELECT geo.latitude, @@ -446,11 +446,11 @@ async function get_samples_2(pid) { performance.mark('samples2-end'); performance.measure('samples2-query', 'samples2-start', 'samples2-end'); const queryTime = performance.getEntriesByName('samples2-query')[0].duration; - console.log(`Path 2 query (wide) executed in ${queryTime.toFixed(0)}ms - retrieved ${result?.length || 0} samples`); + console.log(`Site-mediated location query (wide) executed in ${queryTime.toFixed(0)}ms - retrieved ${result?.length || 0} samples`); return result ?? []; } -// WIDE FORMAT: Eric Kansa's authoritative query (Path 1 only) +// WIDE FORMAT: Eric Kansa's authoritative query (p__sample_location + p__sampling_site) // This is the wide format equivalent of get_samples_at_geo_cord_location_via_sample_event async function get_samples_at_geo_cord_location_via_sample_event(pid) { if (pid === null || pid ==="" || pid == "unset") { @@ -959,7 +959,7 @@ JOIN nodes AS se ON (se.otype = 'SamplingEvent' AND list_contains(se.p__sample_l This is typically **2-4x faster** over HTTP. ::: -**Query Strategy (Path 1 Only)**: +**Query Strategy (p__sample_location + p__sampling_site)**: - Starts at a GeospatialCoordLocation (clicked point) - Walks **backward** via `p__sample_location` column to find SamplingEvents that reference this location - From those events, finds MaterialSampleRecords via `p__produced_by` column @@ -973,7 +973,7 @@ This is typically **2-4x faster** over HTTP. **Ordering**: Prioritizes samples with images (`ORDER BY has_thumbnail DESC`) -**Important**: This query only returns samples whose **sampling events directly reference this geolocation** via `p__sample_location` (Path 1). Samples that reach this location only through their site's `p__site_location` (Path 2) are **not included**. This means site marker locations may return 0 results if no events were recorded at that exact coordinate. +**Important**: This query only returns samples whose **sampling events directly reference this geolocation** via `p__sample_location` (EVENT_SAMPLE_LOCATION). Samples that reach this location only through their site's `p__site_location` (SITE_LOCATION) are **not included**. This means site marker locations may return 0 results if no events were recorded at that exact coordinate. ```{ojs} //| echo: false @@ -1055,18 +1055,33 @@ html`${ ` : html`
- No samples found at this location via Path 1 (direct sampling events). + No samples found at this location via direct location (p__sample_location).
` }` ``` -## Understanding Paths in the iSamples Property Graph +## Understanding Location Traversals Using Typed Edges (Wide Format) -### Why "Path 1" and "Path 2"? +### The 14 iSamples Typed Edge Types -These terms describe the **two main ways to get from a MaterialSampleRecord to geographic coordinates**. They're not the only relationship paths in the graph, but they're the most commonly used for spatial queries. +The iSamples property graph uses **14 typed edges** to express all relationships. In wide format, these are represented as `p__*` columns on entity rows rather than separate edge rows: -**Path 1 (Direct Event Location) - Wide Format** +| Edge Type | Wide Format Column | From β†’ To | +|-----------|-------------------|-----------| +| `MSR_PRODUCED_BY` | `p__produced_by` | MaterialSampleRecord β†’ SamplingEvent | +| `MSR_REGISTRANT` | `p__registrant` | MaterialSampleRecord β†’ Agent | +| `MSR_KEYWORDS` | `p__keywords` | MaterialSampleRecord β†’ IdentifiedConcept | +| `MSR_HAS_CONTEXT_CATEGORY` | `p__has_context_category` | MaterialSampleRecord β†’ IdentifiedConcept | +| `MSR_HAS_MATERIAL_CATEGORY` | `p__has_material_category` | MaterialSampleRecord β†’ IdentifiedConcept | +| `MSR_HAS_SAMPLE_OBJECT_TYPE` | `p__has_sample_object_type` | MaterialSampleRecord β†’ IdentifiedConcept | +| `EVENT_SAMPLING_SITE` | `p__sampling_site` | SamplingEvent β†’ SamplingSite | +| `EVENT_SAMPLE_LOCATION` | `p__sample_location` | SamplingEvent β†’ GeospatialCoordLocation | +| `EVENT_RESPONSIBILITY` | `p__responsibility` | SamplingEvent β†’ Agent | +| `SITE_LOCATION` | `p__site_location` | SamplingSite β†’ GeospatialCoordLocation | + +### Two Ways to Get Geographic Coordinates (Wide Format) + +**Direct Location (via p__sample_location / EVENT_SAMPLE_LOCATION)** ``` MaterialSampleRecord β†’ p__produced_by β†’ @@ -1075,7 +1090,7 @@ SamplingEvent GeospatialCoordLocation ``` -**Path 2 (Via Sampling Site) - Wide Format** +**Site-Mediated Location (via p__sampling_site + p__site_location)** ``` MaterialSampleRecord β†’ p__produced_by β†’ @@ -1087,101 +1102,103 @@ GeospatialCoordLocation ``` **Key Differences:** -- **Path 1 is direct**: Event β†’ Location (3 hops total) -- **Path 2 goes through Site**: Event β†’ Site β†’ Location (4 hops total) -- **Path 1** = "Where was this specific sample collected?" -- **Path 2** = "What named site is this sample from, and where is that site?" +- **Direct** uses `p__sample_location`: 2 edges from Sample to Location +- **Site-Mediated** uses `p__sampling_site` + `p__site_location`: 3 edges from Sample to Location +- **Direct** = "Where was this specific sample collected?" +- **Site-Mediated** = "What named site is this sample from, and where is that site?" -**Wide Format Advantage**: Instead of JOINing through separate edge rows (otype='_edge_'), we directly access the `p__*` columns on entity rows. +**Wide Format Advantage**: Instead of JOINing through separate edge rows (otype='_edge_'), we directly access the `p__*` columns on entity rows using `list_contains()`. -**Important:** The queries below use INNER JOIN for both paths, meaning samples must have connections through both paths to appear in results. Samples with only one path will be excluded. +**Important:** The queries below use INNER JOIN for both traversals, meaning samples must have connections through both to appear in results. Samples with only one traversal will be excluded. -### Full Relationship Map (Beyond Path 1 and Path 2) - -The iSamples property graph contains many more relationships than just the geographic paths: +### Full Relationship Map Using Typed Edges (Wide Format) ``` Agent ↑ - | {p__responsibility, p__registrant} + | p__responsibility, p__registrant | -MaterialSampleRecord ──p__produced_by──→ SamplingEvent ──p__sample_location──→ GeospatialCoordLocation +MaterialSampleRecord ─p__produced_by─→ SamplingEvent ─p__sample_location─→ GeospatialCoordLocation | | ↑ | | | - | {p__keywords, └──p__sampling_site──→ SamplingSite ──p__site_locationβ”€β”˜ - | p__has_sample_object_type, - | p__has_material_category} + | p__keywords, └─p__sampling_site─→ SamplingSite ─p__site_locationβ”€β”˜ + | p__has_sample_object_type, + | p__has_material_category | └──→ IdentifiedConcept ``` -**Path Categories (Wide Format):** -- **PATH 1**: MaterialSampleRecord β†’ SamplingEvent β†’ GeospatialCoordLocation (via `p__produced_by`, `p__sample_location`) -- **PATH 2**: MaterialSampleRecord β†’ SamplingEvent β†’ SamplingSite β†’ GeospatialCoordLocation (via `p__sampling_site`, `p__site_location`) -- **AGENT PATH**: MaterialSampleRecord β†’ SamplingEvent β†’ Agent (via `p__responsibility`, `p__registrant`) -- **CONCEPT PATH**: MaterialSampleRecord β†’ IdentifiedConcept (via `p__keywords`, `p__has_sample_object_type`, `p__has_material_category` - direct, no event!) +**Edge Categories (Wide Format Columns):** +- **DIRECT LOCATION**: `p__produced_by` β†’ `p__sample_location` +- **SITE-MEDIATED LOCATION**: `p__produced_by` β†’ `p__sampling_site` β†’ `p__site_location` +- **AGENT EDGES**: `p__registrant`, `p__responsibility` +- **CONCEPT EDGES**: `p__keywords`, `p__has_*_category`, `p__has_sample_object_type` -**Key Insight:** SamplingEvent is the central hub for most relationships, except concepts which attach directly to MaterialSampleRecord. +**Key Insight:** SamplingEvent is the central hub for most relationships, except concept edges which attach directly to MaterialSampleRecord. -### Query Pattern Analysis (Wide Format Translations) +### Query Pattern Analysis (Wide Format) -The following analysis shows Eric's query functions translated to wide format: +The following analysis shows Eric's query functions in wide format: -#### 1. `get_sample_data_via_sample_pid` - Uses BOTH Path 1 AND Path 2 +#### 1. `get_sample_data_via_sample_pid` - Uses BOTH Location Traversals ``` MaterialSampleRecord (WHERE pid = ?) β†’ p__produced_by β†’ SamplingEvent - β”œβ”€β†’ p__sample_location β†’ GeospatialCoordLocation [Path 1] - └─→ p__sampling_site β†’ SamplingSite [Path 2] + β”œβ”€β†’ p__sample_location β†’ GeospatialCoordLocation [Direct] + └─→ p__sampling_site β†’ SamplingSite [Site-Mediated] Returns: sample metadata + lat/lon + site label/pid -Required: BOTH paths must exist (INNER JOIN) +Required: BOTH traversals must exist (INNER JOIN) ``` -#### 2. `get_sample_data_agents_sample_pid` - Uses AGENT PATH +#### 2. `get_sample_data_agents_sample_pid` - Uses Agent Edges ``` MaterialSampleRecord (WHERE pid = ?) β†’ p__produced_by β†’ SamplingEvent β†’ p__responsibility β†’ Agent Returns: sample metadata + agent info (who collected/registered) -Independent of: Path 1 and Path 2 (no geographic data) +Independent of: Location traversals (no geographic data) ``` -#### 3. `get_sample_types_and_keywords_via_sample_pid` - Uses CONCEPT PATH +#### 3. `get_sample_types_and_keywords_via_sample_pid` - Uses Concept Edges ``` MaterialSampleRecord (WHERE pid = ?) - β†’ {p__keywords, p__has_sample_object_type, p__has_material_category} β†’ IdentifiedConcept + β†’ p__keywords β†’ IdentifiedConcept + β†’ p__has_sample_object_type β†’ IdentifiedConcept + β†’ p__has_material_category β†’ IdentifiedConcept Returns: sample metadata + classification keywords/types -Independent of: Path 1, Path 2, and SamplingEvent! +Independent of: Location traversals and SamplingEvent! ``` -#### 4. `get_samples_at_geo_cord_location_via_sample_event` - REVERSE Path 1 + Path 2 +#### 4. `get_samples_at_geo_cord_location_via_sample_event` - Eric's Canonical Query ``` GeospatialCoordLocation (WHERE pid = ?) ← START HERE (reverse!) - ← p__sample_location ← SamplingEvent [Path 1 REVERSED] - β”œβ”€β†’ p__sampling_site β†’ SamplingSite [Path 2 enrichment] + ← p__sample_location ← SamplingEvent [Direct, reversed] + β”œβ”€β†’ p__sampling_site β†’ SamplingSite [Site context - provenance!] └─← p__produced_by ← MaterialSampleRecord [complete chain] Returns: all samples at a given location + site info Direction: geo β†’ samples (opposite of other queries) ``` +**Key Insight (from Eric Kansa):** This canonical query requires BOTH `p__sample_location` AND `p__sampling_site` - it's not just "find samples at this location" but "find samples at this location that have proper site provenance." + **Summary Table:** -| Function | Path 1 | Path 2 | Direction | Notes | -|----------|--------|--------|-----------|-------| +| Function | Direct Location | Site-Mediated | Direction | Notes | +|----------|-----------------|---------------|-----------|-------| | `get_sample_data_via_sample_pid` | βœ… Required | βœ… Required | Forward | INNER JOIN - no row if either missing | -| `get_sample_data_agents_sample_pid` | ❌ N/A | ❌ N/A | N/A | Uses agent path instead | -| `get_sample_types_and_keywords_via_sample_pid` | ❌ N/A | ❌ N/A | N/A | Direct edges to concepts | -| `get_samples_at_geo_cord_location_via_sample_event` | βœ… Required | βœ… Required | Reverse | Walks from geo to samples | +| `get_sample_data_agents_sample_pid` | ❌ N/A | ❌ N/A | N/A | Uses agent edges instead | +| `get_sample_types_and_keywords_via_sample_pid` | ❌ N/A | ❌ N/A | N/A | Direct concept edges | +| `get_samples_at_geo_cord_location_via_sample_event` | βœ… Required | βœ… Required | Reverse | Geo β†’ samples with site provenance | -## Related Sample Path 1 (selected) +## Related Samples via Direct Location (p__sample_location) - + -Path 1 (direct_event_location): find MaterialSampleRecord items whose producing SamplingEvent has a direct `p__sample_location` pointing to the clicked GeospatialCoordLocation (pid). +Direct Location (via p__sample_location / EVENT_SAMPLE_LOCATION): find MaterialSampleRecord items whose producing SamplingEvent has a direct `p__sample_location` pointing to the clicked GeospatialCoordLocation (pid). - Chain: MaterialSampleRecord β†’ p__produced_by β†’ SamplingEvent β†’ p__sample_location β†’ GeospatialCoordLocation (clicked pid) - This matches the "direct_samples" concept in the Python notebook and is labeled as `location_path = 'direct_event_location'` in the query. @@ -1195,7 +1212,7 @@ samples_1 = selectedSamples1 //| echo: false html`${ s1Loading ? - html`
Loading Path 1 samples…
` + html`
Loading direct location samples…
` : samples_1 && samples_1.length > 0 ? html`
@@ -1262,21 +1279,21 @@ html`${
- Found ${samples_1.length} sample${samples_1.length !== 1 ? 's' : ''} via Path 1 (direct event location) + Found ${samples_1.length} sample${samples_1.length !== 1 ? 's' : ''} via direct location (EVENT_SAMPLE_LOCATION)
` : html`
- No samples found via Path 1 (direct event location). + No samples found via direct location (EVENT_SAMPLE_LOCATION).
` }` ``` -## Related Sample Path 2 (selected) +## Related Samples via Site-Mediated Location (p__site_location) - + -Path 2 (via_site_location): find MaterialSampleRecord items whose producing SamplingEvent references a SamplingSite via `p__sampling_site`, and that site's `p__site_location` points to the clicked GeospatialCoordLocation (pid). +Site-Mediated Location (SITE_LOCATION): find MaterialSampleRecord items whose producing SamplingEvent references a SamplingSite via `p__sampling_site`, and that site's `p__site_location` points to the clicked GeospatialCoordLocation (pid). - Chain: MaterialSampleRecord β†’ p__produced_by β†’ SamplingEvent β†’ p__sampling_site β†’ SamplingSite β†’ p__site_location β†’ GeospatialCoordLocation (clicked pid) - This matches the "samples_via_sites" concept in the Python notebook and is labeled as `location_path = 'via_site_location'` in the query. @@ -1290,7 +1307,7 @@ samples_2 = selectedSamples2 //| echo: false html`${ s2Loading ? - html`
Loading Path 2 samples…
` + html`
Loading site-mediated samples…
` : samples_2 && samples_2.length > 0 ? html`
@@ -1357,11 +1374,11 @@ html`${
- Found ${samples_2.length} sample${samples_2.length !== 1 ? 's' : ''} via Path 2 (via site location) + Found ${samples_2.length} sample${samples_2.length !== 1 ? 's' : ''} via site-mediated location (SITE_LOCATION)
` : html`
- No samples found via Path 2 (via site location). + No samples found via site-mediated location (SITE_LOCATION).
` }` ``` @@ -1373,18 +1390,18 @@ html`${ **Current implementation**: GeospatialCoordLocations are now color-coded by their semantic role in the property graph: -- πŸ”΅ **Blue (small)** - `sample_location_only`: Precise field collection points (Path 1) -- 🟣 **Purple (large)** - `site_location_only`: Administrative site markers (Path 2) -- 🟠 **Orange (medium)** - `both`: Dual-purpose locations (used for both Path 1 and Path 2) +- πŸ”΅ **Blue (small)** - `sample_location_only`: Precise field collection points (EVENT_SAMPLE_LOCATION) +- 🟣 **Purple (large)** - `site_location_only`: Administrative site markers (SITE_LOCATION) +- 🟠 **Orange (medium)** - `both`: Dual-purpose locations (used for both direct and site-mediated location) **Discovery**: Analysis of the OpenContext parquet data reveals that geos fall into three distinct categories based on their usage: -1. **`sample_location_only`**: Precise field collection points (Path 1) +1. **`sample_location_only`**: Precise field collection points (EVENT_SAMPLE_LOCATION) - Most common category - Represents exact GPS coordinates where sampling events occurred - Varies per event, even within the same site -2. **`site_location_only`**: Administrative site markers (Path 2) +2. **`site_location_only`**: Administrative site markers (SITE_LOCATION) - Represents general/reference locations for named archaeological sites - One coordinate per site - May not correspond to any actual collection point @@ -1403,10 +1420,10 @@ html`${ ### Benefits of Current Implementation -1. **Educational**: Makes Path 1 vs Path 2 distinction visually concrete +1. **Educational**: Makes EVENT_SAMPLE_LOCATION vs SITE_LOCATION distinction visually concrete - Users can SEE the semantic difference between precise and administrative locations - - Blue points show where samples were actually collected (Path 1) - - Purple points show administrative site markers (Path 2) + - Blue points show where samples were actually collected (EVENT_SAMPLE_LOCATION) + - Purple points show administrative site markers (SITE_LOCATION) - Demonstrates the complementary nature of the two geographic paths 2. **Exploratory**: Enables visual understanding of spatial patterns @@ -1452,7 +1469,7 @@ LEFT JOIN nodes AS site ON (list_contains(site.p__site_location, geo.row_id)) - ⬜ Convex hull/region drawing for distributed sites - ⬜ Dynamic statistics display on site selection -This implementation transforms the visualization from uniform points into a pedagogical tool that visually demonstrates the Path 1 vs Path 2 distinction in the iSamples metadata model architecture. +This implementation transforms the visualization from uniform points into a pedagogical tool that visually demonstrates the EVENT_SAMPLE_LOCATION vs SITE_LOCATION distinction in the iSamples metadata model architecture. :::