diff --git a/rust/lance/src/dataset/mem_wal/index.rs b/rust/lance/src/dataset/mem_wal/index.rs index 116ea6c60ce..6f173109eff 100644 --- a/rust/lance/src/dataset/mem_wal/index.rs +++ b/rust/lance/src/dataset/mem_wal/index.rs @@ -18,10 +18,14 @@ mod arena_skiplist; mod btree; mod fts; mod hnsw; +mod pk_key; use std::collections::HashMap; +use std::sync::Arc; use std::sync::atomic::{AtomicUsize, Ordering}; +use datafusion::common::ScalarValue; + use super::memtable::batch_store::StoredBatch; use arrow_array::RecordBatch; use lance_core::datatypes::Schema as LanceSchema; @@ -44,6 +48,32 @@ pub type RowPosition = u64; pub use btree::{BTreeIndexConfig, BTreeMemIndex}; pub use fts::{FtsIndexConfig, FtsMemIndex, FtsQueryExpr, SearchOptions}; pub use hnsw::{HnswIndexConfig, HnswMemIndex}; +pub use pk_key::encode_pk_tuple; + +use pk_key::encode_pk_batch; + +/// Synthetic column the composite PK index is keyed on: the order-preserving +/// encoded tuple (see [`encode_pk_tuple`]), stored as `Binary` so a +/// [`BTreeMemIndex`]'s byte backend indexes it directly. +const PK_KEY_COLUMN: &str = "__pk_key__"; + +/// The memtable's primary-key index, used to answer "newest visible version of +/// this key" for dedup. Single-column PKs reuse the column's compact typed +/// [`BTreeMemIndex`] (no second copy); composite PKs key a `BTreeMemIndex` on +/// the order-preserving encoded tuple ([`encode_pk_tuple`]) instead. Either way +/// the lookup is a single seek on one `BTreeMemIndex`. +enum PkIndex { + /// Arity 1: aliases a `btree_indexes` entry, so the insert loop maintains it. + Single(Arc), + /// Arity >= 2: a `BTreeMemIndex` over the encoded-tuple `Binary` key, + /// maintained explicitly in the insert paths (the original batch lacks the + /// synthetic key column). `columns` are the PK columns in order, resolved + /// against each batch's schema at insert time. + Composite { + index: Arc, + columns: Vec, + }, +} // ============================================================================ // Index Store @@ -195,12 +225,17 @@ impl MemIndexConfig { /// therefore safe for scanners to read. Scanners snapshot this at plan /// construction time so every plan keys on a stable MVCC cursor. pub struct IndexStore { - /// BTree indexes keyed by index name. - btree_indexes: HashMap, + /// BTree indexes keyed by index name. `Arc` so the primary-key BTrees can be + /// shared into [`Self::pk_btrees`] without a second copy or a second insert. + btree_indexes: HashMap>, /// HNSW vector indexes keyed by index name. hnsw_indexes: HashMap, /// FTS indexes keyed by index name. fts_indexes: HashMap, + /// The primary-key index (single-column or composite), or `None` without a + /// primary key. Queried via [`Self::pk_newest_visible`] (see + /// [`Self::enable_pk_index`]). + pk_index: Option, /// Maximum batch position that is durable in the WAL and therefore /// visible to scanners. Advanced unconditionally after a WAL append /// succeeds; not gated on whether any indexes are configured. @@ -213,6 +248,7 @@ impl Default for IndexStore { btree_indexes: HashMap::new(), hnsw_indexes: HashMap::new(), fts_indexes: HashMap::new(), + pk_index: None, max_visible_batch_position: AtomicUsize::new(0), } } @@ -230,6 +266,16 @@ impl std::fmt::Debug for IndexStore { &self.hnsw_indexes.keys().collect::>(), ) .field("fts_indexes", &self.fts_indexes.keys().collect::>()) + .field( + "pk_index", + &match &self.pk_index { + None => "none".to_string(), + Some(PkIndex::Single(b)) => format!("single({})", b.column_name()), + Some(PkIndex::Composite { columns, .. }) => { + format!("composite({})", columns.join(", ")) + } + }, + ) .field( "max_visible_batch_position", &self.max_visible_batch_position.load(Ordering::Acquire), @@ -264,7 +310,7 @@ impl IndexStore { for config in configs { match config { MemIndexConfig::BTree(c) => { - let index = BTreeMemIndex::new(c.field_id, c.column.clone()); + let index = Arc::new(BTreeMemIndex::new(c.field_id, c.column.clone())); registry.btree_indexes.insert(c.name.clone(), index); } MemIndexConfig::Hnsw(c) => { @@ -293,7 +339,7 @@ impl IndexStore { /// the production memtable path goes through [`Self::from_configs`]. pub fn add_btree(&mut self, name: String, field_id: i32, column: String) { self.btree_indexes - .insert(name, BTreeMemIndex::new(field_id, column)); + .insert(name, Arc::new(BTreeMemIndex::new(field_id, column))); } /// Add an HNSW vector index with default build parameters. @@ -362,6 +408,158 @@ impl IndexStore { .insert(name, FtsMemIndex::with_params(field_id, column, params)); } + /// Maintain a primary-key index so the memtable can answer "newest visible + /// version of this key" (see [`Self::pk_newest_visible`]). + /// + /// Single-column PKs reuse an existing BTree on the field, else auto-create + /// one under a `__pk__*` name so the normal insert loop maintains it (no + /// second copy). Composite (arity >= 2) PKs key a `BTreeMemIndex` on the + /// order-preserving encoded tuple (synthetic [`PK_KEY_COLUMN`]), maintained + /// explicitly in the insert paths. Call once at construction, after + /// [`Self::from_configs`] and before any inserts; a no-op when `pk_columns` + /// is empty. + pub fn enable_pk_index(&mut self, pk_columns: &[(String, i32)]) { + self.pk_index = match pk_columns { + [] => None, + [(column, field_id)] => { + let btree = match self + .btree_indexes + .values() + .find(|b| b.field_id() == *field_id) + { + Some(existing) => existing.clone(), + None => { + let btree = Arc::new(BTreeMemIndex::new(*field_id, column.clone())); + self.btree_indexes + .insert(format!("__pk__{column}"), btree.clone()); + btree + } + }; + Some(PkIndex::Single(btree)) + } + multi => Some(PkIndex::Composite { + // Synthetic field id (-1): the composite index is held directly, + // never resolved by field id. + index: Arc::new(BTreeMemIndex::new(-1, PK_KEY_COLUMN.to_string())), + columns: multi.iter().map(|(c, _)| c.clone()).collect(), + }), + }; + } + + /// Whether the memtable has a primary-key index. + pub fn has_pk_index(&self) -> bool { + self.pk_index.is_some() + } + + /// Sorted `(value, row_id)` training batches for the flushed on-disk PK + /// BTree (the sidecar dedup index). Single-column emits the typed PK value; + /// composite emits the order-preserving `Binary` encoded tuple. Empty when + /// there is no primary key. Row positions line up 1:1 with the forward- + /// written data file, so they are the flushed row ids directly. + pub fn pk_training_batches(&self, batch_size: usize) -> Result> { + match &self.pk_index { + None => Ok(Vec::new()), + Some(PkIndex::Single(btree)) => btree.to_training_batches(batch_size), + Some(PkIndex::Composite { index, .. }) => index.to_training_batches(batch_size), + } + } + + /// Resolve the PK columns' positions in `batch` (composite insert helper). + fn pk_batch_indices(batch: &RecordBatch, columns: &[String]) -> Result> { + columns + .iter() + .map(|c| { + batch + .schema() + .column_with_name(c) + .map(|(i, _)| i) + .ok_or_else(|| { + Error::invalid_input(format!("PK column '{c}' not found in batch")) + }) + }) + .collect() + } + + /// Maintain the composite PK index for `batch` (no-op for single/no PK): + /// encode the PK columns into the synthetic [`PK_KEY_COLUMN`] `Binary` column + /// and feed that to the keyed `BTreeMemIndex`. + fn insert_composite_pk(&self, batch: &RecordBatch, row_offset: u64) -> Result<()> { + if let Some(PkIndex::Composite { index, columns }) = &self.pk_index { + let pk_indices = Self::pk_batch_indices(batch, columns)?; + let encoded = encode_pk_batch(batch, &pk_indices)?; + let schema = Arc::new(arrow_schema::Schema::new(vec![arrow_schema::Field::new( + PK_KEY_COLUMN, + arrow_schema::DataType::Binary, + false, + )])); + let key_batch = RecordBatch::try_new(schema, vec![Arc::new(encoded)]) + .map_err(|e| Error::invalid_input(e.to_string()))?; + index.insert(&key_batch, row_offset)?; + } + Ok(()) + } + + /// The newest row position of the primary-key tuple `values` (in PK order) + /// visible at `max_visible_row`, or `None`. A single seek either way: + /// single-column probes the typed BTree; composite probes the encoded-tuple + /// index. Collision-free, since `position` is the row identity. + pub fn pk_newest_visible( + &self, + values: &[ScalarValue], + max_visible_row: RowPosition, + ) -> Option { + match &self.pk_index { + None => None, + Some(PkIndex::Single(btree)) => btree.get_newest_visible(&values[0], max_visible_row), + Some(PkIndex::Composite { index, .. }) => { + // An unsupported PK type would have failed at insert, so the + // index can't hold a tuple this fails to encode. The probe key is + // the same `Binary`-encoded tuple the insert path indexed. + let key = encode_pk_tuple(values).ok()?; + index.get_newest_visible(&ScalarValue::Binary(Some(key)), max_visible_row) + } + } + } + + /// Whether `position` is the newest visible row of `values` — the recency + /// check the active index-search arms apply to drop predicate-crossing + /// stale hits. Callers gate on [`Self::has_pk_index`] first, since this is + /// `false` (drop) when the memtable has no primary-key index. + pub fn pk_is_newest( + &self, + values: &[ScalarValue], + position: RowPosition, + max_visible_row: RowPosition, + ) -> bool { + self.pk_newest_visible(values, max_visible_row) == Some(position) + } + + /// Whether `key` has any version visible at `max_visible_row` — the + /// cross-source block-list's existence query, snapshot-bounded so a + /// not-yet-visible write can't shadow an older visible copy. + /// + /// `key` is already in the index's key space: the typed PK value for a + /// single-column key, the `Binary`-encoded tuple for a composite one (built + /// by `block_list::on_disk_pk_key`, the same key the flushed on-disk index is + /// probed with). Both arities forward it straight to the keyed BTree. + pub fn pk_contains_key(&self, key: &ScalarValue, max_visible_row: RowPosition) -> bool { + match &self.pk_index { + None => false, + Some(PkIndex::Single(btree)) | Some(PkIndex::Composite { index: btree, .. }) => { + btree.get_newest_visible(key, max_visible_row).is_some() + } + } + } + + /// Whether the primary-key index holds no rows (or doesn't exist). + pub fn pk_is_empty(&self) -> bool { + match &self.pk_index { + None => true, + Some(PkIndex::Single(btree)) => btree.is_empty(), + Some(PkIndex::Composite { index, .. }) => index.is_empty(), + } + } + /// Insert a batch into all indexes. pub fn insert(&self, batch: &RecordBatch, row_offset: u64) -> Result<()> { self.insert_with_batch_position(batch, row_offset, None) @@ -384,6 +582,9 @@ impl IndexStore { for index in self.fts_indexes.values() { index.insert(batch, row_offset)?; } + // Single-column PK aliases a `btree_indexes` entry (maintained above); + // a composite PK has its own index, maintained here. + self.insert_composite_pk(batch, row_offset)?; // Update global watermark after all indexes have been updated if let Some(bp) = batch_position { @@ -440,6 +641,12 @@ impl IndexStore { } } + // Single-column PK aliases a `btree_indexes` entry (maintained above); + // a composite PK has its own index, maintained here. + for stored in batches { + self.insert_composite_pk(&stored.data, stored.row_offset)?; + } + // Update global watermark to the max batch position let max_bp = batches.iter().map(|b| b.batch_position).max().unwrap(); self.advance_max_visible_batch_position(max_bp); @@ -552,6 +759,14 @@ impl IndexStore { .map(|(name, _idx_type, duration)| (name.to_string(), duration)) .collect(); + // Single-column PK aliases a `btree_indexes` entry — its thread above + // already maintained it (and joined). A composite PK has its own + // index; maintain it here before the watermark advances so the + // visible prefix is fully indexed. + for stored in batches { + self.insert_composite_pk(&stored.data, stored.row_offset)?; + } + // Update global watermark to the max batch position let max_bp = batches.iter().map(|b| b.batch_position).max().unwrap(); self.advance_max_visible_batch_position(max_bp); @@ -562,7 +777,7 @@ impl IndexStore { /// Get a BTree index by name. pub fn get_btree(&self, name: &str) -> Option<&BTreeMemIndex> { - self.btree_indexes.get(name) + self.btree_indexes.get(name).map(Arc::as_ref) } /// Get an HNSW vector index by name. @@ -583,6 +798,7 @@ impl IndexStore { self.btree_indexes .values() .find(|idx| idx.field_id() == field_id) + .map(Arc::as_ref) } /// Get an HNSW vector index by field ID. @@ -607,6 +823,7 @@ impl IndexStore { self.btree_indexes .values() .find(|idx| idx.column_name() == column) + .map(Arc::as_ref) } /// Get an HNSW vector index by column name. @@ -694,6 +911,73 @@ mod tests { .unwrap() } + /// Single-column `id` batch for primary-key lookup tests. + fn id_batch(ids: &[i32]) -> RecordBatch { + RecordBatch::try_new( + Arc::new(ArrowSchema::new(vec![Field::new( + "id", + DataType::Int32, + false, + )])), + vec![Arc::new(Int32Array::from(ids.to_vec()))], + ) + .unwrap() + } + + #[test] + fn pk_newest_visible_single_column() { + let mut store = IndexStore::new(); + store.enable_pk_index(&[("id".to_string(), 0)]); + // id=1 at positions 0 and 2 (an update), id=2 at position 1. + store.insert(&id_batch(&[1, 2]), 0).unwrap(); + store.insert(&id_batch(&[1]), 2).unwrap(); + + let one = [ScalarValue::Int32(Some(1))]; + // Watermark above the update sees the newest position; below it, the older. + assert_eq!(store.pk_newest_visible(&one, 5), Some(2)); + assert_eq!(store.pk_newest_visible(&one, 1), Some(0)); + assert!(store.pk_is_newest(&one, 2, 5)); + assert!(!store.pk_is_newest(&one, 0, 5)); + // Absent key (probed by the typed value, as the block-list does). + assert!(!store.pk_contains_key(&ScalarValue::Int32(Some(9)), 5)); + } + + #[test] + fn pk_newest_visible_composite_seeks_encoded_tuple() { + let mut store = IndexStore::new(); + store.enable_pk_index(&[("id".to_string(), 0), ("name".to_string(), 1)]); + // Rows: (1,"a")@0, (1,"b")@1, (1,"a")@2 — an update of (1,"a"). + let schema = Arc::new(ArrowSchema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(Int32Array::from(vec![1, 1, 1])), + Arc::new(StringArray::from(vec!["a", "b", "a"])), + ], + ) + .unwrap(); + store.insert(&batch, 0).unwrap(); + + let tuple_1a = [ScalarValue::Int32(Some(1)), ScalarValue::from("a")]; + let tuple_1b = [ScalarValue::Int32(Some(1)), ScalarValue::from("b")]; + // (1,"a")'s newest visible row is its re-write at position 2. + assert_eq!(store.pk_newest_visible(&tuple_1a, 5), Some(2)); + assert!(store.pk_is_newest(&tuple_1a, 2, 5)); + assert!(!store.pk_is_newest(&tuple_1a, 0, 5)); + // (1,"b") only exists at position 1. + assert_eq!(store.pk_newest_visible(&tuple_1b, 5), Some(1)); + // Watermark below the re-write: the older (1,"a")@0 is the newest visible. + assert_eq!(store.pk_newest_visible(&tuple_1a, 1), Some(0)); + // An absent tuple (probed by its Binary-encoded key, as the block-list + // does). + let tuple_2a = [ScalarValue::Int32(Some(2)), ScalarValue::from("a")]; + let key_2a = ScalarValue::Binary(Some(encode_pk_tuple(&tuple_2a).unwrap())); + assert!(!store.pk_contains_key(&key_2a, 5)); + } + #[test] fn test_index_registry() { let schema = create_test_schema(); diff --git a/rust/lance/src/dataset/mem_wal/index/pk_key.rs b/rust/lance/src/dataset/mem_wal/index/pk_key.rs new file mode 100644 index 00000000000..b31fe42c995 --- /dev/null +++ b/rust/lance/src/dataset/mem_wal/index/pk_key.rs @@ -0,0 +1,204 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Composite primary-key encoding for MemWAL dedup. +//! +//! A multi-column primary key is reduced to a single order-preserving byte +//! string ([`encode_pk_tuple`]) so the whole tuple is one comparable key: +//! lexicographic byte order equals tuple order, and distinct tuples never +//! collide. Encoded as a `Binary` value, the tuple is indexed directly by a +//! [`super::BTreeMemIndex`] (its byte backend) — both in memory and, after +//! flush, as the on-disk BTree's `Binary` value column — so a probe builds +//! `ScalarValue::Binary(key)` and every layer agrees. +//! +//! Single-column primary keys do **not** use this — they key the typed +//! `BTreeMemIndex` on the column value directly. + +use arrow_array::{BinaryArray, RecordBatch}; +use datafusion::common::ScalarValue; +use lance_core::{Error, Result}; + +/// Sign-flip a signed integer to an order-preserving unsigned key (matches the +/// fixed-int BTree backend). Big-endian bytes of the result sort like the value. +#[inline] +fn encode_signed(v: i64) -> u64 { + (v as u64) ^ (1u64 << 63) +} + +/// Append an order-preserving encoding of one non-null byte string: each `0x00` +/// is escaped to `0x00 0xFF`, then a `0x00 0x00` terminator is appended. The +/// terminator sorts before any escaped content, so a prefix orders before its +/// extensions and no value can forge a column boundary. +fn encode_bytes(out: &mut Vec, bytes: &[u8]) { + for &b in bytes { + out.push(b); + if b == 0x00 { + out.push(0xFF); + } + } + out.extend_from_slice(&[0x00, 0x00]); +} + +/// Append the order-preserving encoding of a single PK column value. A leading +/// tag (`0x00` null / `0x01` non-null) makes nulls sort first and keeps the +/// per-column encoding self-delimiting (fixed-width for ints, terminated for +/// bytes), so concatenating columns stays injective and order-preserving. +fn encode_value(out: &mut Vec, value: &ScalarValue) -> Result<()> { + if value.is_null() { + out.push(0x00); + return Ok(()); + } + out.push(0x01); + macro_rules! be_signed { + ($v:expr) => { + out.extend_from_slice(&encode_signed($v as i64).to_be_bytes()) + }; + } + match value { + ScalarValue::Int8(Some(v)) => be_signed!(*v), + ScalarValue::Int16(Some(v)) => be_signed!(*v), + ScalarValue::Int32(Some(v)) => be_signed!(*v), + ScalarValue::Int64(Some(v)) => be_signed!(*v), + ScalarValue::Date32(Some(v)) => be_signed!(*v), + ScalarValue::Date64(Some(v)) => be_signed!(*v), + ScalarValue::UInt8(Some(v)) => out.extend_from_slice(&(*v as u64).to_be_bytes()), + ScalarValue::UInt16(Some(v)) => out.extend_from_slice(&(*v as u64).to_be_bytes()), + ScalarValue::UInt32(Some(v)) => out.extend_from_slice(&(*v as u64).to_be_bytes()), + ScalarValue::UInt64(Some(v)) => out.extend_from_slice(&v.to_be_bytes()), + ScalarValue::Boolean(Some(b)) => out.push(*b as u8), + ScalarValue::Utf8(Some(s)) | ScalarValue::LargeUtf8(Some(s)) => { + encode_bytes(out, s.as_bytes()) + } + ScalarValue::Binary(Some(b)) + | ScalarValue::LargeBinary(Some(b)) + | ScalarValue::FixedSizeBinary(_, Some(b)) => encode_bytes(out, b), + other => { + return Err(Error::invalid_input(format!( + "Unsupported primary-key column type for composite key: {other:?}" + ))); + } + } + Ok(()) +} + +/// Encode a PK tuple (values in PK column order) to one order-preserving key. +pub fn encode_pk_tuple(values: &[ScalarValue]) -> Result> { + let mut out = Vec::with_capacity(values.len() * 9); + for value in values { + encode_value(&mut out, value)?; + } + Ok(out) +} + +/// Encode row `row` of `batch`'s PK columns (at `pk_indices`) to one key. +fn encode_pk_row(batch: &RecordBatch, pk_indices: &[usize], row: usize) -> Result> { + let mut out = Vec::with_capacity(pk_indices.len() * 9); + for &col in pk_indices { + let value = ScalarValue::try_from_array(batch.column(col), row)?; + encode_value(&mut out, &value)?; + } + Ok(out) +} + +/// Encode every row of `batch`'s PK columns (at `pk_indices`) into a `Binary` +/// column of order-preserving composite keys — the form a [`super::BTreeMemIndex`] +/// indexes directly (its byte backend), so the composite PK reuses the same +/// index as a single-column one. +pub fn encode_pk_batch(batch: &RecordBatch, pk_indices: &[usize]) -> Result { + let mut keys: Vec> = Vec::with_capacity(batch.num_rows()); + for row in 0..batch.num_rows() { + keys.push(encode_pk_row(batch, pk_indices, row)?); + } + Ok(BinaryArray::from_iter_values(keys.iter())) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::{Int32Array, StringArray}; + use arrow_schema::{DataType, Field, Schema}; + use std::sync::Arc; + + fn tuple(a: i32, b: &str) -> Vec { + vec![ScalarValue::Int32(Some(a)), ScalarValue::from(b)] + } + + #[test] + fn encoding_is_order_preserving_and_injective() { + // Sorting tuples by their encoding must match tuple order, and distinct + // tuples must produce distinct bytes. + let tuples = [ + tuple(1, "a"), + tuple(1, "ab"), + tuple(1, "b"), + tuple(2, "a"), + tuple(-1, "z"), + ]; + let mut encoded: Vec<(Vec, &Vec)> = tuples + .iter() + .map(|t| (encode_pk_tuple(t).unwrap(), t)) + .collect(); + encoded.sort_by(|x, y| x.0.cmp(&y.0)); + let order: Vec<_> = encoded.iter().map(|(_, t)| (*t).clone()).collect(); + // -1 < 1 < 2; within id=1, "a" < "ab" < "b". + assert_eq!( + order, + vec![ + tuple(-1, "z"), + tuple(1, "a"), + tuple(1, "ab"), + tuple(1, "b"), + tuple(2, "a"), + ] + ); + // Injective: 5 distinct tuples → 5 distinct keys. + let mut keys: Vec> = tuples.iter().map(|t| encode_pk_tuple(t).unwrap()).collect(); + keys.sort(); + keys.dedup(); + assert_eq!(keys.len(), 5); + } + + #[test] + fn null_sorts_first_and_is_distinct() { + let null_a = vec![ScalarValue::Int32(None), ScalarValue::from("a")]; + let one_a = tuple(1, "a"); + assert!(encode_pk_tuple(&null_a).unwrap() < encode_pk_tuple(&one_a).unwrap()); + assert_ne!( + encode_pk_tuple(&null_a).unwrap(), + encode_pk_tuple(&one_a).unwrap() + ); + } + + #[test] + fn prefix_safety_with_embedded_zero() { + // A string containing 0x00 must not collide with or sort incorrectly + // against a shorter one (escaping + terminator). + let with_zero = vec![ScalarValue::Binary(Some(vec![0x00]))]; + let empty = vec![ScalarValue::Binary(Some(vec![]))]; + assert!(encode_pk_tuple(&empty).unwrap() < encode_pk_tuple(&with_zero).unwrap()); + } + + #[test] + fn encode_pk_batch_matches_per_tuple_encoding() { + // Each row of the encoded `Binary` column equals `encode_pk_tuple` of + // that row's PK values — so the column a BTreeMemIndex indexes is exactly + // what a probe builds. + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("name", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(Int32Array::from(vec![2, 1])), + Arc::new(StringArray::from(vec!["a", "b"])), + ], + ) + .unwrap(); + let encoded = encode_pk_batch(&batch, &[0, 1]).unwrap(); + assert_eq!(encoded.value(0), encode_pk_tuple(&tuple(2, "a")).unwrap()); + assert_eq!(encoded.value(1), encode_pk_tuple(&tuple(1, "b")).unwrap()); + // (1,"b") encodes below (2,"a"). + assert!(encoded.value(1) < encoded.value(0)); + } +} diff --git a/rust/lance/src/dataset/mem_wal/memtable/batch_store.rs b/rust/lance/src/dataset/mem_wal/memtable/batch_store.rs index f4d4d797acc..3df6215d7ce 100644 --- a/rust/lance/src/dataset/mem_wal/memtable/batch_store.rs +++ b/rust/lance/src/dataset/mem_wal/memtable/batch_store.rs @@ -615,6 +615,22 @@ impl BatchStore { (0..end).collect() } + /// The inclusive maximum visible *row* position at `max_visible_batch_position`, + /// or `None` when no rows are visible. The visible batches are the committed + /// prefix `[0, last_visible_idx]`; each batch carries its cumulative + /// `row_offset`, so this is the end of the last visible batch minus one. + /// Used to bound MVCC seeks against the maintained PK-position index. + pub fn max_visible_row(&self, max_visible_batch_position: usize) -> Option { + let len = self.committed_len.load(Ordering::Acquire); + if len == 0 { + return None; + } + let last_visible_idx = max_visible_batch_position.min(len - 1); + let last = self.get(last_visible_idx)?; + let visible_end = last.row_offset + last.num_rows as u64; // exclusive + visible_end.checked_sub(1) + } + /// Check if a specific batch is visible at a given visibility position. #[inline] pub fn is_batch_visible( diff --git a/rust/lance/src/dataset/mem_wal/memtable/flush.rs b/rust/lance/src/dataset/mem_wal/memtable/flush.rs index 8f0e34db5ec..e55bd565924 100644 --- a/rust/lance/src/dataset/mem_wal/memtable/flush.rs +++ b/rust/lance/src/dataset/mem_wal/memtable/flush.rs @@ -449,6 +449,10 @@ impl MemTableFlusher { all_indexes.extend(fts_indexes); } + // Write the standalone primary-key dedup index (sidecar, not a manifest + // index — the block-list opens it directly by path). + self.create_pk_index(&gen_path, memtable.indexes()).await?; + // Write a single manifest that records the fragments, the // within-generation deletion vector, and all indexes, overwriting the // data-only v1 manifest created by Dataset::write. @@ -543,6 +547,49 @@ impl MemTableFlusher { Ok(created_indexes) } + /// Write the standalone primary-key dedup index for this generation. + /// + /// Unlike user indexes, this is a **sidecar**: it is not registered in the + /// manifest. The block-list opens it directly by path + /// ([`pk_index_path`]) and probes it with `Equals`. Single-column primary + /// keys index the typed value; composite keys index the order-preserving + /// `Binary` encoded tuple (see [`super::super::index::encode_pk_tuple`]). + /// Row positions line up 1:1 with the forward-written data file, so they are + /// the flushed row ids directly. No-op without a primary-key index. + async fn create_pk_index( + &self, + gen_path: &Path, + mem_indexes: Option<&super::super::index::IndexStore>, + ) -> Result<()> { + use datafusion::physical_plan::SendableRecordBatchStream; + use datafusion::physical_plan::stream::RecordBatchStreamAdapter; + use lance_index::scalar::btree::train_btree_index; + use lance_index::scalar::lance_format::LanceIndexStore; + + use crate::dataset::mem_wal::util::pk_index_path; + + let Some(registry) = mem_indexes else { + return Ok(()); + }; + let batches = registry.pk_training_batches(8192)?; + if batches.is_empty() { + return Ok(()); + } + + let schema = batches[0].schema(); + let store = LanceIndexStore::new( + self.object_store.clone(), + pk_index_path(gen_path), + Arc::new(LanceCache::no_cache()), + ); + let stream: SendableRecordBatchStream = Box::pin(RecordBatchStreamAdapter::new( + schema, + futures::stream::iter(batches.into_iter().map(Ok)), + )); + train_btree_index(stream, &store, 8192, None, None).await?; + Ok(()) + } + /// Create FTS (Full-Text Search) indexes from in-memory data (uncommitted). /// /// Writes the FTS index files and returns index metadata without committing. @@ -1227,6 +1274,107 @@ mod tests { assert_eq!(rows.get(&3), Some(&"c2".to_string())); } + /// Flushing a memtable with a primary-key index writes a standalone sidecar + /// BTree at `{gen}/_pk_index` that the block-list can reopen by path and + /// probe by value — including for a within-gen-superseded PK (existence, + /// not visibility). + #[tokio::test] + async fn flushed_pk_index_sidecar_is_probeable() { + use lance_core::cache::LanceCache; + use lance_index::metrics::NoOpMetricsCollector; + use lance_index::registry::IndexPluginRegistry; + use lance_index::scalar::lance_format::LanceIndexStore; + use lance_index::scalar::{SargableQuery, SearchResult}; + + use super::super::super::index::IndexStore; + use crate::dataset::mem_wal::util::pk_index_path; + use datafusion::common::ScalarValue; + + let (store, base_path, _base_uri, _temp_dir) = create_local_store().await; + let shard_id = Uuid::new_v4(); + let manifest_store = Arc::new(ShardManifestStore::new( + store.clone(), + &base_path, + shard_id, + 2, + )); + let (epoch, _manifest) = manifest_store.claim_epoch(0).await.unwrap(); + + // Primary-key index on `id`, no user indexes. + let schema = create_pk_schema(); + let mut memtable = MemTable::new(schema.clone(), 1, vec![0]).unwrap(); + let mut registry = IndexStore::new(); + registry.enable_pk_index(&[("id".to_string(), 0)]); + memtable.set_indexes(registry); + + // id=1 updated in-gen (a -> a2); id=2 unique. + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(vec![1, 2, 1])), + Arc::new(StringArray::from(vec!["a", "b", "a2"])), + ], + ) + .unwrap(); + let frag_id = memtable.insert(batch).await.unwrap(); + memtable.mark_wal_flushed(&[frag_id], 1, &[0]); + + let flusher = MemTableFlusher::new( + store.clone(), + base_path.clone(), + _base_uri.clone(), + shard_id, + manifest_store.clone(), + ); + let result = flusher + .flush_with_indexes(&memtable, epoch, &[], 1) + .await + .unwrap(); + + // Reopen the sidecar directly by path (the block-list's route). + let gen_path = base_path + .clone() + .join("_mem_wal") + .join(shard_id.to_string()) + .join(result.generation.path.as_str()); + let index_store = Arc::new(LanceIndexStore::new( + store.clone(), + pk_index_path(&gen_path), + Arc::new(LanceCache::no_cache()), + )); + let registry = IndexPluginRegistry::with_default_plugins(); + let plugin = registry.get_plugin_by_name("BTree").unwrap(); + let details = + prost_types::Any::from_msg(&lance_index::pbold::BTreeIndexDetails::default()).unwrap(); + let index = plugin + .load_index(index_store, &details, None, &LanceCache::no_cache()) + .await + .unwrap(); + + let contains = |id: i32| { + let index = index.clone(); + async move { + let result = index + .search( + &SargableQuery::Equals(ScalarValue::Int32(Some(id))), + &NoOpMetricsCollector, + ) + .await + .unwrap(); + match result { + SearchResult::Exact(s) | SearchResult::AtMost(s) | SearchResult::AtLeast(s) => { + !s.is_empty() + } + } + } + }; + // Both PKs present (id=1 even though its first version was superseded); + // an absent PK is not. + assert!(contains(1).await); + assert!(contains(2).await); + assert!(!contains(99).await); + } + /// Covers `finalize_generation` writing both a deletion vector *and* /// indexes into the same manifest — the deletion-only and index-only /// paths are exercised by sibling tests. diff --git a/rust/lance/src/dataset/mem_wal/memtable/scanner/builder.rs b/rust/lance/src/dataset/mem_wal/memtable/scanner/builder.rs index 2c5192e28a1..17fa9c76a65 100644 --- a/rust/lance/src/dataset/mem_wal/memtable/scanner/builder.rs +++ b/rust/lance/src/dataset/mem_wal/memtable/scanner/builder.rs @@ -366,6 +366,14 @@ impl MemTableScanner { self } + /// The `max_visible_batch_position` snapshot this scanner latched at + /// construction. A downstream recency filter must key on this same snapshot + /// (not a fresh read of the IndexStore watermark, which a concurrent append + /// could have advanced) so it stays consistent with the rows the search saw. + pub fn max_visible_batch_position(&self) -> usize { + self.max_visible_batch_position + } + /// Include the _rowaddr column in output. /// /// Same value as _rowid but named for compatibility with LSM scanner. diff --git a/rust/lance/src/dataset/mem_wal/scanner/block_list.rs b/rust/lance/src/dataset/mem_wal/scanner/block_list.rs index 684fde48da1..1faccc6a6b5 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/block_list.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/block_list.rs @@ -3,37 +3,118 @@ //! Per-source block-list construction for LSM vector search. //! -//! A generation's membership is an `Arc>` of PK hashes -//! ([`compute_pk_hash`]), built once (immutable gens cached). Each source gets a -//! `Vec>>` of the newer generations' sets (`NEWER(G)`; base: all -//! of them) — referenced, never merged. The KNN drops candidates whose PK is in -//! any (see [`super::exec::PkHashFilterExec`]). +//! A generation's membership is a [`GenMembership`]: in-memory generations +//! (active / frozen) are probed by value against their maintained primary-key +//! index (no per-query set), while flushed generations are probed against their +//! standalone on-disk PK BTree (the sidecar written at flush, opened by path and +//! queried with `Equals`). Each source gets a `Vec` of the newer +//! generations (`NEWER(G)`; base: all of them); the KNN drops a candidate whose +//! PK any of them contains (see [`super::exec::PkBlockFilterExec`]). //! -//! Cross-generation only: within-gen dups share a hash and fall to the global -//! dedup's `(generation, freshness)` tiebreaker. +//! Cross-generation only: within-gen dups collapse via the global dedup's +//! `(generation, freshness)` tiebreaker. -use std::collections::{HashMap, HashSet}; -use std::sync::Arc; +use std::collections::HashMap; +use std::sync::{Arc, LazyLock}; -use arrow_array::RecordBatch; -use futures::TryStreamExt; -use lance_core::Result; +use datafusion::common::ScalarValue; +use lance_core::{Error, Result}; +use lance_index::metrics::NoOpMetricsCollector; +use lance_index::registry::IndexPluginRegistry; +use lance_index::scalar::lance_format::LanceIndexStore; +use lance_index::scalar::{ + IndexStore as ScalarIndexStore, SargableQuery, ScalarIndex, SearchResult, +}; use uuid::Uuid; use super::data_source::{LsmDataSource, LsmGeneration}; -use super::exec::{compute_pk_hash, resolve_pk_indices}; use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; -use crate::dataset::Dataset; -use crate::dataset::mem_wal::write::BatchStore; +use crate::dataset::mem_wal::index::encode_pk_tuple; +use crate::dataset::mem_wal::util::PK_INDEX_DIR; +use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; use crate::session::Session; -/// Per-source blocked PK-hash sets, keyed by `(shard_id, generation)`. Each -/// value is the membership sets of the generations newer than that source. -pub type SourceBlockLists = HashMap<(Option, LsmGeneration), Vec>>>; +/// Default-plugin registry, used only to load the standalone PK BTree by its +/// `BTreeIndexDetails` type. Built once. +static PK_BTREE_REGISTRY: LazyLock> = + LazyLock::new(IndexPluginRegistry::with_default_plugins); + +/// One newer generation's PK membership, used to decide whether it shadows an +/// older source's row. +#[derive(Clone, Debug)] +pub enum GenMembership { + /// Probe the in-memory memtable's primary-key index, bounded to its visible + /// prefix (so a not-yet-visible write can't shadow an older visible copy). + InMemory { + index_store: Arc, + /// Inclusive visible row watermark; `None` when no rows are visible. + max_visible_row: Option, + }, + /// Probe the flushed generation's standalone on-disk PK BTree. + OnDisk(Arc), +} + +impl GenMembership { + /// Whether this generation visibly contains the primary `key` — the typed + /// value for a single-column PK, the encoded `Binary` tuple for a composite + /// one (built by [`on_disk_pk_key`]). The same key probes the in-memory + /// BTree and the flushed on-disk BTree, which now share a key space. + pub async fn contains(&self, key: &ScalarValue) -> Result { + match self { + Self::InMemory { + index_store, + max_visible_row, + } => Ok(max_visible_row.is_some_and(|max| index_store.pk_contains_key(key, max))), + Self::OnDisk(index) => { + let result = index + .search(&SargableQuery::Equals(key.clone()), &NoOpMetricsCollector) + .await + .map_err(|e| Error::io(e.to_string()))?; + Ok(!search_is_empty(&result)) + } + } + } + + /// Whether this generation has no (visible) membership — used to skip adding + /// an empty blocked set. A flushed generation always has rows (flush rejects + /// an empty memtable), so it is never empty. + fn is_empty(&self) -> bool { + match self { + Self::InMemory { + index_store, + max_visible_row, + } => max_visible_row.is_none() || index_store.pk_is_empty(), + Self::OnDisk(_) => false, + } + } +} -/// A shard's generations paired with their PK-hash membership, before sorting. -type ShardGenSets = HashMap>)>>; +/// Whether a scalar search returned no rows (existence test for the block-list). +fn search_is_empty(result: &SearchResult) -> bool { + match result { + SearchResult::Exact(set) | SearchResult::AtMost(set) | SearchResult::AtLeast(set) => { + set.is_empty() + } + } +} + +/// The probe key for the on-disk PK BTree: a single-column PK indexes its typed +/// value directly; a composite PK indexes the order-preserving encoded tuple as +/// `Binary` (matching what flush wrote — see [`encode_pk_tuple`]). +pub fn on_disk_pk_key(values: &[ScalarValue]) -> Result { + match values { + [single] => Ok(single.clone()), + _ => Ok(ScalarValue::Binary(Some(encode_pk_tuple(values)?))), + } +} + +/// Per-source blocked memberships, keyed by `(shard_id, generation)`. Each value +/// is the memberships of the generations newer than that source. +pub type SourceBlockLists = HashMap<(Option, LsmGeneration), Vec>; + +/// A shard's generations paired with their membership, before sorting. +type ShardGenSets = HashMap>; /// Per-source `NEWER(G)`, keyed by `(shard_id, generation)`. Generations are /// per-shard, so a source is superseded only by strictly-newer generations of @@ -42,11 +123,10 @@ type ShardGenSets = HashMap>)>>; /// Only superseded sources get an entry; the newest of each shard never does. pub async fn compute_source_block_lists( sources: &[LsmDataSource], - pk_columns: &[String], session: Option<&Arc>, flushed_cache: Option<&Arc>, ) -> Result { - // Hash each non-base source's membership, grouped by shard (generations are + // Membership per non-base source, grouped by shard (generations are // per-shard, so supersession is within-shard only). let mut by_shard: ShardGenSets = HashMap::new(); let mut has_base = false; @@ -55,15 +135,16 @@ pub async fn compute_source_block_lists( LsmDataSource::BaseTable { .. } => has_base = true, LsmDataSource::ActiveMemTable { batch_store, + index_store, shard_id, generation, .. } => { - let hashes = Arc::new(pk_hashes_from_batch_store(batch_store, pk_columns)?); + let membership = in_memory_membership(batch_store, index_store); by_shard .entry(*shard_id) .or_default() - .push((*generation, hashes)); + .push((*generation, membership)); } LsmDataSource::FlushedMemTable { path, @@ -71,30 +152,29 @@ pub async fn compute_source_block_lists( generation, .. } => { - // Cached by immutable path so repeated searches skip the PK scan. - let hashes = flushed_pk_hashes(path, pk_columns, session, flushed_cache).await?; + let index = open_pk_index(path, session, flushed_cache).await?; by_shard .entry(*shard_id) .or_default() - .push((*generation, hashes)); + .push((*generation, GenMembership::OnDisk(index))); } } } let mut blocked: SourceBlockLists = HashMap::new(); // Base (shardless, oldest) is superseded by every non-base generation. - let mut base_blocked: Vec>> = Vec::new(); + let mut base_blocked: Vec = Vec::new(); for (shard, mut gens) in by_shard { // Newest-first: a gen's blocked list is its own shard's newer gens. gens.sort_by_key(|(generation, _)| std::cmp::Reverse(*generation)); - let mut newer: Vec>> = Vec::new(); - for (generation, hashes) in gens { + let mut newer: Vec = Vec::new(); + for (generation, membership) in gens { if !newer.is_empty() { blocked.insert((Some(shard), generation), newer.clone()); } - if !hashes.is_empty() { - base_blocked.push(hashes.clone()); - newer.push(hashes); + if !membership.is_empty() { + base_blocked.push(membership.clone()); + newer.push(membership); } } } @@ -104,260 +184,248 @@ pub async fn compute_source_block_lists( Ok(blocked) } -/// The fresh-tier block-list: one membership set per generation that shadows the -/// base table — active + frozen memtables (hashed now) and flushed generations -/// (from the cache). Same `Vec>>` shape the vector-search filter -/// consumes; a base/external reader can drop any row whose PK is in one of them. -/// The base source, if present, is skipped (it is what gets shadowed). +/// The fresh-tier block-list: one [`GenMembership`] per generation that shadows +/// the base table — active + frozen memtables (probed against their index) and +/// flushed generations (probed against their on-disk PK BTree). A base/external +/// reader can test any PK against these (via [`GenMembership::contains`]) to +/// decide whether the fresh tier shadows it. The base source, if present, is +/// skipped (it is what gets shadowed). pub async fn fresh_tier_block_list( sources: &[LsmDataSource], - pk_columns: &[String], session: Option<&Arc>, flushed_cache: Option<&Arc>, -) -> Result>>> { - let mut sets = Vec::new(); +) -> Result> { + let mut memberships = Vec::new(); for source in sources { - let set = match source { + let membership = match source { LsmDataSource::BaseTable { .. } => continue, - LsmDataSource::ActiveMemTable { batch_store, .. } => { - Arc::new(pk_hashes_from_batch_store(batch_store, pk_columns)?) - } + LsmDataSource::ActiveMemTable { + batch_store, + index_store, + .. + } => in_memory_membership(batch_store, index_store), LsmDataSource::FlushedMemTable { path, .. } => { - flushed_pk_hashes(path, pk_columns, session, flushed_cache).await? + GenMembership::OnDisk(open_pk_index(path, session, flushed_cache).await?) } }; - if !set.is_empty() { - sets.push(set); - } - } - Ok(sets) -} - -/// Hash the PK membership of an in-memory memtable (active or frozen) from its -/// committed `BatchStore` rows. -pub fn pk_hashes_from_batch_store( - store: &BatchStore, - pk_columns: &[String], -) -> Result> { - let mut batches: Vec = Vec::with_capacity(store.len()); - for i in 0..store.len() { - if let Some(stored) = store.get(i) { - batches.push(stored.data.clone()); + if !membership.is_empty() { + memberships.push(membership); } } - pk_hashes_from_batches(&batches, pk_columns) + Ok(memberships) } -/// Hash every row's primary key across `batches` into a membership set. -fn pk_hashes_from_batches(batches: &[RecordBatch], pk_columns: &[String]) -> Result> { - let mut pk_hashes = HashSet::new(); - for batch in batches { - if batch.num_rows() == 0 { - continue; - } - let pk_indices = resolve_pk_indices(batch, pk_columns) - .map_err(|e| lance_core::Error::invalid_input(e.to_string()))?; - for row_idx in 0..batch.num_rows() { - pk_hashes.insert(compute_pk_hash(batch, &pk_indices, row_idx)); - } +/// Cross-source membership of an in-memory (active / frozen) memtable: a +/// snapshot-bounded probe of its maintained primary-key index. A memtable +/// without a primary-key index can't be probed, so it blocks nothing — the +/// production vector-search path always enables the index. +fn in_memory_membership( + batch_store: &Arc, + index_store: &Arc, +) -> GenMembership { + let max_visible_row = batch_store.max_visible_row(index_store.max_visible_batch_position()); + GenMembership::InMemory { + index_store: index_store.clone(), + max_visible_row, } - Ok(pk_hashes) } -/// Build (or fetch the cached) PK-hash membership for one flushed generation. -/// Cached by immutable path (single-flight); the build scans the flushed -/// dataset's PK columns. -async fn flushed_pk_hashes( +/// Open the standalone PK BTree at `{flushed gen}/_pk_index` for one flushed +/// generation. Reuses the flushed dataset's (session-configured) object store +/// and **its index cache**, then loads the sidecar directly by path through the +/// BTree plugin — it is not a manifest index. The opened index and its pages +/// are cached in the session's index cache (keyed by the immutable flushed +/// path), so repeated probes reuse them with no separate cache path and no +/// upfront scan; concurrent first-opens may each load before the cache fills. +async fn open_pk_index( path: &str, - pk_columns: &[String], session: Option<&Arc>, flushed_cache: Option<&Arc>, -) -> Result>> { - match flushed_cache { - Some(cache) => { - let build_cache = cache.clone(); - let build_path = path.to_string(); - let build_session = session.cloned(); - let build_pk = pk_columns.to_vec(); - cache - .get_or_build_pk_hashes( - path, - // `Box::pin` keeps this build future off the caller's future - // (avoids `clippy::large_futures`). - Box::pin(async move { - let dataset = open_flushed_dataset( - &build_path, - build_session.as_ref(), - Some(&build_cache), - ) - .await?; - scan_pk_hashes(&dataset, &build_pk).await - }), - ) - .await - } - None => { - let dataset = open_flushed_dataset(path, session, None).await?; - Ok(Arc::new(scan_pk_hashes(&dataset, pk_columns).await?)) - } +) -> Result> { + let dataset = open_flushed_dataset(path, session, flushed_cache).await?; + // Namespace the session index cache by the (immutable) flushed path so this + // sidecar's pages live alongside every other index instead of a bespoke + // cache. `fri_uuid` is None — flushed generations carry no fragment-reuse. + let index_cache = dataset.index_cache.for_index(path, None); + let index_dir = dataset.base.clone().join(PK_INDEX_DIR); + let store: Arc = Arc::new(LanceIndexStore::new( + dataset.object_store.clone(), + index_dir, + Arc::new(index_cache.clone()), + )); + + let plugin = PK_BTREE_REGISTRY.get_plugin_by_name("BTree")?; + // Cache the opened index in the session cache (mirrors `open_scalar_index`). + if let Some(index) = plugin + .get_from_cache(store.clone(), None, &index_cache) + .await? + { + return Ok(index); } + let details = prost_types::Any::from_msg(&lance_index::pbold::BTreeIndexDetails::default()) + .map_err(|e| Error::io(e.to_string()))?; + let index = plugin + .load_index(store, &details, None, &index_cache) + .await?; + plugin.put_in_cache(&index_cache, index.clone()).await?; + Ok(index) } -/// Scan a dataset's PK columns and fold them into a membership set, one batch -/// resident at a time (no full PK-column buffer). -async fn scan_pk_hashes(dataset: &Dataset, pk_columns: &[String]) -> Result> { - let pk_refs: Vec<&str> = pk_columns.iter().map(String::as_str).collect(); - let mut scanner = dataset.scan(); - scanner.project(&pk_refs)?; - let mut stream = scanner.try_into_stream().await?; - let mut hashes = HashSet::new(); - while let Some(batch) = stream.try_next().await? { - if batch.num_rows() == 0 { - continue; - } - let pk_indices = resolve_pk_indices(&batch, pk_columns) - .map_err(|e| lance_core::Error::invalid_input(e.to_string()))?; - for row in 0..batch.num_rows() { - hashes.insert(compute_pk_hash(&batch, &pk_indices, row)); - } +/// Test helper: write a flushed generation's standalone PK sidecar at +/// `{uri}/_pk_index` from `batches`, mirroring what flush does in production. +/// `pk_columns` are the primary-key column names (field ids are synthesized by +/// position — `insert` resolves columns by name). A no-op when no batch carries +/// the PK columns. +#[cfg(test)] +pub async fn write_test_pk_sidecar( + uri: &str, + batches: &[arrow_array::RecordBatch], + pk_columns: &[&str], +) -> Result<()> { + use datafusion::physical_plan::stream::RecordBatchStreamAdapter; + use lance_core::cache::LanceCache; + use lance_index::scalar::btree::train_btree_index; + use lance_io::object_store::ObjectStore; + + use crate::dataset::mem_wal::util::pk_index_path; + + let pk: Vec<(String, i32)> = pk_columns + .iter() + .enumerate() + .map(|(i, c)| (c.to_string(), i as i32)) + .collect(); + let mut index = IndexStore::new(); + index.enable_pk_index(&pk); + let mut offset = 0u64; + for batch in batches { + index.insert(batch, offset)?; + offset += batch.num_rows() as u64; } - Ok(hashes) + + let training = index.pk_training_batches(8192)?; + if training.is_empty() { + return Ok(()); + } + let schema = training[0].schema(); + let (object_store, base_path) = ObjectStore::from_uri(uri).await?; + let store = LanceIndexStore::new( + object_store, + pk_index_path(&base_path), + Arc::new(LanceCache::no_cache()), + ); + let stream = Box::pin(RecordBatchStreamAdapter::new( + schema, + futures::stream::iter(training.into_iter().map(Ok)), + )); + train_btree_index(stream, &store, 8192, None, None).await } #[cfg(test)] mod tests { use super::*; - use arrow_array::Int32Array; + use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration}; + use crate::dataset::mem_wal::write::IndexStore; + use arrow_array::{Int32Array, RecordBatch}; use arrow_schema::{DataType, Field, Schema}; use std::sync::Arc; + use uuid::Uuid; fn id_batch(ids: &[i32]) -> RecordBatch { let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(ids.to_vec()))]).unwrap() } - /// Hash a single Int32 `id` PK the way the planner does, so a test can probe - /// a returned blocked set by value. - fn hash_id(id: i32) -> u64 { - let batch = id_batch(&[id]); - let pk_indices = resolve_pk_indices(&batch, &["id".to_string()]).unwrap(); - compute_pk_hash(&batch, &pk_indices, 0) - } - - /// Whether `id`'s PK hash is blocked by any of a source's newer-gen sets. - fn blocks(sets: &[Arc>], id: i32) -> bool { - sets.iter().any(|s| s.contains(&hash_id(id))) - } - - #[test] - fn pk_hashes_collapse_within_gen_duplicates() { - // Two rows share pk=1 (a within-gen duplicate); pk=2 is unique. - let hashes = pk_hashes_from_batches(&[id_batch(&[1, 2, 1])], &["id".to_string()]).unwrap(); - assert_eq!(hashes.len(), 2); // distinct pks: 1, 2 + /// An active/frozen memtable source whose PK index holds one row per id in + /// `ids` (positions 0..n), all committed and visible. + fn active_source(shard: Uuid, generation: u64, ids: &[i32]) -> LsmDataSource { + let store = BatchStore::with_capacity(16); + let mut index = IndexStore::new(); + index.enable_pk_index(&[("id".to_string(), 0)]); + for &id in ids { + let b = id_batch(&[id]); + let (bp, off, _) = store.append(b.clone()).unwrap(); + index.insert_with_batch_position(&b, off, Some(bp)).unwrap(); + } + LsmDataSource::ActiveMemTable { + batch_store: Arc::new(store), + index_store: Arc::new(index), + schema: id_batch(&[1]).schema(), + shard_id: shard, + generation: LsmGeneration::memtable(generation), + } } - #[test] - fn empty_batches_yield_empty_membership() { - let hashes = pk_hashes_from_batches(&[id_batch(&[])], &["id".to_string()]).unwrap(); - assert!(hashes.is_empty()); + /// Whether `id`'s PK is blocked by any of a source's newer-gen memberships. + async fn blocks(memberships: &[GenMembership], id: i32) -> bool { + let key = on_disk_pk_key(&[ScalarValue::Int32(Some(id))]).unwrap(); + for m in memberships { + if m.contains(&key).await.unwrap() { + return true; + } + } + false } #[test] - fn batch_store_membership_collapses_within_gen_dups() { - let store = BatchStore::with_capacity(8); - // Two single-row batches, both pk=1 (a within-gen update). - store.append(id_batch(&[1])).unwrap(); - store.append(id_batch(&[1])).unwrap(); - // A two-row batch: pk=2, pk=3. - store.append(id_batch(&[2, 3])).unwrap(); - - let hashes = pk_hashes_from_batch_store(&store, &["id".to_string()]).unwrap(); - assert_eq!(hashes.len(), 3); // distinct pks: 1, 2, 3 + fn on_disk_key_is_typed_for_single_and_binary_for_composite() { + // Single-column → the typed value; composite → encoded Binary. + let single = [ScalarValue::Int32(Some(7))]; + assert_eq!( + on_disk_pk_key(&single).unwrap(), + ScalarValue::Int32(Some(7)) + ); + let composite = [ScalarValue::Int32(Some(1)), ScalarValue::from("a")]; + assert!(matches!( + on_disk_pk_key(&composite).unwrap(), + ScalarValue::Binary(Some(_)) + )); } #[tokio::test] - async fn fresh_tier_block_list_one_set_per_in_memory_gen() { - use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration}; - use crate::dataset::mem_wal::write::IndexStore; - use uuid::Uuid; - + async fn fresh_tier_block_list_one_membership_per_in_memory_gen() { let shard = Uuid::new_v4(); - let mk = |ids: &[i32], generation: u64| { - let store = BatchStore::with_capacity(8); - store.append(id_batch(ids)).unwrap(); - LsmDataSource::ActiveMemTable { - batch_store: Arc::new(store), - index_store: Arc::new(IndexStore::new()), - schema: id_batch(&[1]).schema(), - shard_id: shard, - generation: LsmGeneration::memtable(generation), - } - }; // Active gen 2: pk=1,2. Frozen gen 1: pk=3. - let sources = vec![mk(&[1, 2], 2), mk(&[3], 1)]; + let sources = vec![ + active_source(shard, 2, &[1, 2]), + active_source(shard, 1, &[3]), + ]; - let sets = fresh_tier_block_list(&sources, &["id".to_string()], None, None) - .await - .unwrap(); + let memberships = fresh_tier_block_list(&sources, None, None).await.unwrap(); - // One set per generation; together they cover pk=1,2,3 (not 4). - assert_eq!(sets.len(), 2); + // One membership per generation; together they cover pk=1,2,3 (not 4). + assert_eq!(memberships.len(), 2); for id in [1, 2, 3] { - assert!(blocks(&sets, id)); + assert!(blocks(&memberships, id).await); } - assert!(!blocks(&sets, 4)); + assert!(!blocks(&memberships, 4).await); } #[tokio::test] async fn block_lists_suppress_stale_across_in_memory_gens() { - use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration}; - use crate::dataset::mem_wal::write::IndexStore; - use uuid::Uuid; - let shard = Uuid::new_v4(); - let mk = |batches: &[&[i32]], generation: u64| { - let store = BatchStore::with_capacity(8); - for ids in batches { - store.append(id_batch(ids)).unwrap(); - } - LsmDataSource::ActiveMemTable { - batch_store: Arc::new(store), - index_store: Arc::new(IndexStore::new()), - schema: id_batch(&[1]).schema(), - shard_id: shard, - generation: LsmGeneration::memtable(generation), - } - }; - - // Frozen gen 1: stale pk=1. - // Active gen 2: pk=1 re-written, pk=2 new. - let sources = vec![mk(&[&[1]], 1), mk(&[&[1], &[2]], 2)]; + // Frozen gen 1: stale pk=1. Active gen 2: pk=1 re-written, pk=2 new. + let sources = vec![ + active_source(shard, 1, &[1]), + active_source(shard, 2, &[1, 2]), + ]; - let blocked = Box::pin(compute_source_block_lists( - &sources, - &["id".to_string()], - None, - None, - )) - .await - .unwrap(); + let blocked = Box::pin(compute_source_block_lists(&sources, None, None)) + .await + .unwrap(); let g1 = LsmGeneration::memtable(1); let g2 = LsmGeneration::memtable(2); // The newer active write supersedes the frozen copy: gen 1 is blocked on // pk=1, so its KNN drops pk=1. - assert!(blocks(&blocked[&(Some(shard), g1)], 1)); + assert!(blocks(&blocked[&(Some(shard), g1)], 1).await); // The active (newest) generation is superseded by nothing — no entry. assert!(!blocked.contains_key(&(Some(shard), g2))); } #[tokio::test] async fn block_lists_suppress_stale_base_row() { - use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration}; - use crate::dataset::mem_wal::write::IndexStore; use crate::dataset::{Dataset, WriteParams}; use arrow_array::RecordBatchIterator; - use uuid::Uuid; // Base (gen 0): pk=1 (stale), pk=3 (live). let base_batch = id_batch(&[1, 3]); @@ -372,89 +440,101 @@ mod tests { ); // Active gen 1: pk=1 re-written, pk=2 new. - let store = BatchStore::with_capacity(8); - store.append(id_batch(&[1])).unwrap(); - store.append(id_batch(&[2])).unwrap(); - let sources = vec![ LsmDataSource::BaseTable { dataset: base }, - LsmDataSource::ActiveMemTable { - batch_store: Arc::new(store), - index_store: Arc::new(IndexStore::new()), - schema, - shard_id: Uuid::new_v4(), - generation: LsmGeneration::memtable(1), - }, + active_source(Uuid::new_v4(), 1, &[1, 2]), ]; - let blocked = Box::pin(compute_source_block_lists( - &sources, - &["id".to_string()], - None, - None, - )) - .await - .unwrap(); + let blocked = Box::pin(compute_source_block_lists(&sources, None, None)) + .await + .unwrap(); // Base is blocked by every newer gen: pk=1 (re-written in gen 1) is - // blocked, pk=3 (base-only) is not. End-to-end drop: vector_search specs. + // blocked, pk=3 (base-only) is not. let base_blocked = blocked .get(&(None, LsmGeneration::BASE_TABLE)) .expect("base has a blocked set"); - assert!(blocks(base_blocked, 1)); - assert!(!blocks(base_blocked, 3)); + assert!(blocks(base_blocked, 1).await); + assert!(!blocks(base_blocked, 3).await); } #[tokio::test] async fn block_lists_are_keyed_per_shard() { // Regression: generations are per-shard, so a source must only be blocked - // by newer generations of its OWN shard. A generation-only key would - // cross-block same-generation sources from different shards. - use crate::dataset::mem_wal::scanner::data_source::{LsmDataSource, LsmGeneration}; - use crate::dataset::mem_wal::write::IndexStore; - use uuid::Uuid; - - let mk = |shard: Uuid, ids: &[i32], generation: u64| { - let store = BatchStore::with_capacity(8); - store.append(id_batch(ids)).unwrap(); - LsmDataSource::ActiveMemTable { - batch_store: Arc::new(store), - index_store: Arc::new(IndexStore::new()), - schema: id_batch(&[1]).schema(), - shard_id: shard, - generation: LsmGeneration::memtable(generation), - } - }; - - // Two shards, each: frozen gen 1 (stale) + active gen 2 (re-write). - // Shard A keys pk=1; shard B keys pk=2 (disjoint partitions). + // by newer generations of its OWN shard. let a = Uuid::new_v4(); let b = Uuid::new_v4(); + // Two shards, each: frozen gen 1 (stale) + active gen 2 (re-write). + // Shard A keys pk=1; shard B keys pk=2 (disjoint partitions). let sources = vec![ - mk(a, &[1], 1), - mk(a, &[1], 2), - mk(b, &[2], 1), - mk(b, &[2], 2), + active_source(a, 1, &[1]), + active_source(a, 2, &[1]), + active_source(b, 1, &[2]), + active_source(b, 2, &[2]), ]; - let blocked = Box::pin(compute_source_block_lists( - &sources, - &["id".to_string()], - None, - None, - )) - .await - .unwrap(); + let blocked = Box::pin(compute_source_block_lists(&sources, None, None)) + .await + .unwrap(); let g1 = LsmGeneration::memtable(1); let g2 = LsmGeneration::memtable(2); // Each shard's gen 1 is blocked by its OWN gen 2 only. - assert!(blocks(&blocked[&(Some(a), g1)], 1)); - assert!(!blocks(&blocked[&(Some(a), g1)], 2)); - assert!(blocks(&blocked[&(Some(b), g1)], 2)); - assert!(!blocks(&blocked[&(Some(b), g1)], 1)); + assert!(blocks(&blocked[&(Some(a), g1)], 1).await); + assert!(!blocks(&blocked[&(Some(a), g1)], 2).await); + assert!(blocks(&blocked[&(Some(b), g1)], 2).await); + assert!(!blocks(&blocked[&(Some(b), g1)], 1).await); // The newest generation of each shard is superseded by nothing. assert!(!blocked.contains_key(&(Some(a), g2))); assert!(!blocked.contains_key(&(Some(b), g2))); } + + #[tokio::test] + async fn index_membership_is_snapshot_bounded() { + // The index-sourced membership only counts a PK whose version is visible + // at the source's watermark, so a newer generation's not-yet-visible + // write can't shadow an older generation's visible copy. + let shard = Uuid::new_v4(); + let schema = id_batch(&[1]).schema(); + + // Older frozen gen 1: pk=1. + let g1 = active_source(shard, 1, &[1]); + + // Newer active gen 2: pk=99 visible at position 0, then pk=1 written at + // position 1 but with the watermark left at batch 0 (so pk=1 is in the + // index yet not visible) — the concurrent-write race. + let g2_store = BatchStore::with_capacity(8); + let mut g2_index = IndexStore::new(); + g2_index.enable_pk_index(&[("id".to_string(), 0)]); + let b0 = id_batch(&[99]); + let (bp0, off0, _) = g2_store.append(b0.clone()).unwrap(); + g2_index + .insert_with_batch_position(&b0, off0, Some(bp0)) // advances watermark to 0 + .unwrap(); + let b1 = id_batch(&[1]); + let (_, off1, _) = g2_store.append(b1.clone()).unwrap(); + g2_index + .insert_with_batch_position(&b1, off1, None) // index updated, watermark unchanged + .unwrap(); + let g2 = LsmDataSource::ActiveMemTable { + batch_store: Arc::new(g2_store), + index_store: Arc::new(g2_index), + schema, + shard_id: shard, + generation: LsmGeneration::memtable(2), + }; + + let blocked = Box::pin(compute_source_block_lists(&[g1, g2], None, None)) + .await + .unwrap(); + + let g1_block = &blocked[&(Some(shard), LsmGeneration::memtable(1))]; + // pk=99 is visible in gen 2 → it blocks gen 1's pk=99. + assert!(blocks(g1_block, 99).await); + // pk=1's only gen-2 copy is not yet visible → it must NOT shadow gen 1. + assert!( + !blocks(g1_block, 1).await, + "a not-yet-visible newer write must not shadow an older visible copy" + ); + } } diff --git a/rust/lance/src/dataset/mem_wal/scanner/builder.rs b/rust/lance/src/dataset/mem_wal/scanner/builder.rs index ade4164d485..fb7087760c8 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/builder.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/builder.rs @@ -457,21 +457,34 @@ impl LsmScanner { /// [`FlushedMemTableCache`] when one is set. pub async fn contains_pks(&self, pks: &RecordBatch) -> Result> { let sources = self.build_collector().collect()?; - let sets = super::block_list::fresh_tier_block_list( + let memberships = super::block_list::fresh_tier_block_list( &sources, - &self.pk_columns, self.session.as_ref(), self.flushed_cache.as_ref(), ) .await?; let pk_indices = super::exec::resolve_pk_indices(pks, &self.pk_columns) .map_err(|e| Error::invalid_input(e.to_string()))?; - Ok((0..pks.num_rows()) - .map(|row| { - let hash = super::exec::compute_pk_hash(pks, &pk_indices, row); - sets.iter().any(|set| set.contains(&hash)) - }) - .collect()) + let mut contained = Vec::with_capacity(pks.num_rows()); + for row in 0..pks.num_rows() { + // Both in-memory and flushed generations probe by the same key (the + // typed value, or the encoded `Binary` tuple for a composite PK). + let values: Vec = pk_indices + .iter() + .map(|&col| ScalarValue::try_from_array(pks.column(col), row)) + .collect::>() + .map_err(|e| Error::invalid_input(e.to_string()))?; + let key = super::block_list::on_disk_pk_key(&values)?; + let mut found = false; + for membership in &memberships { + if membership.contains(&key).await? { + found = true; + break; + } + } + contained.push(found); + } + Ok(contained) } /// Build the data source collector. @@ -588,10 +601,14 @@ mod tests { }; let mk = |ids: &[i32], generation: u64| { let store = BatchStore::with_capacity(8); - store.append(id_batch(ids)).unwrap(); + let mut index = IndexStore::new(); + index.enable_pk_index(&[("id".to_string(), 0)]); + let b = id_batch(ids); + let (bp, off, _) = store.append(b.clone()).unwrap(); + index.insert_with_batch_position(&b, off, Some(bp)).unwrap(); InMemoryMemTableRef { batch_store: Arc::new(store), - index_store: Arc::new(IndexStore::new()), + index_store: Arc::new(index), schema: schema.clone(), generation, } diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec.rs b/rust/lance/src/dataset/mem_wal/scanner/exec.rs index 88fd617dc0a..115cffccc81 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/exec.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/exec.rs @@ -9,22 +9,22 @@ //! - [`MemtableGenTagExec`]: Wraps a scan to add `_memtable_gen` column //! - [`BloomFilterGuardExec`]: Guards child execution with bloom filter check //! - [`CoalesceFirstExec`]: Returns first non-empty result with short-circuit -//! - [`WithinSourceDedupExec`]: Deduplicates rows with the same PK from a single source -//! - [`PkHashFilterExec`]: Drops rows whose PK hash was superseded by a newer generation (the cross-generation block-list) +//! - [`PkBlockFilterExec`]: Drops rows whose PK was superseded by a newer generation (the cross-generation block-list) +//! - [`NewestPkFilterExec`]: Drops active-memtable hits that aren't the newest visible version of their PK (the within-source recency filter) mod bloom_guard; mod coalesce_first; mod generation_tag; +mod newest_pk_filter; mod pk; -mod pk_hash_filter; -mod within_source_dedup; +mod pk_block_filter; pub use bloom_guard::{BloomFilterGuardExec, compute_pk_hash_from_scalars}; pub use coalesce_first::CoalesceFirstExec; pub use generation_tag::{MEMTABLE_GEN_COLUMN, MemtableGenTagExec}; +pub use newest_pk_filter::NewestPkFilterExec; pub use pk::{ ROW_ADDRESS_COLUMN, compute_pk_hash, is_supported_pk_type, resolve_pk_indices, validate_pk_types, }; -pub use pk_hash_filter::PkHashFilterExec; -pub use within_source_dedup::{DedupDirection, WithinSourceDedupExec}; +pub use pk_block_filter::PkBlockFilterExec; diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/newest_pk_filter.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/newest_pk_filter.rs new file mode 100644 index 00000000000..e1495cb0bb1 --- /dev/null +++ b/rust/lance/src/dataset/mem_wal/scanner/exec/newest_pk_filter.rs @@ -0,0 +1,393 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Drop predicate-crossing stale rows from an active-memtable index search. +//! +//! The active memtable's HNSW / inverted index are append-only, so an updated +//! row's old entries stay live. When an update moves a row out of the query's +//! match set, the fresh version isn't in the index result, so a result-set +//! dedup (keep-newest among the returned rows) has nothing to suppress the +//! stale version against — and it leaks. +//! +//! This node closes that hole with a predicate-independent recency check: for +//! each hit it asks the memtable's maintained primary-key index +//! ([`IndexStore::pk_is_newest`]) whether the hit's own row position is the +//! newest version of its primary key visible at the query's `max_visible` +//! watermark, and keeps the hit **iff so**. A stale hit (some +//! newer version exists) is dropped even when that newer version never appears +//! in the result. This is exactly the seek point-lookup already does; the index +//! search arms simply didn't do it. + +use std::any::Any; +use std::fmt; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use arrow::compute::filter_record_batch; +use arrow_array::{Array, BooleanArray, RecordBatch, UInt64Array}; +use arrow_schema::SchemaRef; +use datafusion::common::ScalarValue; +use datafusion::error::{DataFusionError, Result as DFResult}; +use datafusion::execution::TaskContext; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, + SendableRecordBatchStream, +}; +use futures::{Stream, StreamExt}; + +use super::pk::resolve_pk_indices; +use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; + +/// Keeps only the index hits that are the newest visible version of their PK. +/// +/// The input must expose all `pk_columns` and the `row_id_column` (`UInt64`, +/// the BatchStore row position). The output schema is unchanged. +pub struct NewestPkFilterExec { + input: Arc, + pk_columns: Vec, + row_id_column: String, + /// Holds the maintained primary-key index, queried per hit via + /// [`IndexStore::pk_is_newest`]. + index_store: Arc, + /// Resolves the `max_visible` row watermark from the visible batch prefix. + batch_store: Arc, + /// The MVCC batch-position snapshot the index search latched. Captured once + /// at plan time and shared with the search so the recency check keys on the + /// same snapshot the hits came from. + max_visible_batch_position: usize, + properties: Arc, +} + +impl fmt::Debug for NewestPkFilterExec { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + // `BatchStore` / `IndexStore` aren't `Debug`; show only the knobs. + f.debug_struct("NewestPkFilterExec") + .field("pk_columns", &self.pk_columns) + .field("row_id_column", &self.row_id_column) + .field( + "max_visible_batch_position", + &self.max_visible_batch_position, + ) + .finish() + } +} + +impl NewestPkFilterExec { + pub fn new( + input: Arc, + pk_columns: Vec, + row_id_column: impl Into, + index_store: Arc, + batch_store: Arc, + max_visible_batch_position: usize, + ) -> Self { + // A filter preserves the input schema and partitioning. + let properties = Arc::new(PlanProperties::new( + EquivalenceProperties::new(input.schema()), + input.output_partitioning().clone(), + input.pipeline_behavior(), + input.boundedness(), + )); + Self { + input, + pk_columns, + row_id_column: row_id_column.into(), + index_store, + batch_store, + max_visible_batch_position, + properties, + } + } + + /// The inclusive max visible row position for this snapshot, or `None` when + /// no rows are visible. + fn max_visible_row(&self) -> Option { + self.batch_store + .max_visible_row(self.max_visible_batch_position) + } +} + +impl DisplayAs for NewestPkFilterExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + match t { + DisplayFormatType::Default + | DisplayFormatType::Verbose + | DisplayFormatType::TreeRender => { + write!( + f, + "NewestPkFilterExec: pk=[{}], row_id={}, max_visible_batch={}", + self.pk_columns.join(", "), + self.row_id_column, + self.max_visible_batch_position, + ) + } + } + } +} + +impl ExecutionPlan for NewestPkFilterExec { + fn name(&self) -> &str { + "NewestPkFilterExec" + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.input.schema() + } + + fn properties(&self) -> &Arc { + &self.properties + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> DFResult> { + if children.len() != 1 { + return Err(DataFusionError::Internal( + "NewestPkFilterExec requires exactly one child".to_string(), + )); + } + Ok(Arc::new(Self::new( + children[0].clone(), + self.pk_columns.clone(), + self.row_id_column.clone(), + self.index_store.clone(), + self.batch_store.clone(), + self.max_visible_batch_position, + ))) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> DFResult { + let input_stream = self.input.execute(partition, context)?; + Ok(Box::pin(NewestPkFilterStream { + input: input_stream, + pk_columns: self.pk_columns.clone(), + row_id_column: self.row_id_column.clone(), + index_store: self.index_store.clone(), + max_visible_row: self.max_visible_row(), + schema: self.schema(), + })) + } +} + +struct NewestPkFilterStream { + input: SendableRecordBatchStream, + pk_columns: Vec, + row_id_column: String, + index_store: Arc, + /// Inclusive watermark snapshot; `None` when no rows are visible. + max_visible_row: Option, + schema: SchemaRef, +} + +impl NewestPkFilterStream { + fn filter_batch(&self, batch: RecordBatch) -> DFResult { + // No primary-key index (memtable without a primary key), no visible + // rows, or an empty batch: nothing to dedup against, so pass it through. + if !self.index_store.has_pk_index() { + return Ok(batch); + } + let Some(max_visible_row) = self.max_visible_row else { + return Ok(batch); + }; + if batch.num_rows() == 0 { + return Ok(batch); + } + + let pk_indices = resolve_pk_indices(&batch, &self.pk_columns)?; + let row_ids = batch + .column_by_name(&self.row_id_column) + .ok_or_else(|| { + DataFusionError::Internal(format!( + "Row-id column '{}' not found in NewestPkFilterExec input", + self.row_id_column + )) + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| { + DataFusionError::Internal(format!( + "Row-id column '{}' is not UInt64", + self.row_id_column + )) + })?; + + let mut keep = Vec::with_capacity(batch.num_rows()); + for row in 0..batch.num_rows() { + // A null row position can't be ordered; keep it rather than guess + // (callers always project a real position here). + if row_ids.is_null(row) { + keep.push(true); + continue; + } + let position = row_ids.value(row); + let values: Vec = pk_indices + .iter() + .map(|&col| ScalarValue::try_from_array(batch.column(col), row)) + .collect::>()?; + // Keep iff this hit is the newest visible version of its PK. + keep.push( + self.index_store + .pk_is_newest(&values, position, max_visible_row), + ); + } + filter_record_batch(&batch, &BooleanArray::from(keep)) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) + } +} + +impl Stream for NewestPkFilterStream { + type Item = DFResult; + + fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + match self.input.poll_next_unpin(cx) { + Poll::Ready(Some(Ok(batch))) => Poll::Ready(Some(self.filter_batch(batch))), + other => other, + } + } +} + +impl datafusion::physical_plan::RecordBatchStream for NewestPkFilterStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::Int32Array; + use arrow_schema::{DataType, Field, Schema}; + use datafusion::prelude::SessionContext; + use datafusion_physical_plan::test::TestMemoryExec; + use futures::TryStreamExt; + + /// Single-column `id` PK batch, one per append so a caller can control + /// row-level visibility via `max_visible_batch_position`. + fn id_batch(id: i32) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(vec![id]))]).unwrap() + } + + /// Index-search "hits": `(id, _rowid)` pairs the filter evaluates. + fn hits(rows: &[(i32, u64)]) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new(lance_core::ROW_ID, DataType::UInt64, true), + ])); + let ids: Vec = rows.iter().map(|(id, _)| *id).collect(); + let rowids: Vec = rows.iter().map(|(_, p)| *p).collect(); + RecordBatch::try_new( + schema, + vec![ + Arc::new(Int32Array::from(ids)), + Arc::new(UInt64Array::from(rowids)), + ], + ) + .unwrap() + } + + /// Build an active memtable whose PK index + BatchStore hold one row per + /// `id` in `appended` (positions 0..n), all committed. + fn active(appended: &[i32]) -> (Arc, Arc) { + let batch_store = Arc::new(BatchStore::with_capacity(16)); + let mut index = IndexStore::new(); + index.enable_pk_index(&[("id".to_string(), 0)]); + for &id in appended { + let b = id_batch(id); + let (bp, off, _) = batch_store.append(b.clone()).unwrap(); + index.insert_with_batch_position(&b, off, Some(bp)).unwrap(); + } + (Arc::new(index), batch_store) + } + + async fn run( + index_store: Arc, + batch_store: Arc, + max_visible_batch_position: usize, + hits_batch: RecordBatch, + ) -> Vec<(i32, u64)> { + let input = + TestMemoryExec::try_new_exec(&[vec![hits_batch.clone()]], hits_batch.schema(), None) + .unwrap(); + let exec = NewestPkFilterExec::new( + input, + vec!["id".to_string()], + lance_core::ROW_ID, + index_store, + batch_store, + max_visible_batch_position, + ); + let ctx = SessionContext::new(); + let out: Vec = exec + .execute(0, ctx.task_ctx()) + .unwrap() + .try_collect() + .await + .unwrap(); + let mut rows = Vec::new(); + for b in &out { + let ids = b.column(0).as_any().downcast_ref::().unwrap(); + let pos = b.column(1).as_any().downcast_ref::().unwrap(); + for i in 0..b.num_rows() { + rows.push((ids.value(i), pos.value(i))); + } + } + rows + } + + #[tokio::test] + async fn keeps_only_the_newest_visible_position_per_pk() { + // id=1 written at positions 0 and 2 (an update), id=2 at position 1; all + // visible. A stale hit (id=1 @ 0) is dropped; the newest (id=1 @ 2) and + // the unrelated id=2 survive — even though all three were "returned" by + // the index search. + let (index, store) = active(&[1, 2, 1]); + let rows = run(index, store, 2, hits(&[(1, 0), (2, 1), (1, 2)])).await; + assert_eq!(rows, vec![(2, 1), (1, 2)]); + } + + #[tokio::test] + async fn does_not_vanish_a_visible_row_under_a_newer_invisible_write() { + // The store/index hold id=1 at positions 0 and 2, but the query latched + // `max_visible_batch_position = 0` (only position 0 visible) — i.e. the + // update at position 2 was committed *after* this query's snapshot. The + // visible older row (id=1 @ 0) must be KEPT (its newest *visible* version + // is itself), not dropped because of the not-yet-visible position 2. + let (index, store) = active(&[1, 2, 1]); + let kept = run(index.clone(), store.clone(), 0, hits(&[(1, 0)])).await; + assert_eq!(kept, vec![(1, 0)], "visible row must not vanish"); + + // And the not-yet-visible position is itself dropped (outside snapshot). + let dropped = run(index, store, 0, hits(&[(1, 2)])).await; + assert!( + dropped.is_empty(), + "row beyond the snapshot must be dropped" + ); + } + + #[tokio::test] + async fn passes_through_when_no_pk_index() { + // A memtable without a primary-key index can't be deduped here, so the + // filter is a pass-through rather than dropping everything. + let batch_store = Arc::new(BatchStore::with_capacity(16)); + batch_store.append(id_batch(1)).unwrap(); + let index = Arc::new(IndexStore::new()); // no enable_pk_index + let rows = run(index, batch_store, 0, hits(&[(1, 0), (1, 9)])).await; + assert_eq!(rows, vec![(1, 0), (1, 9)]); + } +} diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/pk.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/pk.rs index 523dd30bf82..0707eb5e8dd 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/exec/pk.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/exec/pk.rs @@ -4,7 +4,7 @@ //! Shared primary-key helpers for the LSM scanner execution nodes. //! //! Centralizes PK column resolution and per-row hashing so that every -//! consumer (e.g. [`super::WithinSourceDedupExec`], [`super::PkHashFilterExec`]) +//! consumer (e.g. [`super::PkBlockFilterExec`], [`super::NewestPkFilterExec`]) //! resolves and hashes a primary key the same way. The row hash is kept //! consistent with the variants supported by [`super::compute_pk_hash_from_scalars`] //! so a single PK produces the same hash regardless of which exec consumes it. diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/pk_block_filter.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/pk_block_filter.rs new file mode 100644 index 00000000000..4eb703e99ab --- /dev/null +++ b/rust/lance/src/dataset/mem_wal/scanner/exec/pk_block_filter.rs @@ -0,0 +1,344 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright The Lance Authors + +//! Drop superseded rows from a per-source result by primary-key membership. +//! +//! Drops a row when any newer generation's membership ([`GenMembership`]) +//! contains its primary key — in-memory generations probe their PK index by +//! value, flushed generations probe their on-disk PK BTree with `Equals`. Used +//! both as the KNN post-filter (vector search, with over-fetch) and the +//! cross-generation scan filter (`k = 0`). +//! +//! Cross-generation only: within-gen duplicates collapse via the global dedup's +//! `(generation, freshness)` tiebreaker. +//! +//! Post-filters an over-fetched KNN (the planner's `overfetch_factor`); warns +//! when a source had >= k candidates but < k survived (over-fetch too small). +//! +//! Perf note: the on-disk probe is one `Equals` per row per flushed generation. +//! It is not disk-bound in steady state — the opened index and its (small, +//! memtable-sized) pages are held by the injected `FlushedMemTableCache` / +//! `LanceCache`, so after the first touch every probe is memory-resident. The +//! residual per-row cost is the in-memory BTree search plus a `SearchResult` +//! allocation per lookup; a batched/existence-only membership probe (avoiding +//! the per-row allocation) is the future optimization. + +use std::any::Any; +use std::fmt; +use std::pin::Pin; +use std::sync::Arc; +use std::task::{Context, Poll}; + +use arrow::compute::filter_record_batch; +use arrow_array::{BooleanArray, RecordBatch}; +use arrow_schema::SchemaRef; +use datafusion::common::ScalarValue; +use datafusion::error::{DataFusionError, Result as DFResult}; +use datafusion::execution::TaskContext; +use datafusion::physical_expr::EquivalenceProperties; +use datafusion::physical_plan::{ + DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, + SendableRecordBatchStream, +}; +use futures::future::BoxFuture; +use futures::{FutureExt, Stream, StreamExt}; +use tracing::warn; + +use super::super::block_list::{GenMembership, on_disk_pk_key}; +use super::pk::resolve_pk_indices; + +/// Filters out rows whose PK is contained in any newer generation's membership. +#[derive(Debug)] +pub struct PkBlockFilterExec { + input: Arc, + pk_columns: Vec, + /// Newer generations' membership; a row is blocked if any contains its PK. + blocked: Vec, + /// Target neighbor count, used only to warn on a per-source under-fetch. + k: usize, + properties: Arc, +} + +impl PkBlockFilterExec { + pub fn new( + input: Arc, + pk_columns: Vec, + blocked: Vec, + k: usize, + ) -> Self { + // A filter preserves the input schema and partitioning. + let properties = Arc::new(PlanProperties::new( + EquivalenceProperties::new(input.schema()), + input.output_partitioning().clone(), + input.pipeline_behavior(), + input.boundedness(), + )); + Self { + input, + pk_columns, + blocked, + k, + properties, + } + } +} + +impl DisplayAs for PkBlockFilterExec { + fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { + match t { + DisplayFormatType::Default + | DisplayFormatType::Verbose + | DisplayFormatType::TreeRender => { + write!( + f, + "PkBlockFilterExec: pk_cols=[{}], gens={}", + self.pk_columns.join(", "), + self.blocked.len(), + ) + } + } + } +} + +impl ExecutionPlan for PkBlockFilterExec { + fn name(&self) -> &str { + "PkBlockFilterExec" + } + + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> SchemaRef { + self.input.schema() + } + + fn properties(&self) -> &Arc { + &self.properties + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.input] + } + + fn with_new_children( + self: Arc, + children: Vec>, + ) -> DFResult> { + if children.len() != 1 { + return Err(DataFusionError::Internal( + "PkBlockFilterExec requires exactly one child".to_string(), + )); + } + Ok(Arc::new(Self::new( + children[0].clone(), + self.pk_columns.clone(), + self.blocked.clone(), + self.k, + ))) + } + + fn execute( + &self, + partition: usize, + context: Arc, + ) -> DFResult { + let input_stream = self.input.execute(partition, context)?; + Ok(Box::pin(PkBlockFilterStream { + input: input_stream, + pk_columns: self.pk_columns.clone(), + blocked: self.blocked.clone(), + k: self.k, + schema: self.schema(), + pending: None, + input_seen: 0, + kept: 0, + warned: false, + })) + } +} + +struct PkBlockFilterStream { + input: SendableRecordBatchStream, + pk_columns: Vec, + blocked: Vec, + k: usize, + schema: SchemaRef, + /// The in-flight filter for the batch currently being processed (the probe + /// is async, so a batch is filtered off-poll and resumed here). + pending: Option>>, + input_seen: usize, + kept: usize, + warned: bool, +} + +/// Keep only the rows no newer-gen membership contains. Async because flushed +/// generations are probed against their on-disk PK BTree. +async fn filter_batch( + batch: RecordBatch, + pk_columns: Vec, + blocked: Vec, +) -> DFResult { + if blocked.is_empty() || batch.num_rows() == 0 { + return Ok(batch); + } + let pk_indices = resolve_pk_indices(&batch, &pk_columns)?; + let to_df = |e: lance_core::Error| DataFusionError::Execution(e.to_string()); + + let mut keep = Vec::with_capacity(batch.num_rows()); + for row in 0..batch.num_rows() { + let values: Vec = pk_indices + .iter() + .map(|&col| ScalarValue::try_from_array(batch.column(col), row)) + .collect::>()?; + let key = on_disk_pk_key(&values).map_err(to_df)?; + let mut blocked_row = false; + for membership in &blocked { + if membership.contains(&key).await.map_err(to_df)? { + blocked_row = true; + break; + } + } + keep.push(!blocked_row); + } + filter_record_batch(&batch, &BooleanArray::from(keep)) + .map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) +} + +impl Stream for PkBlockFilterStream { + type Item = DFResult; + + fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { + let this = self.get_mut(); + loop { + // Drive an in-flight filter to completion before pulling more input. + if let Some(fut) = this.pending.as_mut() { + return match fut.as_mut().poll(cx) { + Poll::Ready(Ok(out)) => { + this.pending = None; + this.kept += out.num_rows(); + Poll::Ready(Some(Ok(out))) + } + Poll::Ready(Err(e)) => { + this.pending = None; + Poll::Ready(Some(Err(e))) + } + Poll::Pending => Poll::Pending, + }; + } + + match this.input.poll_next_unpin(cx) { + Poll::Ready(Some(Ok(batch))) => { + this.input_seen += batch.num_rows(); + this.pending = Some( + filter_batch(batch, this.pk_columns.clone(), this.blocked.clone()).boxed(), + ); + // Loop to poll the just-created future. + } + Poll::Ready(Some(Err(e))) => return Poll::Ready(Some(Err(e))), + Poll::Ready(None) => { + // >= k candidates in, < k out: over-fetch missed superseded rows. + if !this.warned && this.input_seen >= this.k && this.kept < this.k { + warn!( + k = this.k, + fetched = this.input_seen, + kept = this.kept, + "LSM vector search: < k live rows survived the PK post-filter; \ + raise the over-fetch factor or use a true KNN prefilter." + ); + this.warned = true; + } + return Poll::Ready(None); + } + Poll::Pending => return Poll::Pending, + } + } + } +} + +impl datafusion::physical_plan::RecordBatchStream for PkBlockFilterStream { + fn schema(&self) -> SchemaRef { + self.schema.clone() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; + use arrow_array::Int32Array; + use arrow_schema::{DataType, Field, Schema}; + use datafusion::prelude::SessionContext; + use datafusion_physical_plan::test::TestMemoryExec; + use futures::TryStreamExt; + + fn int_batch(ids: &[i32]) -> RecordBatch { + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); + RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(ids.to_vec()))]).unwrap() + } + + /// An in-memory membership whose PK index holds `ids` (positions 0..n). + fn membership(ids: &[i32]) -> GenMembership { + let store = BatchStore::with_capacity(16); + let mut index = IndexStore::new(); + index.enable_pk_index(&[("id".to_string(), 0)]); + for &id in ids { + let b = int_batch(&[id]); + let (bp, off, _) = store.append(b.clone()).unwrap(); + index.insert_with_batch_position(&b, off, Some(bp)).unwrap(); + } + let max_visible_row = store.max_visible_row(index.max_visible_batch_position()); + GenMembership::InMemory { + index_store: Arc::new(index), + max_visible_row, + } + } + + async fn run(exec: PkBlockFilterExec) -> Vec { + let ctx = SessionContext::new(); + let out: Vec = exec + .execute(0, ctx.task_ctx()) + .unwrap() + .try_collect() + .await + .unwrap(); + out.iter() + .flat_map(|b| { + b.column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap() + .values() + .to_vec() + }) + .collect() + } + + #[tokio::test] + async fn drops_rows_blocked_by_a_newer_generation() { + let b = int_batch(&[10, 20, 30]); + let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); + let exec = + PkBlockFilterExec::new(input, vec!["id".to_string()], vec![membership(&[20])], 1); + assert_eq!(run(exec).await, vec![10, 30]); + } + + #[tokio::test] + async fn blocks_a_pk_present_in_any_generation() { + // Two newer-gen memberships: a row is dropped if either contains its PK. + let b = int_batch(&[10, 20, 30]); + let blocked = vec![membership(&[10]), membership(&[30])]; + let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); + let exec = PkBlockFilterExec::new(input, vec!["id".to_string()], blocked, 1); + assert_eq!(run(exec).await, vec![20]); + } + + #[tokio::test] + async fn empty_blocked_keeps_all_rows() { + let b = int_batch(&[1, 2, 3]); + let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); + let exec = PkBlockFilterExec::new(input, vec!["id".to_string()], Vec::new(), 1); + assert_eq!(run(exec).await, vec![1, 2, 3]); + } +} diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/pk_hash_filter.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/pk_hash_filter.rs deleted file mode 100644 index ee473047d01..00000000000 --- a/rust/lance/src/dataset/mem_wal/scanner/exec/pk_hash_filter.rs +++ /dev/null @@ -1,350 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright The Lance Authors - -//! Drop superseded rows from a per-source KNN result by primary-key hash. -//! -//! Drops a row when its PK hash ([`super::compute_pk_hash`]) is in any `blocked` -//! set — the newer generations' membership (`Arc`, shared, never merged; -//! base table: all generations). Only the KNN output is hashed. -//! -//! Cross-generation only: within-gen duplicates share a hash, so the global -//! dedup's `(generation, freshness)` tiebreaker collapses those instead. -//! -//! Post-filters an over-fetched KNN (the planner's `overfetch_factor`); warns -//! when a source had >= k candidates but < k survived (over-fetch too small). - -use std::any::Any; -use std::collections::HashSet; -use std::fmt; -use std::pin::Pin; -use std::sync::Arc; -use std::task::{Context, Poll}; - -use arrow::compute::filter_record_batch; -use arrow_array::{BooleanArray, RecordBatch}; -use arrow_schema::SchemaRef; -use datafusion::error::{DataFusionError, Result as DFResult}; -use datafusion::execution::TaskContext; -use datafusion::physical_expr::EquivalenceProperties; -use datafusion::physical_plan::{ - DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, - SendableRecordBatchStream, -}; -use futures::{Stream, StreamExt}; -use tracing::warn; - -use super::pk::{compute_pk_hash, resolve_pk_indices}; - -/// Filters out rows whose PK hash is in any set of `blocked`. -#[derive(Debug)] -pub struct PkHashFilterExec { - input: Arc, - pk_columns: Vec, - /// Newer generations' membership; a row is blocked if any set holds its hash. - blocked: Vec>>, - /// Target neighbor count, used only to warn on a per-source under-fetch. - k: usize, - properties: Arc, -} - -impl PkHashFilterExec { - pub fn new( - input: Arc, - pk_columns: Vec, - blocked: Vec>>, - k: usize, - ) -> Self { - // A filter preserves the input schema and partitioning. - let properties = Arc::new(PlanProperties::new( - EquivalenceProperties::new(input.schema()), - input.output_partitioning().clone(), - input.pipeline_behavior(), - input.boundedness(), - )); - Self { - input, - pk_columns, - blocked, - k, - properties, - } - } -} - -impl DisplayAs for PkHashFilterExec { - fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { - match t { - DisplayFormatType::Default - | DisplayFormatType::Verbose - | DisplayFormatType::TreeRender => { - let total: usize = self.blocked.iter().map(|s| s.len()).sum(); - write!( - f, - "PkHashFilterExec: pk_cols=[{}], gens={}, blocked={}", - self.pk_columns.join(", "), - self.blocked.len(), - total, - ) - } - } - } -} - -impl ExecutionPlan for PkHashFilterExec { - fn name(&self) -> &str { - "PkHashFilterExec" - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn schema(&self) -> SchemaRef { - self.input.schema() - } - - fn properties(&self) -> &Arc { - &self.properties - } - - fn children(&self) -> Vec<&Arc> { - vec![&self.input] - } - - fn with_new_children( - self: Arc, - children: Vec>, - ) -> DFResult> { - if children.len() != 1 { - return Err(DataFusionError::Internal( - "PkHashFilterExec requires exactly one child".to_string(), - )); - } - Ok(Arc::new(Self::new( - children[0].clone(), - self.pk_columns.clone(), - self.blocked.clone(), - self.k, - ))) - } - - fn execute( - &self, - partition: usize, - context: Arc, - ) -> DFResult { - let input_stream = self.input.execute(partition, context)?; - Ok(Box::pin(PkHashFilterStream { - input: input_stream, - pk_columns: self.pk_columns.clone(), - blocked: self.blocked.clone(), - k: self.k, - schema: self.schema(), - input_seen: 0, - kept: 0, - warned: false, - })) - } -} - -struct PkHashFilterStream { - input: SendableRecordBatchStream, - pk_columns: Vec, - blocked: Vec>>, - k: usize, - schema: SchemaRef, - input_seen: usize, - kept: usize, - warned: bool, -} - -impl PkHashFilterStream { - fn filter_batch(&self, batch: RecordBatch) -> DFResult { - if self.blocked.is_empty() || batch.num_rows() == 0 { - return Ok(batch); - } - let pk_indices = resolve_pk_indices(&batch, &self.pk_columns)?; - let keep: BooleanArray = (0..batch.num_rows()) - .map(|row| { - let hash = compute_pk_hash(&batch, &pk_indices, row); - !self.blocked.iter().any(|set| set.contains(&hash)) - }) - .collect(); - filter_record_batch(&batch, &keep) - .map_err(|e| DataFusionError::ArrowError(Box::new(e), None)) - } -} - -impl Stream for PkHashFilterStream { - type Item = DFResult; - - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - match self.input.poll_next_unpin(cx) { - Poll::Ready(Some(Ok(batch))) => { - self.input_seen += batch.num_rows(); - match self.filter_batch(batch) { - Ok(out) => { - self.kept += out.num_rows(); - Poll::Ready(Some(Ok(out))) - } - Err(e) => Poll::Ready(Some(Err(e))), - } - } - Poll::Ready(None) => { - // >= k candidates in, < k out: the over-fetch missed superseded rows. - if !self.warned && self.input_seen >= self.k && self.kept < self.k { - warn!( - k = self.k, - fetched = self.input_seen, - kept = self.kept, - "LSM vector search: < k live rows survived the PK-hash post-filter; \ - raise the over-fetch factor or use a true KNN prefilter." - ); - self.warned = true; - } - Poll::Ready(None) - } - other => other, - } - } -} - -impl datafusion::physical_plan::RecordBatchStream for PkHashFilterStream { - fn schema(&self) -> SchemaRef { - self.schema.clone() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow_array::{Int32Array, StringArray}; - use arrow_schema::{DataType, Field, Schema}; - use datafusion::prelude::SessionContext; - use datafusion_physical_plan::test::TestMemoryExec; - use futures::TryStreamExt; - - /// Hash a single-column Int32 PK value the way the exec does, so a test can - /// build blocked sets from values rather than hand-computed hashes. - fn hash_int_pk(id: i32) -> u64 { - let batch = int_batch(&[id]); - let pk_indices = resolve_pk_indices(&batch, &["id".to_string()]).unwrap(); - compute_pk_hash(&batch, &pk_indices, 0) - } - - fn int_batch(ids: &[i32]) -> RecordBatch { - let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); - RecordBatch::try_new(schema, vec![Arc::new(Int32Array::from(ids.to_vec()))]).unwrap() - } - - fn blocked(ids: &[i32]) -> Vec>> { - vec![Arc::new(ids.iter().map(|&id| hash_int_pk(id)).collect())] - } - - async fn run(exec: PkHashFilterExec) -> Vec { - let ctx = SessionContext::new(); - let out: Vec = exec - .execute(0, ctx.task_ctx()) - .unwrap() - .try_collect() - .await - .unwrap(); - out.iter() - .flat_map(|b| { - b.column_by_name("id") - .unwrap() - .as_any() - .downcast_ref::() - .unwrap() - .values() - .to_vec() - }) - .collect() - } - - #[tokio::test] - async fn drops_rows_with_blocked_pk_hash() { - let b = int_batch(&[10, 20, 30]); - let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); - let exec = PkHashFilterExec::new(input, vec!["id".to_string()], blocked(&[20]), 1); - assert_eq!(run(exec).await, vec![10, 30]); - } - - #[tokio::test] - async fn blocks_a_pk_present_in_any_generation_set() { - // Two newer-gen sets: a row is dropped if either contains its PK. - let b = int_batch(&[10, 20, 30]); - let sets = vec![ - Arc::new(HashSet::from([hash_int_pk(10)])), - Arc::new(HashSet::from([hash_int_pk(30)])), - ]; - let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); - let exec = PkHashFilterExec::new(input, vec!["id".to_string()], sets, 1); - assert_eq!(run(exec).await, vec![20]); - } - - #[tokio::test] - async fn empty_blocked_keeps_all_rows() { - let b = int_batch(&[1, 2, 3]); - let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); - let exec = PkHashFilterExec::new(input, vec!["id".to_string()], Vec::new(), 1); - assert_eq!(run(exec).await, vec![1, 2, 3]); - } - - #[tokio::test] - async fn null_pk_is_hashed_consistently_and_blockable() { - // A null PK hashes deterministically (compute_pk_hash hashes is_null), - // so a superseded null-key row can be dropped like any other. - let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, true)])); - let with_null = |ids: Vec>| { - RecordBatch::try_new(schema.clone(), vec![Arc::new(Int32Array::from(ids))]).unwrap() - }; - let pk = vec!["id".to_string()]; - let null_row = with_null(vec![None]); - let pk_indices = resolve_pk_indices(&null_row, &pk).unwrap(); - let sets = vec![Arc::new(HashSet::from([compute_pk_hash( - &null_row, - &pk_indices, - 0, - )]))]; - - // Rows: 10, NULL, 30 — only the NULL-key row is dropped. - let b = with_null(vec![Some(10), None, Some(30)]); - let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); - let exec = PkHashFilterExec::new(input, pk, sets, 1); - assert_eq!(run(exec).await, vec![10, 30]); - } - - #[tokio::test] - async fn composite_pk_hash_matches_block_set() { - // Composite PK (id, name): block the (2, "b") tuple only. - let schema = Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, false), - ])); - let mk = |ids: &[i32], names: &[&str]| { - RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(Int32Array::from(ids.to_vec())), - Arc::new(StringArray::from(names.to_vec())), - ], - ) - .unwrap() - }; - let pk = vec!["id".to_string(), "name".to_string()]; - let one_row = mk(&[2], &["b"]); - let pk_indices = resolve_pk_indices(&one_row, &pk).unwrap(); - let sets = vec![Arc::new(HashSet::from([compute_pk_hash( - &one_row, - &pk_indices, - 0, - )]))]; - - // (1,"a") and (2,"a") survive; only the exact (2,"b") tuple is dropped. - let b = mk(&[1, 2, 2], &["a", "a", "b"]); - let input = TestMemoryExec::try_new_exec(&[vec![b.clone()]], b.schema(), None).unwrap(); - let exec = PkHashFilterExec::new(input, pk, sets, 1); - assert_eq!(run(exec).await, vec![1, 2]); - } -} diff --git a/rust/lance/src/dataset/mem_wal/scanner/exec/within_source_dedup.rs b/rust/lance/src/dataset/mem_wal/scanner/exec/within_source_dedup.rs deleted file mode 100644 index be5dae6a668..00000000000 --- a/rust/lance/src/dataset/mem_wal/scanner/exec/within_source_dedup.rs +++ /dev/null @@ -1,432 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright The Lance Authors - -//! WithinSourceDedupExec - Deduplicates rows with the same primary key from a -//! single LSM source, keeping the newest insert. -//! -//! In MemWAL/LSM mode the same primary key can be written multiple times into -//! the same memtable. The active memtable stores rows in insert order (larger -//! `_rowaddr` = newer), while flushed memtables are reverse-written so that -//! within a flushed file the smallest `_rowid` is the newest insert (see -//! `memtable/flush.rs:152` and `hnsw/storage.rs:307`). Point lookup uses this -//! node to collapse such duplicates *within a single source* so that the -//! downstream `CoalesceFirstExec` / `LIMIT` sees at most one row per primary -//! key per source. - -use std::any::Any; -use std::collections::HashMap; -use std::fmt; -use std::pin::Pin; -use std::sync::Arc; -use std::task::{Context, Poll}; - -use arrow_array::{Array, RecordBatch, UInt64Array}; -use arrow_schema::SchemaRef; -use datafusion::error::Result as DFResult; -use datafusion::execution::TaskContext; -use datafusion::physical_expr::{EquivalenceProperties, Partitioning}; -use datafusion::physical_plan::{ - DisplayAs, DisplayFormatType, ExecutionPlan, ExecutionPlanProperties, PlanProperties, - SendableRecordBatchStream, -}; -use futures::{Stream, StreamExt, ready}; - -use super::pk::{compute_pk_hash, resolve_pk_indices}; - -/// Among rows that share a primary key, which row-address extreme identifies -/// the newest insert to keep. The kept row is always the freshest; only the -/// row address (`_rowaddr`/`_rowid`) used to find it differs by source. -#[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum DedupDirection { - /// Keep the row with the largest row-address value (active memtable: larger - /// `_rowaddr` = inserted later). - KeepMaxRowAddr, - /// Keep the row with the smallest row-address value (flushed memtable under - /// reverse-write: smaller `_rowid` = inserted later). - KeepMinRowAddr, -} - -/// Deduplicates rows from a single source by primary key, keeping the row -/// whose `row_addr_column` value wins per [`DedupDirection`]. -/// -/// # Required columns -/// -/// The input must expose: -/// - All `pk_columns` -/// - `row_addr_column` of `UInt64` type -/// -/// The output schema is unchanged from the input. Callers that need to hide -/// the row-address column from downstream consumers should compose this node -/// with `project_to_canonical` or `null_columns`. -/// -/// # Performance -/// -/// Memory: `O(unique primary keys in input)`. For point lookup the input is -/// already filtered to a single primary key so the map holds at most one -/// entry. -#[derive(Debug)] -pub struct WithinSourceDedupExec { - input: Arc, - pk_columns: Vec, - row_addr_column: String, - direction: DedupDirection, - schema: SchemaRef, - properties: Arc, -} - -impl WithinSourceDedupExec { - pub fn new( - input: Arc, - pk_columns: Vec, - row_addr_column: impl Into, - direction: DedupDirection, - ) -> Self { - let schema = input.schema(); - let properties = Arc::new(PlanProperties::new( - EquivalenceProperties::new(schema.clone()), - Partitioning::UnknownPartitioning(1), - input.pipeline_behavior(), - input.boundedness(), - )); - Self { - input, - pk_columns, - row_addr_column: row_addr_column.into(), - direction, - schema, - properties, - } - } - - pub fn pk_columns(&self) -> &[String] { - &self.pk_columns - } - - pub fn row_addr_column(&self) -> &str { - &self.row_addr_column - } - - pub fn direction(&self) -> DedupDirection { - self.direction - } -} - -impl DisplayAs for WithinSourceDedupExec { - fn fmt_as(&self, t: DisplayFormatType, f: &mut fmt::Formatter) -> fmt::Result { - match t { - DisplayFormatType::Default - | DisplayFormatType::Verbose - | DisplayFormatType::TreeRender => { - write!( - f, - "WithinSourceDedupExec: pk=[{}], row_addr={}, direction={:?}", - self.pk_columns.join(", "), - self.row_addr_column, - self.direction, - ) - } - } - } -} - -impl ExecutionPlan for WithinSourceDedupExec { - fn name(&self) -> &str { - "WithinSourceDedupExec" - } - - fn as_any(&self) -> &dyn Any { - self - } - - fn schema(&self) -> SchemaRef { - self.schema.clone() - } - - fn properties(&self) -> &Arc { - &self.properties - } - - fn children(&self) -> Vec<&Arc> { - vec![&self.input] - } - - fn with_new_children( - self: Arc, - children: Vec>, - ) -> DFResult> { - if children.len() != 1 { - return Err(datafusion::error::DataFusionError::Internal( - "WithinSourceDedupExec requires exactly one child".to_string(), - )); - } - Ok(Arc::new(Self::new( - children[0].clone(), - self.pk_columns.clone(), - self.row_addr_column.clone(), - self.direction, - ))) - } - - fn execute( - &self, - partition: usize, - context: Arc, - ) -> DFResult { - let input_stream = self.input.execute(partition, context)?; - Ok(Box::pin(WithinSourceDedupStream { - input: input_stream, - pk_columns: self.pk_columns.clone(), - row_addr_column: self.row_addr_column.clone(), - direction: self.direction, - schema: self.schema.clone(), - winners: HashMap::new(), - emitted: false, - })) - } -} - -/// One winning row, materialized as a single-row `RecordBatch` so we don't -/// have to keep the source batch alive after we've picked the winner. -struct Winner { - batch: RecordBatch, - row_addr: u64, -} - -struct WithinSourceDedupStream { - input: SendableRecordBatchStream, - pk_columns: Vec, - row_addr_column: String, - direction: DedupDirection, - schema: SchemaRef, - winners: HashMap, - emitted: bool, -} - -impl WithinSourceDedupStream { - fn consume_batch(&mut self, batch: RecordBatch) -> DFResult<()> { - if batch.num_rows() == 0 { - return Ok(()); - } - let pk_indices = resolve_pk_indices(&batch, &self.pk_columns)?; - let row_addr_array = batch - .column_by_name(&self.row_addr_column) - .ok_or_else(|| { - datafusion::error::DataFusionError::Internal(format!( - "Row-address column '{}' not found in batch", - self.row_addr_column - )) - })? - .as_any() - .downcast_ref::() - .ok_or_else(|| { - datafusion::error::DataFusionError::Internal(format!( - "Row-address column '{}' is not UInt64", - self.row_addr_column - )) - })?; - - for row_idx in 0..batch.num_rows() { - if row_addr_array.is_null(row_idx) { - // A NULL row address can't be ordered against a real one. Skip - // rather than guess — callers should always project a real - // row-address column for dedup-eligible sources. - continue; - } - let row_addr = row_addr_array.value(row_idx); - let pk_hash = compute_pk_hash(&batch, &pk_indices, row_idx); - - let take_row = match self.winners.get(&pk_hash) { - None => true, - Some(existing) => match self.direction { - DedupDirection::KeepMaxRowAddr => row_addr > existing.row_addr, - DedupDirection::KeepMinRowAddr => row_addr < existing.row_addr, - }, - }; - - if take_row { - let single = batch.slice(row_idx, 1); - self.winners.insert( - pk_hash, - Winner { - batch: single, - row_addr, - }, - ); - } - } - Ok(()) - } - - fn finalize(&mut self) -> DFResult { - if self.winners.is_empty() { - return Ok(RecordBatch::new_empty(self.schema.clone())); - } - let batches: Vec = self.winners.drain().map(|(_, w)| w.batch).collect(); - let batch_refs: Vec<&RecordBatch> = batches.iter().collect(); - arrow_select::concat::concat_batches(&self.schema, batch_refs) - .map_err(|e| datafusion::error::DataFusionError::ArrowError(Box::new(e), None)) - } -} - -impl Stream for WithinSourceDedupStream { - type Item = DFResult; - - fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll> { - loop { - if self.emitted { - return Poll::Ready(None); - } - match ready!(self.input.poll_next_unpin(cx)) { - Some(Ok(batch)) => { - if let Err(e) = self.consume_batch(batch) { - self.emitted = true; - return Poll::Ready(Some(Err(e))); - } - } - Some(Err(e)) => { - self.emitted = true; - return Poll::Ready(Some(Err(e))); - } - None => { - self.emitted = true; - return Poll::Ready(Some(self.finalize())); - } - } - } - } -} - -impl datafusion::physical_plan::RecordBatchStream for WithinSourceDedupStream { - fn schema(&self) -> SchemaRef { - self.schema.clone() - } -} - -#[cfg(test)] -mod tests { - use super::*; - use arrow_array::{Float32Array, Int32Array, StringArray}; - use arrow_schema::{DataType, Field, Schema}; - use datafusion::prelude::SessionContext; - use datafusion_physical_plan::test::TestMemoryExec; - use futures::TryStreamExt; - - fn create_test_schema() -> SchemaRef { - Arc::new(Schema::new(vec![ - Field::new("id", DataType::Int32, false), - Field::new("name", DataType::Utf8, true), - Field::new("_distance", DataType::Float32, true), - Field::new("_row_addr", DataType::UInt64, true), - ])) - } - - fn batch(ids: &[i32], names: &[&str], distances: &[f32], row_addr: &[u64]) -> RecordBatch { - let schema = create_test_schema(); - RecordBatch::try_new( - schema, - vec![ - Arc::new(Int32Array::from(ids.to_vec())), - Arc::new(StringArray::from(names.to_vec())), - Arc::new(Float32Array::from(distances.to_vec())), - Arc::new(UInt64Array::from(row_addr.to_vec())), - ], - ) - .unwrap() - } - - async fn run(batches: Vec, direction: DedupDirection) -> Vec { - let schema = create_test_schema(); - let input = TestMemoryExec::try_new_exec(&[batches], schema, None).unwrap(); - let exec = - WithinSourceDedupExec::new(input, vec!["id".to_string()], "_row_addr", direction); - let ctx = SessionContext::new(); - let stream = exec.execute(0, ctx.task_ctx()).unwrap(); - stream.try_collect().await.unwrap() - } - - fn extract(batches: &[RecordBatch]) -> Vec<(i32, String, u64)> { - let mut out = Vec::new(); - for b in batches { - let ids = b.column(0).as_any().downcast_ref::().unwrap(); - let names = b.column(1).as_any().downcast_ref::().unwrap(); - let addr = b.column(3).as_any().downcast_ref::().unwrap(); - for i in 0..b.num_rows() { - out.push((ids.value(i), names.value(i).to_string(), addr.value(i))); - } - } - out.sort_by_key(|(id, _, _)| *id); - out - } - - #[tokio::test] - async fn keep_max_picks_largest_row_addr() { - // Active-memtable case: same pk inserted twice; newer = larger _rowaddr. - let b1 = batch( - &[1, 1, 2], - &["old", "new", "two"], - &[0.1, 0.2, 0.3], - &[10, 99, 5], - ); - let out = run(vec![b1], DedupDirection::KeepMaxRowAddr).await; - let rows = extract(&out); - assert_eq!(rows.len(), 2); - assert_eq!(rows[0], (1, "new".to_string(), 99)); - assert_eq!(rows[1], (2, "two".to_string(), 5)); - } - - #[tokio::test] - async fn keep_min_picks_smallest_row_addr() { - // Flushed-memtable case under reverse-write: newer = smaller _rowid. - let b1 = batch( - &[1, 1, 2], - &["old", "new", "two"], - &[0.1, 0.2, 0.3], - &[99, 10, 5], - ); - let out = run(vec![b1], DedupDirection::KeepMinRowAddr).await; - let rows = extract(&out); - assert_eq!(rows.len(), 2); - assert_eq!(rows[0], (1, "new".to_string(), 10)); - assert_eq!(rows[1], (2, "two".to_string(), 5)); - } - - #[tokio::test] - async fn dedup_across_batches() { - let b1 = batch(&[1, 2], &["a", "b"], &[0.1, 0.2], &[1, 1]); - let b2 = batch(&[1, 3], &["a_new", "c"], &[0.5, 0.4], &[7, 1]); - let out = run(vec![b1, b2], DedupDirection::KeepMaxRowAddr).await; - let rows = extract(&out); - assert_eq!(rows.len(), 3); - assert_eq!(rows[0], (1, "a_new".to_string(), 7)); - assert_eq!(rows[1], (2, "b".to_string(), 1)); - assert_eq!(rows[2], (3, "c".to_string(), 1)); - } - - #[tokio::test] - async fn empty_input() { - let out = run(vec![], DedupDirection::KeepMaxRowAddr).await; - let total: usize = out.iter().map(|b| b.num_rows()).sum(); - assert_eq!(total, 0); - } - - #[tokio::test] - async fn null_row_addr_skipped() { - // Rows with NULL row address can't be ordered — they're dropped so they - // don't accidentally become winners against real values. - let schema = create_test_schema(); - let b = RecordBatch::try_new( - schema.clone(), - vec![ - Arc::new(Int32Array::from(vec![1, 1])), - Arc::new(StringArray::from(vec!["nulladdr", "real"])), - Arc::new(Float32Array::from(vec![0.1, 0.2])), - Arc::new(UInt64Array::from(vec![None, Some(5)])), - ], - ) - .unwrap(); - let out = run(vec![b], DedupDirection::KeepMaxRowAddr).await; - let rows = extract(&out); - assert_eq!(rows.len(), 1); - assert_eq!(rows[0], (1, "real".to_string(), 5)); - } -} diff --git a/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs b/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs index 39abf7e8c71..0c2d3b039fe 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/flushed_cache.rs @@ -41,12 +41,10 @@ use crate::session::Session; pub struct FlushedMemTableCache { // `moka`'s async cache gives a bounded size plus single-flight // `try_get_with`, so concurrent first-queries on a just-flushed - // generation open the dataset exactly once. + // generation open the dataset exactly once. The opened dataset carries the + // session index cache, which also backs each generation's standalone PK + // dedup index (see `block_list::open_pk_index`) — no separate cache path. inner: moka::future::Cache>, - // Per-generation set of PK hashes for the vector-search block-list, keyed by - // the same immutable flushed path. Built lazily on the first query that needs - // it (single-flight) so repeated searches skip re-scanning the PK column. - pk_hashes: moka::future::Cache>>, } impl FlushedMemTableCache { @@ -63,10 +61,6 @@ impl FlushedMemTableCache { // into at build time. .support_invalidation_closures() .build(), - pk_hashes: moka::future::Cache::builder() - .max_capacity(max_entries) - .support_invalidation_closures() - .build(), } } @@ -96,21 +90,6 @@ impl FlushedMemTableCache { .map_err(|e: Arc| Error::cloned(e.to_string())) } - /// Get the cached set of PK hashes for `path`, building it (exactly once) on - /// a miss via `build`. The flushed path is immutable, so a cached set is - /// never stale; concurrent first-queries share one build via `moka`'s - /// single-flight `try_get_with`. - pub async fn get_or_build_pk_hashes( - &self, - path: &str, - build: impl std::future::Future>>, - ) -> Result>> { - self.pk_hashes - .try_get_with(path.to_string(), async move { build.await.map(Arc::new) }) - .await - .map_err(|e: Arc| Error::cloned(e.to_string())) - } - /// Drop cached entries whose path is not in `live_paths`. /// /// Called by the consumer after compaction retires generations. Purely a @@ -125,10 +104,6 @@ impl FlushedMemTableCache { let _ = self .inner .invalidate_entries_if(move |path, _| !live.contains(path)); - let live = live_paths.clone(); - let _ = self - .pk_hashes - .invalidate_entries_if(move |path, _| !live.contains(path)); } } @@ -250,34 +225,6 @@ mod tests { assert_eq!(cache.inner.entry_count(), 1, "exactly one entry cached"); } - #[tokio::test] - async fn pk_hashes_cached_reuses_first_build() { - // The PK-hash set is keyed by the immutable flushed path: a hit returns - // the first-built set and never runs the second build closure. - let cache = FlushedMemTableCache::new(8); - let path = "memory://shard/gen_1"; - let first = cache - .get_or_build_pk_hashes(path, async { Ok(HashSet::from([1u64, 2])) }) - .await - .unwrap(); - let second = cache - .get_or_build_pk_hashes(path, async { - // Different contents; must be ignored because the path is cached. - Ok(HashSet::from([9u64])) - }) - .await - .unwrap(); - assert!( - Arc::ptr_eq(&first, &second), - "a PK-hash cache hit must reuse the first-built set" - ); - assert_eq!( - second.len(), - 2, - "cached set keeps the first build's contents" - ); - } - #[tokio::test] async fn test_retain_paths_drops_unreferenced() { let temp_dir = tempfile::tempdir().unwrap(); diff --git a/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs b/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs index e3ef44d8b1a..2f1c27a6cd8 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/fts_search.rs @@ -44,7 +44,7 @@ use datafusion::physical_plan::ExecutionPlan; use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec; use datafusion::physical_plan::union::UnionExec; -use lance_core::{Error, ROW_ID, Result, is_system_column}; +use lance_core::{Error, Result, is_system_column}; use lance_index::scalar::FullTextSearchQuery; use lance_index::scalar::inverted::query::FtsQuery as IndexFtsQuery; use tracing::instrument; @@ -52,7 +52,7 @@ use tracing::instrument; use super::block_list::compute_source_block_lists; use super::collector::LsmDataSourceCollector; use super::data_source::LsmDataSource; -use super::exec::{DedupDirection, PkHashFilterExec, WithinSourceDedupExec}; +use super::exec::{NewestPkFilterExec, PkBlockFilterExec}; use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; use super::projection::project_to_canonical; use crate::dataset::mem_wal::memtable::scanner::MemTableScanner; @@ -154,13 +154,12 @@ impl LsmFtsSearchPlanner { return self.empty_plan(&target_schema); } - // Per-source PK-hash block sets for cross-generation dedup (NEWER(G) - // per shard; base = union of all gens). Query-type-agnostic — same - // call the vector planner makes. `Box::pin` keeps the future off + // Per-source PK block sets for cross-generation dedup (NEWER(G) per + // shard; base = union of all gens). Query-type-agnostic — same call the + // vector planner makes. `Box::pin` keeps the future off // `clippy::large_futures`. let block_lists = Box::pin(compute_source_block_lists( &sources, - &self.pk_columns, self.session.as_ref(), self.flushed_cache.as_ref(), )) @@ -173,7 +172,7 @@ impl LsmFtsSearchPlanner { let blocked = block_lists.get(&(source.shard_id(), source.generation())); // Over-fetch a blocked source so the post-filter still yields k live // rows. The active arm returns all matches (no builder limit), so its - // within-source dedup needs no over-fetch hint. + // within-source recency filter needs no over-fetch hint. let fetch_k = if blocked.is_some() { ((k as f64) * overfetch).ceil() as usize } else { @@ -181,24 +180,19 @@ impl LsmFtsSearchPlanner { }; let plan = self - .build_source_plan(source, column, &query, fetch_k, projection, is_active) + .build_source_plan(source, column, &query, fetch_k, projection) .await?; // Dedup, mirroring LsmVectorSearchPlanner: - // * active: collapse duplicate-PK appends to the newest insert - // (larger _rowid = inserted later). The FTS index is append-only, - // so an in-memtable update leaves both versions searchable. + // * active: already wrapped in `NewestPkFilterExec` inside + // `build_source_plan` (drops predicate-crossing stale hits, which a + // result-set dedup can't catch). // * flushed/base: drop rows superseded by a newer generation via the // block-list (within-gen is handled by the flushed deletion vector). let deduped = if is_active { - Arc::new(WithinSourceDedupExec::new( - plan, - self.pk_columns.clone(), - ROW_ID, - DedupDirection::KeepMaxRowAddr, - )) as Arc + plan } else if let Some(set) = blocked { - Arc::new(PkHashFilterExec::new( + Arc::new(PkBlockFilterExec::new( plan, self.pk_columns.clone(), set.clone(), @@ -263,7 +257,6 @@ impl LsmFtsSearchPlanner { query: &FullTextSearchQuery, k: usize, projection: Option<&[String]>, - emit_row_id: bool, ) -> Result> { match source { LsmDataSource::BaseTable { dataset } => { @@ -301,11 +294,12 @@ impl LsmFtsSearchPlanner { MemTableScanner::new(batch_store.clone(), index_store.clone(), schema.clone()); let cols = self.fts_scanner_projection(projection); scanner.project(&cols.iter().map(|s| s.as_str()).collect::>()); - // Emit `_rowid` (row position) so the planner can collapse - // duplicate-PK appends via WithinSourceDedupExec before the union. - if emit_row_id { - scanner.with_row_id(); - } + // Expose the row position so the recency filter can identify the + // newest visible version of each PK. The append-only inverted + // index keeps an updated row's old postings live, so a stale hit + // can match a query the fresh row no longer does; the filter + // drops it. `project_to_canonical` strips `_rowid` afterward. + scanner.with_row_id(); // `MemTableScanner::full_text_search` takes a raw match // string; richer query shapes (phrase/boolean/fuzzy) can // be plumbed through once the MemTable scanner accepts a @@ -324,7 +318,19 @@ impl LsmFtsSearchPlanner { // today; the per-partition Sort+fetch above bounds the // emitted rows. let _ = k; - scanner.create_plan().await + let plan = scanner.create_plan().await?; + // Drop predicate-crossing stale hits: keep a hit iff it is the + // newest visible version of its PK (collapses duplicate-PK + // appends too — supersedes the old WithinSourceDedupExec). + let filtered: Arc = Arc::new(NewestPkFilterExec::new( + plan, + self.pk_columns.clone(), + lance_core::ROW_ID, + index_store.clone(), + batch_store.clone(), + scanner.max_visible_batch_position(), + )); + Ok(filtered) } } } @@ -478,6 +484,7 @@ mod tests { // Active memtable with its own FTS index, containing a matching row. let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut indexes = IndexStore::new(); + indexes.enable_pk_index(&[("id".to_string(), 0)]); indexes.add_fts("text_fts".to_string(), 1, "text".to_string()); let active_batch = make_batch( &schema, @@ -646,6 +653,7 @@ mod tests { let schema = fts_schema(); let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut indexes = IndexStore::new(); + indexes.enable_pk_index(&[("id".to_string(), 0)]); indexes.add_fts("text_fts".to_string(), 1, "text".to_string()); // First append (positions 0,1): id=1 is the stale version of the PK. @@ -725,4 +733,88 @@ mod tests { "dedup must keep the newest (max row-position) version" ); } + + #[tokio::test] + async fn active_stale_update_predicate_crossing_leaks() { + // A PK update that crosses out of the match set: pk=1 inserted as + // "alpha lance", then updated to "beta lance". The append-only inverted + // index keeps the old "alpha" posting live, so an "alpha" search still + // matches the STALE pk=1 row — and the fresh "beta lance" row isn't even + // a candidate, so a result-set dedup has nothing to suppress it against. + // `NewestPkFilterExec` drops it predicate-independently: pk=1's newest + // visible row is "beta lance", so the "alpha" hit is not the newest. + let schema = fts_schema(); + let batch_store = Arc::new(BatchStore::with_capacity(16)); + let mut indexes = IndexStore::new(); + indexes.enable_pk_index(&[("id".to_string(), 0)]); + indexes.add_fts("text_fts".to_string(), 1, "text".to_string()); + + // Insert pk=1 ("alpha lance") and an unrelated live pk=2 ("alpha foo"). + let b1 = make_batch(&schema, &[1, 2], &["alpha lance", "alpha foo"]); + let (_, _, bp1) = batch_store.append(b1.clone()).unwrap(); + indexes + .insert_with_batch_position(&b1, 0, Some(bp1)) + .unwrap(); + + // Update pk=1 → "beta lance" (no longer matches "alpha"). + let b2 = make_batch(&schema, &[1], &["beta lance"]); + let (_, _, bp2) = batch_store.append(b2.clone()).unwrap(); + indexes + .insert_with_batch_position(&b2, 1, Some(bp2)) + .unwrap(); + let indexes = Arc::new(indexes); + + let tmp = tempfile::tempdir().unwrap(); + let base_uri = format!("{}/base", tmp.path().to_str().unwrap()); + let collector = LsmDataSourceCollector::without_base_table(base_uri, vec![]) + .with_in_memory_memtables( + uuid::Uuid::new_v4(), + InMemoryMemTables { + active: InMemoryMemTableRef { + batch_store, + index_store: indexes, + schema: schema.clone(), + generation: 1, + }, + frozen: vec![], + }, + ); + + let planner = LsmFtsSearchPlanner::new(collector, vec!["id".to_string()], schema); + let plan = planner + .plan_search( + "text", + FullTextSearchQuery::new("alpha".to_string()), + 10, + None, + ) + .await + .expect("planner should produce a plan"); + + let ctx = datafusion::prelude::SessionContext::new(); + let stream = plan.execute(0, ctx.task_ctx()).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + + let mut ids: Vec = Vec::new(); + for b in &batches { + let col = b + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..b.num_rows() { + ids.push(col.value(i)); + } + } + + assert!( + !ids.contains(&1), + "stale pk=1 (now 'beta lance') leaked on an 'alpha' search; got ids={ids:?}" + ); + assert!( + ids.contains(&2), + "live pk=2 ('alpha foo') must still match 'alpha'; got ids={ids:?}" + ); + } } diff --git a/rust/lance/src/dataset/mem_wal/scanner/planner.rs b/rust/lance/src/dataset/mem_wal/scanner/planner.rs index f3f15e2e680..32cf9ed7880 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/planner.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/planner.rs @@ -15,7 +15,7 @@ use tracing::instrument; use super::collector::LsmDataSourceCollector; use super::data_source::LsmDataSource; -use super::exec::{MEMTABLE_GEN_COLUMN, MemtableGenTagExec, PkHashFilterExec, ROW_ADDRESS_COLUMN}; +use super::exec::{MEMTABLE_GEN_COLUMN, MemtableGenTagExec, PkBlockFilterExec, ROW_ADDRESS_COLUMN}; use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; use super::projection::{ build_scanner_projection, canonical_output_schema, null_columns, project_to_canonical, @@ -82,7 +82,7 @@ impl LsmScanPlanner { /// Each source is independently newest-per-PK (active via the fused /// [`MemTableDedupScanExec`](super::super::memtable::scanner), flushed via /// its within-generation deletion vector) and a cross-generation block-list - /// ([`PkHashFilterExec`]) drops any PK superseded by a newer generation. + /// ([`PkBlockFilterExec`]) drops any PK superseded by a newer generation. /// Each PK therefore survives in exactly one source, so a plain /// `UnionExec` carries at most one row per PK — no cross-source dedup, /// sort, or merge needed. `_memtable_gen` / `_rowaddr` are output-only and @@ -119,7 +119,6 @@ impl LsmScanPlanner { // `Box::pin` keeps the future off `clippy::large_futures`. let block_lists = Box::pin(super::block_list::compute_source_block_lists( &sources, - &self.pk_columns, self.session.as_ref(), self.flushed_cache.as_ref(), )) @@ -138,7 +137,7 @@ impl LsmScanPlanner { // Drop cross-generation stale rows (PKs superseded by a newer gen). // `k = 0`: there is no top-k, so the under-fetch warning never fires. let scan = match block_lists.get(&(source.shard_id(), source.generation())) { - Some(set) => Arc::new(PkHashFilterExec::new( + Some(set) => Arc::new(PkBlockFilterExec::new( scan, self.pk_columns.clone(), set.clone(), @@ -413,13 +412,36 @@ mod integration_tests { .unwrap() } - /// Create a dataset at the given URI with the provided batches. + /// Create a dataset at the given URI with the provided batches. Also writes + /// the standalone PK sidecar (on `id`) so a flushed-generation source can be + /// probed by the block-list; harmless for a base table (never probed). async fn create_dataset(uri: &str, batches: Vec) -> Dataset { let schema = batches[0].schema(); - let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema); - Dataset::write(reader, uri, Some(WriteParams::default())) + let has_id = schema.column_with_name("id").is_some(); + let reader = RecordBatchIterator::new(batches.clone().into_iter().map(Ok), schema); + let dataset = Dataset::write(reader, uri, Some(WriteParams::default())) .await - .unwrap() + .unwrap(); + if has_id { + super::super::block_list::write_test_pk_sidecar(uri, &batches, &["id"]) + .await + .unwrap(); + } + dataset + } + + /// Build an in-memory memtable's `(batch_store, index_store)` with the PK + /// index enabled and populated (mirrors production — the block-list needs + /// the PK index to dedup in-memory generations). + fn pk_indexed(batches: &[RecordBatch]) -> (Arc, Arc) { + let batch_store = Arc::new(BatchStore::with_capacity(100)); + let mut index = IndexStore::new(); + index.enable_pk_index(&[("id".to_string(), 0)]); + for b in batches { + let (bp, off, _) = batch_store.append(b.clone()).unwrap(); + index.insert_with_batch_position(b, off, Some(bp)).unwrap(); + } + (batch_store, Arc::new(index)) } /// Setup a multi-level LSM structure with: @@ -470,10 +492,8 @@ mod integration_tests { .with_flushed_generation(2, "gen_2".to_string()); // Create active memtable - let batch_store = Arc::new(BatchStore::with_capacity(100)); - let index_store = Arc::new(IndexStore::new()); - let active_batch = create_test_batch(&schema, &[5, 6, 7], "active"); - let _ = batch_store.append(active_batch); + let (batch_store, index_store) = + pk_indexed(&[create_test_batch(&schema, &[5, 6, 7], "active")]); let active_memtable = InMemoryMemTables { active: InMemoryMemTableRef { @@ -515,18 +535,18 @@ mod integration_tests { // Verify the plan (gen DESC order: active -> gen2 -> gen1 -> base): // - plain UnionExec at top // - active arm: MemTableDedupScanExec (newest gen, not block-listed) - // - older arms: PkHashFilterExec (cross-gen block-list) -> LanceRead + // - older arms: PkBlockFilterExec (cross-gen block-list) -> LanceRead assert_plan_node_equals( plan, "ProjectionExec:... CoalescePartitionsExec UnionExec MemTableDedupScanExec: projection=[id, name, _rowaddr], with_row_id=false, with_row_address=true - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_2... - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_1... - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...base/data...refine_filter=--", ) .await @@ -549,9 +569,9 @@ mod integration_tests { // Verify the plan with `_memtable_gen` tags (gen DESC order): // - plain UnionExec at top - // - each arm: MemtableGenTagExec -> (PkHashFilterExec ->) data source + // - each arm: MemtableGenTagExec -> (PkBlockFilterExec ->) data source // - gen3 (active): MemtableGenTagExec -> MemTableDedupScanExec - // - gen2/gen1/base: MemtableGenTagExec -> PkHashFilterExec -> LanceRead + // - gen2/gen1/base: MemtableGenTagExec -> PkBlockFilterExec -> LanceRead assert_plan_node_equals( plan, "ProjectionExec:... @@ -560,13 +580,13 @@ mod integration_tests { MemtableGenTagExec: gen=gen3 MemTableDedupScanExec: projection=[id, name, _rowaddr], with_row_id=false, with_row_address=true MemtableGenTagExec: gen=gen2 - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_2... MemtableGenTagExec: gen=gen1 - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_1... MemtableGenTagExec: gen=base - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...base/data...refine_filter=--", ) .await @@ -647,14 +667,14 @@ mod integration_tests { } // base/gen1/gen2 all hold PKs superseded by a newer generation, so each - // is wrapped in a `PkHashFilterExec`; the newest (active) arm is not. + // is wrapped in a `PkBlockFilterExec`; the newest (active) arm is not. let plan = scanner.create_plan().await.unwrap(); let plan_str = format!( "{}", datafusion::physical_plan::displayable(plan.as_ref()).indent(true) ); assert!( - plan_str.contains("PkHashFilterExec"), + plan_str.contains("PkBlockFilterExec"), "filtered-read plan must apply the cross-gen block-list, got:\n{}", plan_str ); @@ -730,21 +750,21 @@ mod integration_tests { .with_flushed_generation(2, "gen_2".to_string()); // Frozen gen3 (sealed, NOT in the manifest) and active gen4. - let frozen_store = Arc::new(BatchStore::with_capacity(100)); - let _ = frozen_store.append(create_test_batch(&schema, &[6, 7], "frozen")); + let (frozen_store, frozen_index) = + pk_indexed(&[create_test_batch(&schema, &[6, 7], "frozen")]); let frozen = InMemoryMemTableRef { batch_store: frozen_store, - index_store: Arc::new(IndexStore::new()), + index_store: frozen_index, schema: schema.clone(), generation: 3, }; - let active_store = Arc::new(BatchStore::with_capacity(100)); - let _ = active_store.append(create_test_batch(&schema, &[7, 8], "active")); + let (active_store, active_index) = + pk_indexed(&[create_test_batch(&schema, &[7, 8], "active")]); let in_memory = InMemoryMemTables { active: InMemoryMemTableRef { batch_store: active_store, - index_store: Arc::new(IndexStore::new()), + index_store: active_index, schema: schema.clone(), generation: 4, }, @@ -969,12 +989,12 @@ mod integration_tests { ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr] MemTableDedupScanExec: projection=[id, name, _rowaddr], with_row_id=false, with_row_address=true ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr] - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_2... ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr] - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_1... - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...base/data...refine_filter=--", ) .await @@ -1037,14 +1057,14 @@ mod integration_tests { MemTableDedupScanExec: projection=[id, name, _rowaddr], with_row_id=false, with_row_address=true MemtableGenTagExec: gen=gen2 ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr] - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_2... MemtableGenTagExec: gen=gen1 ProjectionExec: expr=[id@0 as id, name@1 as name, NULL as _rowaddr] - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...gen_1... MemtableGenTagExec: gen=base - PkHashFilterExec: pk_cols=[id]... + PkBlockFilterExec: pk_cols=[id]... LanceRead:...base/data...refine_filter=--", ) .await @@ -1113,6 +1133,8 @@ mod integration_tests { let mut index_store = IndexStore::new(); // Add BTree index on id column (field_id=0) index_store.add_btree("id_idx".to_string(), 0, "id".to_string()); + // Reuse it as the PK index so the block-list can dedup this generation. + index_store.enable_pk_index(&[("id".to_string(), 0)]); let active_batch = create_test_batch(&schema, &[5, 6, 7], "active"); let _ = batch_store.append(active_batch.clone()); @@ -1177,7 +1199,7 @@ mod integration_tests { // 1. Verify overall structure assert!(plan_str.contains("UnionExec"), "Should have UnionExec"); assert!( - plan_str.contains("PkHashFilterExec"), + plan_str.contains("PkBlockFilterExec"), "older generations should be block-list filtered" ); assert!( @@ -1365,7 +1387,6 @@ mod integration_tests { // Active memtable: id=10 inserted ("keep") then updated to NULL within // the same generation; id=20 ("active_20") is a control that matches. - let batch_store = Arc::new(BatchStore::with_capacity(16)); let active_batch = RecordBatch::try_new( schema.clone(), vec![ @@ -1378,12 +1399,12 @@ mod integration_tests { ], ) .unwrap(); - batch_store.append(active_batch).unwrap(); + let (batch_store, index_store) = pk_indexed(&[active_batch]); let in_memory = InMemoryMemTables { active: InMemoryMemTableRef { batch_store, - index_store: Arc::new(IndexStore::new()), + index_store, schema: schema.clone(), generation: 1, }, diff --git a/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs b/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs index a6063c2930c..7bd51518a6c 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/point_lookup.rs @@ -9,11 +9,14 @@ use std::collections::HashMap; use std::sync::Arc; use arrow_array::{Array, RecordBatch}; -use arrow_schema::SchemaRef; +use arrow_schema::{SchemaRef, SortOptions}; use datafusion::common::ScalarValue; use datafusion::execution::TaskContext; +use datafusion::physical_expr::expressions::Column; +use datafusion::physical_expr::{LexOrdering, PhysicalSortExpr}; use datafusion::physical_plan::ExecutionPlan; use datafusion::physical_plan::limit::GlobalLimitExec; +use datafusion::physical_plan::sorts::sort::SortExec; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::prelude::{Expr, SessionContext}; use futures::TryStreamExt; @@ -27,10 +30,7 @@ use crate::dataset::mem_wal::memtable::batch_store::BatchStore; use super::collector::LsmDataSourceCollector; use super::data_source::LsmDataSource; -use super::exec::{ - BloomFilterGuardExec, CoalesceFirstExec, DedupDirection, WithinSourceDedupExec, - compute_pk_hash_from_scalars, -}; +use super::exec::{BloomFilterGuardExec, CoalesceFirstExec, compute_pk_hash_from_scalars}; use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; use super::projection::{ build_scanner_projection, canonical_output_schema, null_columns, project_to_canonical, @@ -573,19 +573,29 @@ impl LsmPointLookupPlanner { // multiple rows sharing the target primary key. scanner.with_row_id(); let raw = scanner.create_plan().await?; - // Within the active memtable, larger `_rowid` = newer - // insert. After dedup there is exactly one row per PK. - let deduped: Arc = Arc::new(WithinSourceDedupExec::new( - raw, - self.pk_columns.clone(), - lance_core::ROW_ID, - DedupDirection::KeepMaxRowAddr, - )); + // The filter already restricts to the exact PK value, so the + // scan yields that key's insert history. Within the active + // memtable larger `_rowid` = newer insert, so sorting `_rowid` + // DESC and keeping the first row picks the newest version — one + // row per (value-exact) PK. + let rowid_idx = raw.schema().index_of(lance_core::ROW_ID)?; + let ordering = LexOrdering::new(vec![PhysicalSortExpr { + expr: Arc::new(Column::new(lance_core::ROW_ID, rowid_idx)), + options: SortOptions { + descending: true, + nulls_first: false, + }, + }]) + .ok_or_else(|| { + lance_core::Error::internal("point-lookup: failed to build _rowid ordering") + })?; + let newest: Arc = + Arc::new(SortExec::new(ordering, raw).with_fetch(Some(1))); // Per-source `_rowid` would collide with the base table's; // NULL it before canonicalization (the value is internal to // this arm). project_to_canonical drops it entirely when // the user didn't request `_rowid` in the projection. - null_columns(deduped, &[lance_core::ROW_ID])? + null_columns(newest, &[lance_core::ROW_ID])? } }; project_to_canonical(scan, &target) @@ -642,10 +652,6 @@ fn probe_position( pk_column: &str, pk_value: &ScalarValue, ) -> Result { - let Some(btree) = index_store.get_btree_by_column(pk_column) else { - return Ok(ProbePos::NoIndex); - }; - // Visible batches are the committed prefix [0, last_visible_idx]; each // `StoredBatch` carries its cumulative `row_offset`, so visibility and the // position→batch mapping are O(1)/O(log) with no per-probe allocation. @@ -661,22 +667,37 @@ fn probe_position( if visible_end == 0 { return Ok(ProbePos::Miss); } + let max_visible_row = visible_end - 1; - // Newest visible position of the key — a single seek-and-stop on the - // ordered skiplist (largest key ≤ (value, max_visible_row)). No range - // collect, no allocation. - let Some(pos) = btree.get_newest_visible(pk_value, visible_end - 1) else { + // A single-column primary key always has a value-keyed BTree (reused or + // auto-created — see `IndexStore::enable_pk_index`): collision-free, so one + // seek yields the answer with no re-check. Absent only when the table has no + // PK index, where the caller falls back to the plan path. + let Some(btree) = index_store.get_btree_by_column(pk_column) else { + return Ok(ProbePos::NoIndex); + }; + let Some(pos) = btree.get_newest_visible(pk_value, max_visible_row) else { return Ok(ProbePos::Miss); }; + let (batch_idx, row) = resolve_position(batch_store, last_visible_idx, pos)?; + Ok(ProbePos::Found { batch_idx, row }) +} - // Binary-search the owning batch by `row_offset` (appended in order). +/// Map a global row `position` to its `(batch_idx, row_in_batch)` by binary +/// searching the visible batch prefix on cumulative `row_offset` (batches are +/// appended in order). +fn resolve_position( + batch_store: &BatchStore, + last_visible_idx: usize, + position: u64, +) -> Result<(usize, usize)> { let (mut lo, mut hi) = (0usize, last_visible_idx); while lo < hi { let mid = lo + (hi - lo).div_ceil(2); let off = batch_store.get(mid).map(|b| b.row_offset).ok_or_else(|| { lance_core::Error::internal("point-lookup: batch index out of range during search") })?; - if off <= pos { + if off <= position { lo = mid; } else { hi = mid - 1; @@ -685,10 +706,7 @@ fn probe_position( let stored = batch_store .get(lo) .ok_or_else(|| lance_core::Error::internal("point-lookup: resolved batch missing"))?; - Ok(ProbePos::Found { - batch_idx: lo, - row: (pos - stored.row_offset) as usize, - }) + Ok((lo, (position - stored.row_offset) as usize)) } /// Gather `rows` from `batch_store`'s batch `batch_idx` into the `target` @@ -1097,8 +1115,8 @@ mod tests { // Regression: same primary key inserted twice into one active // memtable must return the *newest* row. The bug was that // `FilterExec → LIMIT 1` over an insert-ordered scan returned the - // first (oldest) match. `WithinSourceDedupExec` collapses by PK, - // keeping the row with the largest `_rowid` (insert order). + // first (oldest) match. The plan-path active arm now sorts `_rowid` + // DESC and keeps the first row (largest `_rowid` = newest insert). use crate::dataset::mem_wal::scanner::collector::{InMemoryMemTableRef, InMemoryMemTables}; use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; use futures::TryStreamExt; @@ -1168,6 +1186,88 @@ mod tests { ); } + #[tokio::test] + async fn test_point_lookup_probes_auto_created_pk_btree() { + // No user `add_btree` on the PK column — only `enable_pk_index`, which + // auto-creates a BTree on the primary key (the production default). The + // fast probe must resolve the newest visible version through that + // collision-free BTree rather than falling back to the plan path. + use crate::dataset::mem_wal::scanner::collector::{InMemoryMemTableRef, InMemoryMemTables}; + use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; + + let schema = create_pk_schema(); + let temp_dir = tempfile::tempdir().unwrap(); + let base_uri = format!("{}/base", temp_dir.path().to_str().unwrap()); + + let batch_store = Arc::new(BatchStore::with_capacity(16)); + let mut index_store = IndexStore::new(); + // No `add_btree` — `enable_pk_index` auto-creates the PK BTree. + index_store.enable_pk_index(&[("id".to_string(), 0)]); + + // pk=1 written twice (the newer second), plus an unrelated pk=2. + let b_old = create_test_batch(&schema, &[1], "old"); + let b_new = create_test_batch(&schema, &[1], "new"); + let b_other = create_test_batch(&schema, &[2], "two"); + let (_, _, bp_old) = batch_store.append(b_old.clone()).unwrap(); + index_store + .insert_with_batch_position(&b_old, 0, Some(bp_old)) + .unwrap(); + let (_, _, bp_new) = batch_store.append(b_new.clone()).unwrap(); + index_store + .insert_with_batch_position(&b_new, 1, Some(bp_new)) + .unwrap(); + let (_, _, bp_other) = batch_store.append(b_other.clone()).unwrap(); + index_store + .insert_with_batch_position(&b_other, 2, Some(bp_other)) + .unwrap(); + let index_store = Arc::new(index_store); + + let shard_id = Uuid::new_v4(); + let collector = LsmDataSourceCollector::without_base_table(base_uri, vec![]) + .with_in_memory_memtables( + shard_id, + InMemoryMemTables { + active: InMemoryMemTableRef { + batch_store, + index_store, + schema: schema.clone(), + generation: 1, + }, + frozen: vec![], + }, + ); + let planner = LsmPointLookupPlanner::new(collector, vec!["id".to_string()], schema); + + // `lookup` takes the fast probe path (single-column PK, no system cols). + let hit = planner + .lookup(&[ScalarValue::Int32(Some(1))], None) + .await + .unwrap() + .expect("pk=1 must be found via the PK-position index probe"); + assert_eq!(hit.num_rows(), 1); + let name = hit + .column_by_name("name") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!( + name.value(0), + "new_1", + "probe must return the newest version" + ); + + // An absent key resolves to None (no on-disk sources to consult). + assert!( + planner + .lookup(&[ScalarValue::Int32(Some(999))], None) + .await + .unwrap() + .is_none(), + "absent key must miss" + ); + } + #[tokio::test] async fn test_point_lookup_flushed_memtable_returns_newest_duplicate() { // Regression / invariant pin: when a flushed memtable contains two diff --git a/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs b/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs index b6b1f952b25..162d94b6c18 100644 --- a/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs +++ b/rust/lance/src/dataset/mem_wal/scanner/vector_search.rs @@ -27,7 +27,6 @@ use crate::io::exec::TakeExec; use super::collector::LsmDataSourceCollector; use super::data_source::LsmDataSource; -use super::exec::{DedupDirection, WithinSourceDedupExec}; use super::flushed_cache::{FlushedMemTableCache, open_flushed_dataset}; use super::projection::{ DISTANCE_COLUMN, build_scanner_projection, canonical_output_schema, null_columns, @@ -38,10 +37,12 @@ use crate::session::Session; /// Plans vector search queries over LSM data. /// /// Each source is independently newest-per-PK before the union — the active -/// memtable via an over-fetched KNN + within-source dedup, flushed generations -/// via their within-generation deletion vector — and the cross-generation -/// block-list ([`super::exec::PkHashFilterExec`]) drops any PK superseded by a -/// newer generation. So each PK reaches the union from exactly one source and a +/// memtable via an over-fetched KNN + a newest-per-PK recency filter +/// ([`super::exec::NewestPkFilterExec`], which drops a hit that isn't the newest +/// visible version of its PK), flushed generations via their within-generation +/// deletion vector — and the cross-generation block-list +/// ([`super::exec::PkBlockFilterExec`]) drops any PK superseded by a newer +/// generation. So each PK reaches the union from exactly one source and a /// distance-ordered merge yields the global top-k; no cross-source dedup is /// needed. /// @@ -54,15 +55,15 @@ use crate::session::Session; /// UnionExec /// ProjectionExec (canonical output schema) /// SortExec(_distance, fetch=k) -/// WithinSourceDedupExec: KeepMaxRowAddr (active) +/// NewestPkFilterExec: newest-per-PK recency (active) /// KNNExec: active memtable, fetch=ceil(k*overfetch) /// ProjectionExec (canonical output schema) /// ProjectionExec (null_columns _rowid) -/// PkHashFilterExec: block-list (flushed) +/// PkBlockFilterExec: block-list (flushed) /// KNNExec: flushed gen N, fetch=ceil(k*overfetch) (fast_search) /// … one per flushed gen … /// ProjectionExec (canonical output schema) -/// PkHashFilterExec: block-list (base) +/// PkBlockFilterExec: block-list (base) /// KNNExec: base table, k (fast_search)[.refine()?] /// ``` /// @@ -168,7 +169,7 @@ impl LsmVectorSearchPlanner { /// the rows that filtering drops: /// /// - `factor < 1.0` (e.g. `0.0`): **stale filtering off.** The per-source - /// block-list / [`super::exec::PkHashFilterExec`] is not built or applied, + /// block-list / [`super::exec::PkBlockFilterExec`] is not built or applied, /// so rows superseded by a newer generation can surface. The global PK /// dedup still runs, so it still suppresses stale copies in the cases /// where both the stale and the fresh row reach it. @@ -210,11 +211,10 @@ impl LsmVectorSearchPlanner { // live candidates after the post-filter. let overfetch_factor = overfetch_factor.max(1.0); - // Per-source PK-hash block sets (`NEWER(G)`; base = union of all gens). + // Per-source PK block sets (`NEWER(G)`; base = union of all gens). // `Box::pin` keeps the future off `clippy::large_futures`. let block_lists = Box::pin(super::block_list::compute_source_block_lists( &sources, - &self.pk_columns, self.session.as_ref(), self.flushed_cache.as_ref(), )) @@ -248,7 +248,7 @@ impl LsmVectorSearchPlanner { } else { k }; - let knn = Box::pin(self.build_knn_plan( + let (knn, active_max_visible) = Box::pin(self.build_knn_plan( source, query_vector, fetch_k, @@ -259,23 +259,35 @@ impl LsmVectorSearchPlanner { .await?; // Make each source independently newest-per-PK before the union: // * active: the append-only HNSW returns one node per inserted - // version, so collapse duplicate PKs to the newest insert - // (KeepMaxRowAddr on `_rowid`) and re-sort by distance. This - // stays probabilistic — a fresh version evicted from the - // over-fetched top-k still leaks. + // version *and* leaves stale versions of updated PKs live. The + // recency filter keeps only the hit that is the newest visible + // version of its PK (per the maintained MVCC PK-position index), + // closing the predicate-crossing stale read, then re-sort by + // distance. // * flushed/base: drop cross-gen superseded rows via the // block-list (within-gen is handled by the flushed DV). let knn = if is_active { - let deduped: Arc = Arc::new(WithinSourceDedupExec::new( - knn, - self.pk_columns.clone(), - lance_core::ROW_ID, - DedupDirection::KeepMaxRowAddr, - )); - sort_by_distance(deduped, k)? + let (batch_store, index_store) = match source { + LsmDataSource::ActiveMemTable { + batch_store, + index_store, + .. + } => (batch_store.clone(), index_store.clone()), + _ => unreachable!("is_active implies ActiveMemTable"), + }; + let filtered: Arc = + Arc::new(super::exec::NewestPkFilterExec::new( + knn, + self.pk_columns.clone(), + lance_core::ROW_ID, + index_store, + batch_store, + active_max_visible.expect("active arm returns its max_visible snapshot"), + )); + sort_by_distance(filtered, k)? } else { match blocked { - Some(set) => Arc::new(super::exec::PkHashFilterExec::new( + Some(set) => Arc::new(super::exec::PkBlockFilterExec::new( knn, self.pk_columns.clone(), set.clone(), @@ -364,11 +376,15 @@ impl LsmVectorSearchPlanner { merged_sorted }; - // Under-fetch is warned per-source inside `PkHashFilterExec`. + // Under-fetch is warned per-source inside `PkBlockFilterExec`. Ok(result) } /// Build KNN plan for a single data source. + /// + /// Returns the plan and, for the active memtable, the `max_visible_batch_position` + /// snapshot its scanner latched — threaded into the recency filter so it keys + /// on the same snapshot the search saw (`None` for base / flushed sources). async fn build_knn_plan( &self, source: &LsmDataSource, @@ -377,7 +393,7 @@ impl LsmVectorSearchPlanner { nprobes: usize, projection: Option<&[String]>, refine: bool, - ) -> Result> { + ) -> Result<(Arc, Option)> { match source { LsmDataSource::BaseTable { dataset } => { let mut scanner = dataset.scan(); @@ -402,7 +418,7 @@ impl LsmVectorSearchPlanner { if refine { scanner.refine(1); } - scanner.create_plan().await + Ok((scanner.create_plan().await?, None)) } LsmDataSource::FlushedMemTable { path, .. } => { let dataset = @@ -418,7 +434,7 @@ impl LsmVectorSearchPlanner { scanner.nprobes(nprobes); scanner.distance_metric(self.distance_type); scanner.fast_search(); - scanner.create_plan().await + Ok((scanner.create_plan().await?, None)) } LsmDataSource::ActiveMemTable { batch_store, @@ -436,8 +452,8 @@ impl LsmVectorSearchPlanner { build_scanner_projection(projection, &self.base_schema, &self.pk_columns); scanner.project(&cols.iter().map(|s| s.as_str()).collect::>()); // Expose `_rowid` (BatchStore row offset, monotonic with - // insert order) so [`WithinSourceDedupExec`] can collapse - // duplicate-PK rows to the newest insert. The value is + // insert order) so `NewestPkFilterExec` can compare each hit's + // position against the PK-position index. The value is // per-source and NULL'd before reaching the canonical merge. // (VectorIndexExec only plumbs `with_row_id`, not // `with_row_address`, but the two yield identical values @@ -447,7 +463,9 @@ impl LsmVectorSearchPlanner { scanner.nearest(&self.vector_column, query_arr, k); scanner.nprobes(nprobes); scanner.distance_metric(self.distance_type); - scanner.create_plan().await + let plan = scanner.create_plan().await?; + // Capture the scanner's own latched snapshot for the recency filter. + Ok((plan, Some(scanner.max_visible_batch_position()))) } } } @@ -567,10 +585,23 @@ mod tests { async fn create_dataset(uri: &str, batches: Vec) -> Dataset { let schema = batches[0].schema(); - let reader = RecordBatchIterator::new(batches.into_iter().map(Ok), schema); - Dataset::write(reader, uri, Some(WriteParams::default())) + let has_id = schema.column_with_name("id").is_some(); + let reader = RecordBatchIterator::new(batches.clone().into_iter().map(Ok), schema); + let dataset = Dataset::write(reader, uri, Some(WriteParams::default())) + .await + .unwrap(); + // Also write the standalone PK sidecar (on `id`) so a flushed-generation + // source can be probed by the block-list (harmless for a base table). + if has_id { + crate::dataset::mem_wal::scanner::block_list::write_test_pk_sidecar( + uri, + &batches, + &["id"], + ) .await - .unwrap() + .unwrap(); + } + dataset } #[tokio::test] @@ -641,6 +672,7 @@ mod tests { // Active memtable with HNSW index over the "vector" column. let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -759,6 +791,7 @@ mod tests { let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -838,6 +871,7 @@ mod tests { let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -951,6 +985,7 @@ mod tests { let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1007,8 +1042,7 @@ mod tests { plan_str ); assert!( - plan_str.contains("WithinSourceDedupExec") - && plan_str.contains("SortPreservingMergeExec"), + plan_str.contains("NewestPkFilterExec") && plan_str.contains("SortPreservingMergeExec"), "expected per-arm dedup + distance merge, got:\n{}", plan_str ); @@ -1091,6 +1125,7 @@ mod tests { // "right" vector close to the query, plus an unrelated pk=2. let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1210,6 +1245,7 @@ mod tests { // Active memtable: id=3 with HNSW index. let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1436,9 +1472,9 @@ mod tests { #[tokio::test] async fn test_vector_search_dedup_within_active_memtable() { // Regression: same PK inserted twice into one active memtable with - // *different* vectors. HNSW indexes each as a distinct node, so - // without WithinSourceDedupExec a KNN can return both candidates - // for the same PK and pollute top-k. The newer insert must win. + // *different* vectors. HNSW indexes each as a distinct node, so without + // the recency filter a KNN can return both candidates for the same PK + // and pollute top-k. The newer insert must win. use crate::dataset::mem_wal::scanner::collector::{InMemoryMemTableRef, InMemoryMemTables}; use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; use datafusion::prelude::SessionContext; @@ -1450,6 +1486,7 @@ mod tests { let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1513,14 +1550,14 @@ mod tests { .await .unwrap(); - // The active arm collapses duplicate-PK HNSW nodes itself via - // WithinSourceDedupExec — there is no cross-source dedup fallback. + // The active arm collapses duplicate-PK HNSW nodes itself via the + // recency filter — there is no cross-source dedup fallback. let plan_str = format!( "{}", datafusion::physical_plan::displayable(plan.as_ref()).indent(true) ); assert!( - plan_str.contains("WithinSourceDedupExec"), + plan_str.contains("NewestPkFilterExec"), "active vector arm must self-dedup, got:\n{}", plan_str ); @@ -1549,10 +1586,120 @@ mod tests { ); } + #[tokio::test] + async fn test_vector_search_active_stale_update_out_of_neighborhood() { + // BUG REPRODUCTION (vector case: a PK update that moves out of the neighborhood). + // + // Within a *single* active memtable, pk=1 is first inserted ON the query + // (distance ~0), then updated to a FAR vector. The append-only HNSW keeps + // both nodes live. A result-set dedup only collapses duplicate PKs that + // are BOTH present in the over-fetched candidate set. + // + // Here the fresh (far) pk=1 is evicted from the candidate set — there are + // enough nearer filler rows that it ranks below the fetch cutoff — so the + // dedup never sees it and the STALE near pk=1 leaks as the nearest hit. + // This is the predicate-crossing hole: the row that *would* suppress the + // stale version isn't in the result set, so result-set dedup can't help. + // + // Desired (NewestPkFilterExec) behaviour: pk=1's newest row-position is + // the far one, computed predicate-independently over the whole memtable, + // so the stale near node is dropped and pk=1 must NOT surface at ~0. + use crate::dataset::mem_wal::scanner::collector::{InMemoryMemTableRef, InMemoryMemTables}; + use crate::dataset::mem_wal::write::{BatchStore, IndexStore}; + use datafusion::prelude::SessionContext; + use futures::TryStreamExt; + + let schema = create_vector_schema(); + let temp_dir = tempfile::tempdir().unwrap(); + let base_uri = format!("{}/base", temp_dir.path().to_str().unwrap()); + + let batch_store = Arc::new(BatchStore::with_capacity(16)); + let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); + index_store.add_hnsw( + "vector_hnsw".to_string(), + 1, + "vector".to_string(), + lance_linalg::distance::DistanceType::L2, + 64, + 8, + ); + + // First append: stale pk=1 ON the query, plus five filler rows strictly + // farther than pk=1 but far nearer than the eventual fresh pk=1. + let q = [0.1, 0.2, 0.3, 0.4]; + let stale_then_fillers = batch_rows( + &schema, + &[ + (1, q), + (10, [0.11, 0.21, 0.31, 0.41]), + (11, [0.13, 0.23, 0.33, 0.43]), + (12, [0.15, 0.25, 0.35, 0.45]), + (13, [0.17, 0.27, 0.37, 0.47]), + (14, [0.19, 0.29, 0.39, 0.49]), + ], + ); + let (_, _, bp0) = batch_store.append(stale_then_fillers.clone()).unwrap(); + index_store + .insert_with_batch_position(&stale_then_fillers, 0, Some(bp0)) + .unwrap(); + + // Second append: the UPDATE — pk=1 moved far from the query. This is the + // newest version (largest row position) but it sits well outside top-k. + let fresh_pk1 = batch_rows(&schema, &[(1, [9.0, 9.0, 9.0, 9.0])]); + let (_, _, bp1) = batch_store.append(fresh_pk1.clone()).unwrap(); + index_store + .insert_with_batch_position(&fresh_pk1, 1, Some(bp1)) + .unwrap(); + let index_store = Arc::new(index_store); + + let shard_id = uuid::Uuid::new_v4(); + let collector = LsmDataSourceCollector::without_base_table(base_uri, vec![]) + .with_in_memory_memtables( + shard_id, + InMemoryMemTables { + active: InMemoryMemTableRef { + batch_store, + index_store, + schema: schema.clone(), + generation: 1, + }, + frozen: vec![], + }, + ); + + let planner = LsmVectorSearchPlanner::new( + collector, + vec!["id".to_string()], + schema, + "vector".to_string(), + lance_linalg::distance::DistanceType::L2, + ); + + // k=3, no over-fetch: the candidate set is {pk1@near, two nearest + // fillers}; fresh pk1@far ranks 7th and never enters the candidates. + let query = create_query_vector(); + let plan = planner + .plan_search(&query, 3, 1, None, false, 1.0) + .await + .unwrap(); + let ctx = SessionContext::new(); + let stream = plan.execute(0, ctx.task_ctx()).unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + let rows = collect_id_dist(&batches); + + assert!( + !rows.iter().any(|&(id, d)| id == 1 && d.abs() < 1e-3), + "stale near pk=1 leaked: its live vector is far from the query, so it \ + must not appear at distance ~0. results={:?}", + rows + ); + } + #[tokio::test] async fn test_vector_search_stale_read_when_fresh_falls_out_of_top_k() { // Regression for the cross-generation stale-read gap that the - // PkHashFilterExec block-list closes. + // PkBlockFilterExec block-list closes. // // Scenario: // * Base (gen 0): stale pk=1 sitting on the query (distance ~0). @@ -1587,6 +1734,7 @@ mod tests { // active arm surfaces pk=2 and drops fresh pk=1. let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1783,6 +1931,7 @@ mod tests { // Active (gen 1): pk 1,2,3 re-inserted with a far vector (the fresh value). let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -1987,6 +2136,7 @@ mod tests { // Active: (1,1) re-inserted far (fresh) + an unrelated nearby (2,2). let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id1".to_string(), 0), ("id2".to_string(), 1)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, @@ -2091,6 +2241,7 @@ mod tests { let batch_store = Arc::new(BatchStore::with_capacity(16)); let mut index_store = IndexStore::new(); + index_store.enable_pk_index(&[("id".to_string(), 0)]); index_store.add_hnsw( "vector_hnsw".to_string(), 1, diff --git a/rust/lance/src/dataset/mem_wal/util.rs b/rust/lance/src/dataset/mem_wal/util.rs index d1413b84b2a..3f5090f6b40 100644 --- a/rust/lance/src/dataset/mem_wal/util.rs +++ b/rust/lance/src/dataset/mem_wal/util.rs @@ -169,6 +169,16 @@ pub fn flushed_memtable_path( shard_base_path(base_path, shard_id).join(format!("{}_gen_{}", random_hash, generation)) } +/// Subdirectory of a flushed generation holding its standalone primary-key +/// dedup index (a sidecar BTree, not registered in the manifest). Both the +/// flush writer and the block-list probe join this onto the generation path. +pub const PK_INDEX_DIR: &str = "_pk_index"; + +/// Path to a flushed generation's standalone primary-key dedup index. +pub fn pk_index_path(gen_path: &Path) -> Path { + gen_path.clone().join(PK_INDEX_DIR) +} + /// Generate an 8-character random hex string for flushed MemTable directories. pub fn generate_random_hash() -> String { let bytes: [u8; 4] = rand::random(); diff --git a/rust/lance/src/dataset/mem_wal/write.rs b/rust/lance/src/dataset/mem_wal/write.rs index 0788b657366..736b5e77833 100644 --- a/rust/lance/src/dataset/mem_wal/write.rs +++ b/rust/lance/src/dataset/mem_wal/write.rs @@ -844,6 +844,16 @@ async fn replay_memtable_from_wal( Ok(position) } +/// Pair each primary-key column name with its field id (both derived from the +/// schema's primary key, in the same order) for [`IndexStore::enable_pk_index`]. +fn pk_index_columns(pk_columns: &[String], pk_field_ids: &[i32]) -> Vec<(String, i32)> { + pk_columns + .iter() + .cloned() + .zip(pk_field_ids.iter().copied()) + .collect() +} + /// Shared state for writer operations. struct SharedWriterState { state: Arc>, @@ -853,6 +863,9 @@ struct SharedWriterState { config: ShardWriterConfig, schema: Arc, pk_field_ids: Vec, + /// Primary-key column names, used to (re)enable the PK-position index on + /// each fresh active memtable created at freeze. + pk_columns: Vec, max_memtable_batches: usize, max_memtable_rows: usize, index_configs: Vec, @@ -868,6 +881,7 @@ impl SharedWriterState { config: ShardWriterConfig, schema: Arc, pk_field_ids: Vec, + pk_columns: Vec, max_memtable_batches: usize, max_memtable_rows: usize, index_configs: Vec, @@ -880,6 +894,7 @@ impl SharedWriterState { config, schema, pk_field_ids, + pk_columns, max_memtable_batches, max_memtable_rows, index_configs, @@ -906,12 +921,13 @@ impl SharedWriterState { )?; if !self.index_configs.is_empty() { - let indexes = Arc::new(IndexStore::from_configs( + let mut indexes = IndexStore::from_configs( &self.index_configs, self.max_memtable_rows, self.max_memtable_batches, - )?); - new_memtable.set_indexes_arc(indexes); + )?; + indexes.enable_pk_index(&pk_index_columns(&self.pk_columns, &self.pk_field_ids)); + new_memtable.set_indexes_arc(Arc::new(indexes)); } let mut old_memtable = std::mem::replace(&mut state.memtable, new_memtable); @@ -1279,11 +1295,9 @@ impl ShardWriter { ) -> Result { // Create MemTable with primary key field IDs from schema let lance_schema = Schema::try_from(schema.as_ref())?; - let pk_field_ids: Vec = lance_schema - .unenforced_primary_key() - .iter() - .map(|f| f.id) - .collect(); + let pk_fields = lance_schema.unenforced_primary_key(); + let pk_field_ids: Vec = pk_fields.iter().map(|f| f.id).collect(); + let pk_columns: Vec = pk_fields.iter().map(|f| f.name.clone()).collect(); let mut memtable = MemTable::with_capacity( schema.clone(), manifest.current_generation, @@ -1292,14 +1306,17 @@ impl ShardWriter { config.max_memtable_batches, )?; - // Create indexes if configured and set them on the MemTable. + // Create indexes if configured and set them on the MemTable. The + // PK-position index is enabled before any WAL replay below so replayed + // rows are recorded in it. if !index_configs.is_empty() { - let indexes = Arc::new(IndexStore::from_configs( + let mut indexes = IndexStore::from_configs( index_configs, config.max_memtable_rows, config.max_memtable_batches, - )?); - memtable.set_indexes_arc(indexes); + )?; + indexes.enable_pk_index(&pk_index_columns(&pk_columns, &pk_field_ids)); + memtable.set_indexes_arc(Arc::new(indexes)); } // Replay any WAL entries written after the last successfully-flushed @@ -1387,6 +1404,7 @@ impl ShardWriter { config.clone(), schema.clone(), pk_field_ids, + pk_columns, config.max_memtable_batches, config.max_memtable_rows, index_configs.to_vec(),