From b1185292e60f4709b7bf20e6304a14ee1e07f182 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sun, 10 May 2026 01:33:21 -0700
Subject: [PATCH 01/37] refactor(mem_wal): redesign FTS mem index for
 single-writer multi-reader

Replace the SkipMap<(token, row_position)> postings layout with per-term
ArcSwap<TermSlice> slices and an ArcSwap<Snapshot>-published per-batch
visibility watermark. Readers grab a snapshot, walk per-term chunks
filtered by chunk.batch_position < snapshot.visible_count, and score
with snapshot-coupled BM25 stats. Writer publishes the snapshot only
after every term chunk for a batch is linked, so readers never observe
a partial document or BM25 stats out of sync with the postings.

Also: tokenize-time tokenizer ownership moves out of a shared Mutex
into a tokenizer pool plus a writer-dedicated slot, so search calls do
not serialize against the writer; add Utf8View to the supported text
types; add memory_usage() for size-based flush triggers; reuse
lance-index InvertedIndex's TokenSet/DocSet/PostingListBuilder for the
flush path.
---
 rust/lance/Cargo.toml                       |    1 +
 rust/lance/src/dataset/mem_wal/index/fts.rs | 2579 +++++++++----------
 2 files changed, 1251 insertions(+), 1329 deletions(-)
diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml
index 83d0f5f989f..e1e5a3bc823 100644
--- a/rust/lance/Cargo.toml
+++ b/rust/lance/Cargo.toml
@@ -68,6 +68,7 @@ rayon.workspace = true
 futures.workspace = true
 uuid.workspace = true
 arrow.workspace = true
+arc-swap.workspace = true
 # TODO: use datafusion sub-modules to reduce build size?
 datafusion.workspace = true
 datafusion-functions.workspace = true
diff --git a/rust/lance/src/dataset/mem_wal/index/fts.rs b/rust/lance/src/dataset/mem_wal/index/fts.rs
index e928790aa64..5b924617bd6 100644
--- a/rust/lance/src/dataset/mem_wal/index/fts.rs
+++ b/rust/lance/src/dataset/mem_wal/index/fts.rs
@@ -1,36 +1,45 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 
-//! In-memory Full-Text Search (FTS) index.
+//! In-memory Full-Text Search (FTS) index with single-writer / multi-reader
+//! semantics.
 //!
-//! Provides inverted index for text search using crossbeam-skiplist.
-//! Uses the same tokenization as Lance's InvertedIndex for consistency.
+//! # Concurrency model
 //!
-//! ## Current Features
-//! - BM25 scoring algorithm for relevance ranking
-//! - Automatic result ordering by score (descending)
-//! - Single-column term queries
-//! - Phrase queries with slop support
+//! - **One writer** (`insert` / `insert_with_batch_position`) at a time per
+//!   index. Callers are responsible for that invariant; this is consistent
+//!   with `IndexStore`'s usage from `ShardWriter`.
+//! - **Many readers** (`search*`, `expand_fuzzy`, `to_index_builder_reversed`)
+//!   in parallel with the writer. Reads are lock-free aside from a brief
+//!   tokenizer-pool checkout.
+//! - **Per-batch monotonic visibility**: a reader either sees every row of
+//!   batch `b` or none of them. A reader never sees a batch numbered above
+//!   the published `visible_count`. BM25 statistics observed by a reader
+//!   (`doc_count`, `total_tokens`, per-term `df`) are mutually consistent
+//!   with the postings the reader walks.
 //!
-//! ## Pending Features (TODO)
-//! - Multi-column search: Search across multiple columns simultaneously
-//! - Boolean queries: MUST/SHOULD/MUST_NOT for complex query logic
-//! - Fuzzy matching: Typo tolerance with configurable edit distance
-//! - Boost queries: Positive/negative boosting for relevance tuning
-//! - WAND factor: Performance/recall tradeoff control
-//! - Per-term/column boost: Fine-grained relevance weighting
+//! Visibility is published atomically by replacing a single `Snapshot` value
+//! via `ArcSwap`. The writer first installs all term chunks into the
+//! per-term `ArcSwap<TermSlice>` slots, then atomically swaps in a new
+//! `Snapshot` whose `visible_count` covers the new batch. Readers load the
+//! `Snapshot` first and filter every term chunk by `batch_position <
+//! snapshot.visible_count`.
 //!
-//! **Note**: FTS index flush to persistent storage is NOT YET IMPLEMENTED.
-//! The in-memory index works for real-time queries on MemTable data,
-//! but is skipped during MemTable flush.
+//! # On-disk format
+//!
+//! At flush time we hand off to `lance_index::scalar::inverted::builder::InnerBuilder`
+//! via `to_index_builder_reversed`. The on-disk format is unchanged from
+//! Lance's existing inverted index.
 
-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
+use std::sync::Arc;
 use std::sync::Mutex;
 use std::sync::atomic::{AtomicUsize, Ordering};
 
-use arrow_array::RecordBatch;
+use arc_swap::ArcSwap;
+use arrow_array::{Array, LargeStringArray, RecordBatch, StringArray, StringViewArray};
+use arrow_schema::DataType;
 use crossbeam_skiplist::SkipMap;
-use datafusion::common::ScalarValue;
 use lance_core::{Error, Result};
 use lance_index::scalar::InvertedIndexParams;
 use lance_index::scalar::inverted::tokenizer::document_tokenizer::LanceTokenizer;
@@ -38,18 +47,11 @@ use lance_tokenizer::TokenStream;
 
 use super::RowPosition;
 
-/// Composite key for FTS index.
-///
-/// By combining (token, row_position), each entry is unique.
-#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord)]
-pub struct FtsKey {
-    /// The indexed token (lowercase).
-    pub token: String,
-    /// Row position (makes the key unique for tokens appearing in multiple docs).
-    pub row_position: RowPosition,
-}
+// ============================================================================
+// Public types preserved from previous API
+// ============================================================================
 
-/// In-memory FTS (Full-Text Search) index entry (returned from search).
+/// In-memory FTS index entry returned from search.
 #[derive(Debug, Clone)]
 pub struct FtsEntry {
     /// Row position in MemTable.
@@ -59,9 +61,6 @@ pub struct FtsEntry {
 }
 
 /// Full-text search query expression for composable queries.
-///
-/// Supports simple term matches, phrase queries, fuzzy matching, and Boolean
-/// combinations with MUST/SHOULD/MUST_NOT logic.
 #[derive(Debug, Clone)]
 pub enum FtsQueryExpr {
     /// Simple term match query.
@@ -85,9 +84,9 @@ pub enum FtsQueryExpr {
         /// The search query string.
         query: String,
         /// Maximum edit distance (Levenshtein distance).
-        /// None means auto-fuzziness based on token length.
+        /// `None` means auto-fuzziness based on token length.
         fuzziness: Option<u32>,
-        /// Maximum number of terms to expand to (default 50).
+        /// Maximum number of terms to expand to.
         max_expansions: usize,
         /// Boost factor applied to the score (default 1.0).
         boost: f32,
@@ -101,18 +100,14 @@ pub enum FtsQueryExpr {
         /// No MUST_NOT clause may match (excludes documents).
         must_not: Vec<Self>,
     },
-    /// Boosting query with positive and optional negative components.
-    ///
-    /// Documents matching the positive query are returned.
-    /// If a negative query is provided, documents matching both positive
-    /// and negative have their scores reduced by `negative_boost`.
+    /// Boosting query with a positive and an optional negative component.
     Boost {
-        /// The primary query (documents must match this).
+        /// Documents must match this query.
         positive: Box<Self>,
-        /// Optional query to demote matching documents.
+        /// Optional query whose matches are demoted.
         negative: Option<Box<Self>>,
-        /// Boost factor for documents matching negative query (typically < 1.0).
-        /// Score becomes: original_score * negative_boost for docs matching negative.
+        /// Multiplier applied to documents matching `negative` (typically
+        /// `< 1.0` to demote).
         negative_boost: f32,
     },
 }
@@ -124,23 +119,10 @@ pub const DEFAULT_MAX_EXPANSIONS: usize = 50;
 pub const DEFAULT_WAND_FACTOR: f32 = 1.0;
 
 /// Search options for controlling performance/recall tradeoffs.
-///
-/// The WAND (Weak AND) factor allows trading recall for performance:
-/// - `wand_factor = 1.0`: Full recall (default), all matching documents returned
-/// - `wand_factor < 1.0`: Faster but may miss some results. Documents with
-///   scores below `top_k_score * wand_factor` are pruned.
-///
-/// # Example
-/// ```ignore
-/// let options = SearchOptions::default()
-///     .with_limit(10)
-///     .with_wand_factor(0.5);
-/// let results = index.search_with_options(&query, options);
-/// ```
 #[derive(Debug, Clone)]
 pub struct SearchOptions {
     /// WAND factor for early termination (0.0 to 1.0).
-    /// 1.0 = full recall, <1.0 = faster but may miss low-scoring results.
+    /// 1.0 = full recall (default).
     pub wand_factor: f32,
     /// Maximum number of results to return. None means unlimited.
     pub limit: Option<usize>,
@@ -156,16 +138,11 @@ impl Default for SearchOptions {
 }
 
 impl SearchOptions {
-    /// Create new SearchOptions with default values.
     pub fn new() -> Self {
         Self::default()
     }
 
     /// Set the WAND factor for early termination.
-    ///
-    /// - 1.0 = full recall (default)
-    /// - 0.5 = prune documents scoring below 50% of the current k-th best score
-    /// - 0.0 = only return the absolute best match
     pub fn with_wand_factor(mut self, wand_factor: f32) -> Self {
         self.wand_factor = wand_factor.clamp(0.0, 1.0);
         self
@@ -179,7 +156,6 @@ impl SearchOptions {
 }
 
 impl FtsQueryExpr {
-    /// Create a simple match query.
     pub fn match_query(query: impl Into<String>) -> Self {
         Self::Match {
             query: query.into(),
@@ -187,7 +163,6 @@ impl FtsQueryExpr {
         }
     }
 
-    /// Create a phrase query with exact matching (slop=0).
     pub fn phrase(query: impl Into<String>) -> Self {
         Self::Phrase {
             query: query.into(),
@@ -196,7 +171,6 @@ impl FtsQueryExpr {
         }
     }
 
-    /// Create a phrase query with specified slop.
     pub fn phrase_with_slop(query: impl Into<String>, slop: u32) -> Self {
         Self::Phrase {
             query: query.into(),
@@ -205,22 +179,15 @@ impl FtsQueryExpr {
         }
     }
 
-    /// Create a fuzzy match query with auto-fuzziness.
-    ///
-    /// Auto-fuzziness is calculated based on token length:
-    /// - 0-2 chars: 0 (exact match)
-    /// - 3-5 chars: 1
-    /// - 6+ chars: 2
     pub fn fuzzy(query: impl Into<String>) -> Self {
         Self::Fuzzy {
             query: query.into(),
-            fuzziness: None, // auto
+            fuzziness: None,
             max_expansions: DEFAULT_MAX_EXPANSIONS,
             boost: 1.0,
         }
     }
 
-    /// Create a fuzzy match query with specified edit distance.
     pub fn fuzzy_with_distance(query: impl Into<String>, fuzziness: u32) -> Self {
         Self::Fuzzy {
             query: query.into(),
@@ -230,7 +197,6 @@ impl FtsQueryExpr {
         }
     }
 
-    /// Create a fuzzy match query with specified edit distance and max expansions.
     pub fn fuzzy_with_options(
         query: impl Into<String>,
         fuzziness: Option<u32>,
@@ -244,14 +210,10 @@ impl FtsQueryExpr {
         }
     }
 
-    /// Create a Boolean query.
     pub fn boolean() -> BooleanQueryBuilder {
         BooleanQueryBuilder::new()
     }
 
-    /// Create a boosting query with only a positive component.
-    ///
-    /// This is equivalent to just running the positive query.
     pub fn boosting(positive: Self) -> Self {
         Self::Boost {
             positive: Box::new(positive),
@@ -260,17 +222,6 @@ impl FtsQueryExpr {
         }
     }
 
-    /// Create a boosting query with positive and negative components.
-    ///
-    /// Documents matching the positive query are returned.
-    /// Documents matching both positive and negative have their scores
-    /// multiplied by `negative_boost` (typically < 1.0 to demote).
-    ///
-    /// # Arguments
-    ///
-    /// * `positive` - The primary query (documents must match this)
-    /// * `negative` - Query to demote matching documents
-    /// * `negative_boost` - Multiplier for documents matching negative (e.g., 0.5)
     pub fn boosting_with_negative(positive: Self, negative: Self, negative_boost: f32) -> Self {
         Self::Boost {
             positive: Box::new(positive),
@@ -279,7 +230,6 @@ impl FtsQueryExpr {
         }
     }
 
-    /// Apply a boost factor to this query.
     pub fn with_boost(self, boost: f32) -> Self {
         match self {
             Self::Match { query, .. } => Self::Match { query, boost },
@@ -295,42 +245,15 @@ impl FtsQueryExpr {
                 max_expansions,
                 boost,
             },
-            Self::Boolean {
-                must,
-                should,
-                must_not,
-            } => {
-                // For Boolean queries, boost is not directly applied
-                // (would need to apply to sub-queries)
-                Self::Boolean {
-                    must,
-                    should,
-                    must_not,
-                }
-            }
-            Self::Boost {
-                positive,
-                negative,
-                negative_boost,
-            } => {
-                // For Boost queries, we wrap the positive in a boosted match
-                // This is a bit unusual - typically you'd boost individual sub-queries
-                Self::Boost {
-                    positive,
-                    negative,
-                    negative_boost,
-                }
-            }
+            // Boolean and Boost don't carry a top-level boost field today.
+            // Preserved as-is to keep behavior identical to the previous impl.
+            other @ (Self::Boolean { .. } | Self::Boost { .. }) => other,
         }
     }
 }
 
-/// Calculate auto-fuzziness based on token length.
-///
-/// This follows the same algorithm as Lance's existing InvertedIndex:
-/// - 0-2 chars: 0 (exact match only)
-/// - 3-5 chars: 1 edit allowed
-/// - 6+ chars: 2 edits allowed
+/// Auto-fuzziness based on token length:
+/// 0–2 chars → 0, 3–5 chars → 1, 6+ chars → 2.
 pub fn auto_fuzziness(token: &str) -> u32 {
     match token.chars().count() {
         0..=2 => 0,
@@ -339,17 +262,13 @@ pub fn auto_fuzziness(token: &str) -> u32 {
     }
 }
 
-/// Calculate Levenshtein distance between two strings.
-///
-/// Returns the minimum number of single-character edits (insertions,
-/// deletions, or substitutions) required to transform one string into another.
+/// Levenshtein distance using two-row dynamic programming.
 pub fn levenshtein_distance(a: &str, b: &str) -> u32 {
     let a_chars: Vec<char> = a.chars().collect();
     let b_chars: Vec<char> = b.chars().collect();
     let m = a_chars.len();
     let n = b_chars.len();
 
-    // Handle edge cases
     if m == 0 {
         return n as u32;
     }
@@ -357,21 +276,17 @@ pub fn levenshtein_distance(a: &str, b: &str) -> u32 {
         return m as u32;
     }
 
-    // Use two rows instead of full matrix for space efficiency
     let mut prev_row: Vec<u32> = (0..=n as u32).collect();
     let mut curr_row: Vec<u32> = vec![0; n + 1];
 
     for (i, a_char) in a_chars.iter().enumerate() {
         curr_row[0] = (i + 1) as u32;
-
         for (j, b_char) in b_chars.iter().enumerate() {
             let cost = if a_char == b_char { 0 } else { 1 };
-
-            curr_row[j + 1] = (prev_row[j + 1] + 1) // deletion
-                .min(curr_row[j] + 1) // insertion
-                .min(prev_row[j] + cost); // substitution
+            curr_row[j + 1] = (prev_row[j + 1] + 1)
+                .min(curr_row[j] + 1)
+                .min(prev_row[j] + cost);
         }
-
         std::mem::swap(&mut prev_row, &mut curr_row);
     }
 
@@ -387,30 +302,25 @@ pub struct BooleanQueryBuilder {
 }
 
 impl BooleanQueryBuilder {
-    /// Create a new Boolean query builder.
     pub fn new() -> Self {
         Self::default()
     }
 
-    /// Add a MUST clause (document must match).
     pub fn must(mut self, query: FtsQueryExpr) -> Self {
         self.must.push(query);
         self
     }
 
-    /// Add a SHOULD clause (document should match, adds to score).
     pub fn should(mut self, query: FtsQueryExpr) -> Self {
         self.should.push(query);
         self
     }
 
-    /// Add a MUST_NOT clause (document must not match).
     pub fn must_not(mut self, query: FtsQueryExpr) -> Self {
         self.must_not.push(query);
         self
     }
 
-    /// Build the Boolean query.
     pub fn build(self) -> FtsQueryExpr {
         FtsQueryExpr::Boolean {
             must: self.must,
@@ -420,51 +330,292 @@ impl BooleanQueryBuilder {
     }
 }
 
-/// Posting value stored in the inverted index.
-/// Contains term frequency and positions for phrase query support.
-#[derive(Clone, Debug)]
-pub struct PostingValue {
-    /// Term frequency in the document.
-    pub frequency: u32,
-    /// Token positions within the document (0-indexed).
-    /// Used for phrase matching.
-    pub positions: Vec<u32>,
+// ============================================================================
+// Internal types
+// ============================================================================
+
+/// Compressed sparse row (CSR) layout for per-document positions.
+///
+/// `offsets[i]..offsets[i+1]` is the slice of `data` belonging to the i-th
+/// document in this `TermChunk`. `offsets.len() == row_positions.len() + 1`.
+#[derive(Debug)]
+struct Positions {
+    offsets: Vec<u32>,
+    data: Vec<u32>,
+}
+
+impl Positions {
+    fn empty() -> Self {
+        Self {
+            offsets: vec![0],
+            data: Vec::new(),
+        }
+    }
+
+    fn push_doc(&mut self, positions: &[u32]) {
+        self.data.extend_from_slice(positions);
+        self.offsets.push(self.data.len() as u32);
+    }
+
+    fn doc_positions(&self, doc_idx: usize) -> &[u32] {
+        let start = self.offsets[doc_idx] as usize;
+        let end = self.offsets[doc_idx + 1] as usize;
+        &self.data[start..end]
+    }
+
+    fn memory_size(&self) -> usize {
+        self.offsets.capacity() * std::mem::size_of::<u32>()
+            + self.data.capacity() * std::mem::size_of::<u32>()
+    }
+}
+
+/// Postings produced by a single batch insert for a single term.
+///
+/// SoA layout for cache locality and trivial handoff to
+/// `PostingListBuilder`. `row_positions` is sorted ascending.
+#[derive(Debug)]
+struct TermChunk {
+    batch_position: usize,
+    row_positions: Vec<u64>,
+    frequencies: Vec<u32>,
+    /// Present iff `params.has_positions()` was true at construction time.
+    /// Independent of that, in-memory phrase queries always work when
+    /// positions were tracked at insert time.
+    positions: Option<Positions>,
+}
+
+impl TermChunk {
+    fn doc_count(&self) -> usize {
+        self.row_positions.len()
+    }
+
+    fn memory_size(&self) -> usize {
+        let base = std::mem::size_of::<Self>()
+            + self.row_positions.capacity() * std::mem::size_of::<u64>()
+            + self.frequencies.capacity() * std::mem::size_of::<u32>();
+        base + self.positions.as_ref().map_or(0, Positions::memory_size)
+    }
+}
+
+/// Append-only list of `TermChunk`s for a single term, replaced atomically
+/// via `ArcSwap`.
+#[derive(Debug, Default)]
+struct TermSlice {
+    chunks: Vec<Arc<TermChunk>>,
+}
+
+impl TermSlice {
+    fn empty() -> Arc<Self> {
+        Arc::new(Self { chunks: Vec::new() })
+    }
+
+    /// Returns a fresh `Arc<TermSlice>` containing all current chunks plus the
+    /// new one. The previous slice is left unchanged so any reader holding
+    /// it continues to see a consistent state.
+    fn with_chunk_appended(&self, chunk: Arc<TermChunk>) -> Arc<Self> {
+        let mut chunks = Vec::with_capacity(self.chunks.len() + 1);
+        chunks.extend(self.chunks.iter().cloned());
+        chunks.push(chunk);
+        Arc::new(Self { chunks })
+    }
+
+    fn memory_size(&self) -> usize {
+        std::mem::size_of::<Self>()
+            + self.chunks.capacity() * std::mem::size_of::<Arc<TermChunk>>()
+            + self.chunks.iter().map(|c| c.memory_size()).sum::<usize>()
+    }
+}
+
+/// Per-batch row metadata.
+#[derive(Debug)]
+struct BatchMeta {
+    batch_position: usize,
+    row_offset: u64,
+    /// `doc_lengths[i]` is the token count of the row at `row_offset + i`.
+    doc_lengths: Vec<u32>,
+    rows: u32,
+}
+
+impl BatchMeta {
+    fn dl(&self, row_position: u64) -> Option<u32> {
+        if row_position < self.row_offset {
+            return None;
+        }
+        let idx = (row_position - self.row_offset) as usize;
+        self.doc_lengths.get(idx).copied()
+    }
+
+    fn memory_size(&self) -> usize {
+        std::mem::size_of::<Self>() + self.doc_lengths.capacity() * std::mem::size_of::<u32>()
+    }
+}
+
+/// Atomic snapshot of the visible state. Replaced via `ArcSwap` after each
+/// batch is fully linked.
+#[derive(Debug)]
+struct Snapshot {
+    /// Number of batches visible to readers. `0` means empty index.
+    visible_count: usize,
+    /// Visible-batch metadata. `batches.len() >= visible_count`; the writer
+    /// may have already appended a still-invisible batch to this list, but
+    /// readers walk only `batches[0..visible_count]`. In the publication
+    /// order described in the module docs, readers never observe
+    /// `visible_count` exceeding the actual length.
+    batches: Arc<[Arc<BatchMeta>]>,
+    /// `Σ batches[i].rows` for `i < visible_count`.
+    cumulative_doc_count: u64,
+    /// `Σ batches[i].doc_lengths.iter().sum()` for `i < visible_count`.
+    cumulative_total_tokens: u64,
+}
+
+impl Snapshot {
+    fn empty() -> Arc<Self> {
+        Arc::new(Self {
+            visible_count: 0,
+            batches: Arc::from(Vec::<Arc<BatchMeta>>::new().into_boxed_slice()),
+            cumulative_doc_count: 0,
+            cumulative_total_tokens: 0,
+        })
+    }
+
+    fn batch_for(&self, batch_position: usize) -> Option<&Arc<BatchMeta>> {
+        // Visible batches are densely numbered starting at 0 in the order
+        // they were inserted. Use direct index when possible.
+        self.batches
+            .get(batch_position)
+            .filter(|m| m.batch_position == batch_position)
+            .or_else(|| {
+                self.batches[..self.visible_count]
+                    .iter()
+                    .find(|m| m.batch_position == batch_position)
+            })
+    }
+}
+
+/// Bounded pool of reader tokenizers with a writer-dedicated slot.
+///
+/// `LanceTokenizer::token_stream_for_*` takes `&mut self`, so each concurrent
+/// caller needs its own tokenizer instance. Builds are cheap for English but
+/// can load dictionaries for CJK tokenizers, so we amortize via the pool.
+struct TokenizerPool {
+    template: Box<dyn LanceTokenizer>,
+    free: Mutex<Vec<Box<dyn LanceTokenizer>>>,
+    cap: usize,
+}
+
+impl std::fmt::Debug for TokenizerPool {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("TokenizerPool")
+            .field("cap", &self.cap)
+            .finish()
+    }
+}
+
+impl TokenizerPool {
+    fn new(params: &InvertedIndexParams, cap: usize) -> Result<Self> {
+        let template = params.build()?;
+        Ok(Self {
+            template,
+            free: Mutex::new(Vec::new()),
+            cap: cap.max(1),
+        })
+    }
+
+    /// Acquire a tokenizer. Pops from the free list, otherwise clones the
+    /// template.
+    fn acquire(&self) -> Box<dyn LanceTokenizer> {
+        if let Some(t) = self.free.lock().ok().and_then(|mut g| g.pop()) {
+            return t;
+        }
+        self.template.box_clone()
+    }
+
+    /// Return a tokenizer to the pool, dropping it if the pool is at cap.
+    fn release(&self, tokenizer: Box<dyn LanceTokenizer>) {
+        if let Ok(mut g) = self.free.lock()
+            && g.len() < self.cap
+        {
+            g.push(tokenizer);
+        }
+    }
+}
+
+/// RAII guard that returns the tokenizer to the pool on drop.
+struct PooledTokenizer<'a> {
+    pool: &'a TokenizerPool,
+    inner: Option<Box<dyn LanceTokenizer>>,
+}
+
+impl<'a> PooledTokenizer<'a> {
+    fn new(pool: &'a TokenizerPool) -> Self {
+        Self {
+            pool,
+            inner: Some(pool.acquire()),
+        }
+    }
+
+    fn get_mut(&mut self) -> &mut dyn LanceTokenizer {
+        self.inner.as_mut().expect("tokenizer in scope").as_mut()
+    }
+}
+
+impl<'a> Drop for PooledTokenizer<'a> {
+    fn drop(&mut self) {
+        if let Some(t) = self.inner.take() {
+            self.pool.release(t);
+        }
+    }
 }
 
-/// In-memory FTS index for full-text search.
+// ============================================================================
+// FtsMemIndex
+// ============================================================================
+
+/// In-memory full-text search index. See module docs for the concurrency
+/// model and visibility contract.
 pub struct FtsMemIndex {
-    /// Field ID this index is built on.
     field_id: i32,
-    /// Column name (for Arrow batch lookups).
     column_name: String,
-    /// Inverted index: (token, row_position) -> (frequency, positions).
-    postings: SkipMap<FtsKey, PostingValue>,
-    /// Total document count.
-    doc_count: AtomicUsize,
-    /// Tokenizer for text processing (same as Lance's InvertedIndex).
-    tokenizer: Mutex<Box<dyn LanceTokenizer>>,
-    /// The parameters used to create the tokenizer (for flush).
     params: InvertedIndexParams,
-    /// Document lengths: row_position -> token count (for BM25).
-    doc_lengths: SkipMap<u64, u32>,
-    /// Total token count across all documents (for computing avgdl).
-    total_tokens: AtomicUsize,
-    /// Document frequency: term -> number of documents containing the term.
-    doc_freq: SkipMap<String, AtomicUsize>,
+
+    tokenizer_pool: Arc<TokenizerPool>,
+    /// Writer-only tokenizer slot. Held under a Mutex purely so `insert`
+    /// can take `&self`. Single-writer assumption means this is uncontested.
+    writer_tokenizer: Mutex<Box<dyn LanceTokenizer>>,
+
+    /// Per-term posting slices. `Arc<str>` interns the term so a single
+    /// allocation backs every chunk that mentions it.
+    terms: SkipMap<Arc<str>, ArcSwap<TermSlice>>,
+
+    /// Atomically-swapped visibility snapshot.
+    snapshot: ArcSwap<Snapshot>,
+
+    /// Strictly-monotonic batch position counter, used by `insert` (the
+    /// no-explicit-position variant) to assign sequential ids. Reads of
+    /// this counter are not part of the visibility contract — callers
+    /// passing explicit positions to `insert_with_batch_position` must keep
+    /// these monotonic themselves.
+    next_batch_position: AtomicUsize,
 }
 
 impl std::fmt::Debug for FtsMemIndex {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let snap = self.snapshot.load();
         f.debug_struct("FtsMemIndex")
             .field("field_id", &self.field_id)
             .field("column_name", &self.column_name)
-            .field("doc_count", &self.doc_count)
+            .field("visible_doc_count", &snap.cumulative_doc_count)
+            .field("visible_batch_count", &snap.visible_count)
             .field("params", &self.params)
             .finish()
     }
 }
 
 impl FtsMemIndex {
+    /// Default reader-tokenizer pool capacity. Small but enough to absorb
+    /// short bursts of concurrent search calls without thrashing.
+    const DEFAULT_TOKENIZER_POOL_CAP: usize = 8;
+
     /// Create a new FTS index for the given field with default parameters.
     pub fn new(field_id: i32, column_name: String) -> Self {
         Self::with_params(field_id, column_name, InvertedIndexParams::default())
@@ -472,547 +623,439 @@ impl FtsMemIndex {
 
     /// Create a new FTS index with custom tokenizer parameters.
     pub fn with_params(field_id: i32, column_name: String, params: InvertedIndexParams) -> Self {
-        let tokenizer = params.build().expect("Failed to build tokenizer");
+        let pool = TokenizerPool::new(&params, Self::DEFAULT_TOKENIZER_POOL_CAP)
+            .expect("Failed to build tokenizer");
+        let writer_tokenizer = pool.template.box_clone();
         Self {
             field_id,
             column_name,
-            postings: SkipMap::new(),
-            doc_count: AtomicUsize::new(0),
-            tokenizer: Mutex::new(tokenizer),
             params,
-            doc_lengths: SkipMap::new(),
-            total_tokens: AtomicUsize::new(0),
-            doc_freq: SkipMap::new(),
+            tokenizer_pool: Arc::new(pool),
+            writer_tokenizer: Mutex::new(writer_tokenizer),
+            terms: SkipMap::new(),
+            snapshot: ArcSwap::from(Snapshot::empty()),
+            next_batch_position: AtomicUsize::new(0),
         }
     }
 
-    /// Get the field ID this index is built on.
     pub fn field_id(&self) -> i32 {
         self.field_id
     }
 
-    /// Get the inverted index parameters.
+    pub fn column_name(&self) -> &str {
+        &self.column_name
+    }
+
     pub fn params(&self) -> &InvertedIndexParams {
         &self.params
     }
 
-    /// Insert documents from a batch into the index.
+    /// Number of visible documents.
+    pub fn doc_count(&self) -> usize {
+        self.snapshot.load().cumulative_doc_count as usize
+    }
+
+    /// Whether there are any visible documents.
+    pub fn is_empty(&self) -> bool {
+        self.snapshot.load().visible_count == 0
+    }
+
+    /// Total number of (term, doc) postings currently stored.
+    ///
+    /// Counts every chunk regardless of visibility — the writer's invariant
+    /// is that chunks become visible together with the snapshot bump, so
+    /// in steady state this equals the number of visible postings.
+    pub fn entry_count(&self) -> usize {
+        self.terms
+            .iter()
+            .map(|e| {
+                let slice = e.value().load();
+                slice.chunks.iter().map(|c| c.doc_count()).sum::<usize>()
+            })
+            .sum()
+    }
+
+    /// Estimated bytes of heap memory held by this index.
+    pub fn memory_usage(&self) -> usize {
+        let mut total = std::mem::size_of::<Self>();
+        for entry in self.terms.iter() {
+            // ~32 bytes for SkipMap node overhead plus the interned term itself.
+            let term: &Arc<str> = entry.key();
+            total += std::mem::size_of::<Arc<str>>() + term.len() + 32;
+            let slice = entry.value().load();
+            total += slice.memory_size();
+        }
+        let snap = self.snapshot.load();
+        total += snap.batches.iter().map(|b| b.memory_size()).sum::<usize>();
+        total
+    }
+
+    // ------------------------------------------------------------------
+    // Insert
+    // ------------------------------------------------------------------
+
+    /// Insert a batch using a sequentially-derived batch position.
     pub fn insert(&self, batch: &RecordBatch, row_offset: u64) -> Result<()> {
-        let col_idx = batch
+        let batch_position = self.next_batch_position.fetch_add(1, Ordering::Relaxed);
+        self.insert_with_batch_position(batch, row_offset, batch_position)
+    }
+
+    /// Insert a batch with an explicit batch position. The position is part
+    /// of the visibility contract — see module docs.
+    pub fn insert_with_batch_position(
+        &self,
+        batch: &RecordBatch,
+        row_offset: u64,
+        batch_position: usize,
+    ) -> Result<()> {
+        // Keep next_batch_position ahead of any explicit caller-supplied
+        // position so a later `insert` (no-position) doesn't collide.
+        let _ = self
+            .next_batch_position
+            .fetch_max(batch_position.saturating_add(1), Ordering::Relaxed);
+
+        let Some(col_idx) = batch
             .schema()
             .column_with_name(&self.column_name)
-            .map(|(idx, _)| idx);
-
-        if col_idx.is_none() {
-            return Ok(());
-        }
-
-        let column = batch.column(col_idx.unwrap());
-
-        for row_idx in 0..batch.num_rows() {
-            let value = ScalarValue::try_from_array(column.as_ref(), row_idx)?;
-            let row_position = row_offset + row_idx as u64;
-
-            if let ScalarValue::Utf8(Some(text)) | ScalarValue::LargeUtf8(Some(text)) = value {
-                // Use the tokenizer (same as InvertedIndex)
-                // Track both frequency and positions for each term
-                let mut term_data: HashMap<String, (u32, Vec<u32>)> = HashMap::new();
-                {
-                    let mut tokenizer = self.tokenizer.lock().unwrap();
-                    let mut token_stream = tokenizer.token_stream_for_doc(&text);
-                    let mut position: u32 = 0;
-                    while let Some(token) = token_stream.next() {
-                        let entry = term_data.entry(token.text.clone()).or_default();
-                        entry.0 += 1; // frequency
-                        entry.1.push(position); // position
-                        position += 1;
-                    }
-                }
+            .map(|(idx, _)| idx)
+        else {
+            // Column missing: nothing to index, but advance the snapshot so
+            // the caller's position counter stays in sync.
+            return self.publish_empty_batch(batch.num_rows() as u32, row_offset, batch_position);
+        };
 
-                // Calculate document length (total token count in this doc)
-                let doc_length: u32 = term_data.values().map(|(freq, _)| freq).sum();
-                self.doc_lengths.insert(row_position, doc_length);
-                self.total_tokens
-                    .fetch_add(doc_length as usize, Ordering::Relaxed);
+        let column = batch.column(col_idx);
+        let texts = extract_texts(column.as_ref())?;
+        debug_assert_eq!(texts.len(), batch.num_rows());
+
+        let mut tok_guard = self
+            .writer_tokenizer
+            .lock()
+            .expect("writer tokenizer poisoned — single-writer invariant violated");
+        let tokenizer: &mut dyn LanceTokenizer = tok_guard.as_mut();
+
+        // Per-term builders (frequency + per-doc positions, indexed by
+        // local doc index in this batch).
+        let mut term_builders: HashMap<Arc<str>, BatchTermBuilder> = HashMap::new();
+        let mut doc_lengths: Vec<u32> = Vec::with_capacity(batch.num_rows());
+        let mut total_tokens: u64 = 0;
+
+        for (local_doc_idx, text_opt) in texts.iter().enumerate() {
+            // Track each doc's position even for null/missing rows so the
+            // dense `doc_lengths` array stays aligned with `row_offset + i`.
+            let mut doc_token_count: u32 = 0;
+            // term -> (frequency, positions). Keyed by `String` so we only
+            // pay the `Arc<str>` allocation once per unique term per doc,
+            // when transferring to `term_builders` below.
+            let mut per_doc: HashMap<String, (u32, Vec<u32>)> = HashMap::new();
+
+            if let Some(text) = text_opt {
+                let mut stream = tokenizer.token_stream_for_doc(text);
+                let mut position: u32 = 0;
+                while let Some(tok) = stream.next() {
+                    let entry = per_doc
+                        .entry(tok.text.clone())
+                        .or_insert_with(|| (0, Vec::new()));
+                    entry.0 += 1;
+                    entry.1.push(position);
+                    position += 1;
+                    doc_token_count += 1;
+                }
+            }
 
-                for (token, (freq, positions)) in term_data {
-                    // Update document frequency for this term
-                    if let Some(entry) = self.doc_freq.get(&token) {
-                        entry.value().fetch_add(1, Ordering::Relaxed);
+            doc_lengths.push(doc_token_count);
+            total_tokens += doc_token_count as u64;
+
+            let row_position = row_offset + local_doc_idx as u64;
+            for (term, (freq, positions)) in per_doc {
+                // Reuse an interned `Arc<str>` if we've already seen the
+                // term in this batch. The first occurrence pays the
+                // allocation; subsequent occurrences clone the Arc.
+                let term_arc: Arc<str> =
+                    if let Some((existing, _)) = term_builders.get_key_value(term.as_str()) {
+                        Arc::clone(existing)
                     } else {
-                        self.doc_freq.insert(token.clone(), AtomicUsize::new(1));
-                    }
-
-                    let key = FtsKey {
-                        token,
-                        row_position,
+                        Arc::<str>::from(term.as_str())
                     };
-                    self.postings.insert(
-                        key,
-                        PostingValue {
-                            frequency: freq,
-                            positions,
-                        },
-                    );
-                }
+                let builder = term_builders
+                    .entry(term_arc)
+                    .or_insert_with(BatchTermBuilder::new);
+                builder.push_doc(row_position, freq, positions);
             }
+        }
 
-            self.doc_count.fetch_add(1, Ordering::Relaxed);
+        // Drop the tokenizer guard before any work that could be slow so we
+        // don't hold it across allocations.
+        drop(tok_guard);
+
+        // Install per-term chunks first so any chunk a reader can possibly
+        // see (via a later snapshot store) is fully linked.
+        for (term, builder) in term_builders {
+            let chunk = builder.build(batch_position);
+            let entry = self
+                .terms
+                .get_or_insert_with(term, TermSlice::empty_arc_swap);
+            let cur = entry.value().load();
+            entry.value().store(cur.with_chunk_appended(chunk));
         }
 
+        let new_meta = Arc::new(BatchMeta {
+            batch_position,
+            row_offset,
+            doc_lengths,
+            rows: batch.num_rows() as u32,
+        });
+
+        self.publish_batch(new_meta, total_tokens);
         Ok(())
     }
 
-    /// Search for documents containing a term.
-    ///
-    /// The term is tokenized using the same tokenizer as the index.
-    /// Returns all matching documents with their BM25 scores.
-    pub fn search(&self, term: &str) -> Vec<FtsEntry> {
-        // Tokenize the search term using token_stream_for_search
-        let tokens: Vec<String> = {
-            let mut tokenizer = self.tokenizer.lock().unwrap();
-            let mut token_stream = tokenizer.token_stream_for_search(term);
-            let mut tokens = Vec::new();
-            while let Some(token) = token_stream.next() {
-                tokens.push(token.text.clone());
-            }
-            tokens
-        };
-
-        // BM25 parameters
-        const K1: f32 = 1.2;
-        const B: f32 = 0.75;
-
-        let n = self.doc_count.load(Ordering::Relaxed) as f32;
-        let total_tokens = self.total_tokens.load(Ordering::Relaxed) as f32;
-        let avgdl = if n > 0.0 { total_tokens / n } else { 1.0 };
-
-        // Collect term frequencies per document for all query tokens
-        // Map: row_position -> Vec<(term_freq, doc_freq_for_term)>
-        let mut doc_term_info: HashMap<RowPosition, Vec<(u32, usize)>> = HashMap::new();
-
-        for token in &tokens {
-            // Get document frequency for this term
-            let df = self
-                .doc_freq
-                .get(token)
-                .map(|e| e.value().load(Ordering::Relaxed))
-                .unwrap_or(0);
+    /// Publish a snapshot for an "empty" batch (e.g. column missing) so the
+    /// visibility counters keep up with the writer's batch_position stream.
+    fn publish_empty_batch(&self, rows: u32, row_offset: u64, batch_position: usize) -> Result<()> {
+        let meta = Arc::new(BatchMeta {
+            batch_position,
+            row_offset,
+            doc_lengths: vec![0; rows as usize],
+            rows,
+        });
+        self.publish_batch(meta, 0);
+        Ok(())
+    }
 
-            if df == 0 {
-                continue;
-            }
+    fn publish_batch(&self, new_meta: Arc<BatchMeta>, batch_total_tokens: u64) {
+        let cur = self.snapshot.load();
+        let new_rows = new_meta.rows as u64;
+        let mut batches: Vec<Arc<BatchMeta>> = Vec::with_capacity(cur.batches.len() + 1);
+        batches.extend(cur.batches.iter().cloned());
+        batches.push(new_meta);
+        let new_snap = Snapshot {
+            visible_count: cur.visible_count + 1,
+            batches: Arc::from(batches.into_boxed_slice()),
+            cumulative_doc_count: cur.cumulative_doc_count + new_rows,
+            cumulative_total_tokens: cur.cumulative_total_tokens + batch_total_tokens,
+        };
+        self.snapshot.store(Arc::new(new_snap));
+    }
 
-            let start = FtsKey {
-                token: token.clone(),
-                row_position: 0,
-            };
-            let end = FtsKey {
-                token: token.clone(),
-                row_position: u64::MAX,
-            };
+    // ------------------------------------------------------------------
+    // Read path
+    // ------------------------------------------------------------------
 
-            for entry in self.postings.range(start..=end) {
-                doc_term_info
-                    .entry(entry.key().row_position)
-                    .or_default()
-                    .push((entry.value().frequency, df));
-            }
+    /// Search for documents containing a term.
+    ///
+    /// The term is tokenized using the configured tokenizer. Returns all
+    /// matching documents with BM25 scores. Result order is unspecified;
+    /// use `search_with_options` for sorted/limited output.
+    pub fn search(&self, term: &str) -> Vec<FtsEntry> {
+        let snap = self.snapshot.load_full();
+        if snap.visible_count == 0 {
+            return Vec::new();
         }
 
-        // Compute BM25 score for each document
-        doc_term_info
-            .into_iter()
-            .map(|(row_position, term_infos)| {
-                let dl = self
-                    .doc_lengths
-                    .get(&row_position)
-                    .map(|e| *e.value() as f32)
-                    .unwrap_or(1.0);
-
-                let mut score: f32 = 0.0;
-                for (tf, df) in term_infos {
-                    // IDF = log((N - n + 0.5) / (n + 0.5) + 1)
-                    let df_f = df as f32;
-                    let idf = ((n - df_f + 0.5) / (df_f + 0.5) + 1.0).ln();
-
-                    // BM25 term score = IDF * (tf * (k1 + 1)) / (tf + k1 * (1 - b + b * (dl / avgdl)))
-                    let tf_f = tf as f32;
-                    let numerator = tf_f * (K1 + 1.0);
-                    let denominator = tf_f + K1 * (1.0 - B + B * (dl / avgdl));
-                    score += idf * (numerator / denominator);
-                }
+        let tokens = self.tokenize_for_search(term);
 
-                FtsEntry {
-                    row_position,
-                    score,
-                }
-            })
-            .collect()
+        score_terms(&snap, &self.terms, &tokens)
     }
 
-    /// Search for documents containing an exact phrase.
-    ///
-    /// The phrase is tokenized and documents must contain all tokens
-    /// in the correct order (within the specified slop distance).
-    ///
-    /// # Arguments
-    /// * `phrase` - The phrase to search for
-    /// * `slop` - Maximum allowed distance between consecutive tokens.
-    ///   0 means exact phrase match (tokens must be adjacent).
-    ///   1 allows one intervening token, etc.
-    ///
-    /// Returns matching documents with BM25 scores.
+    /// Search for documents containing an exact phrase, optionally allowing
+    /// `slop` intervening tokens between consecutive query tokens.
     pub fn search_phrase(&self, phrase: &str, slop: u32) -> Vec<FtsEntry> {
-        // Tokenize the phrase
-        let tokens: Vec<String> = {
-            let mut tokenizer = self.tokenizer.lock().unwrap();
-            let mut token_stream = tokenizer.token_stream_for_search(phrase);
-            let mut tokens = Vec::new();
-            while let Some(token) = token_stream.next() {
-                tokens.push(token.text.clone());
-            }
-            tokens
-        };
+        let snap = self.snapshot.load_full();
+        if snap.visible_count == 0 {
+            return Vec::new();
+        }
 
+        let tokens = self.tokenize_for_search(phrase);
         if tokens.is_empty() {
-            return vec![];
+            return Vec::new();
         }
-
-        // Single token phrase is just a regular search
         if tokens.len() == 1 {
-            return self.search(phrase);
+            // Same shortcut as the previous implementation: a single-token
+            // phrase reduces to a regular term search.
+            return score_terms(&snap, &self.terms, &tokens);
         }
 
-        // BM25 parameters
-        const K1: f32 = 1.2;
-        const B: f32 = 0.75;
-
-        let n = self.doc_count.load(Ordering::Relaxed) as f32;
-        let total_tokens = self.total_tokens.load(Ordering::Relaxed) as f32;
-        let avgdl = if n > 0.0 { total_tokens / n } else { 1.0 };
-
-        // Collect posting lists for each token
-        // Map: token_index -> Map<row_position, PostingValue>
-        let mut token_postings: Vec<HashMap<RowPosition, PostingValue>> = Vec::new();
-
-        for token in &tokens {
-            let start = FtsKey {
-                token: token.clone(),
-                row_position: 0,
-            };
-            let end = FtsKey {
-                token: token.clone(),
-                row_position: u64::MAX,
-            };
-
-            let mut postings_for_token: HashMap<RowPosition, PostingValue> = HashMap::new();
-            for entry in self.postings.range(start..=end) {
-                postings_for_token.insert(entry.key().row_position, entry.value().clone());
+        // Gather visible chunks per token.
+        let mut per_token_chunks: Vec<Vec<Arc<TermChunk>>> = Vec::with_capacity(tokens.len());
+        for tok in &tokens {
+            match self.terms.get(tok.as_str()) {
+                Some(entry) => {
+                    let slice = entry.value().load_full();
+                    let visible: Vec<Arc<TermChunk>> = slice
+                        .chunks
+                        .iter()
+                        .filter(|c| c.batch_position < snap.visible_count)
+                        .cloned()
+                        .collect();
+                    if visible.is_empty() {
+                        return Vec::new();
+                    }
+                    per_token_chunks.push(visible);
+                }
+                None => return Vec::new(),
             }
-            token_postings.push(postings_for_token);
         }
 
-        // Find documents that contain ALL tokens
-        let first_token_docs: Vec<RowPosition> = token_postings[0].keys().copied().collect();
-
-        let mut matching_docs: Vec<FtsEntry> = Vec::new();
+        // Per-term `df` for IDF.
+        let dfs: Vec<u32> = per_token_chunks
+            .iter()
+            .map(|chunks| chunks.iter().map(|c| c.doc_count() as u32).sum::<u32>())
+            .collect();
 
-        for row_position in first_token_docs {
-            // Check if this document contains all tokens
-            let all_tokens_present = token_postings
-                .iter()
-                .all(|tp| tp.contains_key(&row_position));
-            if !all_tokens_present {
-                continue;
-            }
+        let n = snap.cumulative_doc_count as f32;
+        let avgdl = if n > 0.0 {
+            snap.cumulative_total_tokens as f32 / n
+        } else {
+            1.0
+        };
 
-            // Check if the phrase matches (positions are in order within slop)
-            if self.check_phrase_positions(&token_postings, row_position, slop) {
-                // Calculate BM25 score
-                let dl = self
-                    .doc_lengths
-                    .get(&row_position)
-                    .map(|e| *e.value() as f32)
-                    .unwrap_or(1.0);
-
-                let mut score: f32 = 0.0;
-                for (token_idx, token) in tokens.iter().enumerate() {
-                    let df = self
-                        .doc_freq
-                        .get(token)
-                        .map(|e| e.value().load(Ordering::Relaxed))
-                        .unwrap_or(1) as f32;
-                    let tf = token_postings[token_idx]
-                        .get(&row_position)
-                        .map(|p| p.frequency as f32)
-                        .unwrap_or(1.0);
-
-                    // IDF = log((N - n + 0.5) / (n + 0.5) + 1)
-                    let idf = ((n - df + 0.5) / (df + 0.5) + 1.0).ln();
-
-                    // BM25 term score
-                    let numerator = tf * (K1 + 1.0);
-                    let denominator = tf + K1 * (1.0 - B + B * (dl / avgdl));
-                    score += idf * (numerator / denominator);
+        // Find candidate documents: rows that appear in every token's
+        // posting set. Start with the smallest token's docs to bound work.
+        let smallest_idx = (0..per_token_chunks.len())
+            .min_by_key(|&i| {
+                per_token_chunks[i]
+                    .iter()
+                    .map(|c| c.doc_count())
+                    .sum::<usize>()
+            })
+            .unwrap();
+        let candidate_chunks = &per_token_chunks[smallest_idx];
+
+        let mut results = Vec::new();
+        for chunk in candidate_chunks {
+            for (doc_idx, &row_position) in chunk.row_positions.iter().enumerate() {
+                let pos = chunk
+                    .positions
+                    .as_ref()
+                    .map(|p| p.doc_positions(doc_idx).to_vec());
+                let Some(pos) = pos else { continue };
+                let mut all_positions: Vec<Vec<u32>> = vec![Vec::new(); tokens.len()];
+                all_positions[smallest_idx] = pos;
+                let mut all_present = true;
+                let mut frequencies = vec![0u32; tokens.len()];
+                frequencies[smallest_idx] = chunk.frequencies[doc_idx];
+                for (ti, chunks) in per_token_chunks.iter().enumerate() {
+                    if ti == smallest_idx {
+                        continue;
+                    }
+                    match find_doc_in_chunks(chunks, row_position) {
+                        Some((c, doc_idx_other)) => {
+                            frequencies[ti] = c.frequencies[doc_idx_other];
+                            all_positions[ti] = c
+                                .positions
+                                .as_ref()
+                                .map(|p| p.doc_positions(doc_idx_other).to_vec())
+                                .unwrap_or_default();
+                        }
+                        None => {
+                            all_present = false;
+                            break;
+                        }
+                    }
+                }
+                if !all_present {
+                    continue;
+                }
+                if !phrase_matches(&all_positions, slop) {
+                    continue;
                 }
 
-                matching_docs.push(FtsEntry {
+                let dl = lookup_dl(&snap, row_position).unwrap_or(1) as f32;
+                let mut score = 0.0f32;
+                for (ti, _tok) in tokens.iter().enumerate() {
+                    let tf = frequencies[ti] as f32;
+                    let df = dfs[ti] as f32;
+                    score += bm25_term_score(tf, df.max(1.0), n, dl, avgdl);
+                }
+                results.push(FtsEntry {
                     row_position,
                     score,
                 });
             }
         }
 
-        matching_docs
-    }
-
-    /// Check if phrase positions match within the given slop.
-    ///
-    /// Uses relative position algorithm: for each token, compute
-    /// `relative_pos = doc_position - query_position`. If all tokens
-    /// have the same relative position (within slop), the phrase matches.
-    fn check_phrase_positions(
-        &self,
-        token_postings: &[HashMap<RowPosition, PostingValue>],
-        row_position: RowPosition,
-        slop: u32,
-    ) -> bool {
-        // Get positions for each token in this document
-        let mut all_positions: Vec<&Vec<u32>> = Vec::new();
-        for tp in token_postings {
-            if let Some(posting) = tp.get(&row_position) {
-                all_positions.push(&posting.positions);
-            } else {
-                return false;
-            }
-        }
-
-        // For each position of the first token, check if we can form a phrase
-        for &first_pos in all_positions[0] {
-            if Self::check_phrase_from_position(&all_positions, first_pos, slop) {
-                return true;
-            }
-        }
-
-        false
-    }
-
-    /// Check if a phrase can be formed starting from a given position of the first token.
-    fn check_phrase_from_position(all_positions: &[&Vec<u32>], first_pos: u32, slop: u32) -> bool {
-        let mut expected_pos = first_pos;
-
-        for positions in all_positions.iter().skip(1) {
-            // Find a position for this token that's within slop of expected
-            // For slop=0, next token must be at expected_pos+1 (adjacent)
-            // For slop=1, next token can be at expected_pos+1 or expected_pos+2
-            let min_pos = expected_pos.saturating_add(1);
-            let max_pos = expected_pos.saturating_add(1 + slop);
-
-            // Find the actual position used (smallest valid one)
-            if let Some(&actual_pos) = positions
-                .iter()
-                .filter(|&&pos| pos >= min_pos && pos <= max_pos)
-                .min()
-            {
-                expected_pos = actual_pos;
-            } else {
-                return false;
-            }
-        }
-
-        true
-    }
-
-    /// Get the number of entries in the index.
-    /// Note: This counts (token, row_position) pairs, not unique tokens.
-    pub fn entry_count(&self) -> usize {
-        self.postings.len()
-    }
-
-    /// Get the document count.
-    pub fn doc_count(&self) -> usize {
-        self.doc_count.load(Ordering::Relaxed)
-    }
-
-    /// Check if the index is empty.
-    pub fn is_empty(&self) -> bool {
-        self.doc_count.load(Ordering::Relaxed) == 0
-    }
-
-    /// Get the column name.
-    pub fn column_name(&self) -> &str {
-        &self.column_name
+        results
     }
 
     /// Expand a term to fuzzy matches within the specified edit distance.
-    ///
-    /// Returns a list of (matching_term, edit_distance) tuples, sorted by
-    /// edit distance (closest matches first), limited to max_expansions.
+    /// Returns `(matched_term, distance)` pairs sorted by distance, capped
+    /// at `max_expansions`.
     pub fn expand_fuzzy(
         &self,
         term: &str,
         max_distance: u32,
         max_expansions: usize,
     ) -> Vec<(String, u32)> {
+        let snap = self.snapshot.load();
         let mut matches: Vec<(String, u32)> = Vec::new();
 
-        // If max_distance is 0, only exact matches
         if max_distance == 0 {
-            if self.doc_freq.get(term).is_some() {
+            if let Some(entry) = self.terms.get(term)
+                && has_visible_chunk(&entry.value().load(), snap.visible_count)
+            {
                 matches.push((term.to_string(), 0));
             }
             return matches;
         }
 
-        // Iterate through all tokens in doc_freq
-        for entry in self.doc_freq.iter() {
-            let indexed_term = entry.key();
-            let distance = levenshtein_distance(term, indexed_term);
-
-            if distance <= max_distance {
-                matches.push((indexed_term.clone(), distance));
+        for entry in self.terms.iter() {
+            let key: &Arc<str> = entry.key();
+            if !has_visible_chunk(&entry.value().load(), snap.visible_count) {
+                continue;
+            }
+            let dist = levenshtein_distance(term, key);
+            if dist <= max_distance {
+                matches.push((key.to_string(), dist));
             }
         }
-
-        // Sort by distance (prefer closer matches)
         matches.sort_by_key(|(_, d)| *d);
-
-        // Limit to max_expansions
         matches.truncate(max_expansions);
-
         matches
     }
 
-    /// Search for documents using fuzzy matching.
-    ///
-    /// Each query token is expanded to fuzzy matches within the edit distance,
-    /// then searched. Results from all expansions are combined.
+    /// Search for documents using fuzzy matching on each query token.
     pub fn search_fuzzy(
         &self,
         query: &str,
         fuzziness: Option<u32>,
         max_expansions: usize,
     ) -> Vec<FtsEntry> {
-        // Tokenize the query
-        let tokens: Vec<String> = {
-            let mut tokenizer = self.tokenizer.lock().unwrap();
-            let mut token_stream = tokenizer.token_stream_for_search(query);
-            let mut tokens = Vec::new();
-            while let Some(token) = token_stream.next() {
-                tokens.push(token.text.clone());
-            }
-            tokens
-        };
+        let snap = self.snapshot.load_full();
+        if snap.visible_count == 0 {
+            return Vec::new();
+        }
 
+        let tokens = self.tokenize_for_search(query);
         if tokens.is_empty() {
-            return vec![];
+            return Vec::new();
         }
 
-        // BM25 parameters
-        const K1: f32 = 1.2;
-        const B: f32 = 0.75;
-
-        let n = self.doc_count.load(Ordering::Relaxed) as f32;
-        let total_tokens = self.total_tokens.load(Ordering::Relaxed) as f32;
-        let avgdl = if n > 0.0 { total_tokens / n } else { 1.0 };
-
-        // Collect term frequencies per document for all expanded tokens
-        // Map: row_position -> Vec<(term_freq, doc_freq_for_term)>
-        let mut doc_term_info: HashMap<RowPosition, Vec<(u32, usize)>> = HashMap::new();
-
-        for token in &tokens {
-            // Determine fuzziness for this token
-            let max_distance = fuzziness.unwrap_or_else(|| auto_fuzziness(token));
-
-            // Expand to fuzzy matches
-            let expanded = self.expand_fuzzy(token, max_distance, max_expansions);
-
-            for (matched_term, _distance) in expanded {
-                // Get document frequency for this term
-                let df = self
-                    .doc_freq
-                    .get(&matched_term)
-                    .map(|e| e.value().load(Ordering::Relaxed))
-                    .unwrap_or(0);
-
-                if df == 0 {
-                    continue;
-                }
-
-                let start = FtsKey {
-                    token: matched_term.clone(),
-                    row_position: 0,
-                };
-                let end = FtsKey {
-                    token: matched_term,
-                    row_position: u64::MAX,
-                };
-
-                for entry in self.postings.range(start..=end) {
-                    doc_term_info
-                        .entry(entry.key().row_position)
-                        .or_default()
-                        .push((entry.value().frequency, df));
-                }
+        let mut expanded: Vec<String> = Vec::new();
+        for tok in &tokens {
+            let max_dist = fuzziness.unwrap_or_else(|| auto_fuzziness(tok));
+            for (matched, _) in self.expand_fuzzy(tok, max_dist, max_expansions) {
+                expanded.push(matched);
             }
         }
+        if expanded.is_empty() {
+            return Vec::new();
+        }
 
-        // Compute BM25 score for each document
-        doc_term_info
-            .into_iter()
-            .map(|(row_position, term_infos)| {
-                let dl = self
-                    .doc_lengths
-                    .get(&row_position)
-                    .map(|e| *e.value() as f32)
-                    .unwrap_or(1.0);
-
-                let mut score: f32 = 0.0;
-                for (tf, df) in term_infos {
-                    // IDF = log((N - n + 0.5) / (n + 0.5) + 1)
-                    let df_f = df as f32;
-                    let idf = ((n - df_f + 0.5) / (df_f + 0.5) + 1.0).ln();
-
-                    // BM25 term score
-                    let tf_f = tf as f32;
-                    let numerator = tf_f * (K1 + 1.0);
-                    let denominator = tf_f + K1 * (1.0 - B + B * (dl / avgdl));
-                    score += idf * (numerator / denominator);
-                }
-
-                FtsEntry {
-                    row_position,
-                    score,
-                }
-            })
-            .collect()
+        score_terms(&snap, &self.terms, &expanded)
     }
 
     /// Execute a query expression and return matching documents with scores.
-    ///
-    /// This is the main entry point for executing complex queries including
-    /// match, phrase, fuzzy, and Boolean queries.
-    ///
-    /// For performance optimization with limits, use `search_with_options()` instead.
     pub fn search_query(&self, query: &FtsQueryExpr) -> Vec<FtsEntry> {
         match query {
             FtsQueryExpr::Match { query, boost } => {
                 let mut results = self.search(query);
-                if *boost != 1.0 {
-                    for entry in &mut results {
-                        entry.score *= boost;
-                    }
-                }
+                apply_boost(&mut results, *boost);
                 results
             }
             FtsQueryExpr::Phrase { query, slop, boost } => {
                 let mut results = self.search_phrase(query, *slop);
-                if *boost != 1.0 {
-                    for entry in &mut results {
-                        entry.score *= boost;
-                    }
-                }
+                apply_boost(&mut results, *boost);
                 results
             }
             FtsQueryExpr::Fuzzy {
@@ -1022,11 +1065,7 @@ impl FtsMemIndex {
                 boost,
             } => {
                 let mut results = self.search_fuzzy(query, *fuzziness, *max_expansions);
-                if *boost != 1.0 {
-                    for entry in &mut results {
-                        entry.score *= boost;
-                    }
-                }
+                apply_boost(&mut results, *boost);
                 results
             }
             FtsQueryExpr::Boolean {
@@ -1042,131 +1081,73 @@ impl FtsMemIndex {
         }
     }
 
-    /// Execute a query with options for performance/recall tradeoffs.
-    ///
-    /// This method extends `search_query()` with:
-    /// - **WAND factor**: Early termination based on score threshold.
-    ///   With `wand_factor < 1.0`, documents scoring below
-    ///   `threshold = top_k_score * wand_factor` are pruned after scoring.
-    /// - **Limit**: Maximum number of results to return (top-k by score).
-    ///
-    /// Results are always sorted by score in descending order.
-    ///
-    /// # Arguments
-    /// * `query` - The query expression to execute
-    /// * `options` - Search options including wand_factor and limit
-    ///
-    /// # Example
-    /// ```ignore
-    /// let options = SearchOptions::default()
-    ///     .with_limit(10)
-    ///     .with_wand_factor(0.8);
-    /// let results = index.search_with_options(&query, options);
-    /// ```
+    /// Execute a query with options (sort + WAND prune + limit).
     pub fn search_with_options(
         &self,
         query: &FtsQueryExpr,
         options: SearchOptions,
     ) -> Vec<FtsEntry> {
-        // Execute the query to get all results
         let mut results = self.search_query(query);
-
-        // Sort by score descending
         results.sort_by(|a, b| {
             b.score
                 .partial_cmp(&a.score)
                 .unwrap_or(std::cmp::Ordering::Equal)
         });
-
-        // Apply WAND factor pruning if wand_factor < 1.0 and we have a limit
         if options.wand_factor < 1.0 {
             if let Some(limit) = options.limit {
                 if results.len() > limit {
-                    // Get the k-th best score (at position limit-1)
                     let top_k_score = results[limit - 1].score;
                     let threshold = top_k_score * options.wand_factor;
-
-                    // Keep results scoring above the threshold, plus all results up to limit
-                    // This ensures we don't accidentally prune results that would be in top-k
-                    results.retain(|e| e.score >= threshold);
-                }
-            } else {
-                // No limit but wand_factor < 1.0: prune relative to max score
-                if let Some(max_entry) = results.first() {
-                    let threshold = max_entry.score * options.wand_factor;
                     results.retain(|e| e.score >= threshold);
                 }
+            } else if let Some(max_entry) = results.first() {
+                let threshold = max_entry.score * options.wand_factor;
+                results.retain(|e| e.score >= threshold);
             }
         }
-
-        // Apply limit
         if let Some(limit) = options.limit {
             results.truncate(limit);
         }
-
         results
     }
 
-    /// Execute a boosting query.
-    ///
-    /// Returns documents matching the positive query. Documents that also
-    /// match the negative query have their scores multiplied by `negative_boost`.
     fn search_boost(
         &self,
         positive: &FtsQueryExpr,
         negative: Option<&FtsQueryExpr>,
         negative_boost: f32,
     ) -> Vec<FtsEntry> {
-        // Execute positive query to get base results
         let mut results = self.search_query(positive);
-
-        // If no negative query, just return positive results
-        let Some(neg_query) = negative else {
+        let Some(neg) = negative else {
             return results;
         };
-
-        // Execute negative query
-        let negative_results = self.search_query(neg_query);
-
-        // Build a set of row positions that match the negative query
-        let negative_positions: std::collections::HashSet<RowPosition> =
-            negative_results.iter().map(|e| e.row_position).collect();
-
-        // Apply negative boost to documents matching both queries
+        let negative_results = self.search_query(neg);
+        let negative_set: HashSet<RowPosition> = negative_results
+            .into_iter()
+            .map(|e| e.row_position)
+            .collect();
         for entry in &mut results {
-            if negative_positions.contains(&entry.row_position) {
+            if negative_set.contains(&entry.row_position) {
                 entry.score *= negative_boost;
             }
         }
-
         results
     }
 
-    /// Execute a Boolean query with MUST/SHOULD/MUST_NOT logic.
-    ///
-    /// - MUST: All clauses must match (intersection). Scores are summed.
-    /// - SHOULD: At least one clause should match (union). Scores are added.
-    /// - MUST_NOT: No clause may match (exclusion).
-    ///
-    /// If only SHOULD clauses are present, at least one must match.
-    /// If MUST clauses are present, SHOULD clauses just add to the score.
     fn search_boolean(
         &self,
         must: &[FtsQueryExpr],
         should: &[FtsQueryExpr],
         must_not: &[FtsQueryExpr],
     ) -> Vec<FtsEntry> {
-        // Collect MUST_NOT results for exclusion
-        let excluded: std::collections::HashSet<RowPosition> = must_not
+        let excluded: HashSet<RowPosition> = must_not
             .iter()
             .flat_map(|q| self.search_query(q))
             .map(|e| e.row_position)
             .collect();
 
-        // Start with MUST clauses (intersection)
         let mut result_map: HashMap<RowPosition, f32> = if must.is_empty() {
-            // No MUST clauses: start with all SHOULD results
-            let mut map = HashMap::new();
+            let mut map: HashMap<RowPosition, f32> = HashMap::new();
             for q in should {
                 for entry in self.search_query(q) {
                     *map.entry(entry.row_position).or_default() += entry.score;
@@ -1174,29 +1155,22 @@ impl FtsMemIndex {
             }
             map
         } else {
-            // Execute first MUST clause
             let first_results = self.search_query(&must[0]);
             let mut map: HashMap<RowPosition, f32> = first_results
                 .into_iter()
                 .map(|e| (e.row_position, e.score))
                 .collect();
-
-            // Intersect with remaining MUST clauses
             for q in must.iter().skip(1) {
                 let results = self.search_query(q);
                 let result_set: HashMap<RowPosition, f32> = results
                     .into_iter()
                     .map(|e| (e.row_position, e.score))
                     .collect();
-
-                // Keep only documents in both sets, sum scores
                 map = map
                     .into_iter()
                     .filter_map(|(pos, score)| result_set.get(&pos).map(|s| (pos, score + s)))
                     .collect();
             }
-
-            // Add SHOULD clause scores (don't require match since MUST already filters)
             for q in should {
                 for entry in self.search_query(q) {
                     if let Some(score) = map.get_mut(&entry.row_position) {
@@ -1204,16 +1178,13 @@ impl FtsMemIndex {
                     }
                 }
             }
-
             map
         };
 
-        // Filter out MUST_NOT results
         for pos in &excluded {
             result_map.remove(pos);
         }
 
-        // Convert to FtsEntry list
         result_map
             .into_iter()
             .map(|(row_position, score)| FtsEntry {
@@ -1223,18 +1194,27 @@ impl FtsMemIndex {
             .collect()
     }
 
-    /// Export the in-memory FTS index to an `InnerBuilder` for direct flush.
-    ///
-    /// This creates an `InnerBuilder` containing all the index data with
-    /// reversed row positions for efficient LSM scan. The builder can then
-    /// be written directly to disk without re-tokenizing the documents.
-    ///
-    /// # Arguments
-    /// * `partition_id` - Partition ID for the index files
-    /// * `total_rows` - Total number of rows in the MemTable (for position reversal)
+    fn tokenize_for_search(&self, text: &str) -> Vec<String> {
+        let mut tok = PooledTokenizer::new(&self.tokenizer_pool);
+        let mut stream = tok.get_mut().token_stream_for_search(text);
+        let mut out = Vec::new();
+        while let Some(t) = stream.next() {
+            out.push(t.text.clone());
+        }
+        out
+    }
+
+    // ------------------------------------------------------------------
+    // Flush to Lance inverted index format
+    // ------------------------------------------------------------------
+
+    /// Export the in-memory FTS index to an `InnerBuilder` ready to be
+    /// written to disk.
     ///
-    /// # Returns
-    /// An `InnerBuilder` ready to be written to disk
+    /// `total_rows` is the total number of rows in the MemTable being
+    /// flushed; row positions are reversed (`reversed = total_rows - pos -
+    /// 1`) to match the LSM-friendly newest-first flush order used by the
+    /// rest of MemWAL.
     pub fn to_index_builder_reversed(
         &self,
         partition_id: u64,
@@ -1243,120 +1223,344 @@ impl FtsMemIndex {
         use lance_index::scalar::inverted::builder::{InnerBuilder, PositionRecorder};
         use lance_index::scalar::inverted::{DocSet, PostingListBuilder, TokenSet};
 
-        if self.is_empty() {
+        let snap = self.snapshot.load_full();
+        let with_position = self.params.has_positions();
+
+        if snap.visible_count == 0 {
             return Ok(InnerBuilder::new(
                 partition_id,
-                self.params.has_positions(),
+                with_position,
                 Default::default(),
             ));
         }
 
         let total_rows_u64 = total_rows as u64;
-        let with_position = self.params.has_positions();
-
-        // Step 1: Build DocSet with reversed row positions
-        // Collect (original_pos, num_tokens) -> (reversed_pos, num_tokens)
-        let mut doc_entries: Vec<(u64, u32)> = self
-            .doc_lengths
-            .iter()
-            .map(|e| {
-                let original_pos = *e.key();
-                let reversed_pos = total_rows_u64 - original_pos - 1;
-                (reversed_pos, *e.value())
-            })
-            .collect();
 
-        // Sort by reversed position so doc_id assignment matches flushed data order
-        doc_entries.sort_by_key(|(pos, _)| *pos);
+        // Step 1: collect (reversed_pos, num_tokens) for every visible row.
+        // We walk visible BatchMetas in order so the reverse-sort below has
+        // a small, dense input.
+        let mut entries: Vec<(u64, u32)> = Vec::new();
+        for batch in snap.batches.iter().take(snap.visible_count) {
+            for i in 0..batch.rows as usize {
+                let original_pos = batch.row_offset + i as u64;
+                if original_pos >= total_rows_u64 {
+                    return Err(Error::io(format!(
+                        "FTS flush: row position {} >= total_rows {}",
+                        original_pos, total_rows
+                    )));
+                }
+                let reversed = total_rows_u64 - original_pos - 1;
+                entries.push((reversed, batch.doc_lengths[i]));
+            }
+        }
+        entries.sort_by_key(|(rev, _)| *rev);
 
-        // Build DocSet and create mapping from reversed_pos -> doc_id
+        // Step 2: assign doc_ids in ascending reversed-pos order.
         let mut docs = DocSet::default();
-        let mut reversed_pos_to_doc_id: HashMap<u64, u32> =
-            HashMap::with_capacity(doc_entries.len());
-        for (idx, (reversed_pos, num_tokens)) in doc_entries.into_iter().enumerate() {
-            docs.append(reversed_pos, num_tokens);
-            reversed_pos_to_doc_id.insert(reversed_pos, idx as u32);
+        let mut original_to_doc_id: HashMap<u64, u32> = HashMap::with_capacity(entries.len());
+        for (rev, num_tokens) in &entries {
+            let doc_id = docs.append(*rev, *num_tokens);
+            // Recover the original position from the reversed position so
+            // we can map per-term postings without a second pass.
+            let original = total_rows_u64 - rev - 1;
+            original_to_doc_id.insert(original, doc_id);
         }
 
-        // Step 2: Build TokenSet and group postings by token
+        // Step 3: walk terms in alphabetic (skip-list) order and emit
+        // posting lists into the builder.
         let mut tokens = TokenSet::default();
-        let mut token_postings: HashMap<String, Vec<(u32, PostingValue)>> = HashMap::new();
-
-        for entry in self.postings.iter() {
-            let token = entry.key().token.clone();
-            let original_pos = entry.key().row_position;
-            let reversed_pos = total_rows_u64 - original_pos - 1;
-            let doc_id = *reversed_pos_to_doc_id.get(&reversed_pos).ok_or_else(|| {
-                Error::io(format!(
-                    "FTS index internal error: doc_id not found for reversed position {} (original: {}, total_rows: {})",
-                    reversed_pos, original_pos, total_rows
-                ))
-            })?;
-
-            token_postings
-                .entry(token)
-                .or_default()
-                .push((doc_id, entry.value().clone()));
-        }
-
-        // Assign token IDs in sorted order for FST format
-        let mut sorted_tokens: Vec<_> = token_postings.keys().cloned().collect();
-        sorted_tokens.sort();
-        for token in &sorted_tokens {
-            tokens.add(token.clone());
-        }
-
-        // Step 3: Build posting lists
-        let mut posting_lists: Vec<PostingListBuilder> = (0..tokens.len())
-            .map(|_| PostingListBuilder::new(with_position))
-            .collect();
+        let mut posting_lists: Vec<PostingListBuilder> = Vec::new();
+        for entry in self.terms.iter() {
+            let term: &Arc<str> = entry.key();
+            let slice = entry.value().load();
+
+            // Collect per-doc postings filtered by visibility.
+            let mut docs_for_term: Vec<(u32, u32, Option<Vec<u32>>)> = Vec::new();
+            for chunk in &slice.chunks {
+                if chunk.batch_position >= snap.visible_count {
+                    continue;
+                }
+                for (i, row_position) in chunk.row_positions.iter().enumerate() {
+                    let Some(&doc_id) = original_to_doc_id.get(row_position) else {
+                        // Should not happen: a chunk's batch_position is
+                        // visible only if its meta is visible, and the meta
+                        // contributed every row above.
+                        continue;
+                    };
+                    let pos = if with_position {
+                        chunk
+                            .positions
+                            .as_ref()
+                            .map(|p| p.doc_positions(i).to_vec())
+                    } else {
+                        None
+                    };
+                    docs_for_term.push((doc_id, chunk.frequencies[i], pos));
+                }
+            }
+            if docs_for_term.is_empty() {
+                continue;
+            }
+            docs_for_term.sort_by_key(|(doc_id, _, _)| *doc_id);
+
+            let token_id = tokens.add(term.to_string()) as usize;
+            // PostingListBuilder vector is indexed by token_id. New token
+            // ids are assigned monotonically by `add`, so we just push.
+            debug_assert_eq!(token_id, posting_lists.len());
+            posting_lists.push(PostingListBuilder::new(with_position));
+
+            let plb = &mut posting_lists[token_id];
+            for (doc_id, freq, pos) in docs_for_term {
+                let recorder = if with_position {
+                    PositionRecorder::Position(pos.unwrap_or_default().into())
+                } else {
+                    PositionRecorder::Count(freq)
+                };
+                plb.add(doc_id, recorder);
+            }
+        }
+
+        let mut builder = InnerBuilder::new(partition_id, with_position, Default::default());
+        builder.set_tokens(tokens);
+        builder.set_docs(docs);
+        builder.set_posting_lists(posting_lists);
+        Ok(builder)
+    }
+}
+
+// ============================================================================
+// Internal helpers
+// ============================================================================
+
+impl TermSlice {
+    fn empty_arc_swap() -> ArcSwap<Self> {
+        ArcSwap::from(Self::empty())
+    }
+}
+
+/// Helper used during insert to accumulate a single batch's contribution to
+/// one term.
+struct BatchTermBuilder {
+    row_positions: Vec<u64>,
+    frequencies: Vec<u32>,
+    positions: Vec<Vec<u32>>,
+}
+
+impl BatchTermBuilder {
+    fn new() -> Self {
+        Self {
+            row_positions: Vec::new(),
+            frequencies: Vec::new(),
+            positions: Vec::new(),
+        }
+    }
+
+    fn push_doc(&mut self, row_position: u64, frequency: u32, positions: Vec<u32>) {
+        self.row_positions.push(row_position);
+        self.frequencies.push(frequency);
+        self.positions.push(positions);
+    }
+
+    /// Always materializes the per-doc positions: in-memory phrase queries
+    /// rely on them regardless of `params.has_positions()`. The flush path
+    /// consults `params.has_positions()` and emits a `Count` recorder
+    /// instead of `Position` when positions should not be persisted.
+    fn build(self, batch_position: usize) -> Arc<TermChunk> {
+        let mut p = Positions::empty();
+        for doc in &self.positions {
+            p.push_doc(doc);
+        }
+        Arc::new(TermChunk {
+            batch_position,
+            row_positions: self.row_positions,
+            frequencies: self.frequencies,
+            positions: Some(p),
+        })
+    }
+}
+
+/// Borrowed text for a row, or `None` for null/missing.
+type TextOpt<'a> = Option<&'a str>;
+
+fn extract_texts(column: &dyn Array) -> Result<Vec<TextOpt<'_>>> {
+    match column.data_type() {
+        DataType::Utf8 => {
+            let array = column
+                .as_any()
+                .downcast_ref::<StringArray>()
+                .expect("Utf8 array");
+            Ok((0..array.len())
+                .map(|i| (!array.is_null(i)).then(|| array.value(i)))
+                .collect())
+        }
+        DataType::LargeUtf8 => {
+            let array = column
+                .as_any()
+                .downcast_ref::<LargeStringArray>()
+                .expect("LargeUtf8 array");
+            Ok((0..array.len())
+                .map(|i| (!array.is_null(i)).then(|| array.value(i)))
+                .collect())
+        }
+        DataType::Utf8View => {
+            let array = column
+                .as_any()
+                .downcast_ref::<StringViewArray>()
+                .expect("Utf8View array");
+            Ok((0..array.len())
+                .map(|i| (!array.is_null(i)).then(|| array.value(i)))
+                .collect())
+        }
+        other => Err(Error::invalid_input(format!(
+            "FTS index only supports Utf8, LargeUtf8, and Utf8View columns; got {other:?}"
+        ))),
+    }
+}
+
+fn has_visible_chunk(slice: &TermSlice, visible_count: usize) -> bool {
+    slice
+        .chunks
+        .iter()
+        .any(|c| c.batch_position < visible_count)
+}
+
+fn lookup_dl(snap: &Snapshot, row_position: u64) -> Option<u32> {
+    snap.batches[..snap.visible_count]
+        .iter()
+        .find_map(|b| b.dl(row_position))
+}
+
+fn find_doc_in_chunks(
+    chunks: &[Arc<TermChunk>],
+    row_position: u64,
+) -> Option<(&Arc<TermChunk>, usize)> {
+    for chunk in chunks {
+        if let Ok(idx) = chunk.row_positions.binary_search(&row_position) {
+            return Some((chunk, idx));
+        }
+    }
+    None
+}
+
+const BM25_K1: f32 = 1.2;
+const BM25_B: f32 = 0.75;
+
+fn bm25_term_score(tf: f32, df: f32, n: f32, dl: f32, avgdl: f32) -> f32 {
+    let idf = ((n - df + 0.5) / (df + 0.5) + 1.0).ln();
+    let numerator = tf * (BM25_K1 + 1.0);
+    let denominator = tf + BM25_K1 * (1.0 - BM25_B + BM25_B * (dl / avgdl));
+    idf * (numerator / denominator)
+}
 
-        for (token, mut postings) in token_postings {
-            let token_id = tokens.get(&token).ok_or_else(|| {
-                Error::io(format!(
-                    "FTS index internal error: token '{}' not found in TokenSet",
-                    token
-                ))
-            })? as usize;
+/// Score a list of query tokens against the visible state. Each token
+/// contributes its BM25 term score to every document containing it.
+fn score_terms(
+    snap: &Snapshot,
+    terms: &SkipMap<Arc<str>, ArcSwap<TermSlice>>,
+    tokens: &[String],
+) -> Vec<FtsEntry> {
+    let n = snap.cumulative_doc_count as f32;
+    let avgdl = if n > 0.0 {
+        snap.cumulative_total_tokens as f32 / n
+    } else {
+        1.0
+    };
+
+    let mut doc_scores: HashMap<RowPosition, f32> = HashMap::new();
+
+    for token in tokens {
+        let Some(entry) = terms.get(token.as_str()) else {
+            continue;
+        };
+        let slice = entry.value().load_full();
+        // df = total visible postings for this term.
+        let df: u32 = slice
+            .chunks
+            .iter()
+            .filter(|c| c.batch_position < snap.visible_count)
+            .map(|c| c.doc_count() as u32)
+            .sum();
+        if df == 0 {
+            continue;
+        }
+        let df_f = df as f32;
+        for chunk in &slice.chunks {
+            if chunk.batch_position >= snap.visible_count {
+                continue;
+            }
+            let Some(meta) = snap.batch_for(chunk.batch_position) else {
+                continue;
+            };
+            for (i, &row_position) in chunk.row_positions.iter().enumerate() {
+                let dl = meta.dl(row_position).unwrap_or(1) as f32;
+                let tf = chunk.frequencies[i] as f32;
+                let score = bm25_term_score(tf, df_f, n, dl, avgdl);
+                *doc_scores.entry(row_position).or_default() += score;
+            }
+        }
+    }
 
-            // Sort postings by doc_id for proper ordering
-            postings.sort_by_key(|(doc_id, _)| *doc_id);
+    doc_scores
+        .into_iter()
+        .map(|(row_position, score)| FtsEntry {
+            row_position,
+            score,
+        })
+        .collect()
+}
 
-            for (doc_id, value) in postings {
-                let position_recorder = if with_position {
-                    PositionRecorder::Position(value.positions.into())
-                } else {
-                    PositionRecorder::Count(value.frequency)
-                };
-                posting_lists[token_id].add(doc_id, position_recorder);
-            }
+fn phrase_matches(positions: &[Vec<u32>], slop: u32) -> bool {
+    if positions.is_empty() {
+        return false;
+    }
+    for &first_pos in &positions[0] {
+        if phrase_from_position(positions, first_pos, slop) {
+            return true;
         }
+    }
+    false
+}
 
-        // Step 4: Create InnerBuilder with all the data
-        let mut builder = InnerBuilder::new(partition_id, with_position, Default::default());
-        builder.set_tokens(tokens);
-        builder.set_docs(docs);
-        builder.set_posting_lists(posting_lists);
+fn phrase_from_position(positions: &[Vec<u32>], first_pos: u32, slop: u32) -> bool {
+    let mut expected = first_pos;
+    for token_positions in positions.iter().skip(1) {
+        let min = expected.saturating_add(1);
+        let max = expected.saturating_add(1 + slop);
+        match token_positions
+            .iter()
+            .filter(|&&p| p >= min && p <= max)
+            .min()
+        {
+            Some(&p) => expected = p,
+            None => return false,
+        }
+    }
+    true
+}
 
-        Ok(builder)
+fn apply_boost(results: &mut [FtsEntry], boost: f32) {
+    if boost == 1.0 {
+        return;
+    }
+    for r in results.iter_mut() {
+        r.score *= boost;
     }
 }
 
+// ============================================================================
+// Configuration
+// ============================================================================
+
 /// Configuration for a Full-Text Search index.
 #[derive(Debug, Clone)]
 pub struct FtsIndexConfig {
-    /// Index name.
     pub name: String,
-    /// Field ID the index is built on.
     pub field_id: i32,
-    /// Column name (for Arrow batch lookups).
     pub column: String,
-    /// Tokenizer parameters (same as InvertedIndex).
     pub params: InvertedIndexParams,
 }
 
 impl FtsIndexConfig {
-    /// Create a new FtsIndexConfig with default tokenizer parameters.
     pub fn new(name: String, field_id: i32, column: String) -> Self {
         Self {
             name,
@@ -1366,7 +1570,6 @@ impl FtsIndexConfig {
         }
     }
 
-    /// Create a new FtsIndexConfig with custom tokenizer parameters.
     pub fn with_params(
         name: String,
         field_id: i32,
@@ -1382,6 +1585,10 @@ impl FtsIndexConfig {
     }
 }
 
+// ============================================================================
+// Tests
+// ============================================================================
+
 #[cfg(test)]
 mod tests {
     use super::*;
@@ -1421,40 +1628,31 @@ mod tests {
 
         assert_eq!(index.doc_count(), 3);
 
-        // "hello" appears in docs 0 and 2
         let entries = index.search("hello");
-        assert!(!entries.is_empty());
         assert_eq!(entries.len(), 2);
 
-        // "world" appears in docs 0 and 1
         let entries = index.search("world");
-        assert!(!entries.is_empty());
         assert_eq!(entries.len(), 2);
 
-        // "goodbye" appears only in doc 1 (row position 1)
         let entries = index.search("goodbye");
-        assert!(!entries.is_empty());
         assert_eq!(entries.len(), 1);
         assert_eq!(entries[0].row_position, 1);
 
-        // Non-existent term returns empty Vec
         let entries = index.search("nonexistent");
         assert!(entries.is_empty());
     }
 
     fn create_phrase_test_batch(schema: &ArrowSchema) -> RecordBatch {
-        // Note: The tokenizer filters stop words (the, and, very, etc.) and lowercases.
-        // Positions are assigned to non-filtered tokens only.
         RecordBatch::try_new(
             Arc::new(schema.clone()),
             vec![
                 Arc::new(Int32Array::from(vec![0, 1, 2, 3, 4])),
                 Arc::new(StringArray::from(vec![
-                    "alpha beta gamma",               // 0: alpha=0, beta=1, gamma=2
-                    "beta alpha gamma",               // 1: beta=0, alpha=1, gamma=2
-                    "alpha delta beta gamma",         // 2: alpha=0, delta=1, beta=2, gamma=3
-                    "alpha gamma",                    // 3: alpha=0, gamma=1
-                    "alpha delta epsilon beta gamma", // 4: alpha=0, delta=1, epsilon=2, beta=3, gamma=4
+                    "alpha beta gamma",
+                    "beta alpha gamma",
+                    "alpha delta beta gamma",
+                    "alpha gamma",
+                    "alpha delta epsilon beta gamma",
                 ])),
             ],
         )
@@ -1469,19 +1667,10 @@ mod tests {
         let batch = create_phrase_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // Exact phrase "alpha beta" with slop=0 should match only doc 0
-        // Doc 0: "alpha beta gamma" - alpha=0, beta=1 (adjacent)
-        // Doc 2: "alpha delta beta gamma" - alpha=0, beta=2 (NOT adjacent, slop needed)
         let entries = index.search_phrase("alpha beta", 0);
-        assert_eq!(
-            entries.len(),
-            1,
-            "Expected 1 match for 'alpha beta', got {:?}",
-            entries.iter().map(|e| e.row_position).collect::<Vec<_>>()
-        );
+        assert_eq!(entries.len(), 1);
         assert_eq!(entries[0].row_position, 0);
 
-        // "hello world" exact phrase
         let batch2 = create_test_batch(&schema);
         let index2 = FtsMemIndex::new(1, "description".to_string());
         index2.insert(&batch2, 0).unwrap();
@@ -1490,7 +1679,6 @@ mod tests {
         assert_eq!(entries.len(), 1);
         assert_eq!(entries[0].row_position, 0);
 
-        // "goodbye world" exact phrase
         let entries = index2.search_phrase("goodbye world", 0);
         assert_eq!(entries.len(), 1);
         assert_eq!(entries[0].row_position, 1);
@@ -1504,69 +1692,24 @@ mod tests {
         let batch = create_phrase_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // Positions after tokenization (no stop words filtered):
-        // Doc 0: "alpha beta gamma" - alpha=0, beta=1, gamma=2
-        // Doc 2: "alpha delta beta gamma" - alpha=0, delta=1, beta=2, gamma=3
-        // Doc 4: "alpha delta epsilon beta gamma" - alpha=0, delta=1, epsilon=2, beta=3, gamma=4
-
-        // "alpha beta" with slop=0 should match only doc 0
-        // Doc 0: alpha=0, beta=1 (adjacent)
         let entries = index.search_phrase("alpha beta", 0);
-        assert_eq!(
-            entries.len(),
-            1,
-            "slop=0 matches: {:?}",
-            entries.iter().map(|e| e.row_position).collect::<Vec<_>>()
-        );
+        assert_eq!(entries.len(), 1);
         assert_eq!(entries[0].row_position, 0);
 
-        // "alpha beta" with slop=1 should match docs 0 and 2
-        // Doc 0: alpha=0, beta=1 (diff=1, within slop=1)
-        // Doc 2: alpha=0, beta=2 (diff=2, slop=1 allows pos 1-2)
-        // Doc 4: alpha=0, beta=3 (diff=3, slop=1 does NOT allow pos 3)
         let entries = index.search_phrase("alpha beta", 1);
-        assert_eq!(
-            entries.len(),
-            2,
-            "slop=1 matches: {:?}",
-            entries.iter().map(|e| e.row_position).collect::<Vec<_>>()
-        );
+        assert_eq!(entries.len(), 2);
         let positions: Vec<_> = entries.iter().map(|e| e.row_position).collect();
         assert!(positions.contains(&0));
         assert!(positions.contains(&2));
 
-        // "alpha beta" with slop=2 should match docs 0, 2, and 4
         let entries = index.search_phrase("alpha beta", 2);
-        assert_eq!(
-            entries.len(),
-            3,
-            "slop=2 matches: {:?}",
-            entries.iter().map(|e| e.row_position).collect::<Vec<_>>()
-        );
+        assert_eq!(entries.len(), 3);
 
-        // "alpha gamma" with slop=0 should match docs 1 and 3 (adjacent)
-        // Doc 1: "beta alpha gamma" - alpha=1, gamma=2 (adjacent)
-        // Doc 3: "alpha gamma" - alpha=0, gamma=1 (adjacent)
         let entries = index.search_phrase("alpha gamma", 0);
-        assert_eq!(
-            entries.len(),
-            2,
-            "alpha gamma slop=0: {:?}",
-            entries.iter().map(|e| e.row_position).collect::<Vec<_>>()
-        );
+        assert_eq!(entries.len(), 2);
 
-        // "alpha gamma" with slop=1 should match docs 0, 1, 2, and 3
-        // Doc 0: alpha=0, gamma=2 (diff=2, slop=1 allows pos 1-2)
-        // Doc 1: alpha=1, gamma=2 (adjacent)
-        // Doc 2: alpha=0, gamma=3 (diff=3, slop=1 allows pos 1-2, gamma at 3 NOT in range)
-        // Doc 3: alpha=0, gamma=1 (adjacent)
         let entries = index.search_phrase("alpha gamma", 1);
-        assert_eq!(
-            entries.len(),
-            3,
-            "alpha gamma slop=1: {:?}",
-            entries.iter().map(|e| e.row_position).collect::<Vec<_>>()
-        );
+        assert_eq!(entries.len(), 3);
     }
 
     #[test]
@@ -1577,21 +1720,16 @@ mod tests {
         let batch = create_phrase_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // "beta alpha" with slop=0 should not match in most docs (wrong order)
-        // Doc 1 has "beta alpha gamma" - beta=0, alpha=1, so "beta alpha" matches there!
         let entries = index.search_phrase("beta alpha", 0);
-        assert_eq!(entries.len(), 1); // matches doc 1
+        assert_eq!(entries.len(), 1);
         assert_eq!(entries[0].row_position, 1);
 
-        // Non-existent phrase
         let entries = index.search_phrase("nonexistent phrase", 0);
         assert!(entries.is_empty());
 
-        // Partial phrase not in any doc
         let entries = index.search_phrase("alpha hello", 0);
         assert!(entries.is_empty());
 
-        // "gamma alpha" should not match (wrong order in all docs)
         let entries = index.search_phrase("gamma alpha", 0);
         assert!(entries.is_empty());
     }
@@ -1604,10 +1742,8 @@ mod tests {
         let batch = create_phrase_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // Single token phrase should behave like regular search
         let phrase_entries = index.search_phrase("alpha", 0);
         let search_entries = index.search("alpha");
-
         assert_eq!(phrase_entries.len(), search_entries.len());
     }
 
@@ -1619,20 +1755,11 @@ mod tests {
         let batch = create_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // Empty phrase
         let entries = index.search_phrase("", 0);
         assert!(entries.is_empty());
     }
 
-    // ====== Boolean Query Tests ======
-
     fn create_boolean_test_batch(schema: &ArrowSchema) -> RecordBatch {
-        // Test documents for Boolean queries:
-        // Doc 0: "rust programming language"
-        // Doc 1: "python programming language"
-        // Doc 2: "rust web server"
-        // Doc 3: "python web framework"
-        // Doc 4: "javascript programming"
         RecordBatch::try_new(
             Arc::new(schema.clone()),
             vec![
@@ -1657,20 +1784,13 @@ mod tests {
         let batch = create_boolean_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // MUST: rust AND programming
-        // Should match doc 0 only ("rust programming language")
         let query = FtsQueryExpr::boolean()
             .must(FtsQueryExpr::match_query("rust"))
             .must(FtsQueryExpr::match_query("programming"))
             .build();
 
         let entries = index.search_query(&query);
-        assert_eq!(
-            entries.len(),
-            1,
-            "Expected 1 match for MUST(rust, programming), got {:?}",
-            entries.iter().map(|e| e.row_position).collect::<Vec<_>>()
-        );
+        assert_eq!(entries.len(), 1);
         assert_eq!(entries[0].row_position, 0);
     }
 
@@ -1682,21 +1802,13 @@ mod tests {
         let batch = create_boolean_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // SHOULD: rust OR python
-        // Should match docs 0, 1, 2, 3 (all containing rust or python)
         let query = FtsQueryExpr::boolean()
             .should(FtsQueryExpr::match_query("rust"))
             .should(FtsQueryExpr::match_query("python"))
             .build();
 
         let entries = index.search_query(&query);
-        assert_eq!(
-            entries.len(),
-            4,
-            "Expected 4 matches for SHOULD(rust, python), got {:?}",
-            entries.iter().map(|e| e.row_position).collect::<Vec<_>>()
-        );
-
+        assert_eq!(entries.len(), 4);
         let positions: Vec<_> = entries.iter().map(|e| e.row_position).collect();
         assert!(positions.contains(&0));
         assert!(positions.contains(&1));
@@ -1712,18 +1824,12 @@ mod tests {
         let batch = create_boolean_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // MUST_NOT alone with no MUST or SHOULD returns empty
-        // (nothing to include, only exclusions)
         let query = FtsQueryExpr::boolean()
             .must_not(FtsQueryExpr::match_query("rust"))
             .build();
 
         let entries = index.search_query(&query);
-        assert!(
-            entries.is_empty(),
-            "MUST_NOT only should return empty, got {:?}",
-            entries.iter().map(|e| e.row_position).collect::<Vec<_>>()
-        );
+        assert!(entries.is_empty());
     }
 
     #[test]
@@ -1734,33 +1840,17 @@ mod tests {
         let batch = create_boolean_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // MUST: programming, SHOULD: rust
-        // Should match docs 0, 1, 4 (all with programming)
-        // Doc 0 should have higher score (also matches rust)
         let query = FtsQueryExpr::boolean()
             .must(FtsQueryExpr::match_query("programming"))
             .should(FtsQueryExpr::match_query("rust"))
             .build();
 
         let entries = index.search_query(&query);
-        assert_eq!(
-            entries.len(),
-            3,
-            "Expected 3 matches for MUST(programming) SHOULD(rust), got {:?}",
-            entries.iter().map(|e| e.row_position).collect::<Vec<_>>()
-        );
+        assert_eq!(entries.len(), 3);
 
-        // Find doc 0 and doc 1 scores
         let doc0 = entries.iter().find(|e| e.row_position == 0).unwrap();
         let doc1 = entries.iter().find(|e| e.row_position == 1).unwrap();
-
-        // Doc 0 has both programming and rust, should score higher than doc 1 (only programming)
-        assert!(
-            doc0.score > doc1.score,
-            "Doc 0 (rust+programming) should score higher than doc 1 (programming only). Doc0: {}, Doc1: {}",
-            doc0.score,
-            doc1.score
-        );
+        assert!(doc0.score > doc1.score);
     }
 
     #[test]
@@ -1771,25 +1861,18 @@ mod tests {
         let batch = create_boolean_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // MUST: programming, MUST_NOT: python
-        // Should match docs 0 and 4 (programming but not python)
         let query = FtsQueryExpr::boolean()
             .must(FtsQueryExpr::match_query("programming"))
             .must_not(FtsQueryExpr::match_query("python"))
             .build();
 
         let entries = index.search_query(&query);
-        assert_eq!(
-            entries.len(),
-            2,
-            "Expected 2 matches for MUST(programming) MUST_NOT(python), got {:?}",
-            entries.iter().map(|e| e.row_position).collect::<Vec<_>>()
-        );
+        assert_eq!(entries.len(), 2);
 
         let positions: Vec<_> = entries.iter().map(|e| e.row_position).collect();
-        assert!(positions.contains(&0)); // rust programming language
-        assert!(positions.contains(&4)); // javascript programming
-        assert!(!positions.contains(&1)); // python programming language - excluded
+        assert!(positions.contains(&0));
+        assert!(positions.contains(&4));
+        assert!(!positions.contains(&1));
     }
 
     #[test]
@@ -1800,10 +1883,6 @@ mod tests {
         let batch = create_boolean_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // MUST: web, SHOULD: rust, MUST_NOT: framework
-        // Docs with "web": 2 (rust web server), 3 (python web framework)
-        // After MUST_NOT framework: only doc 2
-        // Doc 2 also matches SHOULD(rust), so should have higher score
         let query = FtsQueryExpr::boolean()
             .must(FtsQueryExpr::match_query("web"))
             .should(FtsQueryExpr::match_query("rust"))
@@ -1811,12 +1890,7 @@ mod tests {
             .build();
 
         let entries = index.search_query(&query);
-        assert_eq!(
-            entries.len(),
-            1,
-            "Expected 1 match for MUST(web) SHOULD(rust) MUST_NOT(framework), got {:?}",
-            entries.iter().map(|e| e.row_position).collect::<Vec<_>>()
-        );
+        assert_eq!(entries.len(), 1);
         assert_eq!(entries[0].row_position, 2);
     }
 
@@ -1828,20 +1902,12 @@ mod tests {
         let batch = create_boolean_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // MUST: phrase("programming language")
-        // Should match docs 0 and 1
         let query = FtsQueryExpr::boolean()
             .must(FtsQueryExpr::phrase("programming language"))
             .build();
 
         let entries = index.search_query(&query);
-        assert_eq!(
-            entries.len(),
-            2,
-            "Expected 2 matches for MUST(phrase 'programming language'), got {:?}",
-            entries.iter().map(|e| e.row_position).collect::<Vec<_>>()
-        );
-
+        assert_eq!(entries.len(), 2);
         let positions: Vec<_> = entries.iter().map(|e| e.row_position).collect();
         assert!(positions.contains(&0));
         assert!(positions.contains(&1));
@@ -1855,7 +1921,6 @@ mod tests {
         let batch = create_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // Test FtsQueryExpr::Match
         let query = FtsQueryExpr::match_query("hello");
         let entries = index.search_query(&query);
         assert_eq!(entries.len(), 2);
@@ -1869,7 +1934,6 @@ mod tests {
         let batch = create_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // Test FtsQueryExpr::Phrase
         let query = FtsQueryExpr::phrase("hello world");
         let entries = index.search_query(&query);
         assert_eq!(entries.len(), 1);
@@ -1884,7 +1948,6 @@ mod tests {
         let batch = create_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // Test boost
         let query_no_boost = FtsQueryExpr::match_query("hello");
         let query_with_boost = FtsQueryExpr::match_query("hello").with_boost(2.0);
 
@@ -1892,77 +1955,44 @@ mod tests {
         let entries_with_boost = index.search_query(&query_with_boost);
 
         assert_eq!(entries_no_boost.len(), entries_with_boost.len());
-
-        // Boosted scores should be 2x
-        for (e1, e2) in entries_no_boost.iter().zip(entries_with_boost.iter()) {
+        for e1 in &entries_no_boost {
+            let e2 = entries_with_boost
+                .iter()
+                .find(|e| e.row_position == e1.row_position)
+                .unwrap();
             let expected = e1.score * 2.0;
-            assert!(
-                (e2.score - expected).abs() < 0.001,
-                "Boosted score {} should be 2x original {}",
-                e2.score,
-                e1.score
-            );
+            assert!((e2.score - expected).abs() < 0.001);
         }
     }
 
-    // ====== Fuzzy Matching Tests ======
-
     #[test]
     fn test_levenshtein_distance() {
-        // Identical strings
         assert_eq!(levenshtein_distance("hello", "hello"), 0);
-
-        // Single character difference
-        assert_eq!(levenshtein_distance("hello", "hallo"), 1); // substitution
-        assert_eq!(levenshtein_distance("hello", "hell"), 1); // deletion
-        assert_eq!(levenshtein_distance("hello", "helloo"), 1); // insertion
-
-        // Two character differences
+        assert_eq!(levenshtein_distance("hello", "hallo"), 1);
+        assert_eq!(levenshtein_distance("hello", "hell"), 1);
+        assert_eq!(levenshtein_distance("hello", "helloo"), 1);
         assert_eq!(levenshtein_distance("hello", "hxllo"), 1);
         assert_eq!(levenshtein_distance("hello", "hxxlo"), 2);
-
-        // Completely different strings
         assert_eq!(levenshtein_distance("abc", "xyz"), 3);
-
-        // Empty strings
         assert_eq!(levenshtein_distance("", ""), 0);
         assert_eq!(levenshtein_distance("hello", ""), 5);
         assert_eq!(levenshtein_distance("", "hello"), 5);
-
-        // Case sensitivity
         assert_eq!(levenshtein_distance("Hello", "hello"), 1);
     }
 
     #[test]
     fn test_auto_fuzziness() {
-        // 0-2 chars: 0 fuzziness
         assert_eq!(auto_fuzziness(""), 0);
         assert_eq!(auto_fuzziness("a"), 0);
         assert_eq!(auto_fuzziness("ab"), 0);
-
-        // 3-5 chars: 1 fuzziness
         assert_eq!(auto_fuzziness("abc"), 1);
         assert_eq!(auto_fuzziness("abcd"), 1);
         assert_eq!(auto_fuzziness("abcde"), 1);
-
-        // 6+ chars: 2 fuzziness
         assert_eq!(auto_fuzziness("abcdef"), 2);
         assert_eq!(auto_fuzziness("programming"), 2);
     }
 
     fn create_fuzzy_test_batch(schema: &ArrowSchema) -> RecordBatch {
-        // Test documents for fuzzy matching.
-        // Note: The tokenizer stems words, so we use unstemmed single tokens
-        // for predictable fuzzy matching tests.
-        // Levenshtein distance examples:
-        // - "alpha" to "alpho" = 1 (substitution: a -> o)
-        // - "alpha" to "alphax" = 1 (insertion)
-        // - "alpha" to "alph" = 1 (deletion)
-        // Doc 0: "alpha beta gamma"
-        // Doc 1: "alpho beta delta" (typo: 'alpho' instead of 'alpha', distance=1)
-        // Doc 2: "alpha delta epsilon"
-        // Doc 3: "omega zeta"
-        // Doc 4: "alphax gamma" (typo: extra 'x', distance=1)
         RecordBatch::try_new(
             Arc::new(schema.clone()),
             vec![
@@ -1987,18 +2017,11 @@ mod tests {
         let batch = create_fuzzy_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // Exact match with fuzziness=0: "alpha" exists in index
         let matches = index.expand_fuzzy("alpha", 0, 50);
-        assert_eq!(
-            matches.len(),
-            1,
-            "Expected 1 match for 'alpha', got {:?}",
-            matches
-        );
+        assert_eq!(matches.len(), 1);
         assert_eq!(matches[0].0, "alpha");
         assert_eq!(matches[0].1, 0);
 
-        // Non-existent term with fuzziness=0
         let matches = index.expand_fuzzy("nonexistent", 0, 50);
         assert!(matches.is_empty());
     }
@@ -2011,22 +2034,9 @@ mod tests {
         let batch = create_fuzzy_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // "alpho" (typo, substitution distance=1 from "alpha") should match "alpha"
         let matches = index.expand_fuzzy("alpho", 1, 50);
-        assert!(
-            matches
-                .iter()
-                .any(|(term, dist)| term == "alpha" && *dist == 1),
-            "Expected 'alpha' with distance 1, got {:?}",
-            matches
-        );
-
-        // Also matches itself since it's in the index
-        assert!(
-            matches.iter().any(|(term, _)| term == "alpho"),
-            "Expected 'alpho' in matches, got {:?}",
-            matches
-        );
+        assert!(matches.iter().any(|(t, d)| t == "alpha" && *d == 1));
+        assert!(matches.iter().any(|(t, _)| t == "alpho"));
     }
 
     #[test]
@@ -2037,13 +2047,8 @@ mod tests {
         let batch = create_fuzzy_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // With very high distance, should be limited by max_expansions
         let matches = index.expand_fuzzy("a", 10, 3);
-        assert!(
-            matches.len() <= 3,
-            "Expected at most 3 matches, got {}",
-            matches.len()
-        );
+        assert!(matches.len() <= 3);
     }
 
     #[test]
@@ -2054,17 +2059,8 @@ mod tests {
         let batch = create_fuzzy_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // Search with typo "alpho" should match documents with "alpha" or "alpho"
         let entries = index.search_fuzzy("alpho", Some(1), 50);
-        assert!(!entries.is_empty(), "Expected matches for fuzzy 'alpho'");
-
-        // Should match docs with alpha (0, 2) and alpho (1)
-        let positions: Vec<_> = entries.iter().map(|e| e.row_position).collect();
-        assert!(
-            positions.contains(&0) || positions.contains(&1) || positions.contains(&2),
-            "Expected to match docs with alpha/alpho, got {:?}",
-            positions
-        );
+        assert!(!entries.is_empty());
     }
 
     #[test]
@@ -2075,9 +2071,8 @@ mod tests {
         let batch = create_fuzzy_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // "alpho" (5 chars) should get auto-fuzziness of 1
         let entries = index.search_fuzzy("alpho", None, 50);
-        assert!(!entries.is_empty(), "Expected matches with auto-fuzziness");
+        assert!(!entries.is_empty());
     }
 
     #[test]
@@ -2088,13 +2083,8 @@ mod tests {
         let batch = create_fuzzy_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // Search for something completely different with low fuzziness
         let entries = index.search_fuzzy("xyz", Some(0), 50);
-        assert!(entries.is_empty(), "Expected no matches for 'xyz'");
-
-        // Even with fuzziness=1, "xyz" shouldn't match anything meaningful
-        // (this may or may not be empty depending on what 3-letter words are in the index)
-        let _ = index.search_fuzzy("xyz", Some(1), 50);
+        assert!(entries.is_empty());
     }
 
     #[test]
@@ -2105,13 +2095,9 @@ mod tests {
         let batch = create_fuzzy_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // Test FtsQueryExpr::Fuzzy via search_query
         let query = FtsQueryExpr::fuzzy("alpho");
         let entries = index.search_query(&query);
-        assert!(
-            !entries.is_empty(),
-            "Expected matches for fuzzy query 'alpho'"
-        );
+        assert!(!entries.is_empty());
     }
 
     #[test]
@@ -2122,13 +2108,9 @@ mod tests {
         let batch = create_fuzzy_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // Exact distance: "alpho" has distance 1 from "alpha"
         let query = FtsQueryExpr::fuzzy_with_distance("alpho", 1);
         let entries = index.search_query(&query);
-        assert!(
-            !entries.is_empty(),
-            "Expected matches for fuzzy query with distance 1"
-        );
+        assert!(!entries.is_empty());
     }
 
     #[test]
@@ -2144,22 +2126,15 @@ mod tests {
 
         let entries_no_boost = index.search_query(&query_no_boost);
         let entries_with_boost = index.search_query(&query_with_boost);
-
         assert_eq!(entries_no_boost.len(), entries_with_boost.len());
 
-        // Boosted scores should be 2x
         for e1 in &entries_no_boost {
             let e2 = entries_with_boost
                 .iter()
                 .find(|e| e.row_position == e1.row_position)
                 .unwrap();
             let expected = e1.score * 2.0;
-            assert!(
-                (e2.score - expected).abs() < 0.001,
-                "Boosted score {} should be 2x original {}",
-                e2.score,
-                e1.score
-            );
+            assert!((e2.score - expected).abs() < 0.001);
         }
     }
 
@@ -2171,48 +2146,19 @@ mod tests {
         let batch = create_fuzzy_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // MUST: fuzzy("alpho", distance=1), MUST_NOT: "delta"
-        // "alpho" matches "alpha" (distance=1) and itself
-        // Doc 0: "alpha beta gamma" - matches fuzzy alpho, no delta -> included
-        // Doc 1: "alpho beta delta" - matches fuzzy alpho, has delta -> excluded
-        // Doc 2: "alpha delta epsilon" - matches fuzzy alpho, has delta -> excluded
-        // Doc 4: "alphax gamma" - matches fuzzy alpho via alphax (dist=1 to alpho), no delta -> included
         let query = FtsQueryExpr::boolean()
             .must(FtsQueryExpr::fuzzy_with_distance("alpho", 1))
             .must_not(FtsQueryExpr::match_query("delta"))
             .build();
 
         let entries = index.search_query(&query);
-
-        // Should not contain docs 1 and 2 (have "delta")
         let positions: Vec<_> = entries.iter().map(|e| e.row_position).collect();
-        assert!(
-            !positions.contains(&1),
-            "Doc 1 should be excluded due to MUST_NOT, got {:?}",
-            positions
-        );
-        assert!(
-            !positions.contains(&2),
-            "Doc 2 should be excluded due to MUST_NOT, got {:?}",
-            positions
-        );
-        // Doc 0 should be included
-        assert!(
-            positions.contains(&0),
-            "Doc 0 should be included, got {:?}",
-            positions
-        );
+        assert!(!positions.contains(&1));
+        assert!(!positions.contains(&2));
+        assert!(positions.contains(&0));
     }
 
-    // ====== Boost Query Tests ======
-
     fn create_boost_test_batch(schema: &ArrowSchema) -> RecordBatch {
-        // Test documents for boost queries:
-        // Doc 0: "rust programming language" - matches rust, programming, language
-        // Doc 1: "python programming language" - matches python, programming, language
-        // Doc 2: "rust web server" - matches rust, web, server
-        // Doc 3: "python web framework" - matches python, web, framework
-        // Doc 4: "javascript programming" - matches javascript, programming
         RecordBatch::try_new(
             Arc::new(schema.clone()),
             vec![
@@ -2237,17 +2183,9 @@ mod tests {
         let batch = create_boost_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // Boosting query with only positive component (same as regular query)
         let query = FtsQueryExpr::boosting(FtsQueryExpr::match_query("programming"));
         let entries = index.search_query(&query);
-
-        // Should match docs 0, 1, 4 (all with "programming")
-        assert_eq!(
-            entries.len(),
-            3,
-            "Expected 3 matches for 'programming', got {:?}",
-            entries.iter().map(|e| e.row_position).collect::<Vec<_>>()
-        );
+        assert_eq!(entries.len(), 3);
     }
 
     #[test]
@@ -2258,46 +2196,19 @@ mod tests {
         let batch = create_boost_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // Boosting query: find "programming", demote docs with "python"
         let query = FtsQueryExpr::boosting_with_negative(
             FtsQueryExpr::match_query("programming"),
             FtsQueryExpr::match_query("python"),
-            0.5, // Demote python docs by half
+            0.5,
         );
         let entries = index.search_query(&query);
-
-        // Should still match docs 0, 1, 4 (all with "programming")
         assert_eq!(entries.len(), 3);
 
-        // Find scores for each doc
-        let doc0 = entries.iter().find(|e| e.row_position == 0); // rust programming
-        let doc1 = entries.iter().find(|e| e.row_position == 1); // python programming
-        let doc4 = entries.iter().find(|e| e.row_position == 4); // javascript programming
-
-        assert!(doc0.is_some() && doc1.is_some() && doc4.is_some());
-
-        // Doc 1 (python) should have lower score than doc 0 (rust) due to negative boost
-        // Doc 0 and doc 4 should have similar scores (neither match "python")
-        let score0 = doc0.unwrap().score;
-        let score1 = doc1.unwrap().score;
-        let score4 = doc4.unwrap().score;
-
-        // Doc 1 was demoted by 0.5, so it should have roughly half the score
-        assert!(
-            score1 < score0,
-            "Doc 1 (python) should have lower score than doc 0 (rust). Doc0: {}, Doc1: {}",
-            score0,
-            score1
-        );
-
-        // Doc 0 and doc 4 should have similar scores (both not demoted)
-        // They may differ slightly due to BM25 scoring differences, but doc 1 should be lower
-        assert!(
-            score1 < score4,
-            "Doc 1 (python) should have lower score than doc 4 (javascript). Doc1: {}, Doc4: {}",
-            score1,
-            score4
-        );
+        let doc0 = entries.iter().find(|e| e.row_position == 0).unwrap();
+        let doc1 = entries.iter().find(|e| e.row_position == 1).unwrap();
+        let doc4 = entries.iter().find(|e| e.row_position == 4).unwrap();
+        assert!(doc1.score < doc0.score);
+        assert!(doc1.score < doc4.score);
     }
 
     #[test]
@@ -2308,59 +2219,32 @@ mod tests {
         let batch = create_boost_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // Compare different negative boost factors
         let query_no_demote = FtsQueryExpr::boosting_with_negative(
             FtsQueryExpr::match_query("programming"),
             FtsQueryExpr::match_query("python"),
-            1.0, // No demotion
+            1.0,
         );
-
         let query_half_demote = FtsQueryExpr::boosting_with_negative(
             FtsQueryExpr::match_query("programming"),
             FtsQueryExpr::match_query("python"),
-            0.5, // Half score for python
+            0.5,
         );
-
         let query_zero_demote = FtsQueryExpr::boosting_with_negative(
             FtsQueryExpr::match_query("programming"),
             FtsQueryExpr::match_query("python"),
-            0.0, // Zero score for python
+            0.0,
         );
 
-        let results_no_demote = index.search_query(&query_no_demote);
-        let results_half_demote = index.search_query(&query_half_demote);
-        let results_zero_demote = index.search_query(&query_zero_demote);
-
-        // Get doc 1 (python programming) scores
-        let score_no_demote = results_no_demote
-            .iter()
-            .find(|e| e.row_position == 1)
-            .unwrap()
-            .score;
-        let score_half_demote = results_half_demote
-            .iter()
-            .find(|e| e.row_position == 1)
-            .unwrap()
-            .score;
-        let score_zero_demote = results_zero_demote
-            .iter()
-            .find(|e| e.row_position == 1)
-            .unwrap()
-            .score;
+        let r_no = index.search_query(&query_no_demote);
+        let r_half = index.search_query(&query_half_demote);
+        let r_zero = index.search_query(&query_zero_demote);
 
-        // Verify demotion factors are applied correctly
-        assert!(
-            (score_half_demote - score_no_demote * 0.5).abs() < 0.001,
-            "Half demotion should give half score. Expected {}, got {}",
-            score_no_demote * 0.5,
-            score_half_demote
-        );
+        let s_no = r_no.iter().find(|e| e.row_position == 1).unwrap().score;
+        let s_half = r_half.iter().find(|e| e.row_position == 1).unwrap().score;
+        let s_zero = r_zero.iter().find(|e| e.row_position == 1).unwrap().score;
 
-        assert!(
-            score_zero_demote.abs() < 0.001,
-            "Zero demotion should give zero score, got {}",
-            score_zero_demote
-        );
+        assert!((s_half - s_no * 0.5).abs() < 0.001);
+        assert!(s_zero.abs() < 0.001);
     }
 
     #[test]
@@ -2371,92 +2255,24 @@ mod tests {
         let batch = create_boost_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // Boosting query where negative doesn't match any positive results
         let query = FtsQueryExpr::boosting_with_negative(
-            FtsQueryExpr::match_query("rust"),   // Matches docs 0, 2
-            FtsQueryExpr::match_query("python"), // Matches docs 1, 3 (no overlap!)
+            FtsQueryExpr::match_query("rust"),
+            FtsQueryExpr::match_query("python"),
             0.1,
         );
-
         let entries = index.search_query(&query);
-
-        // Should match docs 0, 2 (rust docs)
         assert_eq!(entries.len(), 2);
 
-        // Scores should not be demoted (no overlap with python)
-        let query_baseline = FtsQueryExpr::match_query("rust");
-        let baseline_entries = index.search_query(&query_baseline);
-
+        let baseline = index.search_query(&FtsQueryExpr::match_query("rust"));
         for entry in &entries {
-            let baseline = baseline_entries
+            let b = baseline
                 .iter()
                 .find(|e| e.row_position == entry.row_position)
                 .unwrap();
-            assert!(
-                (entry.score - baseline.score).abs() < 0.001,
-                "Scores should match when no negative overlap. Got {} vs {}",
-                entry.score,
-                baseline.score
-            );
-        }
-    }
-
-    #[test]
-    fn test_boost_query_nested() {
-        let schema = create_test_schema();
-        let index = FtsMemIndex::new(1, "description".to_string());
-
-        let batch = create_boost_test_batch(&schema);
-        index.insert(&batch, 0).unwrap();
-
-        // Nested boost: positive is a Boolean query
-        let positive_query = FtsQueryExpr::boolean()
-            .should(FtsQueryExpr::match_query("programming"))
-            .should(FtsQueryExpr::match_query("web"))
-            .build();
-
-        let query = FtsQueryExpr::boosting_with_negative(
-            positive_query,
-            FtsQueryExpr::match_query("python"),
-            0.5,
-        );
-
-        let entries = index.search_query(&query);
-
-        // Should match docs 0, 1, 2, 3, 4 (programming or web)
-        assert!(entries.len() >= 4, "Should match multiple docs");
-
-        // Python docs (1, 3) should be demoted
-        let python_docs: Vec<_> = entries
-            .iter()
-            .filter(|e| e.row_position == 1 || e.row_position == 3)
-            .collect();
-
-        let non_python_docs: Vec<_> = entries
-            .iter()
-            .filter(|e| e.row_position != 1 && e.row_position != 3)
-            .collect();
-
-        // At least some python docs should have lower scores
-        if !python_docs.is_empty() && !non_python_docs.is_empty() {
-            let max_python_score = python_docs.iter().map(|e| e.score).fold(0.0f32, f32::max);
-            let max_non_python_score = non_python_docs
-                .iter()
-                .map(|e| e.score)
-                .fold(0.0f32, f32::max);
-
-            // This is a soft check - depends on BM25 scoring details
-            // Just verify the demotion is happening
-            assert!(
-                python_docs.iter().any(|e| e.score < max_non_python_score)
-                    || max_python_score <= max_non_python_score,
-                "Python docs should generally have lower scores"
-            );
+            assert!((entry.score - b.score).abs() < 0.001);
         }
     }
 
-    // ====== WAND Factor / Search Options Tests ======
-
     #[test]
     fn test_search_options_default() {
         let options = SearchOptions::default();
@@ -2467,28 +2283,19 @@ mod tests {
     #[test]
     fn test_search_options_builder() {
         let options = SearchOptions::new().with_wand_factor(0.5).with_limit(10);
-
         assert_eq!(options.wand_factor, 0.5);
         assert_eq!(options.limit, Some(10));
     }
 
     #[test]
     fn test_search_options_wand_factor_clamped() {
-        // wand_factor should be clamped to [0.0, 1.0]
         let options = SearchOptions::new().with_wand_factor(2.0);
         assert_eq!(options.wand_factor, 1.0);
-
         let options = SearchOptions::new().with_wand_factor(-0.5);
         assert_eq!(options.wand_factor, 0.0);
     }
 
     fn create_wand_test_batch(schema: &ArrowSchema) -> RecordBatch {
-        // Test documents with varying relevance:
-        // Doc 0: "alpha alpha alpha beta" - high relevance for "alpha" (3 occurrences)
-        // Doc 1: "alpha beta gamma" - medium relevance for "alpha" (1 occurrence)
-        // Doc 2: "beta gamma delta" - no relevance for "alpha"
-        // Doc 3: "alpha alpha" - medium-high relevance for "alpha" (2 occurrences, shorter doc)
-        // Doc 4: "alpha" - some relevance for "alpha" (1 occurrence, very short doc)
         RecordBatch::try_new(
             Arc::new(schema.clone()),
             vec![
@@ -2514,20 +2321,10 @@ mod tests {
         index.insert(&batch, 0).unwrap();
 
         let query = FtsQueryExpr::match_query("alpha");
-
-        // Full recall (wand_factor = 1.0)
-        let options = SearchOptions::default();
-        let results = index.search_with_options(&query, options);
-
-        // Should return all docs containing "alpha" (docs 0, 1, 3, 4)
-        assert_eq!(results.len(), 4, "Expected 4 matches with full recall");
-
-        // Results should be sorted by score descending
+        let results = index.search_with_options(&query, SearchOptions::default());
+        assert_eq!(results.len(), 4);
         for i in 1..results.len() {
-            assert!(
-                results[i - 1].score >= results[i].score,
-                "Results should be sorted by score descending"
-            );
+            assert!(results[i - 1].score >= results[i].score);
         }
     }
 
@@ -2540,26 +2337,14 @@ mod tests {
         index.insert(&batch, 0).unwrap();
 
         let query = FtsQueryExpr::match_query("alpha");
-
-        // Limit to top 2 results
         let options = SearchOptions::new().with_limit(2);
         let results = index.search_with_options(&query, options);
+        assert_eq!(results.len(), 2);
 
-        assert_eq!(results.len(), 2, "Expected 2 matches with limit=2");
-
-        // Should be the top 2 by score
-        let full_results = index.search_query(&query);
-        let mut full_sorted = full_results;
-        full_sorted.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
-
-        assert_eq!(
-            results[0].row_position, full_sorted[0].row_position,
-            "First result should be highest scorer"
-        );
-        assert_eq!(
-            results[1].row_position, full_sorted[1].row_position,
-            "Second result should be second highest scorer"
-        );
+        let mut full = index.search_query(&query);
+        full.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
+        assert_eq!(results[0].row_position, full[0].row_position);
+        assert_eq!(results[1].row_position, full[1].row_position);
     }
 
     #[test]
@@ -2571,37 +2356,18 @@ mod tests {
         index.insert(&batch, 0).unwrap();
 
         let query = FtsQueryExpr::match_query("alpha");
+        let mut full = index.search_query(&query);
+        full.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
 
-        // Get full results first to understand the score distribution
-        let full_results = index.search_query(&query);
-        let mut full_sorted = full_results.clone();
-        full_sorted.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
-
-        // With wand_factor = 0.0, should only keep results at or above threshold (max_score * 0.0 = 0)
-        // Actually with wand_factor = 0.0, threshold = max_score * 0.0 = 0, so all positive scores pass
-        // The real test is to use a higher wand_factor like 0.5
         let options = SearchOptions::new().with_wand_factor(0.5);
         let results = index.search_with_options(&query, options);
 
-        // Results should be pruned based on threshold
         if !results.is_empty() {
-            let max_score = full_sorted[0].score;
+            let max_score = full[0].score;
             let threshold = max_score * 0.5;
-
             for result in &results {
-                assert!(
-                    result.score >= threshold - 0.001, // small epsilon for float comparison
-                    "With wand_factor=0.5, all results should score >= {} but got {}",
-                    threshold,
-                    result.score
-                );
+                assert!(result.score >= threshold - 0.001);
             }
-
-            // Should have fewer or equal results compared to full results
-            assert!(
-                results.len() <= full_results.len(),
-                "Pruned results should not exceed full results"
-            );
         }
     }
 
@@ -2614,22 +2380,9 @@ mod tests {
         index.insert(&batch, 0).unwrap();
 
         let query = FtsQueryExpr::match_query("alpha");
-
-        // Get full results to understand score distribution
-        let full_results = index.search_query(&query);
-        assert!(
-            full_results.len() >= 3,
-            "Need at least 3 results for this test"
-        );
-
-        // With limit=2 and wand_factor=0.5, prune docs scoring below 50% of 2nd best
         let options = SearchOptions::new().with_limit(2).with_wand_factor(0.5);
         let results = index.search_with_options(&query, options);
-
-        // Should have at most 2 results (the limit)
-        assert!(results.len() <= 2, "Should not exceed limit");
-
-        // Results should be sorted by score
+        assert!(results.len() <= 2);
         if results.len() > 1 {
             assert!(results[0].score >= results[1].score);
         }
@@ -2643,15 +2396,10 @@ mod tests {
         let batch = create_wand_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // Query for something that doesn't exist
         let query = FtsQueryExpr::match_query("nonexistent");
         let options = SearchOptions::new().with_limit(10).with_wand_factor(0.5);
         let results = index.search_with_options(&query, options);
-
-        assert!(
-            results.is_empty(),
-            "Should return empty for non-matching query"
-        );
+        assert!(results.is_empty());
     }
 
     #[test]
@@ -2662,7 +2410,6 @@ mod tests {
         let batch = create_wand_test_batch(&schema);
         index.insert(&batch, 0).unwrap();
 
-        // Boolean query: alpha SHOULD beta
         let query = FtsQueryExpr::boolean()
             .should(FtsQueryExpr::match_query("alpha"))
             .should(FtsQueryExpr::match_query("beta"))
@@ -2670,11 +2417,185 @@ mod tests {
 
         let options = SearchOptions::new().with_limit(3);
         let results = index.search_with_options(&query, options);
-
-        assert!(results.len() <= 3, "Should not exceed limit");
-        // Results should be sorted by score descending
+        assert!(results.len() <= 3);
         for i in 1..results.len() {
             assert!(results[i - 1].score >= results[i].score);
         }
     }
+
+    // ====== New tests for SWMR semantics, Utf8View, memory accounting ======
+
+    #[test]
+    fn test_utf8view_insert_and_search() {
+        // Mirror of test_fts_index_insert_and_search but with a Utf8View column.
+        let schema = Arc::new(ArrowSchema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("description", DataType::Utf8View, true),
+        ]));
+        let index = FtsMemIndex::new(1, "description".to_string());
+
+        let view =
+            arrow_array::StringViewArray::from(vec!["hello world", "goodbye world", "hello again"]);
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![Arc::new(Int32Array::from(vec![0, 1, 2])), Arc::new(view)],
+        )
+        .unwrap();
+        index.insert(&batch, 0).unwrap();
+
+        assert_eq!(index.doc_count(), 3);
+        assert_eq!(index.search("hello").len(), 2);
+        assert_eq!(index.search("world").len(), 2);
+        assert_eq!(index.search("goodbye").len(), 1);
+    }
+
+    #[test]
+    fn test_memory_usage_grows_with_inserts() {
+        let schema = create_test_schema();
+        let index = FtsMemIndex::new(1, "description".to_string());
+
+        let empty = index.memory_usage();
+        index.insert(&create_test_batch(&schema), 0).unwrap();
+        let after_one = index.memory_usage();
+        index
+            .insert(&create_phrase_test_batch(&schema), 100)
+            .unwrap();
+        let after_two = index.memory_usage();
+
+        assert!(after_one > empty, "memory should grow after first insert");
+        assert!(
+            after_two > after_one,
+            "memory should grow after second insert"
+        );
+    }
+
+    #[test]
+    fn test_partial_doc_never_visible_phrase() {
+        // A phrase query inside a single document must either match fully
+        // (both phrase tokens present) or not match at all — readers must
+        // never observe a half-inserted doc that contains only one of the
+        // phrase tokens. Since `search_phrase` only returns rows where
+        // every token's position constraint holds, any returned entry
+        // implicitly proves both tokens were visible together.
+        use std::sync::Arc;
+        let schema = create_test_schema();
+        let index = Arc::new(FtsMemIndex::new(1, "description".to_string()));
+
+        let stop = Arc::new(std::sync::atomic::AtomicBool::new(false));
+        let mut readers = Vec::new();
+        for _ in 0..4 {
+            let idx = index.clone();
+            let stop = stop.clone();
+            readers.push(std::thread::spawn(move || {
+                while !stop.load(Ordering::Relaxed) {
+                    let entries = idx.search_phrase("hello world", 0);
+                    for e in &entries {
+                        // "hello world" appears as the doc at offset 0 of
+                        // every inserted batch, whose row_offset is a
+                        // multiple of 100 — so any matched row_position
+                        // must be a multiple of 100 (the row_offset itself,
+                        // not offset+1 or offset+2).
+                        assert_eq!(
+                            e.row_position % 100,
+                            0,
+                            "phrase 'hello world' should only match the row_offset row of each batch, but got row_position {}",
+                            e.row_position
+                        );
+                        assert!(e.score.is_finite() && e.score >= 0.0);
+                    }
+                }
+            }));
+        }
+
+        for i in 0..50 {
+            let batch = create_test_batch(&schema);
+            index.insert(&batch, (i * 100) as u64).unwrap();
+        }
+        std::thread::sleep(std::time::Duration::from_millis(20));
+        stop.store(true, Ordering::Relaxed);
+
+        for r in readers {
+            r.join().unwrap();
+        }
+    }
+
+    #[test]
+    fn test_swmr_visibility_torture() {
+        use std::sync::Arc;
+        use std::sync::atomic::{AtomicBool, Ordering};
+        let schema = create_test_schema();
+        let index = Arc::new(FtsMemIndex::new(1, "description".to_string()));
+        let stop = Arc::new(AtomicBool::new(false));
+
+        // 8 reader threads issuing match queries.
+        let mut readers = Vec::new();
+        for _ in 0..8 {
+            let idx = index.clone();
+            let stop = stop.clone();
+            readers.push(std::thread::spawn(move || {
+                while !stop.load(Ordering::Relaxed) {
+                    let r = idx.search("hello");
+                    // BM25 scores must be finite and non-negative.
+                    for e in &r {
+                        assert!(e.score.is_finite());
+                        assert!(e.score >= 0.0);
+                    }
+                }
+            }));
+        }
+
+        // One writer thread.
+        let writer = {
+            let idx = index.clone();
+            std::thread::spawn(move || {
+                for i in 0..200 {
+                    let batch = create_test_batch(&schema);
+                    idx.insert(&batch, (i * 100) as u64).unwrap();
+                }
+            })
+        };
+
+        writer.join().unwrap();
+        stop.store(true, Ordering::Relaxed);
+        for r in readers {
+            r.join().unwrap();
+        }
+
+        // After all inserts, doc_count must be 200 batches × 3 rows.
+        assert_eq!(index.doc_count(), 600);
+    }
+
+    #[test]
+    fn test_to_index_builder_reversed_smoke() {
+        // Ensure flush works on a minimal input.
+        let schema = create_test_schema();
+        let index = FtsMemIndex::new(1, "description".to_string());
+        let batch = create_test_batch(&schema);
+        index.insert(&batch, 0).unwrap();
+
+        let builder = index.to_index_builder_reversed(42, 3).unwrap();
+        // The builder can be consumed by callers; we just check it built.
+        assert!(builder.id() > 0 || builder.id() == 42);
+    }
+
+    #[test]
+    fn test_unsupported_column_type_errors() {
+        let schema = Arc::new(ArrowSchema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("description", DataType::Int32, true),
+        ]));
+        let index = FtsMemIndex::new(1, "description".to_string());
+
+        let batch = RecordBatch::try_new(
+            schema,
+            vec![
+                Arc::new(Int32Array::from(vec![0, 1])),
+                Arc::new(Int32Array::from(vec![1, 2])),
+            ],
+        )
+        .unwrap();
+
+        let err = index.insert(&batch, 0).unwrap_err();
+        assert!(err.to_string().contains("only supports"), "{err}");
+    }
 }

From e604d84c97b55fc7b5a0f9b887d5cb793fd773c8 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sun, 10 May 2026 01:52:23 -0700
Subject: [PATCH 02/37] fix(mem_wal): thread snapshot through compound FTS
 queries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Boolean and Boost queries called search_query recursively for each
sub-clause, and every leaf (search / search_phrase / search_fuzzy)
took its own self.snapshot.load_full(). A writer publishing between
sub-queries left the compound result mixing BM25 stats from different
snapshots — n, avgdl, and df disagreed across leaves of the same
query, so the summed score wasn't valid for any point in time.

Snapshot once at search_query / search_with_options and pass the
Arc<Snapshot> through every leaf and every recursive call. Public
search* entry points keep their signatures and snapshot internally.

Also: filter entry_count() by visibility; correct stale doc-comments
on Snapshot::batches, TermChunk::positions, and Snapshot::batch_for;
hoist test-body 'use std::sync::Arc;' to the test module's top.
---
 rust/lance/src/dataset/mem_wal/index/fts.rs | 137 ++++++++++++++------
 1 file changed, 98 insertions(+), 39 deletions(-)

diff --git a/rust/lance/src/dataset/mem_wal/index/fts.rs b/rust/lance/src/dataset/mem_wal/index/fts.rs
index 5b924617bd6..739ab21de2a 100644
--- a/rust/lance/src/dataset/mem_wal/index/fts.rs
+++ b/rust/lance/src/dataset/mem_wal/index/fts.rs
@@ -378,9 +378,11 @@ struct TermChunk {
     batch_position: usize,
     row_positions: Vec<u64>,
     frequencies: Vec<u32>,
-    /// Present iff `params.has_positions()` was true at construction time.
-    /// Independent of that, in-memory phrase queries always work when
-    /// positions were tracked at insert time.
+    /// Per-doc token positions. Always `Some` in the current
+    /// implementation (the writer tracks positions unconditionally so
+    /// in-memory phrase queries work even when `params.has_positions()`
+    /// is false). Kept as `Option` for forward compatibility with a
+    /// future "no-positions" mode that drops the allocation.
     positions: Option<Positions>,
 }
 
@@ -456,11 +458,11 @@ impl BatchMeta {
 struct Snapshot {
     /// Number of batches visible to readers. `0` means empty index.
     visible_count: usize,
-    /// Visible-batch metadata. `batches.len() >= visible_count`; the writer
-    /// may have already appended a still-invisible batch to this list, but
-    /// readers walk only `batches[0..visible_count]`. In the publication
-    /// order described in the module docs, readers never observe
-    /// `visible_count` exceeding the actual length.
+    /// Visible-batch metadata, written to in publish order. Always
+    /// `batches.len() == visible_count` for any snapshot the writer has
+    /// stored (each `publish_batch` appends a single entry and bumps
+    /// `visible_count` by one). The slice is kept exactly the visible
+    /// length so readers can iterate it directly without re-bounding.
     batches: Arc<[Arc<BatchMeta>]>,
     /// `Σ batches[i].rows` for `i < visible_count`.
     cumulative_doc_count: u64,
@@ -479,8 +481,12 @@ impl Snapshot {
     }
 
     fn batch_for(&self, batch_position: usize) -> Option<&Arc<BatchMeta>> {
-        // Visible batches are densely numbered starting at 0 in the order
-        // they were inserted. Use direct index when possible.
+        // The fast path assumes batch_position equals the index in
+        // `batches` (true when callers use the no-arg `insert()` and let
+        // the index assign sequential positions). When callers pass
+        // explicit positions to `insert_with_batch_position`, those
+        // positions can be sparse / out of order, so we fall back to a
+        // linear search through visible batches.
         self.batches
             .get(batch_position)
             .filter(|m| m.batch_position == batch_position)
@@ -660,17 +666,22 @@ impl FtsMemIndex {
         self.snapshot.load().visible_count == 0
     }
 
-    /// Total number of (term, doc) postings currently stored.
+    /// Total number of visible (term, doc) postings.
     ///
-    /// Counts every chunk regardless of visibility — the writer's invariant
-    /// is that chunks become visible together with the snapshot bump, so
-    /// in steady state this equals the number of visible postings.
+    /// Sums posting counts only over chunks whose batch is visible per the
+    /// current snapshot, so this matches what readers can actually walk.
     pub fn entry_count(&self) -> usize {
+        let visible = self.snapshot.load().visible_count;
         self.terms
             .iter()
             .map(|e| {
                 let slice = e.value().load();
-                slice.chunks.iter().map(|c| c.doc_count()).sum::<usize>()
+                slice
+                    .chunks
+                    .iter()
+                    .filter(|c| c.batch_position < visible)
+                    .map(|c| c.doc_count())
+                    .sum::<usize>()
             })
             .sum()
     }
@@ -849,19 +860,30 @@ impl FtsMemIndex {
     /// use `search_with_options` for sorted/limited output.
     pub fn search(&self, term: &str) -> Vec<FtsEntry> {
         let snap = self.snapshot.load_full();
+        self.search_with_snapshot(term, &snap)
+    }
+
+    fn search_with_snapshot(&self, term: &str, snap: &Arc<Snapshot>) -> Vec<FtsEntry> {
         if snap.visible_count == 0 {
             return Vec::new();
         }
-
         let tokens = self.tokenize_for_search(term);
-
-        score_terms(&snap, &self.terms, &tokens)
+        score_terms(snap, &self.terms, &tokens)
     }
 
     /// Search for documents containing an exact phrase, optionally allowing
     /// `slop` intervening tokens between consecutive query tokens.
     pub fn search_phrase(&self, phrase: &str, slop: u32) -> Vec<FtsEntry> {
         let snap = self.snapshot.load_full();
+        self.search_phrase_with_snapshot(phrase, slop, &snap)
+    }
+
+    fn search_phrase_with_snapshot(
+        &self,
+        phrase: &str,
+        slop: u32,
+        snap: &Arc<Snapshot>,
+    ) -> Vec<FtsEntry> {
         if snap.visible_count == 0 {
             return Vec::new();
         }
@@ -873,7 +895,7 @@ impl FtsMemIndex {
         if tokens.len() == 1 {
             // Same shortcut as the previous implementation: a single-token
             // phrase reduces to a regular term search.
-            return score_terms(&snap, &self.terms, &tokens);
+            return score_terms(snap, &self.terms, &tokens);
         }
 
         // Gather visible chunks per token.
@@ -961,7 +983,7 @@ impl FtsMemIndex {
                     continue;
                 }
 
-                let dl = lookup_dl(&snap, row_position).unwrap_or(1) as f32;
+                let dl = lookup_dl(snap, row_position).unwrap_or(1) as f32;
                 let mut score = 0.0f32;
                 for (ti, _tok) in tokens.iter().enumerate() {
                     let tf = frequencies[ti] as f32;
@@ -987,7 +1009,17 @@ impl FtsMemIndex {
         max_distance: u32,
         max_expansions: usize,
     ) -> Vec<(String, u32)> {
-        let snap = self.snapshot.load();
+        let snap = self.snapshot.load_full();
+        self.expand_fuzzy_with_snapshot(term, max_distance, max_expansions, &snap)
+    }
+
+    fn expand_fuzzy_with_snapshot(
+        &self,
+        term: &str,
+        max_distance: u32,
+        max_expansions: usize,
+        snap: &Arc<Snapshot>,
+    ) -> Vec<(String, u32)> {
         let mut matches: Vec<(String, u32)> = Vec::new();
 
         if max_distance == 0 {
@@ -1022,6 +1054,16 @@ impl FtsMemIndex {
         max_expansions: usize,
     ) -> Vec<FtsEntry> {
         let snap = self.snapshot.load_full();
+        self.search_fuzzy_with_snapshot(query, fuzziness, max_expansions, &snap)
+    }
+
+    fn search_fuzzy_with_snapshot(
+        &self,
+        query: &str,
+        fuzziness: Option<u32>,
+        max_expansions: usize,
+        snap: &Arc<Snapshot>,
+    ) -> Vec<FtsEntry> {
         if snap.visible_count == 0 {
             return Vec::new();
         }
@@ -1034,7 +1076,8 @@ impl FtsMemIndex {
         let mut expanded: Vec<String> = Vec::new();
         for tok in &tokens {
             let max_dist = fuzziness.unwrap_or_else(|| auto_fuzziness(tok));
-            for (matched, _) in self.expand_fuzzy(tok, max_dist, max_expansions) {
+            for (matched, _) in self.expand_fuzzy_with_snapshot(tok, max_dist, max_expansions, snap)
+            {
                 expanded.push(matched);
             }
         }
@@ -1042,19 +1085,33 @@ impl FtsMemIndex {
             return Vec::new();
         }
 
-        score_terms(&snap, &self.terms, &expanded)
+        score_terms(snap, &self.terms, &expanded)
     }
 
     /// Execute a query expression and return matching documents with scores.
+    ///
+    /// Snapshots the index state once at entry so the entire compound
+    /// query — including every leaf invoked recursively from `Boolean` /
+    /// `Boost` — sees the same `Snapshot`. This preserves the per-batch
+    /// monotonic visibility contract for compound queries.
     pub fn search_query(&self, query: &FtsQueryExpr) -> Vec<FtsEntry> {
+        let snap = self.snapshot.load_full();
+        self.search_query_with_snapshot(query, &snap)
+    }
+
+    fn search_query_with_snapshot(
+        &self,
+        query: &FtsQueryExpr,
+        snap: &Arc<Snapshot>,
+    ) -> Vec<FtsEntry> {
         match query {
             FtsQueryExpr::Match { query, boost } => {
-                let mut results = self.search(query);
+                let mut results = self.search_with_snapshot(query, snap);
                 apply_boost(&mut results, *boost);
                 results
             }
             FtsQueryExpr::Phrase { query, slop, boost } => {
-                let mut results = self.search_phrase(query, *slop);
+                let mut results = self.search_phrase_with_snapshot(query, *slop, snap);
                 apply_boost(&mut results, *boost);
                 results
             }
@@ -1064,7 +1121,8 @@ impl FtsMemIndex {
                 max_expansions,
                 boost,
             } => {
-                let mut results = self.search_fuzzy(query, *fuzziness, *max_expansions);
+                let mut results =
+                    self.search_fuzzy_with_snapshot(query, *fuzziness, *max_expansions, snap);
                 apply_boost(&mut results, *boost);
                 results
             }
@@ -1072,12 +1130,12 @@ impl FtsMemIndex {
                 must,
                 should,
                 must_not,
-            } => self.search_boolean(must, should, must_not),
+            } => self.search_boolean(must, should, must_not, snap),
             FtsQueryExpr::Boost {
                 positive,
                 negative,
                 negative_boost,
-            } => self.search_boost(positive, negative.as_deref(), *negative_boost),
+            } => self.search_boost(positive, negative.as_deref(), *negative_boost, snap),
         }
     }
 
@@ -1087,7 +1145,8 @@ impl FtsMemIndex {
         query: &FtsQueryExpr,
         options: SearchOptions,
     ) -> Vec<FtsEntry> {
-        let mut results = self.search_query(query);
+        let snap = self.snapshot.load_full();
+        let mut results = self.search_query_with_snapshot(query, &snap);
         results.sort_by(|a, b| {
             b.score
                 .partial_cmp(&a.score)
@@ -1116,12 +1175,13 @@ impl FtsMemIndex {
         positive: &FtsQueryExpr,
         negative: Option<&FtsQueryExpr>,
         negative_boost: f32,
+        snap: &Arc<Snapshot>,
     ) -> Vec<FtsEntry> {
-        let mut results = self.search_query(positive);
+        let mut results = self.search_query_with_snapshot(positive, snap);
         let Some(neg) = negative else {
             return results;
         };
-        let negative_results = self.search_query(neg);
+        let negative_results = self.search_query_with_snapshot(neg, snap);
         let negative_set: HashSet<RowPosition> = negative_results
             .into_iter()
             .map(|e| e.row_position)
@@ -1139,29 +1199,30 @@ impl FtsMemIndex {
         must: &[FtsQueryExpr],
         should: &[FtsQueryExpr],
         must_not: &[FtsQueryExpr],
+        snap: &Arc<Snapshot>,
     ) -> Vec<FtsEntry> {
         let excluded: HashSet<RowPosition> = must_not
             .iter()
-            .flat_map(|q| self.search_query(q))
+            .flat_map(|q| self.search_query_with_snapshot(q, snap))
             .map(|e| e.row_position)
             .collect();
 
         let mut result_map: HashMap<RowPosition, f32> = if must.is_empty() {
             let mut map: HashMap<RowPosition, f32> = HashMap::new();
             for q in should {
-                for entry in self.search_query(q) {
+                for entry in self.search_query_with_snapshot(q, snap) {
                     *map.entry(entry.row_position).or_default() += entry.score;
                 }
             }
             map
         } else {
-            let first_results = self.search_query(&must[0]);
+            let first_results = self.search_query_with_snapshot(&must[0], snap);
             let mut map: HashMap<RowPosition, f32> = first_results
                 .into_iter()
                 .map(|e| (e.row_position, e.score))
                 .collect();
             for q in must.iter().skip(1) {
-                let results = self.search_query(q);
+                let results = self.search_query_with_snapshot(q, snap);
                 let result_set: HashMap<RowPosition, f32> = results
                     .into_iter()
                     .map(|e| (e.row_position, e.score))
@@ -1172,7 +1233,7 @@ impl FtsMemIndex {
                     .collect();
             }
             for q in should {
-                for entry in self.search_query(q) {
+                for entry in self.search_query_with_snapshot(q, snap) {
                     if let Some(score) = map.get_mut(&entry.row_position) {
                         *score += entry.score;
                     }
@@ -1595,6 +1656,7 @@ mod tests {
     use arrow_array::{Int32Array, StringArray};
     use arrow_schema::{DataType, Field, Schema as ArrowSchema};
     use std::sync::Arc;
+    use std::sync::atomic::AtomicBool;
 
     fn create_test_schema() -> Arc<ArrowSchema> {
         Arc::new(ArrowSchema::new(vec![
@@ -2477,7 +2539,6 @@ mod tests {
         // phrase tokens. Since `search_phrase` only returns rows where
         // every token's position constraint holds, any returned entry
         // implicitly proves both tokens were visible together.
-        use std::sync::Arc;
         let schema = create_test_schema();
         let index = Arc::new(FtsMemIndex::new(1, "description".to_string()));
 
@@ -2521,8 +2582,6 @@ mod tests {
 
     #[test]
     fn test_swmr_visibility_torture() {
-        use std::sync::Arc;
-        use std::sync::atomic::{AtomicBool, Ordering};
         let schema = create_test_schema();
         let index = Arc::new(FtsMemIndex::new(1, "description".to_string()));
         let stop = Arc::new(AtomicBool::new(false));

From 967813094b623d795c47e8b63788248f29f6805c Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sun, 10 May 2026 11:13:21 -0700
Subject: [PATCH 03/37] bench: add fineweb FTS end-to-end benchmark
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New rust/lance/benches/mem_wal_fineweb_fts.rs covers three metrics
across the 12 configs in the design doc:

- write throughput at memtable sizes 100k / 500k / 1M
- MemTable FTS query latency (avg/p50/p95) over 100 high-frequency
  tokens + 50 sampled phrases
- consistency: |memtable_top10 ∩ post_flush_disk_top10| / |union| as a
  user-approved replacement for recall@k

The bench downloads HuggingFaceFW/fineweb sample/10BT shards, caches
them, and is fully env-driven so a single binary handles every config.
Driver script bench/run_fineweb_fts.sh loops the 12 configs, uploads
each result.json to S3, and prints a summary.

Also: make `dataset::mem_wal::index` public so the bench can call
`FtsMemIndex::search_with_options` directly to time the MemTable read
path.
---
 bench/run_fineweb_fts.sh                  | 121 +++
 rust/lance/Cargo.toml                     |   5 +-
 rust/lance/benches/mem_wal_fineweb_fts.rs | 952 ++++++++++++++++++++++
 rust/lance/src/dataset/mem_wal.rs         |   2 +-
 4 files changed, 1078 insertions(+), 2 deletions(-)
 create mode 100755 bench/run_fineweb_fts.sh
 create mode 100644 rust/lance/benches/mem_wal_fineweb_fts.rs

diff --git a/bench/run_fineweb_fts.sh b/bench/run_fineweb_fts.sh
new file mode 100755
index 00000000000..5709052e86e
--- /dev/null
+++ b/bench/run_fineweb_fts.sh
@@ -0,0 +1,121 @@
+#!/usr/bin/env bash
+# Driver for the fineweb FTS benchmark.
+#
+# Runs the 12 configs (3 memtable sizes × durable yes/no × FTS yes/no), saves
+# each result.json locally and to S3, and at the end prints a small summary.
+#
+# Usage:
+#   ./bench/run_fineweb_fts.sh [run_id]
+#
+# Env vars (optional):
+#   DATASET_PREFIX   default: s3://jack-devland-build/bench/mem-fts-fineweb
+#   BENCH_BASE_ROWS  default: 1000000
+#   BENCH_INGEST_ROWS default: 1000000
+#   BENCH_BATCH_SIZE default: 1000
+#   AWS_DEFAULT_REGION default: us-east-1
+
+set -euo pipefail
+
+cd "$(dirname "${BASH_SOURCE[0]}")/.."
+
+RUN_ID="${1:-$(date -u +%Y%m%dT%H%M%SZ)}"
+DATASET_PREFIX="${DATASET_PREFIX:-s3://jack-devland-build/bench/mem-fts-fineweb}"
+BENCH_BASE_ROWS="${BENCH_BASE_ROWS:-1000000}"
+BENCH_INGEST_ROWS="${BENCH_INGEST_ROWS:-1000000}"
+BENCH_BATCH_SIZE="${BENCH_BATCH_SIZE:-1000}"
+export AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}"
+
+LOCAL_DIR="bench/results/${RUN_ID}"
+mkdir -p "$LOCAL_DIR"
+
+BIN="target/release/mem_wal_fineweb_fts"
+if [ ! -x "$BIN" ]; then
+    echo "building bench binary..."
+    cargo build --release -p lance --bench mem_wal_fineweb_fts
+    # criterion-style bench output goes to deps/; resolve it.
+    BIN="$(ls -t target/release/deps/mem_wal_fineweb_fts-* | grep -v '\.d$' | head -1)"
+fi
+echo "using bench binary: $BIN"
+
+CONFIGS=(
+    "100000 0 0"
+    "100000 0 1"
+    "100000 1 0"
+    "100000 1 1"
+    "500000 0 0"
+    "500000 0 1"
+    "500000 1 0"
+    "500000 1 1"
+    "1000000 0 0"
+    "1000000 0 1"
+    "1000000 1 0"
+    "1000000 1 1"
+)
+
+echo "=== Run $RUN_ID =="
+echo "  prefix: $DATASET_PREFIX"
+echo "  base_rows: $BENCH_BASE_ROWS  ingest_rows: $BENCH_INGEST_ROWS  batch_size: $BENCH_BATCH_SIZE"
+echo ""
+
+for cfg in "${CONFIGS[@]}"; do
+    read -r MT D F <<< "$cfg"
+    if [ "$MT" = "1000000" ]; then SZ="1M"; elif [ "$MT" = "500000" ]; then SZ="500k"; else SZ="100k"; fi
+    NAME="mt${SZ}_durable${D}_fts${F}"
+    OUT="$LOCAL_DIR/${NAME}.json"
+    LOG="$LOCAL_DIR/${NAME}.log"
+    echo ">>> $NAME"
+    if [ -f "$OUT" ]; then
+        echo "    result already exists, skipping"
+        continue
+    fi
+    set +e
+    BENCH_RUN_ID="$RUN_ID" \
+    DATASET_PREFIX="$DATASET_PREFIX" \
+    BENCH_MAX_MEMTABLE_ROWS="$MT" \
+    DURABLE_WRITE="$D" \
+    FTS_ENABLED="$F" \
+    BENCH_BASE_ROWS="$BENCH_BASE_ROWS" \
+    BENCH_INGEST_ROWS="$BENCH_INGEST_ROWS" \
+    BENCH_BATCH_SIZE="$BENCH_BATCH_SIZE" \
+    BENCH_CACHE_DIR="${BENCH_CACHE_DIR:-/mnt/data/fineweb}" \
+    RESULT_FILE="$OUT" \
+    "$BIN" --bench --nocapture 2>&1 | tee "$LOG"
+    RC=${PIPESTATUS[0]}
+    set -e
+    if [ "$RC" -ne 0 ]; then
+        echo "    !!! config failed (rc=$RC); see $LOG"
+    fi
+    # Upload to S3 alongside the dataset.
+    if [ -f "$OUT" ]; then
+        aws s3 cp "$OUT" "$DATASET_PREFIX/$RUN_ID/results/${NAME}.json" || true
+        aws s3 cp "$LOG" "$DATASET_PREFIX/$RUN_ID/results/${NAME}.log" || true
+    fi
+done
+
+echo ""
+echo "=== summary ==="
+python3 - <<PY
+import glob, json, os
+results = []
+for p in sorted(glob.glob(os.path.join("$LOCAL_DIR", "*.json"))):
+    try:
+        with open(p) as f: r = json.load(f)
+        results.append(r)
+    except Exception as e:
+        print(f"  failed to read {p}: {e}")
+
+print(f"{'config':30s} {'rows/s':>10} {'p95_ms':>7} {'mt_p95_ms':>10} {'cons_mean':>10}")
+for r in results:
+    name = r["config_name"]
+    tp = r["ingest"]["rows_per_sec"]
+    p95 = r["ingest"]["put_p95_ms"]
+    rd = r.get("read")
+    mt = rd["mt_latency_p95_ms"] if rd else 0
+    cm = rd["consistency_mean"] if rd else 0
+    print(f"{name:30s} {tp:>10.0f} {p95:>7.2f} {mt:>10.2f} {cm:>10.3f}")
+PY
+
+echo ""
+echo "Results:"
+echo "  local: $LOCAL_DIR"
+echo "  s3:    $DATASET_PREFIX/$RUN_ID/results/"
diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml
index e1e5a3bc823..19eb1aa9c8a 100644
--- a/rust/lance/Cargo.toml
+++ b/rust/lance/Cargo.toml
@@ -68,7 +68,6 @@ rayon.workspace = true
 futures.workspace = true
 uuid.workspace = true
 arrow.workspace = true
-arc-swap.workspace = true
 # TODO: use datafusion sub-modules to reduce build size?
 datafusion.workspace = true
 datafusion-functions.workspace = true
@@ -237,5 +236,9 @@ harness = false
 name = "mem_wal_shard_writer_backpressure"
 harness = false
 
+[[bench]]
+name = "mem_wal_fineweb_fts"
+harness = false
+
 [lints]
 workspace = true
diff --git a/rust/lance/benches/mem_wal_fineweb_fts.rs b/rust/lance/benches/mem_wal_fineweb_fts.rs
new file mode 100644
index 00000000000..066502b7a7a
--- /dev/null
+++ b/rust/lance/benches/mem_wal_fineweb_fts.rs
@@ -0,0 +1,952 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! End-to-end benchmark for the SWMR FTS mem index using HuggingFace fineweb.
+//!
+//! For a single configuration, this binary:
+//!  1. Downloads (and caches) one fineweb sample shard.
+//!  2. Slices `BENCH_BASE_ROWS + BENCH_INGEST_ROWS` rows from it.
+//!  3. (Once per `BENCH_RUN_ID`) writes the base 1M-row Lance dataset to
+//!     `${DATASET_PREFIX}/${BENCH_RUN_ID}/base/`.
+//!  4. Ingests `BENCH_INGEST_ROWS` rows through `ShardWriter` with the
+//!     configured `max_memtable_rows`, `DURABLE_WRITE`, and `FTS_ENABLED`.
+//!     Records write throughput.
+//!  5. If FTS is enabled and `--with-read-test` is set: ingests
+//!     `max_memtable_rows` rows into a fresh dataset with auto-flush
+//!     disabled, queries the MemTable for 150 prebuilt queries (latency),
+//!     forces a flush, queries the on-disk FTS for the same queries, and
+//!     reports the top-10 set overlap (the user-approved "consistency"
+//!     proxy for recall).
+//!
+//! Writes a structured `result.json` to `${RESULT_FILE}`. All paths,
+//! credentials, and tunables are env-var driven so the same binary drives
+//! all 12 configs from a shell loop.
+//!
+//! See `~/ai/analysis/lance/jack-MemTableFTSBetter/fineweb-fts-bench/DESIGN.md`.
+
+#![recursion_limit = "256"]
+#![allow(clippy::print_stdout, clippy::print_stderr)]
+
+use std::collections::{BTreeMap, HashMap, HashSet};
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use arrow_array::{Array, ArrayRef, Int64Array, RecordBatch, RecordBatchIterator, StringArray};
+use arrow_schema::{DataType, Field, Schema as ArrowSchema};
+use futures::TryStreamExt;
+use lance::dataset::mem_wal::index::{FtsQueryExpr, SearchOptions};
+use lance::dataset::mem_wal::write::ShardWriterConfig;
+use lance::dataset::mem_wal::{DatasetMemWalExt, MemWalConfig};
+use lance::dataset::{Dataset, WriteParams};
+use lance::index::DatasetIndexExt;
+use lance_index::IndexType;
+use lance_index::scalar::{FullTextSearchQuery, ScalarIndexParams};
+use lance_index::scalar::inverted::tokenizer::InvertedIndexParams;
+use parquet::arrow::async_reader::ParquetRecordBatchStreamBuilder;
+use serde::Serialize;
+use uuid::Uuid;
+
+const TEXT_COL: &str = "text";
+const FTS_INDEX_NAME: &str = "text_fts";
+const HF_API_LISTING: &str =
+    "https://huggingface.co/api/datasets/HuggingFaceFW/fineweb/tree/main/sample/10BT";
+const HF_FILE_BASE: &str = "https://huggingface.co/datasets/HuggingFaceFW/fineweb/resolve/main/";
+
+// ----------------------------------------------------------------------
+// Configuration (env-driven)
+// ----------------------------------------------------------------------
+
+#[derive(Debug, Clone)]
+struct Config {
+    dataset_prefix: String,
+    run_id: String,
+    config_name: String,
+    max_memtable_rows: usize,
+    durable_write: bool,
+    fts_enabled: bool,
+    base_rows: usize,
+    ingest_rows: usize,
+    batch_size: usize,
+    cache_dir: std::path::PathBuf,
+    result_file: std::path::PathBuf,
+    /// When true, run the read-perf + consistency sub-test in addition to
+    /// the throughput test. Auto-disabled when `fts_enabled = false`.
+    with_read_test: bool,
+    /// How many high-frequency single-token queries to include.
+    num_token_queries: usize,
+    /// How many random 2-token phrase queries to include.
+    num_phrase_queries: usize,
+    /// Top-K used for read latency and consistency.
+    top_k: usize,
+}
+
+impl Config {
+    fn from_env() -> Self {
+        let dataset_prefix = std::env::var("DATASET_PREFIX")
+            .unwrap_or_else(|_| "/tmp/bench/mem_fts_fineweb".to_string());
+        let run_id = std::env::var("BENCH_RUN_ID").unwrap_or_else(|_| "dev".to_string());
+        let max_memtable_rows = env_usize("BENCH_MAX_MEMTABLE_ROWS", 100_000);
+        let durable_write = env_bool("DURABLE_WRITE", false);
+        let fts_enabled = env_bool("FTS_ENABLED", false);
+        let base_rows = env_usize("BENCH_BASE_ROWS", 1_000_000);
+        let ingest_rows = env_usize("BENCH_INGEST_ROWS", 1_000_000);
+        let batch_size = env_usize("BENCH_BATCH_SIZE", 1000);
+        let cache_dir = std::env::var("BENCH_CACHE_DIR")
+            .unwrap_or_else(|_| {
+                std::env::temp_dir()
+                    .join("mem_wal_fineweb_fts_cache")
+                    .to_string_lossy()
+                    .into_owned()
+            })
+            .into();
+        let result_file = std::env::var("RESULT_FILE")
+            .unwrap_or_else(|_| "result.json".to_string())
+            .into();
+        let with_read_test = env_bool("BENCH_WITH_READ_TEST", true) && fts_enabled;
+        let num_token_queries = env_usize("BENCH_NUM_TOKEN_QUERIES", 100);
+        let num_phrase_queries = env_usize("BENCH_NUM_PHRASE_QUERIES", 50);
+        let top_k = env_usize("BENCH_TOP_K", 10);
+
+        let config_name = format!(
+            "mt{}_durable{}_fts{}",
+            human_size(max_memtable_rows),
+            if durable_write { "1" } else { "0" },
+            if fts_enabled { "1" } else { "0" },
+        );
+
+        Self {
+            dataset_prefix,
+            run_id,
+            config_name,
+            max_memtable_rows,
+            durable_write,
+            fts_enabled,
+            base_rows,
+            ingest_rows,
+            batch_size,
+            cache_dir,
+            result_file,
+            with_read_test,
+            num_token_queries,
+            num_phrase_queries,
+            top_k,
+        }
+    }
+
+    #[allow(dead_code)]
+    fn base_uri(&self) -> String {
+        format!("{}/{}/base", self.dataset_prefix, self.run_id)
+    }
+
+    fn ingest_uri(&self) -> String {
+        format!(
+            "{}/{}/ingest_{}",
+            self.dataset_prefix, self.run_id, self.config_name
+        )
+    }
+
+    fn read_test_uri(&self) -> String {
+        format!(
+            "{}/{}/readtest_{}",
+            self.dataset_prefix, self.run_id, self.config_name
+        )
+    }
+}
+
+fn env_usize(key: &str, default: usize) -> usize {
+    std::env::var(key)
+        .ok()
+        .and_then(|s| s.parse().ok())
+        .unwrap_or(default)
+}
+
+fn env_bool(key: &str, default: bool) -> bool {
+    match std::env::var(key).ok().as_deref() {
+        Some("1") | Some("yes") | Some("true") | Some("YES") | Some("TRUE") => true,
+        Some("0") | Some("no") | Some("false") | Some("NO") | Some("FALSE") => false,
+        _ => default,
+    }
+}
+
+fn human_size(n: usize) -> String {
+    if n % 1_000_000 == 0 {
+        format!("{}M", n / 1_000_000)
+    } else if n % 1_000 == 0 {
+        format!("{}k", n / 1_000)
+    } else {
+        n.to_string()
+    }
+}
+
+// ----------------------------------------------------------------------
+// HF fineweb shard loading
+// ----------------------------------------------------------------------
+
+#[derive(serde::Deserialize)]
+struct HfTreeEntry {
+    #[serde(rename = "type")]
+    kind: String,
+    path: String,
+}
+
+async fn list_shard_paths() -> lance_core::Result<Vec<String>> {
+    let entries: Vec<HfTreeEntry> = reqwest::get(HF_API_LISTING)
+        .await
+        .map_err(|e| lance_core::Error::io(format!("listing HTTP: {}", e)))?
+        .json()
+        .await
+        .map_err(|e| lance_core::Error::io(format!("listing JSON: {}", e)))?;
+    let mut shards: Vec<String> = entries
+        .into_iter()
+        .filter(|e| e.kind == "file" && e.path.ends_with(".parquet"))
+        .map(|e| e.path)
+        .collect();
+    shards.sort();
+    Ok(shards)
+}
+
+async fn download_shard(rel_path: &str, dest: &std::path::Path) -> lance_core::Result<()> {
+    if dest.exists() {
+        return Ok(());
+    }
+    let url = format!("{}{}", HF_FILE_BASE, rel_path);
+    let max_attempts = 5;
+    for attempt in 1..=max_attempts {
+        println!(
+            "downloading {} (attempt {}/{}) ...",
+            rel_path, attempt, max_attempts
+        );
+        let result: lance_core::Result<bytes::Bytes> = async {
+            let resp = reqwest::get(&url)
+                .await
+                .map_err(|e| lance_core::Error::io(format!("download HTTP: {}", e)))?;
+            if !resp.status().is_success() {
+                return Err(lance_core::Error::io(format!(
+                    "download {} → status {}",
+                    url,
+                    resp.status()
+                )));
+            }
+            resp.bytes()
+                .await
+                .map_err(|e| lance_core::Error::io(format!("read body: {}", e)))
+        }
+        .await;
+        match result {
+            Ok(bytes) => {
+                std::fs::write(dest, &bytes)
+                    .map_err(|e| lance_core::Error::io(format!("write: {}", e)))?;
+                println!(
+                    "  wrote {:.1} MB to {}",
+                    bytes.len() as f64 / 1024.0 / 1024.0,
+                    dest.display()
+                );
+                return Ok(());
+            }
+            Err(e) if attempt < max_attempts => {
+                let backoff = Duration::from_secs(2u64.pow(attempt as u32));
+                eprintln!(
+                    "  attempt {} failed: {}; retrying in {:?}",
+                    attempt, e, backoff
+                );
+                tokio::time::sleep(backoff).await;
+            }
+            Err(e) => return Err(e),
+        }
+    }
+    unreachable!()
+}
+
+async fn read_shard_text(
+    path: &std::path::Path,
+    out: &mut Vec<String>,
+    max_rows: usize,
+) -> lance_core::Result<usize> {
+    let file = tokio::fs::File::open(path)
+        .await
+        .map_err(|e| lance_core::Error::io(format!("open parquet: {}", e)))?;
+    let builder = ParquetRecordBatchStreamBuilder::new(file)
+        .await
+        .map_err(|e| lance_core::Error::io(format!("parquet builder: {}", e)))?;
+    let mut stream = builder
+        .build()
+        .map_err(|e| lance_core::Error::io(format!("parquet stream: {}", e)))?;
+
+    let mut taken = 0usize;
+    while taken < max_rows {
+        let Some(rb) = stream
+            .try_next()
+            .await
+            .map_err(|e| lance_core::Error::io(format!("parquet read: {}", e)))?
+        else {
+            break;
+        };
+        let col = rb
+            .column_by_name("text")
+            .ok_or_else(|| lance_core::Error::io("text column missing".to_string()))?;
+        let strs = col
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .ok_or_else(|| lance_core::Error::io("text column not StringArray".to_string()))?;
+        for i in 0..strs.len() {
+            if taken >= max_rows {
+                break;
+            }
+            if strs.is_null(i) {
+                continue;
+            }
+            out.push(strs.value(i).to_string());
+            taken += 1;
+        }
+    }
+    Ok(taken)
+}
+
+async fn load_corpus(
+    needed_rows: usize,
+    cache_dir: &std::path::Path,
+) -> lance_core::Result<Vec<String>> {
+    std::fs::create_dir_all(cache_dir)
+        .map_err(|e| lance_core::Error::io(format!("mkdir cache: {}", e)))?;
+    let shards = list_shard_paths().await?;
+    println!("fineweb sample/10BT has {} parquet shards", shards.len());
+
+    let mut buf: Vec<String> = Vec::with_capacity(needed_rows);
+    for rel_path in &shards {
+        if buf.len() >= needed_rows {
+            break;
+        }
+        let local_name = rel_path.rsplit('/').next().unwrap_or(rel_path);
+        let local = cache_dir.join(local_name);
+        download_shard(rel_path, &local).await?;
+        let want = needed_rows - buf.len();
+        let got = read_shard_text(&local, &mut buf, want).await?;
+        println!(
+            "  shard {} → {} text rows (cumulative {})",
+            local_name,
+            got,
+            buf.len()
+        );
+    }
+    if buf.len() < needed_rows {
+        eprintln!(
+            "  warning: dataset exhausted at {} rows (asked {})",
+            buf.len(),
+            needed_rows
+        );
+    }
+    Ok(buf)
+}
+
+// ----------------------------------------------------------------------
+// Schema + batch helpers
+// ----------------------------------------------------------------------
+
+fn make_schema() -> Arc<ArrowSchema> {
+    let mut id_meta = HashMap::new();
+    id_meta.insert(
+        "lance-schema:unenforced-primary-key".to_string(),
+        "true".to_string(),
+    );
+    let id = Field::new("id", DataType::Int64, false).with_metadata(id_meta);
+    Arc::new(ArrowSchema::new(vec![
+        id,
+        Field::new(TEXT_COL, DataType::Utf8, true),
+    ]))
+}
+
+fn make_batch(start_id: i64, texts: &[String], schema: Arc<ArrowSchema>) -> RecordBatch {
+    let n = texts.len();
+    let ids: Vec<i64> = (start_id..start_id + n as i64).collect();
+    let id_arr: ArrayRef = Arc::new(Int64Array::from(ids));
+    let text_arr: ArrayRef = Arc::new(StringArray::from(texts.to_vec()));
+    RecordBatch::try_new(schema, vec![id_arr, text_arr]).unwrap()
+}
+
+// ----------------------------------------------------------------------
+// Base dataset
+// ----------------------------------------------------------------------
+
+async fn build_base_if_absent(
+    base_uri: &str,
+    schema: Arc<ArrowSchema>,
+    base_texts: &[String],
+    batch_size: usize,
+    fts_enabled: bool,
+) -> lance_core::Result<()> {
+    if Dataset::open(base_uri).await.is_ok() {
+        println!("base dataset already exists at {}, skipping build", base_uri);
+        return Ok(());
+    }
+    println!(
+        "building base dataset at {} ({} rows, batch_size {})",
+        base_uri,
+        base_texts.len(),
+        batch_size
+    );
+    let total = base_texts.len();
+    let mut batches = Vec::with_capacity(total.div_ceil(batch_size));
+    let mut start = 0usize;
+    while start < total {
+        let end = (start + batch_size).min(total);
+        batches.push(Ok(make_batch(
+            start as i64,
+            &base_texts[start..end],
+            schema.clone(),
+        )));
+        start = end;
+    }
+    let reader = RecordBatchIterator::new(batches.into_iter(), schema.clone());
+    let mut dataset = Dataset::write(reader, base_uri, Some(WriteParams::default())).await?;
+    if fts_enabled {
+        let fts_params = InvertedIndexParams::default();
+        dataset
+            .create_index(
+                &[TEXT_COL],
+                IndexType::Inverted,
+                Some(FTS_INDEX_NAME.to_string()),
+                &fts_params,
+                true,
+            )
+            .await?;
+    } else {
+        // Even when MemWAL is configured without FTS, we still need a
+        // BTree index on `id` so MemWAL has at least one maintained
+        // index to reference.
+        let pk_params = ScalarIndexParams::default();
+        dataset
+            .create_index(
+                &["id"],
+                IndexType::BTree,
+                Some("id_btree".to_string()),
+                &pk_params,
+                true,
+            )
+            .await?;
+    }
+    let maintained = if fts_enabled {
+        vec![FTS_INDEX_NAME.to_string()]
+    } else {
+        vec!["id_btree".to_string()]
+    };
+    dataset
+        .initialize_mem_wal(MemWalConfig {
+            shard_spec: None,
+            maintained_indexes: maintained,
+        })
+        .await?;
+    Ok(())
+}
+
+// ----------------------------------------------------------------------
+// Ingest
+// ----------------------------------------------------------------------
+
+#[derive(Debug, Clone, Serialize)]
+struct IngestStats {
+    rows: usize,
+    wall_seconds: f64,
+    rows_per_sec: f64,
+    /// p95 per-`put` latency in milliseconds.
+    put_p95_ms: f64,
+    put_p50_ms: f64,
+    put_max_ms: f64,
+    num_puts: usize,
+}
+
+async fn ingest_via_shard_writer(
+    target_uri: &str,
+    schema: Arc<ArrowSchema>,
+    base_texts: &[String],
+    ingest_texts: &[String],
+    cfg: &Config,
+    disable_auto_flush: bool,
+) -> lance_core::Result<IngestStats> {
+    // Build a fresh ingest dataset by cloning the base.
+    println!("preparing ingest dataset at {}", target_uri);
+    build_base_if_absent(
+        target_uri,
+        schema.clone(),
+        base_texts,
+        cfg.batch_size,
+        cfg.fts_enabled,
+    )
+    .await?;
+    let dataset = Arc::new(Dataset::open(target_uri).await?);
+
+    let shard_id = Uuid::new_v4();
+    let max_memtable_rows = if disable_auto_flush {
+        cfg.ingest_rows.saturating_mul(2).max(2_000_000)
+    } else {
+        cfg.max_memtable_rows
+    };
+    let max_memtable_size = if disable_auto_flush {
+        16 * 1024 * 1024 * 1024 // 16 GiB
+    } else {
+        16 * 1024 * 1024 * 1024
+    };
+    let writer_config = ShardWriterConfig {
+        shard_id,
+        shard_spec_id: 0,
+        durable_write: cfg.durable_write,
+        sync_indexed_write: true,
+        max_memtable_size,
+        max_memtable_rows,
+        max_memtable_batches: 4_000_000,
+        max_wal_flush_interval: Some(Duration::from_millis(200)),
+        max_unflushed_memtable_bytes: usize::MAX / 2,
+        ..ShardWriterConfig::default()
+    };
+    let writer = dataset
+        .as_ref()
+        .mem_wal_writer(shard_id, writer_config)
+        .await?;
+
+    // Ingest IDs start above the base table's last id to keep PK unique.
+    let id_offset: i64 = cfg.base_rows as i64;
+    let n = ingest_texts.len();
+    let bs = cfg.batch_size;
+    let total_batches = n.div_ceil(bs);
+
+    let mut put_latencies: Vec<u128> = Vec::with_capacity(total_batches);
+    let start = Instant::now();
+    for i in 0..total_batches {
+        let lo = i * bs;
+        let hi = (lo + bs).min(n);
+        let batch = make_batch(
+            id_offset + lo as i64,
+            &ingest_texts[lo..hi],
+            schema.clone(),
+        );
+        let put_t = Instant::now();
+        writer.put(vec![batch]).await?;
+        put_latencies.push(put_t.elapsed().as_micros());
+        if (i + 1) % 100 == 0 {
+            let so_far = start.elapsed().as_secs_f64();
+            let rate = (i + 1) as f64 * bs as f64 / so_far.max(1e-9);
+            println!(
+                "  ingest progress: {}/{} batches ({:.0} rows/s)",
+                i + 1,
+                total_batches,
+                rate
+            );
+        }
+    }
+    // Wait for index update to catch up if sync_indexed_write didn't fully drain.
+    let target_batch_pos = total_batches.saturating_sub(1);
+    loop {
+        let active = writer.active_memtable_ref().await?;
+        if active.index_store.max_indexed_batch_position() >= target_batch_pos {
+            break;
+        }
+        drop(active);
+        tokio::time::sleep(Duration::from_millis(50)).await;
+    }
+    let elapsed = start.elapsed();
+    drop(writer);
+
+    put_latencies.sort_unstable();
+    let p50 = put_latencies[put_latencies.len() / 2] as f64 / 1000.0;
+    let p95 = put_latencies[put_latencies.len() * 95 / 100] as f64 / 1000.0;
+    let max = *put_latencies.iter().max().unwrap_or(&0) as f64 / 1000.0;
+    Ok(IngestStats {
+        rows: n,
+        wall_seconds: elapsed.as_secs_f64(),
+        rows_per_sec: n as f64 / elapsed.as_secs_f64().max(1e-9),
+        put_p50_ms: p50,
+        put_p95_ms: p95,
+        put_max_ms: max,
+        num_puts: total_batches,
+    })
+}
+
+// ----------------------------------------------------------------------
+// Query set
+// ----------------------------------------------------------------------
+
+#[derive(Debug, Clone, Serialize)]
+struct QuerySet {
+    tokens: Vec<String>,
+    phrases: Vec<String>,
+}
+
+fn build_query_set(sample_texts: &[&str], cfg: &Config) -> QuerySet {
+    use lance_tokenizer::TokenStream;
+    // ASCII stop-word-ish list used by the default English analyzer; we only
+    // need a coarse filter for query selection here.
+    const STOPWORDS: &[&str] = &[
+        "the", "a", "an", "and", "or", "of", "to", "in", "on", "for", "with", "as", "by", "is",
+        "was", "are", "were", "be", "been", "being", "this", "that", "these", "those", "it", "its",
+        "but", "not", "no", "if", "then", "than", "so", "do", "does", "did", "have", "has", "had",
+        "will", "would", "should", "could", "can", "may", "might", "must", "i", "you", "he", "she",
+        "we", "they", "them", "his", "her", "their", "our", "us", "me", "my", "your", "him",
+    ];
+    let mut tokenizer = InvertedIndexParams::default()
+        .build()
+        .expect("default tokenizer builds");
+    let mut freq: HashMap<String, u64> = HashMap::new();
+    for t in sample_texts.iter().take(50_000) {
+        let mut stream = tokenizer.token_stream_for_doc(t);
+        while let Some(tok) = stream.next() {
+            if tok.text.len() < 3 || tok.text.len() > 24 {
+                continue;
+            }
+            if STOPWORDS.contains(&tok.text.as_str()) {
+                continue;
+            }
+            *freq.entry(tok.text.clone()).or_default() += 1;
+        }
+    }
+    let mut by_freq: Vec<(String, u64)> = freq.into_iter().collect();
+    by_freq.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
+    let tokens: Vec<String> = by_freq
+        .into_iter()
+        .map(|(t, _)| t)
+        .take(cfg.num_token_queries)
+        .collect();
+
+    // Phrase queries: walk a deterministic stride of rows, take the first
+    // two consecutive non-stopword non-short tokens.
+    let mut phrases = Vec::with_capacity(cfg.num_phrase_queries);
+    let stride = sample_texts.len().max(1) / cfg.num_phrase_queries.max(1);
+    let mut idx = 0usize;
+    while phrases.len() < cfg.num_phrase_queries && idx < sample_texts.len() {
+        let t = sample_texts[idx];
+        let mut stream = tokenizer.token_stream_for_doc(t);
+        let mut acc: Vec<String> = Vec::new();
+        while let Some(tok) = stream.next() {
+            if tok.text.len() < 3 || tok.text.len() > 24 {
+                continue;
+            }
+            if STOPWORDS.contains(&tok.text.as_str()) {
+                continue;
+            }
+            acc.push(tok.text.clone());
+            if acc.len() == 2 {
+                phrases.push(format!("{} {}", acc[0], acc[1]));
+                break;
+            }
+        }
+        idx = idx.saturating_add(stride.max(1));
+    }
+
+    QuerySet { tokens, phrases }
+}
+
+// ----------------------------------------------------------------------
+// Read test (FTS only): MemTable query latency + post-flush consistency
+// ----------------------------------------------------------------------
+
+#[derive(Debug, Clone, Serialize)]
+struct ReadStats {
+    rows: usize,
+    /// Average across all queries (token + phrase).
+    mt_latency_avg_ms: f64,
+    mt_latency_p50_ms: f64,
+    mt_latency_p95_ms: f64,
+    consistency_mean: f64,
+    consistency_min: f64,
+    num_queries: usize,
+}
+
+async fn run_read_test(
+    target_uri: &str,
+    schema: Arc<ArrowSchema>,
+    base_texts: &[String],
+    ingest_texts: &[String],
+    queries: &QuerySet,
+    cfg: &Config,
+) -> lance_core::Result<ReadStats> {
+    println!(
+        "  read test: ingesting {} rows with auto-flush disabled",
+        ingest_texts.len()
+    );
+    build_base_if_absent(
+        target_uri,
+        schema.clone(),
+        base_texts,
+        cfg.batch_size,
+        true, // FTS index on the base, since this path is FTS-only.
+    )
+    .await?;
+    let dataset = Arc::new(Dataset::open(target_uri).await?);
+    let shard_id = Uuid::new_v4();
+    let writer_config = ShardWriterConfig {
+        shard_id,
+        shard_spec_id: 0,
+        durable_write: cfg.durable_write,
+        sync_indexed_write: true,
+        // Effectively disable auto-flush triggers so the MemTable holds
+        // the full ingest_texts.len() rows for the query phase.
+        max_memtable_size: 64 * 1024 * 1024 * 1024,
+        max_memtable_rows: ingest_texts.len().saturating_mul(2),
+        max_memtable_batches: 4_000_000,
+        max_wal_flush_interval: Some(Duration::from_millis(200)),
+        max_unflushed_memtable_bytes: usize::MAX / 2,
+        ..ShardWriterConfig::default()
+    };
+    let writer = dataset
+        .as_ref()
+        .mem_wal_writer(shard_id, writer_config)
+        .await?;
+
+    let id_offset: i64 = cfg.base_rows as i64;
+    let bs = cfg.batch_size;
+    let n = ingest_texts.len();
+    let total_batches = n.div_ceil(bs);
+    for i in 0..total_batches {
+        let lo = i * bs;
+        let hi = (lo + bs).min(n);
+        let batch = make_batch(
+            id_offset + lo as i64,
+            &ingest_texts[lo..hi],
+            schema.clone(),
+        );
+        writer.put(vec![batch]).await?;
+    }
+    let target_batch_pos = total_batches.saturating_sub(1);
+    loop {
+        let active = writer.active_memtable_ref().await?;
+        if active.index_store.max_indexed_batch_position() >= target_batch_pos {
+            break;
+        }
+        drop(active);
+        tokio::time::sleep(Duration::from_millis(50)).await;
+    }
+
+    // ----- MemTable phase -----
+    let active = writer.active_memtable_ref().await?;
+    let fts = active
+        .index_store
+        .get_fts(FTS_INDEX_NAME)
+        .ok_or_else(|| lance_core::Error::invalid_input("FTS mem index not found"))?;
+
+    let mut latencies_us: Vec<u128> = Vec::new();
+    let mut mt_top10: Vec<HashSet<i64>> = Vec::new();
+
+    let all_queries: Vec<(FtsQueryExpr, String)> = queries
+        .tokens
+        .iter()
+        .map(|t| (FtsQueryExpr::match_query(t.clone()), t.clone()))
+        .chain(
+            queries
+                .phrases
+                .iter()
+                .map(|p| (FtsQueryExpr::phrase(p.clone()), format!("\"{p}\""))),
+        )
+        .collect();
+
+    // Build a row_position -> id map by scanning the active batches.
+    // This is needed because the MemTable returns row_positions; the
+    // post-flush on-disk FTS returns row_ids that match the `id` column.
+    let mut row_to_id: HashMap<u64, i64> = HashMap::new();
+    for stored in active.batch_store.iter() {
+        let id_arr = stored
+            .data
+            .column_by_name("id")
+            .and_then(|c| c.as_any().downcast_ref::<Int64Array>())
+            .ok_or_else(|| lance_core::Error::invalid_input("id col missing"))?;
+        for r in 0..id_arr.len() {
+            row_to_id.insert(stored.row_offset + r as u64, id_arr.value(r));
+        }
+    }
+
+    for (q, label) in &all_queries {
+        let opts = SearchOptions::new().with_limit(cfg.top_k);
+        let t0 = Instant::now();
+        let entries = fts.search_with_options(q, opts);
+        latencies_us.push(t0.elapsed().as_micros());
+        let mut ids = HashSet::with_capacity(cfg.top_k);
+        for e in entries.iter().take(cfg.top_k) {
+            if let Some(id) = row_to_id.get(&e.row_position) {
+                ids.insert(*id);
+            }
+        }
+        if mt_top10.len() < 3 {
+            println!(
+                "    [mt] {label}: {} hits, ids={:?}",
+                entries.len(),
+                ids.iter().take(3).collect::<Vec<_>>()
+            );
+        }
+        mt_top10.push(ids);
+    }
+    drop(active);
+
+    latencies_us.sort_unstable();
+    let avg_us =
+        latencies_us.iter().sum::<u128>() as f64 / latencies_us.len().max(1) as f64;
+    let p50 = latencies_us[latencies_us.len() / 2] as f64 / 1000.0;
+    let p95 = latencies_us[latencies_us.len() * 95 / 100] as f64 / 1000.0;
+
+    // ----- Force flush, then on-disk phase -----
+    println!("  read test: closing writer to force flush");
+    writer.close().await?;
+    let flushed_dataset = Dataset::open(target_uri).await?;
+
+    let mut consistencies: Vec<f64> = Vec::with_capacity(all_queries.len());
+    for ((q, label), mt_ids) in all_queries.iter().zip(mt_top10.iter()) {
+        let fts_query = match q {
+            FtsQueryExpr::Match { query, .. } => FullTextSearchQuery::new(query.clone()),
+            FtsQueryExpr::Phrase { query, .. } => {
+                FullTextSearchQuery::new(format!("\"{}\"", query))
+            }
+            _ => unreachable!("only match/phrase queries in this set"),
+        };
+        let mut scanner = flushed_dataset.scan();
+        scanner.full_text_search(fts_query)?;
+        scanner.limit(Some(cfg.top_k as i64), None)?;
+        scanner.project(&["id"])?;
+        let stream = scanner.try_into_stream().await?;
+        let batches: Vec<RecordBatch> = stream.try_collect().await?;
+        let mut disk_ids = HashSet::new();
+        for b in &batches {
+            let id_arr = b
+                .column_by_name("id")
+                .and_then(|c| c.as_any().downcast_ref::<Int64Array>())
+                .ok_or_else(|| lance_core::Error::invalid_input("disk id col missing"))?;
+            for i in 0..id_arr.len() {
+                disk_ids.insert(id_arr.value(i));
+            }
+        }
+        let inter: usize = mt_ids.intersection(&disk_ids).count();
+        let denom = mt_ids.len().max(disk_ids.len()).max(1);
+        let cons = inter as f64 / denom as f64;
+        if consistencies.len() < 3 {
+            println!(
+                "    [disk] {label}: {} hits; mt={} disk={} ∩={} cons={:.3}",
+                disk_ids.len(),
+                mt_ids.len(),
+                disk_ids.len(),
+                inter,
+                cons
+            );
+        }
+        consistencies.push(cons);
+    }
+
+    let cons_mean = consistencies.iter().sum::<f64>() / consistencies.len().max(1) as f64;
+    let cons_min = consistencies
+        .iter()
+        .copied()
+        .fold(f64::INFINITY, f64::min);
+
+    Ok(ReadStats {
+        rows: n,
+        mt_latency_avg_ms: avg_us / 1000.0,
+        mt_latency_p50_ms: p50,
+        mt_latency_p95_ms: p95,
+        consistency_mean: cons_mean,
+        consistency_min: if cons_min.is_finite() { cons_min } else { 0.0 },
+        num_queries: all_queries.len(),
+    })
+}
+
+// ----------------------------------------------------------------------
+// Top-level orchestration
+// ----------------------------------------------------------------------
+
+#[derive(Debug, Clone, Serialize)]
+struct RunResult {
+    config_name: String,
+    max_memtable_rows: usize,
+    durable_write: bool,
+    fts_enabled: bool,
+    base_rows: usize,
+    ingest_rows: usize,
+    batch_size: usize,
+    ingest: IngestStats,
+    read: Option<ReadStats>,
+    timestamp_utc: String,
+}
+
+#[tokio::main(flavor = "multi_thread")]
+async fn main() -> lance_core::Result<()> {
+    let cfg = Config::from_env();
+    println!("=== mem_wal_fineweb_fts === config = {:?}", cfg);
+
+    let total_rows = cfg.base_rows + cfg.ingest_rows;
+    let texts = load_corpus(total_rows, &cfg.cache_dir).await?;
+    if texts.len() < total_rows {
+        return Err(lance_core::Error::io(format!(
+            "fineweb shards yielded only {} rows, need {}",
+            texts.len(),
+            total_rows
+        )));
+    }
+    let base_texts = &texts[..cfg.base_rows];
+    let ingest_texts = &texts[cfg.base_rows..cfg.base_rows + cfg.ingest_rows];
+
+    let schema = make_schema();
+
+    // Build query set once from the ingest slice (deterministic).
+    let sample_refs: Vec<&str> = ingest_texts.iter().take(50_000).map(|s| s.as_str()).collect();
+    let queries = build_query_set(&sample_refs, &cfg);
+    println!(
+        "query set: {} tokens + {} phrases",
+        queries.tokens.len(),
+        queries.phrases.len()
+    );
+
+    // Throughput sub-test: ingest 1M with the configured params.
+    println!("\n--- throughput sub-test ---");
+    let ingest_stats = ingest_via_shard_writer(
+        &cfg.ingest_uri(),
+        schema.clone(),
+        base_texts,
+        ingest_texts,
+        &cfg,
+        false, // auto-flush enabled (per max_memtable_rows)
+    )
+    .await?;
+    println!("throughput: {:.1} rows/s", ingest_stats.rows_per_sec);
+
+    // Read sub-test: only when FTS enabled and read test requested.
+    let read_stats = if cfg.with_read_test {
+        println!("\n--- read sub-test ---");
+        let n_for_read = cfg.max_memtable_rows.min(ingest_texts.len());
+        let read_ingest = &ingest_texts[..n_for_read];
+        Some(
+            run_read_test(
+                &cfg.read_test_uri(),
+                schema.clone(),
+                base_texts,
+                read_ingest,
+                &queries,
+                &cfg,
+            )
+            .await?,
+        )
+    } else {
+        None
+    };
+
+    let timestamp_utc = chrono::Utc::now().to_rfc3339();
+    let result = RunResult {
+        config_name: cfg.config_name.clone(),
+        max_memtable_rows: cfg.max_memtable_rows,
+        durable_write: cfg.durable_write,
+        fts_enabled: cfg.fts_enabled,
+        base_rows: cfg.base_rows,
+        ingest_rows: cfg.ingest_rows,
+        batch_size: cfg.batch_size,
+        ingest: ingest_stats,
+        read: read_stats,
+        timestamp_utc,
+    };
+    let json = serde_json::to_string_pretty(&result)
+        .map_err(|e| lance_core::Error::io(format!("serialize result: {}", e)))?;
+    if let Some(parent) = cfg.result_file.parent() {
+        if !parent.as_os_str().is_empty() {
+            std::fs::create_dir_all(parent).ok();
+        }
+    }
+    std::fs::write(&cfg.result_file, json.as_bytes())
+        .map_err(|e| lance_core::Error::io(format!("write result: {}", e)))?;
+    println!("\nwrote result to {}", cfg.result_file.display());
+    println!("=== DONE ===");
+    let _ = result; // silence unused with no read test
+    let _ = sample_refs;
+    let _ = BTreeMap::<String, String>::new();
+    Ok(())
+}
diff --git a/rust/lance/src/dataset/mem_wal.rs b/rust/lance/src/dataset/mem_wal.rs
index 3f2345d22f5..1cea7e8b9e5 100644
--- a/rust/lance/src/dataset/mem_wal.rs
+++ b/rust/lance/src/dataset/mem_wal.rs
@@ -34,7 +34,7 @@
 
 mod api;
 mod hnsw;
-mod index;
+pub mod index;
 mod manifest;
 pub mod memtable;
 pub mod scanner;

From d70daeb479fa55f381259a7d1a5bc7ac80bc551e Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sun, 10 May 2026 18:21:19 -0700
Subject: [PATCH 04/37] fix(bench): replace post-ingest index-catchup spin with
 writer.close()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous spin on max_indexed_batch_position never terminated when
max_memtable_rows triggers auto-flushes during ingest: the counter is
reset on each new active memtable, so target_batch_pos = total_batches
- 1 is unreachable from the active generation.

Close the writer instead — it drains the final WAL flush and any
outstanding memtable flush; the inline sync_indexed_write covers the
per-put index updates. close() time is included in the measured
elapsed so configs with different flush cadences are compared
apples-to-apples.
---
 rust/lance/benches/mem_wal_fineweb_fts.rs | 23 ++++++++++++-----------
 1 file changed, 12 insertions(+), 11 deletions(-)

diff --git a/rust/lance/benches/mem_wal_fineweb_fts.rs b/rust/lance/benches/mem_wal_fineweb_fts.rs
index 066502b7a7a..d93dc625003 100644
--- a/rust/lance/benches/mem_wal_fineweb_fts.rs
+++ b/rust/lance/benches/mem_wal_fineweb_fts.rs
@@ -532,18 +532,19 @@ async fn ingest_via_shard_writer(
             );
         }
     }
-    // Wait for index update to catch up if sync_indexed_write didn't fully drain.
-    let target_batch_pos = total_batches.saturating_sub(1);
-    loop {
-        let active = writer.active_memtable_ref().await?;
-        if active.index_store.max_indexed_batch_position() >= target_batch_pos {
-            break;
-        }
-        drop(active);
-        tokio::time::sleep(Duration::from_millis(50)).await;
-    }
+    // Close the writer to drain the final WAL/memtable flush. With
+    // sync_indexed_write = true the inline index update is already
+    // complete, but auto-flush leaves background memtable->disk work
+    // outstanding that close() awaits. We deliberately include close()
+    // in the elapsed measurement so the reported throughput reflects
+    // "rows fully ingested + their pending flush", not just "puts
+    // returned". This is the apples-to-apples figure across configs
+    // with different `max_memtable_rows` (which drive the flush
+    // cadence). The previous post-ingest spin on
+    // `max_indexed_batch_position` was incorrect under auto-flush
+    // because the counter resets on each new active memtable.
+    writer.close().await?;
     let elapsed = start.elapsed();
-    drop(writer);
 
     put_latencies.sort_unstable();
     let p50 = put_latencies[put_latencies.len() / 2] as f64 / 1000.0;

From af62c36722c7046b5fe9374d45029f4b1ad8384f Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sat, 16 May 2026 00:43:34 -0700
Subject: [PATCH 05/37] bench: rework fineweb FTS bench to mirror the
 ShardWriter backpressure bench
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Rewrites mem_wal_fineweb_fts as a CLI-arg bench modeled on the upstream
mem_wal_shard_writer_backpressure bench: same Mode matrix (async/sync ×
index/no-index), same ShardWriter wiring, JSON output. Payload is real
HuggingFace FineWeb text and the maintained index is the in-memory FTS
index.

Splits the run into independent --phase write|read invocations so a
process never holds two ShardWriter lifecycles in sequence — that was
the deadlock in the first iteration. The driver runs every config under
a timeout watchdog so a hang costs one window, not days.
---
 bench/run_fineweb_fts.sh                  |  176 ++--
 rust/lance/benches/mem_wal_fineweb_fts.rs | 1165 ++++++++++-----------
 2 files changed, 618 insertions(+), 723 deletions(-)

diff --git a/bench/run_fineweb_fts.sh b/bench/run_fineweb_fts.sh
index 5709052e86e..c6dedd96cf9 100755
--- a/bench/run_fineweb_fts.sh
+++ b/bench/run_fineweb_fts.sh
@@ -1,121 +1,113 @@
 #!/usr/bin/env bash
-# Driver for the fineweb FTS benchmark.
+# Driver for the FineWeb FTS benchmark panel.
 #
-# Runs the 12 configs (3 memtable sizes × durable yes/no × FTS yes/no), saves
-# each result.json locally and to S3, and at the end prints a small summary.
+# Write panel : 12 configs = 4 modes (async/sync × idx/no-idx) × 3 memtable
+#               sizes (100k / 500k / 1M). Each config ingests 1M rows.
+# Read panel  : 6 configs  = 2 indexed modes × 3 memtable sizes. Each
+#               ingests `size` rows into an auto-flush-disabled MemTable,
+#               times the FTS queries, flushes, and replays on disk.
 #
-# Usage:
-#   ./bench/run_fineweb_fts.sh [run_id]
+# Every config runs as its own process under a `timeout` watchdog, so a
+# hang costs one timeout window, not days. result.json is uploaded to S3.
 #
-# Env vars (optional):
-#   DATASET_PREFIX   default: s3://jack-devland-build/bench/mem-fts-fineweb
-#   BENCH_BASE_ROWS  default: 1000000
-#   BENCH_INGEST_ROWS default: 1000000
-#   BENCH_BATCH_SIZE default: 1000
-#   AWS_DEFAULT_REGION default: us-east-1
+# Usage: ./bench/run_fineweb_fts.sh [run_id]
 
-set -euo pipefail
+set -uo pipefail
 
 cd "$(dirname "${BASH_SOURCE[0]}")/.."
 
 RUN_ID="${1:-$(date -u +%Y%m%dT%H%M%SZ)}"
 DATASET_PREFIX="${DATASET_PREFIX:-s3://jack-devland-build/bench/mem-fts-fineweb}"
-BENCH_BASE_ROWS="${BENCH_BASE_ROWS:-1000000}"
-BENCH_INGEST_ROWS="${BENCH_INGEST_ROWS:-1000000}"
-BENCH_BATCH_SIZE="${BENCH_BATCH_SIZE:-1000}"
+SEED_ROWS="${SEED_ROWS:-1000000}"
+BATCH_ROWS="${BATCH_ROWS:-1000}"
+CALLS="${CALLS:-1000}"
+CACHE_DIR="${CACHE_DIR:-/mnt/data/fineweb}"
+CONFIG_TIMEOUT="${CONFIG_TIMEOUT:-3600}"
 export AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}"
 
 LOCAL_DIR="bench/results/${RUN_ID}"
 mkdir -p "$LOCAL_DIR"
 
-BIN="target/release/mem_wal_fineweb_fts"
-if [ ! -x "$BIN" ]; then
+BIN="$(find target/release/deps -maxdepth 1 -type f -perm -111 -name 'mem_wal_fineweb_fts-*' ! -name '*.d' 2>/dev/null | sort | tail -1)"
+if [ -z "$BIN" ]; then
     echo "building bench binary..."
-    cargo build --release -p lance --bench mem_wal_fineweb_fts
-    # criterion-style bench output goes to deps/; resolve it.
-    BIN="$(ls -t target/release/deps/mem_wal_fineweb_fts-* | grep -v '\.d$' | head -1)"
+    cargo bench -p lance --bench mem_wal_fineweb_fts --no-run
+    BIN="$(find target/release/deps -maxdepth 1 -type f -perm -111 -name 'mem_wal_fineweb_fts-*' ! -name '*.d' 2>/dev/null | sort | tail -1)"
 fi
-echo "using bench binary: $BIN"
-
-CONFIGS=(
-    "100000 0 0"
-    "100000 0 1"
-    "100000 1 0"
-    "100000 1 1"
-    "500000 0 0"
-    "500000 0 1"
-    "500000 1 0"
-    "500000 1 1"
-    "1000000 0 0"
-    "1000000 0 1"
-    "1000000 1 0"
-    "1000000 1 1"
-)
-
-echo "=== Run $RUN_ID =="
-echo "  prefix: $DATASET_PREFIX"
-echo "  base_rows: $BENCH_BASE_ROWS  ingest_rows: $BENCH_INGEST_ROWS  batch_size: $BENCH_BATCH_SIZE"
+echo "bench binary: $BIN"
+echo "run id:       $RUN_ID"
 echo ""
 
-for cfg in "${CONFIGS[@]}"; do
-    read -r MT D F <<< "$cfg"
-    if [ "$MT" = "1000000" ]; then SZ="1M"; elif [ "$MT" = "500000" ]; then SZ="500k"; else SZ="100k"; fi
-    NAME="mt${SZ}_durable${D}_fts${F}"
-    OUT="$LOCAL_DIR/${NAME}.json"
-    LOG="$LOCAL_DIR/${NAME}.log"
-    echo ">>> $NAME"
-    if [ -f "$OUT" ]; then
-        echo "    result already exists, skipping"
-        continue
+run_one() {
+    local name="$1"; shift
+    local out="$LOCAL_DIR/${name}.json"
+    local log="$LOCAL_DIR/${name}.log"
+    echo ">>> $name"
+    if [ -f "$out" ]; then
+        echo "    already done, skipping"
+        return
     fi
-    set +e
-    BENCH_RUN_ID="$RUN_ID" \
-    DATASET_PREFIX="$DATASET_PREFIX" \
-    BENCH_MAX_MEMTABLE_ROWS="$MT" \
-    DURABLE_WRITE="$D" \
-    FTS_ENABLED="$F" \
-    BENCH_BASE_ROWS="$BENCH_BASE_ROWS" \
-    BENCH_INGEST_ROWS="$BENCH_INGEST_ROWS" \
-    BENCH_BATCH_SIZE="$BENCH_BATCH_SIZE" \
-    BENCH_CACHE_DIR="${BENCH_CACHE_DIR:-/mnt/data/fineweb}" \
-    RESULT_FILE="$OUT" \
-    "$BIN" --bench --nocapture 2>&1 | tee "$LOG"
-    RC=${PIPESTATUS[0]}
-    set -e
-    if [ "$RC" -ne 0 ]; then
-        echo "    !!! config failed (rc=$RC); see $LOG"
-    fi
-    # Upload to S3 alongside the dataset.
-    if [ -f "$OUT" ]; then
-        aws s3 cp "$OUT" "$DATASET_PREFIX/$RUN_ID/results/${NAME}.json" || true
-        aws s3 cp "$LOG" "$DATASET_PREFIX/$RUN_ID/results/${NAME}.log" || true
+    timeout "$CONFIG_TIMEOUT" "$BIN" --bench "$@" --output "$out" > "$log" 2>&1
+    local rc=$?
+    if [ "$rc" -eq 124 ]; then
+        echo "    !!! TIMED OUT after ${CONFIG_TIMEOUT}s"
+    elif [ "$rc" -ne 0 ]; then
+        echo "    !!! failed rc=$rc (see $log)"
+    else
+        echo "    ok"
     fi
+    [ -f "$out" ] && aws s3 cp "$out" "$DATASET_PREFIX/$RUN_ID/results/${name}.json" >/dev/null 2>&1
+    aws s3 cp "$log" "$DATASET_PREFIX/$RUN_ID/results/${name}.log" >/dev/null 2>&1
+}
+
+# ---- write panel: 4 modes × 3 sizes ----
+for mode in async_noidx async_idx sync_noidx sync_idx; do
+    for sz in 100000 500000 1000000; do
+        case "$sz" in
+            1000000) tag=1M ;;
+            500000)  tag=500k ;;
+            *)       tag=100k ;;
+        esac
+        run_one "write_${mode}_mt${tag}" \
+            --phase write --mode "$mode" \
+            --uri "$DATASET_PREFIX/$RUN_ID/w_${mode}_mt${tag}" \
+            --seed-rows "$SEED_ROWS" --batch-rows "$BATCH_ROWS" --calls "$CALLS" \
+            --max-memtable-rows "$sz" --cache-dir "$CACHE_DIR"
+    done
+done
+
+# ---- read panel: 2 indexed modes × 3 sizes ----
+for mode in async_idx sync_idx; do
+    for sz in 100000 500000 1000000; do
+        case "$sz" in
+            1000000) tag=1M ;;
+            500000)  tag=500k ;;
+            *)       tag=100k ;;
+        esac
+        run_one "read_${mode}_mt${tag}" \
+            --phase read --mode "$mode" \
+            --uri "$DATASET_PREFIX/$RUN_ID/r_${mode}_mt${tag}" \
+            --seed-rows "$SEED_ROWS" --batch-rows "$BATCH_ROWS" \
+            --read-rows "$sz" --cache-dir "$CACHE_DIR"
+    done
 done
 
 echo ""
 echo "=== summary ==="
-python3 - <<PY
-import glob, json, os
-results = []
-for p in sorted(glob.glob(os.path.join("$LOCAL_DIR", "*.json"))):
+python3 - "$LOCAL_DIR" <<'PY'
+import glob, json, os, sys
+d = sys.argv[1]
+print(f"{'config':28s} {'rows/s':>10} {'put_p99_ms':>11} {'mt_p95_ms':>10} {'cons_mean':>10}")
+for p in sorted(glob.glob(os.path.join(d, "*.json"))):
     try:
-        with open(p) as f: r = json.load(f)
-        results.append(r)
+        r = json.load(open(p))
     except Exception as e:
-        print(f"  failed to read {p}: {e}")
-
-print(f"{'config':30s} {'rows/s':>10} {'p95_ms':>7} {'mt_p95_ms':>10} {'cons_mean':>10}")
-for r in results:
-    name = r["config_name"]
-    tp = r["ingest"]["rows_per_sec"]
-    p95 = r["ingest"]["put_p95_ms"]
-    rd = r.get("read")
-    mt = rd["mt_latency_p95_ms"] if rd else 0
-    cm = rd["consistency_mean"] if rd else 0
-    print(f"{name:30s} {tp:>10.0f} {p95:>7.2f} {mt:>10.2f} {cm:>10.3f}")
+        print(f"  bad {p}: {e}"); continue
+    name = os.path.basename(p)[:-5]
+    if r.get("phase") == "write":
+        print(f"{name:28s} {r['throughput_rows_per_sec']:>10.0f} {r['put_p99_ms']:>11.2f} {'-':>10} {'-':>10}")
+    else:
+        print(f"{name:28s} {'-':>10} {'-':>11} {r['mt_latency_p95_ms']:>10.3f} {r['consistency_mean']:>10.3f}")
 PY
-
 echo ""
-echo "Results:"
-echo "  local: $LOCAL_DIR"
-echo "  s3:    $DATASET_PREFIX/$RUN_ID/results/"
+echo "results: $LOCAL_DIR  +  $DATASET_PREFIX/$RUN_ID/results/"
diff --git a/rust/lance/benches/mem_wal_fineweb_fts.rs b/rust/lance/benches/mem_wal_fineweb_fts.rs
index d93dc625003..afeed61c5de 100644
--- a/rust/lance/benches/mem_wal_fineweb_fts.rs
+++ b/rust/lance/benches/mem_wal_fineweb_fts.rs
@@ -1,33 +1,44 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 
-//! End-to-end benchmark for the SWMR FTS mem index using HuggingFace fineweb.
+//! MemWAL FTS benchmark on real FineWeb text.
 //!
-//! For a single configuration, this binary:
-//!  1. Downloads (and caches) one fineweb sample shard.
-//!  2. Slices `BENCH_BASE_ROWS + BENCH_INGEST_ROWS` rows from it.
-//!  3. (Once per `BENCH_RUN_ID`) writes the base 1M-row Lance dataset to
-//!     `${DATASET_PREFIX}/${BENCH_RUN_ID}/base/`.
-//!  4. Ingests `BENCH_INGEST_ROWS` rows through `ShardWriter` with the
-//!     configured `max_memtable_rows`, `DURABLE_WRITE`, and `FTS_ENABLED`.
-//!     Records write throughput.
-//!  5. If FTS is enabled and `--with-read-test` is set: ingests
-//!     `max_memtable_rows` rows into a fresh dataset with auto-flush
-//!     disabled, queries the MemTable for 150 prebuilt queries (latency),
-//!     forces a flush, queries the on-disk FTS for the same queries, and
-//!     reports the top-10 set overlap (the user-approved "consistency"
-//!     proxy for recall).
+//! FTS-specialized sibling of `mem_wal_shard_writer_backpressure`: same
+//! CLI-arg shape, same `Mode` matrix (async/sync × index/no-index), same
+//! `ShardWriter` wiring and JSON output, but the payload is real
+//! HuggingFace FineWeb `text` and the maintained index is the in-memory
+//! FTS index instead of IVF/PQ.
 //!
-//! Writes a structured `result.json` to `${RESULT_FILE}`. All paths,
-//! credentials, and tunables are env-var driven so the same binary drives
-//! all 12 configs from a shell loop.
+//! Two phases, selected with `--phase`; each invocation does exactly one
+//! phase so a process never holds two `ShardWriter` lifecycles (that is
+//! what deadlocked the first iteration of this bench):
 //!
-//! See `~/ai/analysis/lance/jack-MemTableFTSBetter/fineweb-fts-bench/DESIGN.md`.
+//!   --phase write   throughput panel: ingest `calls × batch-rows` rows
+//!                   through `ShardWriter`, report rows/s + latency
+//!                   percentiles.
+//!   --phase read    MemTable FTS read panel: ingest `read-rows` rows into
+//!                   an auto-flush-disabled MemTable, time the FTS queries
+//!                   against the live MemTable, force a flush, replay the
+//!                   queries against the on-disk FTS index, and report the
+//!                   per-query top-K overlap ("consistency").
+//!
+//! Example:
+//!
+//! ```bash
+//! AWS_DEFAULT_REGION=us-east-1 \
+//! cargo bench -p lance --bench mem_wal_fineweb_fts -- \
+//!   --phase write --mode async_idx \
+//!   --uri s3://jack-devland-build/bench/mem-fts-fineweb/run1/w_async_idx_mt100k \
+//!   --seed-rows 1000000 --batch-rows 1000 --calls 1000 \
+//!   --max-memtable-rows 100000 \
+//!   --cache-dir /mnt/data/fineweb --output result.json
+//! ```
 
 #![recursion_limit = "256"]
 #![allow(clippy::print_stdout, clippy::print_stderr)]
 
-use std::collections::{BTreeMap, HashMap, HashSet};
+use std::collections::{HashMap, HashSet};
+use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 
@@ -35,15 +46,16 @@ use arrow_array::{Array, ArrayRef, Int64Array, RecordBatch, RecordBatchIterator,
 use arrow_schema::{DataType, Field, Schema as ArrowSchema};
 use futures::TryStreamExt;
 use lance::dataset::mem_wal::index::{FtsQueryExpr, SearchOptions};
-use lance::dataset::mem_wal::write::ShardWriterConfig;
-use lance::dataset::mem_wal::{DatasetMemWalExt, MemWalConfig};
+use lance::dataset::mem_wal::{DatasetMemWalExt, MemWalConfig, ShardWriterConfig};
 use lance::dataset::{Dataset, WriteParams};
 use lance::index::DatasetIndexExt;
+use lance_core::Result;
 use lance_index::IndexType;
-use lance_index::scalar::{FullTextSearchQuery, ScalarIndexParams};
+use lance_index::scalar::FullTextSearchQuery;
 use lance_index::scalar::inverted::tokenizer::InvertedIndexParams;
+use lance_tokenizer::TokenStream;
 use parquet::arrow::async_reader::ParquetRecordBatchStreamBuilder;
-use serde::Serialize;
+use serde_json::json;
 use uuid::Uuid;
 
 const TEXT_COL: &str = "text";
@@ -53,133 +65,130 @@ const HF_API_LISTING: &str =
 const HF_FILE_BASE: &str = "https://huggingface.co/datasets/HuggingFaceFW/fineweb/resolve/main/";
 
 // ----------------------------------------------------------------------
-// Configuration (env-driven)
+// Mode / Phase
 // ----------------------------------------------------------------------
 
-#[derive(Debug, Clone)]
-struct Config {
-    dataset_prefix: String,
-    run_id: String,
-    config_name: String,
-    max_memtable_rows: usize,
-    durable_write: bool,
-    fts_enabled: bool,
-    base_rows: usize,
-    ingest_rows: usize,
-    batch_size: usize,
-    cache_dir: std::path::PathBuf,
-    result_file: std::path::PathBuf,
-    /// When true, run the read-perf + consistency sub-test in addition to
-    /// the throughput test. Auto-disabled when `fts_enabled = false`.
-    with_read_test: bool,
-    /// How many high-frequency single-token queries to include.
-    num_token_queries: usize,
-    /// How many random 2-token phrase queries to include.
-    num_phrase_queries: usize,
-    /// Top-K used for read latency and consistency.
-    top_k: usize,
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum Mode {
+    AsyncNoIndex,
+    AsyncIndexed,
+    SyncNoIndex,
+    SyncIndexed,
 }
 
-impl Config {
-    fn from_env() -> Self {
-        let dataset_prefix = std::env::var("DATASET_PREFIX")
-            .unwrap_or_else(|_| "/tmp/bench/mem_fts_fineweb".to_string());
-        let run_id = std::env::var("BENCH_RUN_ID").unwrap_or_else(|_| "dev".to_string());
-        let max_memtable_rows = env_usize("BENCH_MAX_MEMTABLE_ROWS", 100_000);
-        let durable_write = env_bool("DURABLE_WRITE", false);
-        let fts_enabled = env_bool("FTS_ENABLED", false);
-        let base_rows = env_usize("BENCH_BASE_ROWS", 1_000_000);
-        let ingest_rows = env_usize("BENCH_INGEST_ROWS", 1_000_000);
-        let batch_size = env_usize("BENCH_BATCH_SIZE", 1000);
-        let cache_dir = std::env::var("BENCH_CACHE_DIR")
-            .unwrap_or_else(|_| {
-                std::env::temp_dir()
-                    .join("mem_wal_fineweb_fts_cache")
-                    .to_string_lossy()
-                    .into_owned()
-            })
-            .into();
-        let result_file = std::env::var("RESULT_FILE")
-            .unwrap_or_else(|_| "result.json".to_string())
-            .into();
-        let with_read_test = env_bool("BENCH_WITH_READ_TEST", true) && fts_enabled;
-        let num_token_queries = env_usize("BENCH_NUM_TOKEN_QUERIES", 100);
-        let num_phrase_queries = env_usize("BENCH_NUM_PHRASE_QUERIES", 50);
-        let top_k = env_usize("BENCH_TOP_K", 10);
-
-        let config_name = format!(
-            "mt{}_durable{}_fts{}",
-            human_size(max_memtable_rows),
-            if durable_write { "1" } else { "0" },
-            if fts_enabled { "1" } else { "0" },
-        );
+impl Mode {
+    fn parse(value: &str) -> std::result::Result<Self, String> {
+        match value {
+            "async_noidx" => Ok(Self::AsyncNoIndex),
+            "async_idx" => Ok(Self::AsyncIndexed),
+            "sync_noidx" => Ok(Self::SyncNoIndex),
+            "sync_idx" => Ok(Self::SyncIndexed),
+            _ => Err(format!(
+                "unknown mode '{value}', expected async_noidx|async_idx|sync_noidx|sync_idx"
+            )),
+        }
+    }
 
-        Self {
-            dataset_prefix,
-            run_id,
-            config_name,
-            max_memtable_rows,
-            durable_write,
-            fts_enabled,
-            base_rows,
-            ingest_rows,
-            batch_size,
-            cache_dir,
-            result_file,
-            with_read_test,
-            num_token_queries,
-            num_phrase_queries,
-            top_k,
+    fn as_str(self) -> &'static str {
+        match self {
+            Self::AsyncNoIndex => "async_noidx",
+            Self::AsyncIndexed => "async_idx",
+            Self::SyncNoIndex => "sync_noidx",
+            Self::SyncIndexed => "sync_idx",
         }
     }
 
-    #[allow(dead_code)]
-    fn base_uri(&self) -> String {
-        format!("{}/{}/base", self.dataset_prefix, self.run_id)
+    /// FTS index maintained in the MemTable.
+    fn indexed(self) -> bool {
+        matches!(self, Self::AsyncIndexed | Self::SyncIndexed)
     }
 
-    fn ingest_uri(&self) -> String {
-        format!(
-            "{}/{}/ingest_{}",
-            self.dataset_prefix, self.run_id, self.config_name
-        )
+    /// Each `put` waits for WAL durability before returning.
+    fn durable_write(self) -> bool {
+        matches!(self, Self::SyncNoIndex | Self::SyncIndexed)
     }
 
-    fn read_test_uri(&self) -> String {
-        format!(
-            "{}/{}/readtest_{}",
-            self.dataset_prefix, self.run_id, self.config_name
-        )
+    /// Index update happens inline in `put` (only meaningful when indexed).
+    fn sync_indexed_write(self) -> bool {
+        matches!(self, Self::SyncIndexed)
     }
 }
 
-fn env_usize(key: &str, default: usize) -> usize {
-    std::env::var(key)
-        .ok()
-        .and_then(|s| s.parse().ok())
-        .unwrap_or(default)
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum Phase {
+    Write,
+    Read,
 }
 
-fn env_bool(key: &str, default: bool) -> bool {
-    match std::env::var(key).ok().as_deref() {
-        Some("1") | Some("yes") | Some("true") | Some("YES") | Some("TRUE") => true,
-        Some("0") | Some("no") | Some("false") | Some("NO") | Some("FALSE") => false,
-        _ => default,
+impl Phase {
+    fn parse(value: &str) -> std::result::Result<Self, String> {
+        match value {
+            "write" => Ok(Self::Write),
+            "read" => Ok(Self::Read),
+            _ => Err(format!("unknown phase '{value}', expected write|read")),
+        }
+    }
+
+    fn as_str(self) -> &'static str {
+        match self {
+            Self::Write => "write",
+            Self::Read => "read",
+        }
     }
 }
 
-fn human_size(n: usize) -> String {
-    if n % 1_000_000 == 0 {
-        format!("{}M", n / 1_000_000)
-    } else if n % 1_000 == 0 {
-        format!("{}k", n / 1_000)
-    } else {
-        n.to_string()
+// ----------------------------------------------------------------------
+// Args
+// ----------------------------------------------------------------------
+
+#[derive(Debug, Clone)]
+struct Args {
+    phase: Phase,
+    mode: Mode,
+    uri: Option<String>,
+    seed_rows: usize,
+    batch_rows: usize,
+    calls: usize,
+    read_rows: usize,
+    max_memtable_rows: Option<usize>,
+    max_memtable_size: usize,
+    max_unflushed_memtable_bytes: usize,
+    max_wal_flush_interval_ms: u64,
+    cache_dir: PathBuf,
+    num_token_queries: usize,
+    num_phrase_queries: usize,
+    top_k: usize,
+    tokio_threads: usize,
+    output: Option<PathBuf>,
+}
+
+impl Default for Args {
+    fn default() -> Self {
+        let threads = std::thread::available_parallelism().map_or(1, usize::from);
+        Self {
+            phase: Phase::Write,
+            mode: Mode::AsyncIndexed,
+            uri: None,
+            seed_rows: 1_000_000,
+            batch_rows: 1_000,
+            calls: 1_000,
+            read_rows: 100_000,
+            max_memtable_rows: None,
+            max_memtable_size: 16 * 1024 * 1024 * 1024,
+            max_unflushed_memtable_bytes: 8 * 1024 * 1024 * 1024,
+            max_wal_flush_interval_ms: 100,
+            cache_dir: std::env::temp_dir().join("mem_wal_fineweb_fts_cache"),
+            num_token_queries: 100,
+            num_phrase_queries: 50,
+            top_k: 10,
+            tokio_threads: threads,
+            output: None,
+        }
     }
 }
 
 // ----------------------------------------------------------------------
-// HF fineweb shard loading
+// HuggingFace FineWeb loading
 // ----------------------------------------------------------------------
 
 #[derive(serde::Deserialize)]
@@ -189,13 +198,13 @@ struct HfTreeEntry {
     path: String,
 }
 
-async fn list_shard_paths() -> lance_core::Result<Vec<String>> {
+async fn list_shard_paths() -> Result<Vec<String>> {
     let entries: Vec<HfTreeEntry> = reqwest::get(HF_API_LISTING)
         .await
-        .map_err(|e| lance_core::Error::io(format!("listing HTTP: {}", e)))?
+        .map_err(|e| lance_core::Error::io(format!("listing HTTP: {e}")))?
         .json()
         .await
-        .map_err(|e| lance_core::Error::io(format!("listing JSON: {}", e)))?;
+        .map_err(|e| lance_core::Error::io(format!("listing JSON: {e}")))?;
     let mut shards: Vec<String> = entries
         .into_iter()
         .filter(|e| e.kind == "file" && e.path.ends_with(".parquet"))
@@ -205,37 +214,35 @@ async fn list_shard_paths() -> lance_core::Result<Vec<String>> {
     Ok(shards)
 }
 
-async fn download_shard(rel_path: &str, dest: &std::path::Path) -> lance_core::Result<()> {
+async fn download_shard(rel_path: &str, dest: &std::path::Path) -> Result<()> {
     if dest.exists() {
         return Ok(());
     }
-    let url = format!("{}{}", HF_FILE_BASE, rel_path);
-    let max_attempts = 5;
-    for attempt in 1..=max_attempts {
-        println!(
-            "downloading {} (attempt {}/{}) ...",
-            rel_path, attempt, max_attempts
-        );
-        let result: lance_core::Result<bytes::Bytes> = async {
+    let url = format!("{HF_FILE_BASE}{rel_path}");
+    let tmp = dest.with_extension("part");
+    for attempt in 1..=5u32 {
+        println!("downloading {rel_path} (attempt {attempt}/5) ...");
+        let result: Result<bytes::Bytes> = async {
             let resp = reqwest::get(&url)
                 .await
-                .map_err(|e| lance_core::Error::io(format!("download HTTP: {}", e)))?;
+                .map_err(|e| lance_core::Error::io(format!("download HTTP: {e}")))?;
             if !resp.status().is_success() {
                 return Err(lance_core::Error::io(format!(
-                    "download {} → status {}",
-                    url,
+                    "download {url} -> status {}",
                     resp.status()
                 )));
             }
             resp.bytes()
                 .await
-                .map_err(|e| lance_core::Error::io(format!("read body: {}", e)))
+                .map_err(|e| lance_core::Error::io(format!("read body: {e}")))
         }
         .await;
         match result {
             Ok(bytes) => {
-                std::fs::write(dest, &bytes)
-                    .map_err(|e| lance_core::Error::io(format!("write: {}", e)))?;
+                std::fs::write(&tmp, &bytes)
+                    .map_err(|e| lance_core::Error::io(format!("write: {e}")))?;
+                std::fs::rename(&tmp, dest)
+                    .map_err(|e| lance_core::Error::io(format!("rename: {e}")))?;
                 println!(
                     "  wrote {:.1} MB to {}",
                     bytes.len() as f64 / 1024.0 / 1024.0,
@@ -243,13 +250,9 @@ async fn download_shard(rel_path: &str, dest: &std::path::Path) -> lance_core::R
                 );
                 return Ok(());
             }
-            Err(e) if attempt < max_attempts => {
-                let backoff = Duration::from_secs(2u64.pow(attempt as u32));
-                eprintln!(
-                    "  attempt {} failed: {}; retrying in {:?}",
-                    attempt, e, backoff
-                );
-                tokio::time::sleep(backoff).await;
+            Err(e) if attempt < 5 => {
+                eprintln!("  attempt {attempt} failed: {e}; retrying");
+                tokio::time::sleep(Duration::from_secs(2u64.pow(attempt))).await;
             }
             Err(e) => return Err(e),
         }
@@ -261,23 +264,22 @@ async fn read_shard_text(
     path: &std::path::Path,
     out: &mut Vec<String>,
     max_rows: usize,
-) -> lance_core::Result<usize> {
+) -> Result<usize> {
     let file = tokio::fs::File::open(path)
         .await
-        .map_err(|e| lance_core::Error::io(format!("open parquet: {}", e)))?;
+        .map_err(|e| lance_core::Error::io(format!("open parquet: {e}")))?;
     let builder = ParquetRecordBatchStreamBuilder::new(file)
         .await
-        .map_err(|e| lance_core::Error::io(format!("parquet builder: {}", e)))?;
+        .map_err(|e| lance_core::Error::io(format!("parquet builder: {e}")))?;
     let mut stream = builder
         .build()
-        .map_err(|e| lance_core::Error::io(format!("parquet stream: {}", e)))?;
-
+        .map_err(|e| lance_core::Error::io(format!("parquet stream: {e}")))?;
     let mut taken = 0usize;
     while taken < max_rows {
         let Some(rb) = stream
             .try_next()
             .await
-            .map_err(|e| lance_core::Error::io(format!("parquet read: {}", e)))?
+            .map_err(|e| lance_core::Error::io(format!("parquet read: {e}")))?
         else {
             break;
         };
@@ -287,7 +289,7 @@ async fn read_shard_text(
         let strs = col
             .as_any()
             .downcast_ref::<StringArray>()
-            .ok_or_else(|| lance_core::Error::io("text column not StringArray".to_string()))?;
+            .ok_or_else(|| lance_core::Error::io("text not StringArray".to_string()))?;
         for i in 0..strs.len() {
             if taken >= max_rows {
                 break;
@@ -302,44 +304,36 @@ async fn read_shard_text(
     Ok(taken)
 }
 
-async fn load_corpus(
-    needed_rows: usize,
-    cache_dir: &std::path::Path,
-) -> lance_core::Result<Vec<String>> {
+/// Load `needed_rows` text rows from cached FineWeb shards, downloading
+/// shards on demand.
+async fn load_corpus(needed_rows: usize, cache_dir: &std::path::Path) -> Result<Vec<String>> {
     std::fs::create_dir_all(cache_dir)
-        .map_err(|e| lance_core::Error::io(format!("mkdir cache: {}", e)))?;
+        .map_err(|e| lance_core::Error::io(format!("mkdir cache: {e}")))?;
     let shards = list_shard_paths().await?;
-    println!("fineweb sample/10BT has {} parquet shards", shards.len());
-
+    println!("fineweb sample/10BT: {} shards", shards.len());
     let mut buf: Vec<String> = Vec::with_capacity(needed_rows);
-    for rel_path in &shards {
+    for rel in &shards {
         if buf.len() >= needed_rows {
             break;
         }
-        let local_name = rel_path.rsplit('/').next().unwrap_or(rel_path);
-        let local = cache_dir.join(local_name);
-        download_shard(rel_path, &local).await?;
+        let name = rel.rsplit('/').next().unwrap_or(rel);
+        let local = cache_dir.join(name);
+        download_shard(rel, &local).await?;
         let want = needed_rows - buf.len();
         let got = read_shard_text(&local, &mut buf, want).await?;
-        println!(
-            "  shard {} → {} text rows (cumulative {})",
-            local_name,
-            got,
-            buf.len()
-        );
+        println!("  shard {name} -> {got} rows (cumulative {})", buf.len());
     }
     if buf.len() < needed_rows {
-        eprintln!(
-            "  warning: dataset exhausted at {} rows (asked {})",
-            buf.len(),
-            needed_rows
-        );
+        return Err(lance_core::Error::io(format!(
+            "fineweb yielded only {} rows, need {needed_rows}",
+            buf.len()
+        )));
     }
     Ok(buf)
 }
 
 // ----------------------------------------------------------------------
-// Schema + batch helpers
+// Schema / batches
 // ----------------------------------------------------------------------
 
 fn make_schema() -> Arc<ArrowSchema> {
@@ -348,251 +342,121 @@ fn make_schema() -> Arc<ArrowSchema> {
         "lance-schema:unenforced-primary-key".to_string(),
         "true".to_string(),
     );
-    let id = Field::new("id", DataType::Int64, false).with_metadata(id_meta);
     Arc::new(ArrowSchema::new(vec![
-        id,
+        Field::new("id", DataType::Int64, false).with_metadata(id_meta),
         Field::new(TEXT_COL, DataType::Utf8, true),
     ]))
 }
 
-fn make_batch(start_id: i64, texts: &[String], schema: Arc<ArrowSchema>) -> RecordBatch {
-    let n = texts.len();
-    let ids: Vec<i64> = (start_id..start_id + n as i64).collect();
+fn make_batch(schema: Arc<ArrowSchema>, start_id: i64, texts: &[&str]) -> RecordBatch {
+    let ids: Vec<i64> = (start_id..start_id + texts.len() as i64).collect();
     let id_arr: ArrayRef = Arc::new(Int64Array::from(ids));
-    let text_arr: ArrayRef = Arc::new(StringArray::from(texts.to_vec()));
+    let text_arr: ArrayRef = Arc::new(StringArray::from_iter_values(texts.iter().copied()));
     RecordBatch::try_new(schema, vec![id_arr, text_arr]).unwrap()
 }
 
-// ----------------------------------------------------------------------
-// Base dataset
-// ----------------------------------------------------------------------
-
-async fn build_base_if_absent(
-    base_uri: &str,
+/// Write a seed dataset of `seed_texts.len()` rows, optionally create the
+/// base FTS index, and initialize MemWAL.
+async fn build_seed_dataset(
+    uri: &str,
     schema: Arc<ArrowSchema>,
-    base_texts: &[String],
-    batch_size: usize,
-    fts_enabled: bool,
-) -> lance_core::Result<()> {
-    if Dataset::open(base_uri).await.is_ok() {
-        println!("base dataset already exists at {}, skipping build", base_uri);
-        return Ok(());
-    }
-    println!(
-        "building base dataset at {} ({} rows, batch_size {})",
-        base_uri,
-        base_texts.len(),
-        batch_size
-    );
-    let total = base_texts.len();
-    let mut batches = Vec::with_capacity(total.div_ceil(batch_size));
-    let mut start = 0usize;
-    while start < total {
-        let end = (start + batch_size).min(total);
-        batches.push(Ok(make_batch(
-            start as i64,
-            &base_texts[start..end],
-            schema.clone(),
-        )));
-        start = end;
+    seed_texts: &[String],
+    batch_rows: usize,
+    indexed: bool,
+) -> Result<f64> {
+    let start = Instant::now();
+    let mut batches = Vec::with_capacity(seed_texts.len().div_ceil(batch_rows));
+    let mut lo = 0usize;
+    while lo < seed_texts.len() {
+        let hi = (lo + batch_rows).min(seed_texts.len());
+        let slice: Vec<&str> = seed_texts[lo..hi].iter().map(|s| s.as_str()).collect();
+        batches.push(Ok(make_batch(schema.clone(), lo as i64, &slice)));
+        lo = hi;
     }
     let reader = RecordBatchIterator::new(batches.into_iter(), schema.clone());
-    let mut dataset = Dataset::write(reader, base_uri, Some(WriteParams::default())).await?;
-    if fts_enabled {
-        let fts_params = InvertedIndexParams::default();
+    let mut dataset = Dataset::write(reader, uri, Some(WriteParams::default())).await?;
+    if indexed {
         dataset
             .create_index(
                 &[TEXT_COL],
                 IndexType::Inverted,
                 Some(FTS_INDEX_NAME.to_string()),
-                &fts_params,
-                true,
-            )
-            .await?;
-    } else {
-        // Even when MemWAL is configured without FTS, we still need a
-        // BTree index on `id` so MemWAL has at least one maintained
-        // index to reference.
-        let pk_params = ScalarIndexParams::default();
-        dataset
-            .create_index(
-                &["id"],
-                IndexType::BTree,
-                Some("id_btree".to_string()),
-                &pk_params,
+                &InvertedIndexParams::default(),
                 true,
             )
             .await?;
     }
-    let maintained = if fts_enabled {
-        vec![FTS_INDEX_NAME.to_string()]
-    } else {
-        vec!["id_btree".to_string()]
-    };
     dataset
         .initialize_mem_wal(MemWalConfig {
             shard_spec: None,
-            maintained_indexes: maintained,
+            maintained_indexes: if indexed {
+                vec![FTS_INDEX_NAME.to_string()]
+            } else {
+                vec![]
+            },
         })
         .await?;
-    Ok(())
-}
-
-// ----------------------------------------------------------------------
-// Ingest
-// ----------------------------------------------------------------------
-
-#[derive(Debug, Clone, Serialize)]
-struct IngestStats {
-    rows: usize,
-    wall_seconds: f64,
-    rows_per_sec: f64,
-    /// p95 per-`put` latency in milliseconds.
-    put_p95_ms: f64,
-    put_p50_ms: f64,
-    put_max_ms: f64,
-    num_puts: usize,
+    Ok(start.elapsed().as_secs_f64())
 }
 
-async fn ingest_via_shard_writer(
-    target_uri: &str,
-    schema: Arc<ArrowSchema>,
-    base_texts: &[String],
-    ingest_texts: &[String],
-    cfg: &Config,
-    disable_auto_flush: bool,
-) -> lance_core::Result<IngestStats> {
-    // Build a fresh ingest dataset by cloning the base.
-    println!("preparing ingest dataset at {}", target_uri);
-    build_base_if_absent(
-        target_uri,
-        schema.clone(),
-        base_texts,
-        cfg.batch_size,
-        cfg.fts_enabled,
-    )
-    .await?;
-    let dataset = Arc::new(Dataset::open(target_uri).await?);
-
-    let shard_id = Uuid::new_v4();
-    let max_memtable_rows = if disable_auto_flush {
-        cfg.ingest_rows.saturating_mul(2).max(2_000_000)
+fn shard_writer_config(args: &Args, shard_id: Uuid, disable_auto_flush: bool) -> ShardWriterConfig {
+    let max_rows = if disable_auto_flush {
+        args.read_rows.saturating_mul(4).max(4_000_000)
     } else {
-        cfg.max_memtable_rows
+        args.max_memtable_rows.unwrap_or(usize::MAX / 2)
     };
-    let max_memtable_size = if disable_auto_flush {
-        16 * 1024 * 1024 * 1024 // 16 GiB
+    let mut config = ShardWriterConfig::new(shard_id)
+        .with_durable_write(args.mode.durable_write())
+        .with_sync_indexed_write(args.mode.sync_indexed_write())
+        .with_max_memtable_size(args.max_memtable_size)
+        .with_max_unflushed_memtable_bytes(args.max_unflushed_memtable_bytes)
+        .with_max_memtable_rows(max_rows)
+        .with_max_memtable_batches(max_rows.div_ceil(args.batch_rows).saturating_add(64));
+    if args.max_wal_flush_interval_ms == 0 {
+        config.max_wal_flush_interval = None;
     } else {
-        16 * 1024 * 1024 * 1024
-    };
-    let writer_config = ShardWriterConfig {
-        shard_id,
-        shard_spec_id: 0,
-        durable_write: cfg.durable_write,
-        sync_indexed_write: true,
-        max_memtable_size,
-        max_memtable_rows,
-        max_memtable_batches: 4_000_000,
-        max_wal_flush_interval: Some(Duration::from_millis(200)),
-        max_unflushed_memtable_bytes: usize::MAX / 2,
-        ..ShardWriterConfig::default()
-    };
-    let writer = dataset
-        .as_ref()
-        .mem_wal_writer(shard_id, writer_config)
-        .await?;
-
-    // Ingest IDs start above the base table's last id to keep PK unique.
-    let id_offset: i64 = cfg.base_rows as i64;
-    let n = ingest_texts.len();
-    let bs = cfg.batch_size;
-    let total_batches = n.div_ceil(bs);
+        config = config
+            .with_max_wal_flush_interval(Duration::from_millis(args.max_wal_flush_interval_ms));
+    }
+    config
+}
 
-    let mut put_latencies: Vec<u128> = Vec::with_capacity(total_batches);
-    let start = Instant::now();
-    for i in 0..total_batches {
-        let lo = i * bs;
-        let hi = (lo + bs).min(n);
-        let batch = make_batch(
-            id_offset + lo as i64,
-            &ingest_texts[lo..hi],
-            schema.clone(),
-        );
-        let put_t = Instant::now();
-        writer.put(vec![batch]).await?;
-        put_latencies.push(put_t.elapsed().as_micros());
-        if (i + 1) % 100 == 0 {
-            let so_far = start.elapsed().as_secs_f64();
-            let rate = (i + 1) as f64 * bs as f64 / so_far.max(1e-9);
-            println!(
-                "  ingest progress: {}/{} batches ({:.0} rows/s)",
-                i + 1,
-                total_batches,
-                rate
-            );
-        }
+fn percentile(values: &[f64], pct: f64) -> f64 {
+    if values.is_empty() {
+        return f64::NAN;
     }
-    // Close the writer to drain the final WAL/memtable flush. With
-    // sync_indexed_write = true the inline index update is already
-    // complete, but auto-flush leaves background memtable->disk work
-    // outstanding that close() awaits. We deliberately include close()
-    // in the elapsed measurement so the reported throughput reflects
-    // "rows fully ingested + their pending flush", not just "puts
-    // returned". This is the apples-to-apples figure across configs
-    // with different `max_memtable_rows` (which drive the flush
-    // cadence). The previous post-ingest spin on
-    // `max_indexed_batch_position` was incorrect under auto-flush
-    // because the counter resets on each new active memtable.
-    writer.close().await?;
-    let elapsed = start.elapsed();
-
-    put_latencies.sort_unstable();
-    let p50 = put_latencies[put_latencies.len() / 2] as f64 / 1000.0;
-    let p95 = put_latencies[put_latencies.len() * 95 / 100] as f64 / 1000.0;
-    let max = *put_latencies.iter().max().unwrap_or(&0) as f64 / 1000.0;
-    Ok(IngestStats {
-        rows: n,
-        wall_seconds: elapsed.as_secs_f64(),
-        rows_per_sec: n as f64 / elapsed.as_secs_f64().max(1e-9),
-        put_p50_ms: p50,
-        put_p95_ms: p95,
-        put_max_ms: max,
-        num_puts: total_batches,
-    })
+    let mut sorted = values.to_vec();
+    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
+    let idx = ((pct / 100.0) * (sorted.len().saturating_sub(1)) as f64).round() as usize;
+    sorted[idx.min(sorted.len() - 1)]
 }
 
 // ----------------------------------------------------------------------
 // Query set
 // ----------------------------------------------------------------------
 
-#[derive(Debug, Clone, Serialize)]
 struct QuerySet {
     tokens: Vec<String>,
     phrases: Vec<String>,
 }
 
-fn build_query_set(sample_texts: &[&str], cfg: &Config) -> QuerySet {
-    use lance_tokenizer::TokenStream;
-    // ASCII stop-word-ish list used by the default English analyzer; we only
-    // need a coarse filter for query selection here.
-    const STOPWORDS: &[&str] = &[
-        "the", "a", "an", "and", "or", "of", "to", "in", "on", "for", "with", "as", "by", "is",
-        "was", "are", "were", "be", "been", "being", "this", "that", "these", "those", "it", "its",
-        "but", "not", "no", "if", "then", "than", "so", "do", "does", "did", "have", "has", "had",
-        "will", "would", "should", "could", "can", "may", "might", "must", "i", "you", "he", "she",
-        "we", "they", "them", "his", "her", "their", "our", "us", "me", "my", "your", "him",
-    ];
+const STOPWORDS: &[&str] = &[
+    "the", "a", "an", "and", "or", "of", "to", "in", "on", "for", "with", "as", "by", "is", "was",
+    "are", "were", "be", "been", "being", "this", "that", "these", "those", "it", "its", "but",
+    "not", "no", "if", "then", "than", "so", "do", "does", "did", "have", "has", "had", "will",
+    "would", "should", "could", "can", "may", "might", "must", "i", "you", "he", "she", "we",
+    "they", "them", "his", "her", "their", "our", "us", "me", "my", "your", "him", "at", "from",
+];
+
+fn build_query_set(sample: &[&str], args: &Args) -> QuerySet {
     let mut tokenizer = InvertedIndexParams::default()
         .build()
         .expect("default tokenizer builds");
     let mut freq: HashMap<String, u64> = HashMap::new();
-    for t in sample_texts.iter().take(50_000) {
+    for t in sample.iter().take(50_000) {
         let mut stream = tokenizer.token_stream_for_doc(t);
         while let Some(tok) = stream.next() {
-            if tok.text.len() < 3 || tok.text.len() > 24 {
-                continue;
-            }
-            if STOPWORDS.contains(&tok.text.as_str()) {
+            if tok.text.len() < 3 || tok.text.len() > 24 || STOPWORDS.contains(&tok.text.as_str()) {
                 continue;
             }
             *freq.entry(tok.text.clone()).or_default() += 1;
@@ -603,23 +467,17 @@ fn build_query_set(sample_texts: &[&str], cfg: &Config) -> QuerySet {
     let tokens: Vec<String> = by_freq
         .into_iter()
         .map(|(t, _)| t)
-        .take(cfg.num_token_queries)
+        .take(args.num_token_queries)
         .collect();
 
-    // Phrase queries: walk a deterministic stride of rows, take the first
-    // two consecutive non-stopword non-short tokens.
-    let mut phrases = Vec::with_capacity(cfg.num_phrase_queries);
-    let stride = sample_texts.len().max(1) / cfg.num_phrase_queries.max(1);
+    let mut phrases = Vec::with_capacity(args.num_phrase_queries);
+    let stride = (sample.len().max(1) / args.num_phrase_queries.max(1)).max(1);
     let mut idx = 0usize;
-    while phrases.len() < cfg.num_phrase_queries && idx < sample_texts.len() {
-        let t = sample_texts[idx];
-        let mut stream = tokenizer.token_stream_for_doc(t);
+    while phrases.len() < args.num_phrase_queries && idx < sample.len() {
+        let mut stream = tokenizer.token_stream_for_doc(sample[idx]);
         let mut acc: Vec<String> = Vec::new();
         while let Some(tok) = stream.next() {
-            if tok.text.len() < 3 || tok.text.len() > 24 {
-                continue;
-            }
-            if STOPWORDS.contains(&tok.text.as_str()) {
+            if tok.text.len() < 3 || tok.text.len() > 24 || STOPWORDS.contains(&tok.text.as_str()) {
                 continue;
             }
             acc.push(tok.text.clone());
@@ -628,326 +486,371 @@ fn build_query_set(sample_texts: &[&str], cfg: &Config) -> QuerySet {
                 break;
             }
         }
-        idx = idx.saturating_add(stride.max(1));
+        idx += stride;
     }
-
     QuerySet { tokens, phrases }
 }
 
 // ----------------------------------------------------------------------
-// Read test (FTS only): MemTable query latency + post-flush consistency
+// Write phase
 // ----------------------------------------------------------------------
 
-#[derive(Debug, Clone, Serialize)]
-struct ReadStats {
-    rows: usize,
-    /// Average across all queries (token + phrase).
-    mt_latency_avg_ms: f64,
-    mt_latency_p50_ms: f64,
-    mt_latency_p95_ms: f64,
-    consistency_mean: f64,
-    consistency_min: f64,
-    num_queries: usize,
-}
+async fn run_write(args: &Args, uri: &str, corpus: &[String]) -> Result<serde_json::Value> {
+    let schema = make_schema();
+    let indexed = args.mode.indexed();
 
-async fn run_read_test(
-    target_uri: &str,
-    schema: Arc<ArrowSchema>,
-    base_texts: &[String],
-    ingest_texts: &[String],
-    queries: &QuerySet,
-    cfg: &Config,
-) -> lance_core::Result<ReadStats> {
-    println!(
-        "  read test: ingesting {} rows with auto-flush disabled",
-        ingest_texts.len()
-    );
-    build_base_if_absent(
-        target_uri,
-        schema.clone(),
-        base_texts,
-        cfg.batch_size,
-        true, // FTS index on the base, since this path is FTS-only.
-    )
-    .await?;
-    let dataset = Arc::new(Dataset::open(target_uri).await?);
-    let shard_id = Uuid::new_v4();
-    let writer_config = ShardWriterConfig {
-        shard_id,
-        shard_spec_id: 0,
-        durable_write: cfg.durable_write,
-        sync_indexed_write: true,
-        // Effectively disable auto-flush triggers so the MemTable holds
-        // the full ingest_texts.len() rows for the query phase.
-        max_memtable_size: 64 * 1024 * 1024 * 1024,
-        max_memtable_rows: ingest_texts.len().saturating_mul(2),
-        max_memtable_batches: 4_000_000,
-        max_wal_flush_interval: Some(Duration::from_millis(200)),
-        max_unflushed_memtable_bytes: usize::MAX / 2,
-        ..ShardWriterConfig::default()
-    };
-    let writer = dataset
-        .as_ref()
-        .mem_wal_writer(shard_id, writer_config)
-        .await?;
+    let seed = &corpus[..args.seed_rows.min(corpus.len())];
+    let setup_s = build_seed_dataset(uri, schema.clone(), seed, args.batch_rows, indexed).await?;
 
-    let id_offset: i64 = cfg.base_rows as i64;
-    let bs = cfg.batch_size;
-    let n = ingest_texts.len();
-    let total_batches = n.div_ceil(bs);
-    for i in 0..total_batches {
-        let lo = i * bs;
-        let hi = (lo + bs).min(n);
+    let dataset = Dataset::open(uri).await?;
+    let shard_id = Uuid::new_v4();
+    let config = shard_writer_config(args, shard_id, false);
+    let writer = dataset.mem_wal_writer(shard_id, config).await?;
+
+    // Ingest rows come from the corpus after the seed, cycled if needed.
+    let ingest_pool = &corpus[args.seed_rows.min(corpus.len())..];
+    let pool_len = ingest_pool.len().max(1);
+    let mut latencies_ms = Vec::with_capacity(args.calls);
+    let id_base = args.seed_rows as i64;
+
+    let puts_start = Instant::now();
+    for call in 0..args.calls {
+        let lo = (call * args.batch_rows) % pool_len;
+        let mut slice: Vec<&str> = Vec::with_capacity(args.batch_rows);
+        for j in 0..args.batch_rows {
+            slice.push(ingest_pool[(lo + j) % pool_len].as_str());
+        }
         let batch = make_batch(
-            id_offset + lo as i64,
-            &ingest_texts[lo..hi],
             schema.clone(),
+            id_base + (call * args.batch_rows) as i64,
+            &slice,
         );
+        let put_start = Instant::now();
         writer.put(vec![batch]).await?;
-    }
-    let target_batch_pos = total_batches.saturating_sub(1);
-    loop {
-        let active = writer.active_memtable_ref().await?;
-        if active.index_store.max_indexed_batch_position() >= target_batch_pos {
-            break;
+        latencies_ms.push(put_start.elapsed().as_secs_f64() * 1000.0);
+        if (call + 1) % 100 == 0 {
+            let rate = ((call + 1) * args.batch_rows) as f64 / puts_start.elapsed().as_secs_f64();
+            println!("  put {}/{} ({rate:.0} rows/s)", call + 1, args.calls);
         }
-        drop(active);
-        tokio::time::sleep(Duration::from_millis(50)).await;
     }
+    let elapsed_puts_s = puts_start.elapsed().as_secs_f64();
 
-    // ----- MemTable phase -----
-    let active = writer.active_memtable_ref().await?;
-    let fts = active
-        .index_store
-        .get_fts(FTS_INDEX_NAME)
-        .ok_or_else(|| lance_core::Error::invalid_input("FTS mem index not found"))?;
+    let close_start = Instant::now();
+    writer.close().await?;
+    let elapsed_close_s = close_start.elapsed().as_secs_f64();
+    let elapsed_total_s = puts_start.elapsed().as_secs_f64();
+
+    let rows = args.calls * args.batch_rows;
+    Ok(json!({
+        "phase": "write",
+        "mode": args.mode.as_str(),
+        "uri": uri,
+        "seed_rows": args.seed_rows,
+        "batch_rows": args.batch_rows,
+        "calls": args.calls,
+        "ingested_rows": rows,
+        "max_memtable_rows": args.max_memtable_rows,
+        "setup_seconds": setup_s,
+        "elapsed_puts_seconds": elapsed_puts_s,
+        "elapsed_close_seconds": elapsed_close_s,
+        "elapsed_total_seconds": elapsed_total_s,
+        // Throughput including the final close()/flush — comparable across
+        // configs with different flush cadences.
+        "throughput_rows_per_sec": rows as f64 / elapsed_total_s,
+        // Loop-only throughput (puts returned, flush may be outstanding).
+        "throughput_puts_rows_per_sec": rows as f64 / elapsed_puts_s,
+        "put_p50_ms": percentile(&latencies_ms, 50.0),
+        "put_p90_ms": percentile(&latencies_ms, 90.0),
+        "put_p99_ms": percentile(&latencies_ms, 99.0),
+        "put_max_ms": latencies_ms.iter().copied().fold(0.0_f64, f64::max),
+    }))
+}
+
+// ----------------------------------------------------------------------
+// Read phase
+// ----------------------------------------------------------------------
+
+async fn run_read(args: &Args, uri: &str, corpus: &[String]) -> Result<serde_json::Value> {
+    let schema = make_schema();
 
-    let mut latencies_us: Vec<u128> = Vec::new();
-    let mut mt_top10: Vec<HashSet<i64>> = Vec::new();
+    // Seed dataset with an FTS base index (read phase is FTS-only).
+    let seed = &corpus[..args.seed_rows.min(corpus.len())];
+    let setup_s = build_seed_dataset(uri, schema.clone(), seed, args.batch_rows, true).await?;
+
+    let dataset = Dataset::open(uri).await?;
+    let shard_id = Uuid::new_v4();
+    // Auto-flush disabled so the MemTable holds the full read_rows.
+    let config = shard_writer_config(args, shard_id, true);
+    let writer = dataset.mem_wal_writer(shard_id, config).await?;
+
+    let ingest_pool = &corpus[args.seed_rows.min(corpus.len())..];
+    let n = args.read_rows.min(ingest_pool.len());
+    let id_base = args.seed_rows as i64;
+    let total_batches = n.div_ceil(args.batch_rows);
+    let ingest_start = Instant::now();
+    for b in 0..total_batches {
+        let lo = b * args.batch_rows;
+        let hi = (lo + args.batch_rows).min(n);
+        let slice: Vec<&str> = ingest_pool[lo..hi].iter().map(|s| s.as_str()).collect();
+        writer
+            .put(vec![make_batch(
+                schema.clone(),
+                id_base + lo as i64,
+                &slice,
+            )])
+            .await?;
+    }
+    let ingest_s = ingest_start.elapsed().as_secs_f64();
+    println!("  read phase: ingested {n} rows in {ingest_s:.1}s");
 
-    let all_queries: Vec<(FtsQueryExpr, String)> = queries
+    // Build the query set from the ingested slice.
+    let sample: Vec<&str> = ingest_pool[..n].iter().map(|s| s.as_str()).collect();
+    let queries = build_query_set(&sample, args);
+    let all_queries: Vec<FtsQueryExpr> = queries
         .tokens
         .iter()
-        .map(|t| (FtsQueryExpr::match_query(t.clone()), t.clone()))
+        .map(|t| FtsQueryExpr::match_query(t.clone()))
         .chain(
             queries
                 .phrases
                 .iter()
-                .map(|p| (FtsQueryExpr::phrase(p.clone()), format!("\"{p}\""))),
+                .map(|p| FtsQueryExpr::phrase(p.clone())),
         )
         .collect();
+    println!(
+        "  query set: {} tokens + {} phrases",
+        queries.tokens.len(),
+        queries.phrases.len()
+    );
 
-    // Build a row_position -> id map by scanning the active batches.
-    // This is needed because the MemTable returns row_positions; the
-    // post-flush on-disk FTS returns row_ids that match the `id` column.
+    // ---- MemTable read phase ----
+    let active = writer.active_memtable_ref().await?;
+    let fts = active
+        .index_store
+        .get_fts(FTS_INDEX_NAME)
+        .ok_or_else(|| lance_core::Error::invalid_input("FTS mem index missing"))?;
+
+    // row_position -> id, recovered from the MemTable batches.
     let mut row_to_id: HashMap<u64, i64> = HashMap::new();
     for stored in active.batch_store.iter() {
         let id_arr = stored
             .data
             .column_by_name("id")
             .and_then(|c| c.as_any().downcast_ref::<Int64Array>())
-            .ok_or_else(|| lance_core::Error::invalid_input("id col missing"))?;
+            .ok_or_else(|| lance_core::Error::invalid_input("id column missing"))?;
         for r in 0..id_arr.len() {
             row_to_id.insert(stored.row_offset + r as u64, id_arr.value(r));
         }
     }
 
-    for (q, label) in &all_queries {
-        let opts = SearchOptions::new().with_limit(cfg.top_k);
+    let mut latencies_us = Vec::with_capacity(all_queries.len());
+    let mut mt_top: Vec<HashSet<i64>> = Vec::with_capacity(all_queries.len());
+    for q in &all_queries {
+        let opts = SearchOptions::new().with_limit(args.top_k);
         let t0 = Instant::now();
-        let entries = fts.search_with_options(q, opts);
-        latencies_us.push(t0.elapsed().as_micros());
-        let mut ids = HashSet::with_capacity(cfg.top_k);
-        for e in entries.iter().take(cfg.top_k) {
-            if let Some(id) = row_to_id.get(&e.row_position) {
-                ids.insert(*id);
-            }
-        }
-        if mt_top10.len() < 3 {
-            println!(
-                "    [mt] {label}: {} hits, ids={:?}",
-                entries.len(),
-                ids.iter().take(3).collect::<Vec<_>>()
-            );
-        }
-        mt_top10.push(ids);
+        let hits = fts.search_with_options(q, opts);
+        latencies_us.push(t0.elapsed().as_micros() as f64);
+        let ids: HashSet<i64> = hits
+            .iter()
+            .take(args.top_k)
+            .filter_map(|e| row_to_id.get(&e.row_position).copied())
+            .collect();
+        mt_top.push(ids);
     }
     drop(active);
 
-    latencies_us.sort_unstable();
-    let avg_us =
-        latencies_us.iter().sum::<u128>() as f64 / latencies_us.len().max(1) as f64;
-    let p50 = latencies_us[latencies_us.len() / 2] as f64 / 1000.0;
-    let p95 = latencies_us[latencies_us.len() * 95 / 100] as f64 / 1000.0;
+    let lat_ms: Vec<f64> = latencies_us.iter().map(|us| us / 1000.0).collect();
+    let mt_avg_ms = lat_ms.iter().sum::<f64>() / lat_ms.len().max(1) as f64;
 
-    // ----- Force flush, then on-disk phase -----
-    println!("  read test: closing writer to force flush");
+    // ---- Force flush, on-disk replay ----
+    let flush_start = Instant::now();
     writer.close().await?;
-    let flushed_dataset = Dataset::open(target_uri).await?;
-
-    let mut consistencies: Vec<f64> = Vec::with_capacity(all_queries.len());
-    for ((q, label), mt_ids) in all_queries.iter().zip(mt_top10.iter()) {
-        let fts_query = match q {
-            FtsQueryExpr::Match { query, .. } => FullTextSearchQuery::new(query.clone()),
-            FtsQueryExpr::Phrase { query, .. } => {
-                FullTextSearchQuery::new(format!("\"{}\"", query))
-            }
-            _ => unreachable!("only match/phrase queries in this set"),
+    let flush_s = flush_start.elapsed().as_secs_f64();
+    let flushed = Dataset::open(uri).await?;
+
+    let mut consistencies = Vec::with_capacity(all_queries.len());
+    for (q, mt_ids) in all_queries.iter().zip(mt_top.iter()) {
+        let query_str = match q {
+            FtsQueryExpr::Match { query, .. } => query.clone(),
+            FtsQueryExpr::Phrase { query, .. } => format!("\"{query}\""),
+            _ => unreachable!("only match/phrase queries are generated"),
         };
-        let mut scanner = flushed_dataset.scan();
-        scanner.full_text_search(fts_query)?;
-        scanner.limit(Some(cfg.top_k as i64), None)?;
+        let mut scanner = flushed.scan();
+        scanner.full_text_search(FullTextSearchQuery::new(query_str))?;
+        scanner.limit(Some(args.top_k as i64), None)?;
         scanner.project(&["id"])?;
         let stream = scanner.try_into_stream().await?;
         let batches: Vec<RecordBatch> = stream.try_collect().await?;
-        let mut disk_ids = HashSet::new();
+        let mut disk_ids: HashSet<i64> = HashSet::new();
         for b in &batches {
             let id_arr = b
                 .column_by_name("id")
                 .and_then(|c| c.as_any().downcast_ref::<Int64Array>())
-                .ok_or_else(|| lance_core::Error::invalid_input("disk id col missing"))?;
+                .ok_or_else(|| lance_core::Error::invalid_input("disk id column missing"))?;
             for i in 0..id_arr.len() {
                 disk_ids.insert(id_arr.value(i));
             }
         }
         let inter: usize = mt_ids.intersection(&disk_ids).count();
         let denom = mt_ids.len().max(disk_ids.len()).max(1);
-        let cons = inter as f64 / denom as f64;
-        if consistencies.len() < 3 {
-            println!(
-                "    [disk] {label}: {} hits; mt={} disk={} ∩={} cons={:.3}",
-                disk_ids.len(),
-                mt_ids.len(),
-                disk_ids.len(),
-                inter,
-                cons
-            );
-        }
-        consistencies.push(cons);
+        consistencies.push(inter as f64 / denom as f64);
     }
-
     let cons_mean = consistencies.iter().sum::<f64>() / consistencies.len().max(1) as f64;
-    let cons_min = consistencies
-        .iter()
-        .copied()
-        .fold(f64::INFINITY, f64::min);
-
-    Ok(ReadStats {
-        rows: n,
-        mt_latency_avg_ms: avg_us / 1000.0,
-        mt_latency_p50_ms: p50,
-        mt_latency_p95_ms: p95,
-        consistency_mean: cons_mean,
-        consistency_min: if cons_min.is_finite() { cons_min } else { 0.0 },
-        num_queries: all_queries.len(),
-    })
+    let cons_min = consistencies.iter().copied().fold(1.0_f64, f64::min);
+
+    Ok(json!({
+        "phase": "read",
+        "mode": args.mode.as_str(),
+        "uri": uri,
+        "seed_rows": args.seed_rows,
+        "read_rows": n,
+        "max_memtable_rows": args.max_memtable_rows,
+        "setup_seconds": setup_s,
+        "ingest_seconds": ingest_s,
+        "flush_seconds": flush_s,
+        "num_queries": all_queries.len(),
+        "num_token_queries": queries.tokens.len(),
+        "num_phrase_queries": queries.phrases.len(),
+        "mt_latency_avg_ms": mt_avg_ms,
+        "mt_latency_p50_ms": percentile(&lat_ms, 50.0),
+        "mt_latency_p95_ms": percentile(&lat_ms, 95.0),
+        "mt_latency_p99_ms": percentile(&lat_ms, 99.0),
+        "consistency_mean": cons_mean,
+        "consistency_min": cons_min,
+    }))
 }
 
 // ----------------------------------------------------------------------
-// Top-level orchestration
+// CLI
 // ----------------------------------------------------------------------
 
-#[derive(Debug, Clone, Serialize)]
-struct RunResult {
-    config_name: String,
-    max_memtable_rows: usize,
-    durable_write: bool,
-    fts_enabled: bool,
-    base_rows: usize,
-    ingest_rows: usize,
-    batch_size: usize,
-    ingest: IngestStats,
-    read: Option<ReadStats>,
-    timestamp_utc: String,
+fn parse<T>(flag: &str, value: &str) -> Result<T>
+where
+    T: std::str::FromStr,
+    T::Err: std::fmt::Display,
+{
+    value
+        .parse()
+        .map_err(|e| lance_core::Error::invalid_input(format!("invalid {flag}: {value} ({e})")))
 }
 
-#[tokio::main(flavor = "multi_thread")]
-async fn main() -> lance_core::Result<()> {
-    let cfg = Config::from_env();
-    println!("=== mem_wal_fineweb_fts === config = {:?}", cfg);
-
-    let total_rows = cfg.base_rows + cfg.ingest_rows;
-    let texts = load_corpus(total_rows, &cfg.cache_dir).await?;
-    if texts.len() < total_rows {
-        return Err(lance_core::Error::io(format!(
-            "fineweb shards yielded only {} rows, need {}",
-            texts.len(),
-            total_rows
-        )));
+fn parse_args() -> Result<Args> {
+    let mut args = Args::default();
+    let mut iter = std::env::args().skip(1);
+    while let Some(flag) = iter.next() {
+        if flag == "--bench" {
+            continue;
+        }
+        let value = iter
+            .next()
+            .ok_or_else(|| lance_core::Error::invalid_input(format!("missing value for {flag}")))?;
+        match flag.as_str() {
+            "--phase" => {
+                args.phase = Phase::parse(&value).map_err(lance_core::Error::invalid_input)?
+            }
+            "--mode" => {
+                args.mode = Mode::parse(&value).map_err(lance_core::Error::invalid_input)?
+            }
+            "--uri" => args.uri = Some(value),
+            "--seed-rows" => args.seed_rows = parse(&flag, &value)?,
+            "--batch-rows" => args.batch_rows = parse(&flag, &value)?,
+            "--calls" => args.calls = parse(&flag, &value)?,
+            "--read-rows" => args.read_rows = parse(&flag, &value)?,
+            "--max-memtable-rows" => {
+                let v: usize = parse(&flag, &value)?;
+                args.max_memtable_rows = (v != 0).then_some(v);
+            }
+            "--max-memtable-size" => args.max_memtable_size = parse(&flag, &value)?,
+            "--max-unflushed-memtable-bytes" => {
+                args.max_unflushed_memtable_bytes = parse(&flag, &value)?
+            }
+            "--max-wal-flush-interval-ms" => args.max_wal_flush_interval_ms = parse(&flag, &value)?,
+            "--cache-dir" => args.cache_dir = PathBuf::from(value),
+            "--num-token-queries" => args.num_token_queries = parse(&flag, &value)?,
+            "--num-phrase-queries" => args.num_phrase_queries = parse(&flag, &value)?,
+            "--top-k" => args.top_k = parse(&flag, &value)?,
+            "--tokio-threads" => args.tokio_threads = parse(&flag, &value)?,
+            "--output" => args.output = Some(PathBuf::from(value)),
+            _ => {
+                return Err(lance_core::Error::invalid_input(format!(
+                    "unknown argument: {flag}"
+                )));
+            }
+        }
     }
-    let base_texts = &texts[..cfg.base_rows];
-    let ingest_texts = &texts[cfg.base_rows..cfg.base_rows + cfg.ingest_rows];
+    if args.batch_rows == 0 || args.calls == 0 || args.seed_rows == 0 {
+        return Err(lance_core::Error::invalid_input(
+            "seed-rows, batch-rows, calls must be > 0",
+        ));
+    }
+    Ok(args)
+}
 
-    let schema = make_schema();
+async fn run(args: Args) -> Result<()> {
+    let temp = if args.uri.is_none() {
+        Some(tempfile::tempdir().map_err(|e| lance_core::Error::io(format!("tempdir: {e}")))?)
+    } else {
+        None
+    };
+    let uri = match &args.uri {
+        Some(u) => u.clone(),
+        None => temp
+            .as_ref()
+            .unwrap()
+            .path()
+            .join("fineweb_fts.lance")
+            .display()
+            .to_string(),
+    };
 
-    // Build query set once from the ingest slice (deterministic).
-    let sample_refs: Vec<&str> = ingest_texts.iter().take(50_000).map(|s| s.as_str()).collect();
-    let queries = build_query_set(&sample_refs, &cfg);
     println!(
-        "query set: {} tokens + {} phrases",
-        queries.tokens.len(),
-        queries.phrases.len()
+        "bench=mem_wal_fineweb_fts phase={} mode={} uri={} seed_rows={} batch_rows={} calls={} read_rows={} max_memtable_rows={:?}",
+        args.phase.as_str(),
+        args.mode.as_str(),
+        uri,
+        args.seed_rows,
+        args.batch_rows,
+        args.calls,
+        args.read_rows,
+        args.max_memtable_rows,
     );
 
-    // Throughput sub-test: ingest 1M with the configured params.
-    println!("\n--- throughput sub-test ---");
-    let ingest_stats = ingest_via_shard_writer(
-        &cfg.ingest_uri(),
-        schema.clone(),
-        base_texts,
-        ingest_texts,
-        &cfg,
-        false, // auto-flush enabled (per max_memtable_rows)
-    )
-    .await?;
-    println!("throughput: {:.1} rows/s", ingest_stats.rows_per_sec);
-
-    // Read sub-test: only when FTS enabled and read test requested.
-    let read_stats = if cfg.with_read_test {
-        println!("\n--- read sub-test ---");
-        let n_for_read = cfg.max_memtable_rows.min(ingest_texts.len());
-        let read_ingest = &ingest_texts[..n_for_read];
-        Some(
-            run_read_test(
-                &cfg.read_test_uri(),
-                schema.clone(),
-                base_texts,
-                read_ingest,
-                &queries,
-                &cfg,
-            )
-            .await?,
-        )
-    } else {
-        None
+    let ingest_needed = match args.phase {
+        Phase::Write => args.calls * args.batch_rows,
+        Phase::Read => args.read_rows,
     };
-
-    let timestamp_utc = chrono::Utc::now().to_rfc3339();
-    let result = RunResult {
-        config_name: cfg.config_name.clone(),
-        max_memtable_rows: cfg.max_memtable_rows,
-        durable_write: cfg.durable_write,
-        fts_enabled: cfg.fts_enabled,
-        base_rows: cfg.base_rows,
-        ingest_rows: cfg.ingest_rows,
-        batch_size: cfg.batch_size,
-        ingest: ingest_stats,
-        read: read_stats,
-        timestamp_utc,
+    // Cap downloaded corpus; the write phase cycles the ingest pool if it
+    // needs more rows than were loaded.
+    let corpus_rows = (args.seed_rows + ingest_needed).min(2_000_000);
+    let corpus = load_corpus(corpus_rows, &args.cache_dir).await?;
+
+    let result = match args.phase {
+        Phase::Write => run_write(&args, &uri, &corpus).await?,
+        Phase::Read => run_read(&args, &uri, &corpus).await?,
     };
-    let json = serde_json::to_string_pretty(&result)
-        .map_err(|e| lance_core::Error::io(format!("serialize result: {}", e)))?;
-    if let Some(parent) = cfg.result_file.parent() {
-        if !parent.as_os_str().is_empty() {
+
+    let text = serde_json::to_string_pretty(&result)
+        .map_err(|e| lance_core::Error::io(format!("serialize: {e}")))?;
+    println!("{text}");
+    if let Some(path) = &args.output {
+        if let Some(parent) = path.parent()
+            && !parent.as_os_str().is_empty()
+        {
             std::fs::create_dir_all(parent).ok();
         }
+        std::fs::write(path, text.as_bytes())
+            .map_err(|e| lance_core::Error::io(format!("write {}: {e}", path.display())))?;
     }
-    std::fs::write(&cfg.result_file, json.as_bytes())
-        .map_err(|e| lance_core::Error::io(format!("write result: {}", e)))?;
-    println!("\nwrote result to {}", cfg.result_file.display());
     println!("=== DONE ===");
-    let _ = result; // silence unused with no read test
-    let _ = sample_refs;
-    let _ = BTreeMap::<String, String>::new();
     Ok(())
 }
+
+fn main() -> Result<()> {
+    let args = parse_args()?;
+    let runtime = tokio::runtime::Builder::new_multi_thread()
+        .worker_threads(args.tokio_threads.max(1))
+        .enable_all()
+        .build()
+        .map_err(|e| lance_core::Error::io(format!("build runtime: {e}")))?;
+    runtime.block_on(run(args))
+}

From c6e1e2278a74531b3e4dc92f6bb13ef696f3b9ff Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sat, 16 May 2026 02:29:14 -0700
Subject: [PATCH 06/37] fix(bench): wait for FTS index catchup before the
 read-phase queries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

In async-index modes the FTS index is updated in the background
WAL-flush handler, so querying the MemTable right after the put loop
hit an empty index — consistency scored 0 because MemTable top-10 was
always empty. The read phase now waits for the IndexStore visibility
watermark to reach the last buffered batch before querying. Auto-flush
is disabled in the read phase so the watermark is a stable, monotonic
target. Reports the catchup time as index_catchup_seconds.
---
 rust/lance/benches/mem_wal_fineweb_fts.rs | 29 +++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/rust/lance/benches/mem_wal_fineweb_fts.rs b/rust/lance/benches/mem_wal_fineweb_fts.rs
index afeed61c5de..affe902d8c5 100644
--- a/rust/lance/benches/mem_wal_fineweb_fts.rs
+++ b/rust/lance/benches/mem_wal_fineweb_fts.rs
@@ -603,6 +603,34 @@ async fn run_read(args: &Args, uri: &str, corpus: &[String]) -> Result<serde_jso
     let ingest_s = ingest_start.elapsed().as_secs_f64();
     println!("  read phase: ingested {n} rows in {ingest_s:.1}s");
 
+    // Wait for the FTS index to catch up before querying the MemTable. In
+    // async-index modes the index is updated in the background WAL-flush
+    // handler, so right after the put loop the FtsMemIndex is still empty.
+    // Auto-flush is disabled in the read phase, so the active memtable and
+    // its batch positions are stable and `max_indexed_batch_position` is a
+    // monotonic, meaningful target here (unlike the write phase).
+    let catchup_start = Instant::now();
+    let catchup_deadline = Duration::from_secs(900);
+    loop {
+        let active = writer.active_memtable_ref().await?;
+        let target = active.batch_store.max_buffered_batch_position();
+        let visible = active.index_store.max_visible_batch_position();
+        drop(active);
+        match target {
+            None => break,
+            Some(t) if visible >= t => break,
+            Some(_) => {}
+        }
+        if catchup_start.elapsed() > catchup_deadline {
+            return Err(lance_core::Error::io(
+                "FTS index did not catch up within 900s".to_string(),
+            ));
+        }
+        tokio::time::sleep(Duration::from_millis(100)).await;
+    }
+    let index_catchup_s = catchup_start.elapsed().as_secs_f64();
+    println!("  read phase: index caught up in {index_catchup_s:.1}s");
+
     // Build the query set from the ingested slice.
     let sample: Vec<&str> = ingest_pool[..n].iter().map(|s| s.as_str()).collect();
     let queries = build_query_set(&sample, args);
@@ -707,6 +735,7 @@ async fn run_read(args: &Args, uri: &str, corpus: &[String]) -> Result<serde_jso
         "max_memtable_rows": args.max_memtable_rows,
         "setup_seconds": setup_s,
         "ingest_seconds": ingest_s,
+        "index_catchup_seconds": index_catchup_s,
         "flush_seconds": flush_s,
         "num_queries": all_queries.len(),
         "num_token_queries": queries.tokens.len(),

From 6bfc14a5569306e3ed3edd93948bdca0de64ce69 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sat, 16 May 2026 03:41:10 -0700
Subject: [PATCH 07/37] fix(bench): force sync_indexed_write in the FTS read
 phase

The watermark-based catchup wait never completed for async-index mode
with auto-flush disabled. The read panel measures FTS read latency and
MemTable/on-disk consistency, both of which are properties of the
populated index and independent of whether ingestion indexed sync or
async. So the read phase now forces sync_indexed_write: the FTS index
is fully populated when the put loop returns and the query phase can
start immediately, no watermark polling.
---
 rust/lance/benches/mem_wal_fineweb_fts.rs | 37 ++++-------------------
 1 file changed, 6 insertions(+), 31 deletions(-)

diff --git a/rust/lance/benches/mem_wal_fineweb_fts.rs b/rust/lance/benches/mem_wal_fineweb_fts.rs
index affe902d8c5..64daa907488 100644
--- a/rust/lance/benches/mem_wal_fineweb_fts.rs
+++ b/rust/lance/benches/mem_wal_fineweb_fts.rs
@@ -579,8 +579,12 @@ async fn run_read(args: &Args, uri: &str, corpus: &[String]) -> Result<serde_jso
 
     let dataset = Dataset::open(uri).await?;
     let shard_id = Uuid::new_v4();
-    // Auto-flush disabled so the MemTable holds the full read_rows.
-    let config = shard_writer_config(args, shard_id, true);
+    // Auto-flush disabled so the MemTable holds the full read_rows, and
+    // sync_indexed_write forced on so the FTS index is fully populated
+    // when the put loop returns — the read panel measures FTS read
+    // latency and consistency, which are properties of the populated
+    // index and do not depend on whether ingestion was sync or async.
+    let config = shard_writer_config(args, shard_id, true).with_sync_indexed_write(true);
     let writer = dataset.mem_wal_writer(shard_id, config).await?;
 
     let ingest_pool = &corpus[args.seed_rows.min(corpus.len())..];
@@ -603,34 +607,6 @@ async fn run_read(args: &Args, uri: &str, corpus: &[String]) -> Result<serde_jso
     let ingest_s = ingest_start.elapsed().as_secs_f64();
     println!("  read phase: ingested {n} rows in {ingest_s:.1}s");
 
-    // Wait for the FTS index to catch up before querying the MemTable. In
-    // async-index modes the index is updated in the background WAL-flush
-    // handler, so right after the put loop the FtsMemIndex is still empty.
-    // Auto-flush is disabled in the read phase, so the active memtable and
-    // its batch positions are stable and `max_indexed_batch_position` is a
-    // monotonic, meaningful target here (unlike the write phase).
-    let catchup_start = Instant::now();
-    let catchup_deadline = Duration::from_secs(900);
-    loop {
-        let active = writer.active_memtable_ref().await?;
-        let target = active.batch_store.max_buffered_batch_position();
-        let visible = active.index_store.max_visible_batch_position();
-        drop(active);
-        match target {
-            None => break,
-            Some(t) if visible >= t => break,
-            Some(_) => {}
-        }
-        if catchup_start.elapsed() > catchup_deadline {
-            return Err(lance_core::Error::io(
-                "FTS index did not catch up within 900s".to_string(),
-            ));
-        }
-        tokio::time::sleep(Duration::from_millis(100)).await;
-    }
-    let index_catchup_s = catchup_start.elapsed().as_secs_f64();
-    println!("  read phase: index caught up in {index_catchup_s:.1}s");
-
     // Build the query set from the ingested slice.
     let sample: Vec<&str> = ingest_pool[..n].iter().map(|s| s.as_str()).collect();
     let queries = build_query_set(&sample, args);
@@ -735,7 +711,6 @@ async fn run_read(args: &Args, uri: &str, corpus: &[String]) -> Result<serde_jso
         "max_memtable_rows": args.max_memtable_rows,
         "setup_seconds": setup_s,
         "ingest_seconds": ingest_s,
-        "index_catchup_seconds": index_catchup_s,
         "flush_seconds": flush_s,
         "num_queries": all_queries.len(),
         "num_token_queries": queries.tokens.len(),

From 802d5d49ce85663a30c5ff20731c46b3a26e6a61 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sat, 16 May 2026 03:50:15 -0700
Subject: [PATCH 08/37] fix(bench): tiny read-phase seed + id prefilter for
 consistency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The read-phase consistency scored 0 because the 1M-row seed polluted
the on-disk comparison: the MemTable FTS index covers only the ingested
rows, but `full_text_search` ranked over all 1.1M rows and its top-10
was dominated by seed rows the MemTable never held — disjoint by
construction.

The read phase now seeds with only READ_SEED_ROWS (1000) rows and
prefilters the on-disk query to `id >= READ_SEED_ROWS`, so both the
MemTable and on-disk top-10 cover exactly the ingested population and
BM25 stats are effectively identical.
---
 rust/lance/benches/mem_wal_fineweb_fts.rs | 31 +++++++++++++++--------
 1 file changed, 21 insertions(+), 10 deletions(-)

diff --git a/rust/lance/benches/mem_wal_fineweb_fts.rs b/rust/lance/benches/mem_wal_fineweb_fts.rs
index 64daa907488..3f03f9f3f64 100644
--- a/rust/lance/benches/mem_wal_fineweb_fts.rs
+++ b/rust/lance/benches/mem_wal_fineweb_fts.rs
@@ -60,6 +60,12 @@ use uuid::Uuid;
 
 const TEXT_COL: &str = "text";
 const FTS_INDEX_NAME: &str = "text_fts";
+/// Seed-row count for the read phase. Kept tiny so the on-disk corpus is
+/// effectively just the ingested rows: the MemTable FTS index covers only
+/// the ingested rows, so the on-disk `full_text_search` it is compared
+/// against must too. The on-disk query is additionally prefiltered to
+/// `id >= READ_SEED_ROWS` to drop these few seed rows entirely.
+const READ_SEED_ROWS: usize = 1000;
 const HF_API_LISTING: &str =
     "https://huggingface.co/api/datasets/HuggingFaceFW/fineweb/tree/main/sample/10BT";
 const HF_FILE_BASE: &str = "https://huggingface.co/datasets/HuggingFaceFW/fineweb/resolve/main/";
@@ -573,8 +579,10 @@ async fn run_write(args: &Args, uri: &str, corpus: &[String]) -> Result<serde_js
 async fn run_read(args: &Args, uri: &str, corpus: &[String]) -> Result<serde_json::Value> {
     let schema = make_schema();
 
-    // Seed dataset with an FTS base index (read phase is FTS-only).
-    let seed = &corpus[..args.seed_rows.min(corpus.len())];
+    // Tiny seed dataset with an FTS base index (read phase is FTS-only).
+    // See READ_SEED_ROWS for why the seed is kept small.
+    let read_seed = READ_SEED_ROWS.min(corpus.len());
+    let seed = &corpus[..read_seed];
     let setup_s = build_seed_dataset(uri, schema.clone(), seed, args.batch_rows, true).await?;
 
     let dataset = Dataset::open(uri).await?;
@@ -587,9 +595,9 @@ async fn run_read(args: &Args, uri: &str, corpus: &[String]) -> Result<serde_jso
     let config = shard_writer_config(args, shard_id, true).with_sync_indexed_write(true);
     let writer = dataset.mem_wal_writer(shard_id, config).await?;
 
-    let ingest_pool = &corpus[args.seed_rows.min(corpus.len())..];
+    let ingest_pool = &corpus[read_seed..];
     let n = args.read_rows.min(ingest_pool.len());
-    let id_base = args.seed_rows as i64;
+    let id_base = read_seed as i64;
     let total_batches = n.div_ceil(args.batch_rows);
     let ingest_start = Instant::now();
     for b in 0..total_batches {
@@ -681,6 +689,9 @@ async fn run_read(args: &Args, uri: &str, corpus: &[String]) -> Result<serde_jso
         };
         let mut scanner = flushed.scan();
         scanner.full_text_search(FullTextSearchQuery::new(query_str))?;
+        // Restrict to the ingested rows so the on-disk comparison covers
+        // exactly the population the MemTable FTS index held.
+        scanner.filter(&format!("id >= {read_seed}"))?;
         scanner.limit(Some(args.top_k as i64), None)?;
         scanner.project(&["id"])?;
         let stream = scanner.try_into_stream().await?;
@@ -819,13 +830,13 @@ async fn run(args: Args) -> Result<()> {
         args.max_memtable_rows,
     );
 
-    let ingest_needed = match args.phase {
-        Phase::Write => args.calls * args.batch_rows,
-        Phase::Read => args.read_rows,
+    // Corpus size differs by phase: the write phase seeds with
+    // `seed_rows` and cycles the ingest pool; the read phase uses only a
+    // tiny `READ_SEED_ROWS` seed plus `read_rows` ingested rows.
+    let corpus_rows = match args.phase {
+        Phase::Write => (args.seed_rows + args.calls * args.batch_rows).min(2_000_000),
+        Phase::Read => (READ_SEED_ROWS + args.read_rows).min(2_000_000),
     };
-    // Cap downloaded corpus; the write phase cycles the ingest pool if it
-    // needs more rows than were loaded.
-    let corpus_rows = (args.seed_rows + ingest_needed).min(2_000_000);
     let corpus = load_corpus(corpus_rows, &args.cache_dir).await?;
 
     let result = match args.phase {

From c900c7bec3169c75f74d5912c24a1b48f9743c3e Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sat, 16 May 2026 04:10:56 -0700
Subject: [PATCH 09/37] fix(bench): query MemTable FTS via MemTableScanner,
 compare to reference dataset

The read-phase consistency was unmeasurable: (1) the post-MemWAL-flush
on-disk query saw nothing because flushed data lives in the MemWAL LSM
structure that a plain Dataset::scan does not traverse, and (2) the
manual FtsMemIndex row_position -> id mapping was mis-aligned.

The read phase now queries the MemTable through the production
MemTableScanner (returns the id column directly, no manual mapping) and
compares against a freshly built reference Lance dataset holding the
identical ingested rows + a normal FTS index. Both sides cover exactly
the same population with identical BM25 stats. Token queries are drawn
from mid-frequency terms so the top-10 is well-determined rather than a
high-frequency near-tie.
---
 rust/lance/benches/mem_wal_fineweb_fts.rs | 200 ++++++++++++++--------
 1 file changed, 127 insertions(+), 73 deletions(-)

diff --git a/rust/lance/benches/mem_wal_fineweb_fts.rs b/rust/lance/benches/mem_wal_fineweb_fts.rs
index 3f03f9f3f64..4e0322059c4 100644
--- a/rust/lance/benches/mem_wal_fineweb_fts.rs
+++ b/rust/lance/benches/mem_wal_fineweb_fts.rs
@@ -45,8 +45,7 @@ use std::time::{Duration, Instant};
 use arrow_array::{Array, ArrayRef, Int64Array, RecordBatch, RecordBatchIterator, StringArray};
 use arrow_schema::{DataType, Field, Schema as ArrowSchema};
 use futures::TryStreamExt;
-use lance::dataset::mem_wal::index::{FtsQueryExpr, SearchOptions};
-use lance::dataset::mem_wal::{DatasetMemWalExt, MemWalConfig, ShardWriterConfig};
+use lance::dataset::mem_wal::{DatasetMemWalExt, MemTableScanner, MemWalConfig, ShardWriterConfig};
 use lance::dataset::{Dataset, WriteParams};
 use lance::index::DatasetIndexExt;
 use lance_core::Result;
@@ -470,8 +469,15 @@ fn build_query_set(sample: &[&str], args: &Args) -> QuerySet {
     }
     let mut by_freq: Vec<(String, u64)> = freq.into_iter().collect();
     by_freq.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
+    // Skip the most-frequent tokens: terms like "all"/"one"/"time" appear
+    // in a huge fraction of docs, so thousands of documents tie on BM25
+    // and the top-10 is an unstable near-tie that does not meaningfully
+    // exercise FTS correctness. Mid-frequency terms have a well-determined
+    // top-10. The skip is capped so a small vocabulary still yields queries.
+    let skip = (by_freq.len() / 4).min(300);
     let tokens: Vec<String> = by_freq
         .into_iter()
+        .skip(skip)
         .map(|(t, _)| t)
         .take(args.num_token_queries)
         .collect();
@@ -615,99 +621,97 @@ async fn run_read(args: &Args, uri: &str, corpus: &[String]) -> Result<serde_jso
     let ingest_s = ingest_start.elapsed().as_secs_f64();
     println!("  read phase: ingested {n} rows in {ingest_s:.1}s");
 
-    // Build the query set from the ingested slice.
+    // Build the query set from the ingested slice. Each query is a
+    // (is_phrase, query_string) pair so the same set drives both the
+    // MemTableScanner and the reference Dataset scanner.
     let sample: Vec<&str> = ingest_pool[..n].iter().map(|s| s.as_str()).collect();
     let queries = build_query_set(&sample, args);
-    let all_queries: Vec<FtsQueryExpr> = queries
+    let num_token_queries = queries.tokens.len();
+    let num_phrase_queries = queries.phrases.len();
+    let all_queries: Vec<(bool, String)> = queries
         .tokens
         .iter()
-        .map(|t| FtsQueryExpr::match_query(t.clone()))
-        .chain(
-            queries
-                .phrases
-                .iter()
-                .map(|p| FtsQueryExpr::phrase(p.clone())),
-        )
+        .map(|t| (false, t.clone()))
+        .chain(queries.phrases.iter().map(|p| (true, p.clone())))
         .collect();
-    println!(
-        "  query set: {} tokens + {} phrases",
-        queries.tokens.len(),
-        queries.phrases.len()
-    );
+    println!("  query set: {num_token_queries} tokens + {num_phrase_queries} phrases");
 
-    // ---- MemTable read phase ----
+    // ---- MemTable read phase: query through the production MemTableScanner ----
+    // The scanner returns RecordBatches projected to `id`, so there is no
+    // need to map FtsMemIndex row positions back to ids by hand.
     let active = writer.active_memtable_ref().await?;
-    let fts = active
-        .index_store
-        .get_fts(FTS_INDEX_NAME)
-        .ok_or_else(|| lance_core::Error::invalid_input("FTS mem index missing"))?;
-
-    // row_position -> id, recovered from the MemTable batches.
-    let mut row_to_id: HashMap<u64, i64> = HashMap::new();
-    for stored in active.batch_store.iter() {
-        let id_arr = stored
-            .data
-            .column_by_name("id")
-            .and_then(|c| c.as_any().downcast_ref::<Int64Array>())
-            .ok_or_else(|| lance_core::Error::invalid_input("id column missing"))?;
-        for r in 0..id_arr.len() {
-            row_to_id.insert(stored.row_offset + r as u64, id_arr.value(r));
-        }
-    }
+    let mt_batch_store = active.batch_store.clone();
+    let mt_index_store = active.index_store.clone();
+    let mt_schema = active.schema.clone();
+    drop(active);
 
     let mut latencies_us = Vec::with_capacity(all_queries.len());
     let mut mt_top: Vec<HashSet<i64>> = Vec::with_capacity(all_queries.len());
-    for q in &all_queries {
-        let opts = SearchOptions::new().with_limit(args.top_k);
+    for (qi, (is_phrase, q)) in all_queries.iter().enumerate() {
+        let mut scanner = MemTableScanner::new(
+            mt_batch_store.clone(),
+            mt_index_store.clone(),
+            mt_schema.clone(),
+        );
+        if *is_phrase {
+            scanner.full_text_phrase(TEXT_COL, q, 0);
+        } else {
+            scanner.full_text_search(TEXT_COL, q);
+        }
+        scanner.project(&["id"]);
+        scanner.limit(args.top_k, None);
         let t0 = Instant::now();
-        let hits = fts.search_with_options(q, opts);
+        let stream = scanner.try_into_stream().await?;
+        let batches: Vec<RecordBatch> = stream.try_collect().await?;
         latencies_us.push(t0.elapsed().as_micros() as f64);
-        let ids: HashSet<i64> = hits
-            .iter()
-            .take(args.top_k)
-            .filter_map(|e| row_to_id.get(&e.row_position).copied())
-            .collect();
+        let ids = collect_ids(&batches)?;
+        if qi < 5 {
+            println!(
+                "  [debug mt q{qi}] phrase={is_phrase} '{q}' -> {} ids {:?}",
+                ids.len(),
+                ids.iter().take(5).collect::<Vec<_>>()
+            );
+        }
         mt_top.push(ids);
     }
-    drop(active);
-
     let lat_ms: Vec<f64> = latencies_us.iter().map(|us| us / 1000.0).collect();
     let mt_avg_ms = lat_ms.iter().sum::<f64>() / lat_ms.len().max(1) as f64;
 
-    // ---- Force flush, on-disk replay ----
-    let flush_start = Instant::now();
-    writer.close().await?;
-    let flush_s = flush_start.elapsed().as_secs_f64();
-    let flushed = Dataset::open(uri).await?;
+    // ---- Reference: a plain on-disk dataset over the identical ingested
+    // rows + a normal FTS index. The MemTable FTS results are validated
+    // against this. No MemWAL flush is involved: the flushed data lives in
+    // the MemWAL LSM structure which a plain `Dataset::scan` does not see,
+    // so a separate reference dataset is the apples-to-apples comparison.
+    drop(writer);
+    let ref_uri = format!("{uri}_ref");
+    let ref_build_s =
+        build_reference_dataset(&ref_uri, schema.clone(), &sample, id_base, args.batch_rows)
+            .await?;
+    let ref_ds = Dataset::open(&ref_uri).await?;
 
     let mut consistencies = Vec::with_capacity(all_queries.len());
-    for (q, mt_ids) in all_queries.iter().zip(mt_top.iter()) {
-        let query_str = match q {
-            FtsQueryExpr::Match { query, .. } => query.clone(),
-            FtsQueryExpr::Phrase { query, .. } => format!("\"{query}\""),
-            _ => unreachable!("only match/phrase queries are generated"),
+    for (qi, ((is_phrase, q), mt_ids)) in all_queries.iter().zip(mt_top.iter()).enumerate() {
+        let query_str = if *is_phrase {
+            format!("\"{q}\"")
+        } else {
+            q.clone()
         };
-        let mut scanner = flushed.scan();
-        scanner.full_text_search(FullTextSearchQuery::new(query_str))?;
-        // Restrict to the ingested rows so the on-disk comparison covers
-        // exactly the population the MemTable FTS index held.
-        scanner.filter(&format!("id >= {read_seed}"))?;
+        let mut scanner = ref_ds.scan();
+        scanner.full_text_search(FullTextSearchQuery::new(query_str.clone()))?;
         scanner.limit(Some(args.top_k as i64), None)?;
         scanner.project(&["id"])?;
         let stream = scanner.try_into_stream().await?;
         let batches: Vec<RecordBatch> = stream.try_collect().await?;
-        let mut disk_ids: HashSet<i64> = HashSet::new();
-        for b in &batches {
-            let id_arr = b
-                .column_by_name("id")
-                .and_then(|c| c.as_any().downcast_ref::<Int64Array>())
-                .ok_or_else(|| lance_core::Error::invalid_input("disk id column missing"))?;
-            for i in 0..id_arr.len() {
-                disk_ids.insert(id_arr.value(i));
-            }
-        }
+        let disk_ids = collect_ids(&batches)?;
         let inter: usize = mt_ids.intersection(&disk_ids).count();
         let denom = mt_ids.len().max(disk_ids.len()).max(1);
+        if qi < 5 {
+            println!(
+                "  [debug ref q{qi}] '{query_str}' -> ref={} mt={} inter={inter}",
+                disk_ids.len(),
+                mt_ids.len()
+            );
+        }
         consistencies.push(inter as f64 / denom as f64);
     }
     let cons_mean = consistencies.iter().sum::<f64>() / consistencies.len().max(1) as f64;
@@ -717,15 +721,14 @@ async fn run_read(args: &Args, uri: &str, corpus: &[String]) -> Result<serde_jso
         "phase": "read",
         "mode": args.mode.as_str(),
         "uri": uri,
-        "seed_rows": args.seed_rows,
         "read_rows": n,
         "max_memtable_rows": args.max_memtable_rows,
         "setup_seconds": setup_s,
         "ingest_seconds": ingest_s,
-        "flush_seconds": flush_s,
+        "ref_build_seconds": ref_build_s,
         "num_queries": all_queries.len(),
-        "num_token_queries": queries.tokens.len(),
-        "num_phrase_queries": queries.phrases.len(),
+        "num_token_queries": num_token_queries,
+        "num_phrase_queries": num_phrase_queries,
         "mt_latency_avg_ms": mt_avg_ms,
         "mt_latency_p50_ms": percentile(&lat_ms, 50.0),
         "mt_latency_p95_ms": percentile(&lat_ms, 95.0),
@@ -735,6 +738,57 @@ async fn run_read(args: &Args, uri: &str, corpus: &[String]) -> Result<serde_jso
     }))
 }
 
+/// Extract the `id` column values from a list of result batches.
+fn collect_ids(batches: &[RecordBatch]) -> Result<HashSet<i64>> {
+    let mut ids = HashSet::new();
+    for b in batches {
+        let id_arr = b
+            .column_by_name("id")
+            .and_then(|c| c.as_any().downcast_ref::<Int64Array>())
+            .ok_or_else(|| lance_core::Error::invalid_input("id column missing in result"))?;
+        for i in 0..id_arr.len() {
+            ids.insert(id_arr.value(i));
+        }
+    }
+    Ok(ids)
+}
+
+/// Build a plain Lance dataset (no MemWAL) over `texts` with ids starting
+/// at `id_base`, plus a normal FTS index. Used as the read-phase
+/// comparison reference.
+async fn build_reference_dataset(
+    ref_uri: &str,
+    schema: Arc<ArrowSchema>,
+    texts: &[&str],
+    id_base: i64,
+    batch_rows: usize,
+) -> Result<f64> {
+    let start = Instant::now();
+    let mut batches = Vec::with_capacity(texts.len().div_ceil(batch_rows));
+    let mut lo = 0usize;
+    while lo < texts.len() {
+        let hi = (lo + batch_rows).min(texts.len());
+        batches.push(Ok(make_batch(
+            schema.clone(),
+            id_base + lo as i64,
+            &texts[lo..hi],
+        )));
+        lo = hi;
+    }
+    let reader = RecordBatchIterator::new(batches.into_iter(), schema.clone());
+    let mut dataset = Dataset::write(reader, ref_uri, Some(WriteParams::default())).await?;
+    dataset
+        .create_index(
+            &[TEXT_COL],
+            IndexType::Inverted,
+            Some(FTS_INDEX_NAME.to_string()),
+            &InvertedIndexParams::default(),
+            true,
+        )
+        .await?;
+    Ok(start.elapsed().as_secs_f64())
+}
+
 // ----------------------------------------------------------------------
 // CLI
 // ----------------------------------------------------------------------

From bd295cd6ab64027d21e120ac5f77b11198ca4b65 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sat, 16 May 2026 04:35:49 -0700
Subject: [PATCH 10/37] fix(bench): force durable_write in the FTS read phase

The MemTableScanner snapshots the visibility watermark at plan time,
and that watermark only advances as the WAL becomes durable. Without
durable_write the read phase queried a partially-visible MemTable: the
async-mode read configs scored consistency ~0.03-0.23 while the
durable sync-mode ones scored ~0.72-0.75 over identical data. The read
phase now forces durable_write on so the scanner always sees the full
ingested MemTable.
---
 rust/lance/benches/mem_wal_fineweb_fts.rs | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/rust/lance/benches/mem_wal_fineweb_fts.rs b/rust/lance/benches/mem_wal_fineweb_fts.rs
index 4e0322059c4..4e35265bac7 100644
--- a/rust/lance/benches/mem_wal_fineweb_fts.rs
+++ b/rust/lance/benches/mem_wal_fineweb_fts.rs
@@ -593,12 +593,18 @@ async fn run_read(args: &Args, uri: &str, corpus: &[String]) -> Result<serde_jso
 
     let dataset = Dataset::open(uri).await?;
     let shard_id = Uuid::new_v4();
-    // Auto-flush disabled so the MemTable holds the full read_rows, and
-    // sync_indexed_write forced on so the FTS index is fully populated
-    // when the put loop returns — the read panel measures FTS read
-    // latency and consistency, which are properties of the populated
-    // index and do not depend on whether ingestion was sync or async.
-    let config = shard_writer_config(args, shard_id, true).with_sync_indexed_write(true);
+    // Auto-flush disabled so the MemTable holds the full read_rows.
+    // sync_indexed_write + durable_write are both forced on: the read
+    // panel measures FTS read latency and consistency, which are
+    // properties of the *fully visible* MemTable. The MemTableScanner
+    // respects the visibility watermark, and that watermark only advances
+    // as the WAL becomes durable — so without durable_write the scanner
+    // would query a partially-visible MemTable and report garbage. Both
+    // are properties of ingestion, not of the read path, so forcing them
+    // does not bias the latency/consistency measurement.
+    let config = shard_writer_config(args, shard_id, true)
+        .with_sync_indexed_write(true)
+        .with_durable_write(true);
     let writer = dataset.mem_wal_writer(shard_id, config).await?;
 
     let ingest_pool = &corpus[read_seed..];

From d99cf7d0c46f02751fb3293f8ac19ec80b570b7f Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sat, 16 May 2026 08:55:09 -0700
Subject: [PATCH 11/37] bench: add --target-rows-per-sec pacing + backlog
 metrics to FTS write phase

Adds paced ingest to the write phase for a backpressure sweep, mirroring
the mem_wal_shard_writer_backpressure bench used for the HNSW vector
panel. The write phase now reports slow_puts_ge_1s/10s, the memtable +
pending-WAL backlog left when the put loop ends, and honors a paced
target so a sweep can find the max sustainable async FTS throughput
(the rate at which the flush/index pipeline keeps up without
accumulating backpressure).
---
 rust/lance/benches/mem_wal_fineweb_fts.rs | 36 +++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/rust/lance/benches/mem_wal_fineweb_fts.rs b/rust/lance/benches/mem_wal_fineweb_fts.rs
index 4e35265bac7..101442efa52 100644
--- a/rust/lance/benches/mem_wal_fineweb_fts.rs
+++ b/rust/lance/benches/mem_wal_fineweb_fts.rs
@@ -159,6 +159,9 @@ struct Args {
     max_memtable_size: usize,
     max_unflushed_memtable_bytes: usize,
     max_wal_flush_interval_ms: u64,
+    /// Paced ingest target for the write phase. `None` = unpaced (puts
+    /// issued as fast as possible). Used for the backpressure sweep.
+    target_rows_per_sec: Option<f64>,
     cache_dir: PathBuf,
     num_token_queries: usize,
     num_phrase_queries: usize,
@@ -182,6 +185,7 @@ impl Default for Args {
             max_memtable_size: 16 * 1024 * 1024 * 1024,
             max_unflushed_memtable_bytes: 8 * 1024 * 1024 * 1024,
             max_wal_flush_interval_ms: 100,
+            target_rows_per_sec: None,
             cache_dir: std::env::temp_dir().join("mem_wal_fineweb_fts_cache"),
             num_token_queries: 100,
             num_phrase_queries: 50,
@@ -544,14 +548,35 @@ async fn run_write(args: &Args, uri: &str, corpus: &[String]) -> Result<serde_js
             let rate = ((call + 1) * args.batch_rows) as f64 / puts_start.elapsed().as_secs_f64();
             println!("  put {}/{} ({rate:.0} rows/s)", call + 1, args.calls);
         }
+        // Pace the ingest to `target_rows_per_sec` for the backpressure
+        // sweep: sleep so the (call+1)-th put completes no earlier than
+        // its scheduled time. The MemWAL writer's own backpressure can
+        // still delay a put past schedule — that delay is what the sweep
+        // is looking for.
+        if let Some(target) = args.target_rows_per_sec
+            && target > 0.0
+        {
+            let scheduled = ((call + 1) * args.batch_rows) as f64 / target;
+            let actual = puts_start.elapsed().as_secs_f64();
+            if scheduled > actual {
+                tokio::time::sleep(Duration::from_secs_f64(scheduled - actual)).await;
+            }
+        }
     }
     let elapsed_puts_s = puts_start.elapsed().as_secs_f64();
 
+    // Backlog still buffered when the put loop ends: if the writer kept
+    // up with the ingest rate this is small; a large backlog means the
+    // flush/index pipeline fell behind (accumulating backpressure).
+    let backlog = writer.memtable_stats().await.ok();
+
     let close_start = Instant::now();
     writer.close().await?;
     let elapsed_close_s = close_start.elapsed().as_secs_f64();
     let elapsed_total_s = puts_start.elapsed().as_secs_f64();
 
+    let slow_puts_1s = latencies_ms.iter().filter(|ms| **ms >= 1_000.0).count();
+    let slow_puts_10s = latencies_ms.iter().filter(|ms| **ms >= 10_000.0).count();
     let rows = args.calls * args.batch_rows;
     Ok(json!({
         "phase": "write",
@@ -562,10 +587,17 @@ async fn run_write(args: &Args, uri: &str, corpus: &[String]) -> Result<serde_js
         "calls": args.calls,
         "ingested_rows": rows,
         "max_memtable_rows": args.max_memtable_rows,
+        "target_rows_per_sec": args.target_rows_per_sec,
         "setup_seconds": setup_s,
         "elapsed_puts_seconds": elapsed_puts_s,
         "elapsed_close_seconds": elapsed_close_s,
         "elapsed_total_seconds": elapsed_total_s,
+        "slow_puts_ge_1s": slow_puts_1s,
+        "slow_puts_ge_10s": slow_puts_10s,
+        "backlog_memtable_rows": backlog.as_ref().map(|s| s.row_count),
+        "backlog_pending_wal_rows": backlog.as_ref().map(|s| s.pending_wal_row_count),
+        "backlog_pending_wal_mb":
+            backlog.as_ref().map(|s| s.pending_wal_estimated_bytes as f64 / 1.0e6),
         // Throughput including the final close()/flush — comparable across
         // configs with different flush cadences.
         "throughput_rows_per_sec": rows as f64 / elapsed_total_s,
@@ -840,6 +872,10 @@ fn parse_args() -> Result<Args> {
                 args.max_unflushed_memtable_bytes = parse(&flag, &value)?
             }
             "--max-wal-flush-interval-ms" => args.max_wal_flush_interval_ms = parse(&flag, &value)?,
+            "--target-rows-per-sec" => {
+                let v: f64 = parse(&flag, &value)?;
+                args.target_rows_per_sec = (v > 0.0).then_some(v);
+            }
             "--cache-dir" => args.cache_dir = PathBuf::from(value),
             "--num-token-queries" => args.num_token_queries = parse(&flag, &value)?,
             "--num-phrase-queries" => args.num_phrase_queries = parse(&flag, &value)?,

From efbf8e03367ceaeafa75dd974e9829bc9c2ad0bc Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sat, 16 May 2026 08:55:45 -0700
Subject: [PATCH 12/37] bench: add backpressure sweep driver for async FineWeb
 FTS writes

---
 bench/run_fineweb_fts_backpressure.sh | 93 +++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100755 bench/run_fineweb_fts_backpressure.sh

diff --git a/bench/run_fineweb_fts_backpressure.sh b/bench/run_fineweb_fts_backpressure.sh
new file mode 100755
index 00000000000..9dab40a2207
--- /dev/null
+++ b/bench/run_fineweb_fts_backpressure.sh
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+# Backpressure sweep for async MemWAL writes on real FineWeb text.
+#
+# Mirrors the HNSW vector-index backpressure sweep
+# (analysis/lance/jack-mem-wal-hnsw/native-shard-writer-backpressure-*):
+# pace the ingest at increasing target rows/s and find the highest rate
+# the flush/index pipeline sustains without accumulating backpressure
+# (no puts >= 1s, small leftover backlog, bounded drain). Also runs one
+# unpaced cell per mode for the (non-sustainable) peak.
+#
+# Usage: ./bench/run_fineweb_fts_backpressure.sh [run_id]
+
+set -uo pipefail
+cd "$(dirname "${BASH_SOURCE[0]}")/.."
+
+RUN_ID="${1:-bp-$(date -u +%Y%m%dT%H%M%SZ)}"
+DATASET_PREFIX="${DATASET_PREFIX:-s3://jack-devland-build/bench/mem-fts-fineweb}"
+SEED_ROWS="${SEED_ROWS:-10000}"      # small seed: backpressure test, seed size is irrelevant
+BATCH_ROWS="${BATCH_ROWS:-1000}"
+CALLS="${CALLS:-120}"                # 120k rows/cell — long enough for steady state
+CACHE_DIR="${CACHE_DIR:-/mnt/data/fineweb}"
+CONFIG_TIMEOUT="${CONFIG_TIMEOUT:-2400}"
+export AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}"
+
+LOCAL_DIR="bench/results/${RUN_ID}"
+mkdir -p "$LOCAL_DIR"
+
+BIN="$(find target/release/deps -maxdepth 1 -type f -perm -111 -name 'mem_wal_fineweb_fts-*' ! -name '*.d' 2>/dev/null | sort | tail -1)"
+if [ -z "$BIN" ]; then
+    cargo bench -p lance --bench mem_wal_fineweb_fts --no-run
+    BIN="$(find target/release/deps -maxdepth 1 -type f -perm -111 -name 'mem_wal_fineweb_fts-*' ! -name '*.d' 2>/dev/null | sort | tail -1)"
+fi
+echo "bench binary: $BIN"
+echo "run id:       $RUN_ID"
+
+run_cell() {
+    local name="$1"; shift
+    local out="$LOCAL_DIR/${name}.json"
+    local log="$LOCAL_DIR/${name}.log"
+    echo ">>> $name"
+    if [ -f "$out" ]; then echo "    already done"; return; fi
+    timeout "$CONFIG_TIMEOUT" "$BIN" --bench --phase write "$@" --output "$out" > "$log" 2>&1
+    local rc=$?
+    if [ "$rc" -eq 124 ]; then echo "    !!! TIMED OUT"
+    elif [ "$rc" -ne 0 ]; then echo "    !!! failed rc=$rc"
+    else echo "    ok"; fi
+    [ -f "$out" ] && aws s3 cp "$out" "$DATASET_PREFIX/$RUN_ID/results/${name}.json" >/dev/null 2>&1
+    aws s3 cp "$log" "$DATASET_PREFIX/$RUN_ID/results/${name}.log" >/dev/null 2>&1
+}
+
+# mode -> list of paced targets (rows/s); 0 = unpaced peak
+sweep() {
+    local mode="$1"; shift
+    for tgt in "$@"; do
+        local extra=() label
+        if [ "$tgt" = "0" ]; then
+            label="${mode}_unpaced"
+        else
+            label="${mode}_t${tgt}"
+            extra=(--target-rows-per-sec "$tgt")
+        fi
+        run_cell "$label" \
+            --mode "$mode" \
+            --uri "$DATASET_PREFIX/$RUN_ID/bp_${label}" \
+            --seed-rows "$SEED_ROWS" --batch-rows "$BATCH_ROWS" --calls "$CALLS" \
+            --cache-dir "$CACHE_DIR" "${extra[@]}"
+    done
+}
+
+# FTS-indexed async: expected sustainable rate is modest, sweep low.
+sweep async_idx 500 1000 1500 2000 2500 3000 0
+# No-index async: much higher ceiling.
+sweep async_noidx 2000 4000 6000 8000 0
+
+echo ""
+echo "=== backpressure summary ==="
+python3 - "$LOCAL_DIR" <<'PY'
+import glob, json, os, sys
+d = sys.argv[1]
+rows = []
+for p in sorted(glob.glob(os.path.join(d, "*.json"))):
+    try: r = json.load(open(p))
+    except Exception: continue
+    rows.append((os.path.basename(p)[:-5], r))
+print(f"{'cell':26s} {'target':>8s} {'rows/s':>9s} {'puts_r/s':>10s} {'drain_s':>8s} {'slow>=1s':>9s} {'backlog_rows':>13s}")
+for name, r in rows:
+    tgt = r.get('target_rows_per_sec') or 0
+    print(f"{name:26s} {tgt:>8.0f} {r['throughput_rows_per_sec']:>9.0f} "
+          f"{r['throughput_puts_rows_per_sec']:>10.0f} {r['elapsed_close_seconds']:>8.1f} "
+          f"{r['slow_puts_ge_1s']:>9d} {str(r.get('backlog_memtable_rows')):>13s}")
+PY
+echo ""
+echo "results: $LOCAL_DIR  +  $DATASET_PREFIX/$RUN_ID/results/"

From e1a1c098d9b3e4674a1b416b9eeb3551bf444626 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sat, 16 May 2026 08:59:56 -0700
Subject: [PATCH 13/37] fix(bench): bound max_memtable_rows default to avoid
 petabyte preallocation

With no --max-memtable-rows the write-phase config defaulted to
usize::MAX/2, so max_memtable_batches = max_rows/batch_rows overflowed
into a ~9e15 Vec capacity and the writer aborted on a 664 PB
allocation. The default is now derived from the byte budget and
clamped to 16M rows.
---
 rust/lance/benches/mem_wal_fineweb_fts.rs | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/rust/lance/benches/mem_wal_fineweb_fts.rs b/rust/lance/benches/mem_wal_fineweb_fts.rs
index 101442efa52..70a2e9d8926 100644
--- a/rust/lance/benches/mem_wal_fineweb_fts.rs
+++ b/rust/lance/benches/mem_wal_fineweb_fts.rs
@@ -412,7 +412,13 @@ fn shard_writer_config(args: &Args, shard_id: Uuid, disable_auto_flush: bool) ->
     let max_rows = if disable_auto_flush {
         args.read_rows.saturating_mul(4).max(4_000_000)
     } else {
-        args.max_memtable_rows.unwrap_or(usize::MAX / 2)
+        // When no explicit row cap is given, derive a generous but
+        // BOUNDED cap from the byte budget. `usize::MAX/2` here would make
+        // `max_memtable_batches` (computed below) astronomically large and
+        // the writer would try to preallocate a petabyte-scale Vec.
+        args.max_memtable_rows.unwrap_or_else(|| {
+            (args.max_memtable_size / 2048).clamp(args.batch_rows.max(1), 16_000_000)
+        })
     };
     let mut config = ShardWriterConfig::new(shard_id)
         .with_durable_write(args.mode.durable_write())

From 3a23eecb3d87e6b95e51f0377157511f47381ac9 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sat, 16 May 2026 09:44:07 -0700
Subject: [PATCH 14/37] bench: add --index-type vector|fts to the ShardWriter
 backpressure bench

Centralizes the vector and FTS write-backpressure tests on one bench and
one driver: the paced-ingest / WAL-queue-sampling / skip-close
methodology is identical, and only --index-type selects which column is
indexed (vec via IVF/PQ, or text via the inverted index). This makes the
FTS backpressure numbers directly comparable to the HNSW vector sweep.

The fineweb-shaped `text` column is now generated as varied vocabulary
words instead of a single repeated character so the FTS index does
representative work; the byte size is unchanged so vector runs (where
text is inert payload) are unaffected.

run_shard_writer_backpressure.sh drives both index types via INDEX_TYPE.
---
 bench/run_fineweb_fts_backpressure.sh         |  93 ---------
 bench/run_shard_writer_backpressure.sh        | 115 +++++++++++
 .../mem_wal_shard_writer_backpressure.rs      | 188 +++++++++++++++++-
 3 files changed, 294 insertions(+), 102 deletions(-)
 delete mode 100755 bench/run_fineweb_fts_backpressure.sh
 create mode 100755 bench/run_shard_writer_backpressure.sh

diff --git a/bench/run_fineweb_fts_backpressure.sh b/bench/run_fineweb_fts_backpressure.sh
deleted file mode 100755
index 9dab40a2207..00000000000
--- a/bench/run_fineweb_fts_backpressure.sh
+++ /dev/null
@@ -1,93 +0,0 @@
-#!/usr/bin/env bash
-# Backpressure sweep for async MemWAL writes on real FineWeb text.
-#
-# Mirrors the HNSW vector-index backpressure sweep
-# (analysis/lance/jack-mem-wal-hnsw/native-shard-writer-backpressure-*):
-# pace the ingest at increasing target rows/s and find the highest rate
-# the flush/index pipeline sustains without accumulating backpressure
-# (no puts >= 1s, small leftover backlog, bounded drain). Also runs one
-# unpaced cell per mode for the (non-sustainable) peak.
-#
-# Usage: ./bench/run_fineweb_fts_backpressure.sh [run_id]
-
-set -uo pipefail
-cd "$(dirname "${BASH_SOURCE[0]}")/.."
-
-RUN_ID="${1:-bp-$(date -u +%Y%m%dT%H%M%SZ)}"
-DATASET_PREFIX="${DATASET_PREFIX:-s3://jack-devland-build/bench/mem-fts-fineweb}"
-SEED_ROWS="${SEED_ROWS:-10000}"      # small seed: backpressure test, seed size is irrelevant
-BATCH_ROWS="${BATCH_ROWS:-1000}"
-CALLS="${CALLS:-120}"                # 120k rows/cell — long enough for steady state
-CACHE_DIR="${CACHE_DIR:-/mnt/data/fineweb}"
-CONFIG_TIMEOUT="${CONFIG_TIMEOUT:-2400}"
-export AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}"
-
-LOCAL_DIR="bench/results/${RUN_ID}"
-mkdir -p "$LOCAL_DIR"
-
-BIN="$(find target/release/deps -maxdepth 1 -type f -perm -111 -name 'mem_wal_fineweb_fts-*' ! -name '*.d' 2>/dev/null | sort | tail -1)"
-if [ -z "$BIN" ]; then
-    cargo bench -p lance --bench mem_wal_fineweb_fts --no-run
-    BIN="$(find target/release/deps -maxdepth 1 -type f -perm -111 -name 'mem_wal_fineweb_fts-*' ! -name '*.d' 2>/dev/null | sort | tail -1)"
-fi
-echo "bench binary: $BIN"
-echo "run id:       $RUN_ID"
-
-run_cell() {
-    local name="$1"; shift
-    local out="$LOCAL_DIR/${name}.json"
-    local log="$LOCAL_DIR/${name}.log"
-    echo ">>> $name"
-    if [ -f "$out" ]; then echo "    already done"; return; fi
-    timeout "$CONFIG_TIMEOUT" "$BIN" --bench --phase write "$@" --output "$out" > "$log" 2>&1
-    local rc=$?
-    if [ "$rc" -eq 124 ]; then echo "    !!! TIMED OUT"
-    elif [ "$rc" -ne 0 ]; then echo "    !!! failed rc=$rc"
-    else echo "    ok"; fi
-    [ -f "$out" ] && aws s3 cp "$out" "$DATASET_PREFIX/$RUN_ID/results/${name}.json" >/dev/null 2>&1
-    aws s3 cp "$log" "$DATASET_PREFIX/$RUN_ID/results/${name}.log" >/dev/null 2>&1
-}
-
-# mode -> list of paced targets (rows/s); 0 = unpaced peak
-sweep() {
-    local mode="$1"; shift
-    for tgt in "$@"; do
-        local extra=() label
-        if [ "$tgt" = "0" ]; then
-            label="${mode}_unpaced"
-        else
-            label="${mode}_t${tgt}"
-            extra=(--target-rows-per-sec "$tgt")
-        fi
-        run_cell "$label" \
-            --mode "$mode" \
-            --uri "$DATASET_PREFIX/$RUN_ID/bp_${label}" \
-            --seed-rows "$SEED_ROWS" --batch-rows "$BATCH_ROWS" --calls "$CALLS" \
-            --cache-dir "$CACHE_DIR" "${extra[@]}"
-    done
-}
-
-# FTS-indexed async: expected sustainable rate is modest, sweep low.
-sweep async_idx 500 1000 1500 2000 2500 3000 0
-# No-index async: much higher ceiling.
-sweep async_noidx 2000 4000 6000 8000 0
-
-echo ""
-echo "=== backpressure summary ==="
-python3 - "$LOCAL_DIR" <<'PY'
-import glob, json, os, sys
-d = sys.argv[1]
-rows = []
-for p in sorted(glob.glob(os.path.join(d, "*.json"))):
-    try: r = json.load(open(p))
-    except Exception: continue
-    rows.append((os.path.basename(p)[:-5], r))
-print(f"{'cell':26s} {'target':>8s} {'rows/s':>9s} {'puts_r/s':>10s} {'drain_s':>8s} {'slow>=1s':>9s} {'backlog_rows':>13s}")
-for name, r in rows:
-    tgt = r.get('target_rows_per_sec') or 0
-    print(f"{name:26s} {tgt:>8.0f} {r['throughput_rows_per_sec']:>9.0f} "
-          f"{r['throughput_puts_rows_per_sec']:>10.0f} {r['elapsed_close_seconds']:>8.1f} "
-          f"{r['slow_puts_ge_1s']:>9d} {str(r.get('backlog_memtable_rows')):>13s}")
-PY
-echo ""
-echo "results: $LOCAL_DIR  +  $DATASET_PREFIX/$RUN_ID/results/"
diff --git a/bench/run_shard_writer_backpressure.sh b/bench/run_shard_writer_backpressure.sh
new file mode 100755
index 00000000000..15ad60f2382
--- /dev/null
+++ b/bench/run_shard_writer_backpressure.sh
@@ -0,0 +1,115 @@
+#!/usr/bin/env bash
+# Centralized MemWAL ShardWriter backpressure sweep — drives the
+# `mem_wal_shard_writer_backpressure` bench for BOTH the vector and the
+# FTS index. The methodology (paced async ingest, WAL-queue sampling,
+# skip-close) is identical; the only difference is `--index-type`, so the
+# vector and FTS numbers are directly comparable.
+#
+# Usage:
+#   INDEX_TYPE=fts    ./bench/run_shard_writer_backpressure.sh [run_id]
+#   INDEX_TYPE=vector ./bench/run_shard_writer_backpressure.sh [run_id]
+#
+# Finds the max sustainable async-indexed throughput: the highest paced
+# target where puts never block (slow>=1s == 0) and the WAL flush queue
+# does not accumulate (tail queue delta ~0).
+
+set -uo pipefail
+cd "$(dirname "${BASH_SOURCE[0]}")/.."
+
+INDEX_TYPE="${INDEX_TYPE:-fts}"
+RUN_ID="${1:-bp-${INDEX_TYPE}-$(date -u +%Y%m%dT%H%M%SZ)}"
+DATASET_PREFIX="${DATASET_PREFIX:-s3://jack-devland-build/bench/mem-fts-fineweb}"
+
+# Shared knobs — identical to the HNSW vector backpressure sweep so the
+# two index types are measured the same way.
+ROWS="${ROWS:-500000}"
+SEED_ROWS="${SEED_ROWS:-100000}"
+BATCH_ROWS="${BATCH_ROWS:-1000}"
+VECTOR_DIM="${VECTOR_DIM:-1024}"
+THREADS="${THREADS:-64}"
+TOKIO_THREADS="${TOKIO_THREADS:-64}"
+MAX_MEMTABLE_SIZE="${MAX_MEMTABLE_SIZE:-17179869184}"          # 16 GiB
+MAX_UNFLUSHED_BYTES="${MAX_UNFLUSHED_BYTES:-34359738368}"      # 32 GiB
+MAX_WAL_BUFFER_SIZE="${MAX_WAL_BUFFER_SIZE:-52428800}"        # 50 MiB
+MAX_WAL_FLUSH_INTERVAL_MS="${MAX_WAL_FLUSH_INTERVAL_MS:-0}"
+SAMPLE_INTERVAL_MS="${SAMPLE_INTERVAL_MS:-1000}"
+CONFIG_TIMEOUT="${CONFIG_TIMEOUT:-2400}"
+export AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}"
+
+# Paced async-indexed target sweep (rows/s). FTS indexing is heavier than
+# IVF/PQ so its sustainable ceiling is lower; the vector sweep can push
+# the same script higher via TARGETS=...
+case "$INDEX_TYPE" in
+    fts)    TARGETS_DEFAULT="500 1000 1500 2000 2500 3000 0" ;;
+    vector) TARGETS_DEFAULT="2000 4000 6000 8000 10000 0" ;;
+    *) echo "INDEX_TYPE must be fts|vector" >&2; exit 1 ;;
+esac
+TARGETS="${TARGETS:-$TARGETS_DEFAULT}"
+
+CALLS=$(( ROWS / BATCH_ROWS ))
+LOCAL_DIR="bench/results/${RUN_ID}"
+mkdir -p "$LOCAL_DIR"
+
+BIN="$(find target/release/deps -maxdepth 1 -type f -perm -111 \
+    -name 'mem_wal_shard_writer_backpressure-*' ! -name '*.d' 2>/dev/null | sort | tail -1)"
+if [ -z "$BIN" ]; then
+    cargo bench -p lance --bench mem_wal_shard_writer_backpressure --no-run
+    BIN="$(find target/release/deps -maxdepth 1 -type f -perm -111 \
+        -name 'mem_wal_shard_writer_backpressure-*' ! -name '*.d' 2>/dev/null | sort | tail -1)"
+fi
+echo "index_type=$INDEX_TYPE  bin=$BIN  run_id=$RUN_ID  rows=$ROWS"
+
+for tgt in $TARGETS; do
+    if [ "$tgt" = "0" ]; then label="async_idx_unpaced"; paced=()
+    else label="async_idx_t${tgt}"; paced=(--target-rows-per-sec "$tgt"); fi
+    out="$LOCAL_DIR/${label}.json"; log="$LOCAL_DIR/${label}.log"
+    echo ">>> $label"
+    if [ -f "$out" ]; then echo "    already done"; continue; fi
+    timeout "$CONFIG_TIMEOUT" "$BIN" --bench \
+        --mode async_idx --index-type "$INDEX_TYPE" --schema-shape fineweb \
+        --uri "$DATASET_PREFIX/$RUN_ID/bp_${label}" \
+        --seed-rows "$SEED_ROWS" --batch-rows "$BATCH_ROWS" --calls "$CALLS" \
+        --vector-dim "$VECTOR_DIM" \
+        --max-memtable-size "$MAX_MEMTABLE_SIZE" \
+        --max-unflushed-memtable-bytes "$MAX_UNFLUSHED_BYTES" \
+        --max-wal-buffer-size "$MAX_WAL_BUFFER_SIZE" \
+        --max-wal-flush-interval-ms "$MAX_WAL_FLUSH_INTERVAL_MS" \
+        --sample-interval-ms "$SAMPLE_INTERVAL_MS" \
+        --threads "$THREADS" --tokio-threads "$TOKIO_THREADS" \
+        --skip-close \
+        "${paced[@]}" --output "$out" > "$log" 2>&1
+    rc=$?
+    if [ "$rc" -eq 124 ]; then echo "    !!! TIMED OUT"
+    elif [ "$rc" -ne 0 ]; then echo "    !!! failed rc=$rc"
+    else echo "    ok"; fi
+    [ -f "$out" ] && aws s3 cp "$out" "$DATASET_PREFIX/$RUN_ID/results/${label}.json" >/dev/null 2>&1
+    aws s3 cp "$log" "$DATASET_PREFIX/$RUN_ID/results/${label}.log" >/dev/null 2>&1
+done
+
+echo ""
+echo "=== backpressure summary (index_type=$INDEX_TYPE) ==="
+python3 - "$LOCAL_DIR" "$ROWS" <<'PY'
+import glob, json, os, sys
+d, rows = sys.argv[1], int(sys.argv[2])
+print(f"{'cell':22s} {'target':>8s} {'rows/s':>9s} {'MB/s':>8s} {'p99_ms':>10s} "
+      f"{'slow>=1s':>9s} {'wal_q_end':>10s} {'wal_q_max':>10s} {'q_tail_delta':>13s}")
+for p in sorted(glob.glob(os.path.join(d, "*.json"))):
+    try: r = json.load(open(p))
+    except Exception: continue
+    name = os.path.basename(p)[:-5]
+    mem = r.get("final_memtable_stats") or {}
+    samples = r.get("samples") or []
+    q_end = int(mem.get("wal_queue_pending_rows") or 0)
+    q_max = max([s.get("wal_queue_pending_rows") or 0 for s in samples], default=0)
+    puts = [s for s in samples if s.get("phase") == "puts"
+            and isinstance(s.get("wal_queue_pending_rows"), int)]
+    tail = puts[-5:]
+    q_tail = (tail[-1]["wal_queue_pending_rows"] - tail[0]["wal_queue_pending_rows"]
+              if len(tail) >= 2 else 0)
+    tgt = r.get("target_rows_per_sec") or 0
+    print(f"{name:22s} {tgt:>8.0f} {r.get('throughput_rows_per_sec',0):>9.0f} "
+          f"{r.get('throughput_mb_per_sec',0):>8.2f} {r.get('p99_ms',0):>10.2f} "
+          f"{r.get('slow_puts_1s',0):>9d} {q_end:>10d} {q_max:>10d} {q_tail:>13d}")
+PY
+echo ""
+echo "results: $LOCAL_DIR  +  $DATASET_PREFIX/$RUN_ID/results/"
diff --git a/rust/lance/benches/mem_wal_shard_writer_backpressure.rs b/rust/lance/benches/mem_wal_shard_writer_backpressure.rs
index 1f28cf33619..1e9be110040 100644
--- a/rust/lance/benches/mem_wal_shard_writer_backpressure.rs
+++ b/rust/lance/benches/mem_wal_shard_writer_backpressure.rs
@@ -47,6 +47,7 @@ use lance::index::vector::VectorIndexParams;
 use lance_arrow::FixedSizeListArrayExt;
 use lance_core::Result;
 use lance_index::IndexType;
+use lance_index::scalar::inverted::tokenizer::InvertedIndexParams;
 use lance_index::vector::ivf::IvfBuildParams;
 use lance_index::vector::pq::builder::PQBuildParams;
 use lance_linalg::distance::DistanceType;
@@ -55,6 +56,8 @@ use uuid::Uuid;
 
 const VECTOR_COL: &str = "vec";
 const VECTOR_INDEX_NAME: &str = "vec_idx";
+const TEXT_COL: &str = "text";
+const FTS_INDEX_NAME: &str = "text_fts";
 const TEXT_BYTES: usize = 1_500;
 const ROW_BYTES_FINEWEB_SHAPE: usize = 5_760;
 const FINEWEB_FIXED_BYTES: usize = ROW_BYTES_FINEWEB_SHAPE - TEXT_BYTES - 1024 * size_of::<f32>();
@@ -103,6 +106,40 @@ impl Mode {
     }
 }
 
+/// Which index the MemTable maintains in the indexed (`*_idx`) modes.
+/// The backpressure methodology — paced ingest, WAL-queue sampling,
+/// skip-close — is identical for both; only the indexed column and the
+/// index built differ, so vector and FTS results are directly comparable.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+enum IndexKind {
+    Vector,
+    Fts,
+}
+
+impl IndexKind {
+    fn parse(value: &str) -> std::result::Result<Self, String> {
+        match value {
+            "vector" => Ok(Self::Vector),
+            "fts" => Ok(Self::Fts),
+            _ => Err(format!("unknown index-type '{value}', expected vector|fts")),
+        }
+    }
+
+    fn as_str(self) -> &'static str {
+        match self {
+            Self::Vector => "vector",
+            Self::Fts => "fts",
+        }
+    }
+
+    fn index_name(self) -> &'static str {
+        match self {
+            Self::Vector => VECTOR_INDEX_NAME,
+            Self::Fts => FTS_INDEX_NAME,
+        }
+    }
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq)]
 enum SchemaShape {
     FineWeb,
@@ -142,6 +179,7 @@ impl SchemaShape {
 struct Args {
     uri: Option<String>,
     mode: Mode,
+    index_kind: IndexKind,
     schema_shape: SchemaShape,
     seed_rows: usize,
     batch_rows: usize,
@@ -172,6 +210,7 @@ impl Default for Args {
         Self {
             uri: None,
             mode: Mode::AsyncIndexed,
+            index_kind: IndexKind::Vector,
             schema_shape: SchemaShape::FineWeb,
             seed_rows: 100_000,
             batch_rows: 1_000,
@@ -237,9 +276,10 @@ async fn run(args: Args) -> Result<()> {
     };
 
     println!(
-        "bench=mem_wal_shard_writer_backpressure uri={} mode={} schema_shape={} seed_rows={} batch_rows={} calls={} vector_dim={} text_bytes={} row_bytes={} target_rows_per_sec={:?} max_memtable_size={} max_memtable_rows={} max_memtable_batches={} max_unflushed_memtable_bytes={} max_wal_buffer_size={} max_wal_flush_interval_ms={} rayon_threads={} tokio_threads={} skip_close={}",
+        "bench=mem_wal_shard_writer_backpressure uri={} mode={} index_type={} schema_shape={} seed_rows={} batch_rows={} calls={} vector_dim={} text_bytes={} row_bytes={} target_rows_per_sec={:?} max_memtable_size={} max_memtable_rows={} max_memtable_batches={} max_unflushed_memtable_bytes={} max_wal_buffer_size={} max_wal_flush_interval_ms={} rayon_threads={} tokio_threads={} skip_close={}",
         uri,
         args.mode.as_str(),
+        args.index_kind.as_str(),
         args.schema_shape.as_str(),
         args.seed_rows,
         args.batch_rows,
@@ -260,7 +300,6 @@ async fn run(args: Args) -> Result<()> {
     );
 
     let schema = schema_for_shape(args.schema_shape, args.vector_dim);
-    let text_value = "x".repeat(args.text_bytes);
 
     let setup_start = Instant::now();
     let seed_batch = make_batch(
@@ -270,21 +309,24 @@ async fn run(args: Args) -> Result<()> {
         0,
         args.seed_rows,
         args.vector_dim,
-        &text_value,
+        args.text_bytes,
     )?;
     let batches = RecordBatchIterator::new([Ok(seed_batch)], schema.clone());
     let mut dataset = Dataset::write(batches, &uri, Some(WriteParams::default())).await?;
 
     let index_start = Instant::now();
     if args.mode.indexed() {
-        create_base_vector_index(&mut dataset, &args).await?;
+        match args.index_kind {
+            IndexKind::Vector => create_base_vector_index(&mut dataset, &args).await?,
+            IndexKind::Fts => create_base_fts_index(&mut dataset).await?,
+        }
     }
     let index_setup_s = index_start.elapsed().as_secs_f64();
 
     dataset
         .initialize_mem_wal()
         .maintained_indexes(if args.mode.indexed() {
-            vec![VECTOR_INDEX_NAME.to_string()]
+            vec![args.index_kind.index_name().to_string()]
         } else {
             vec![]
         })
@@ -333,7 +375,7 @@ async fn run(args: Args) -> Result<()> {
             args.seed_rows + call * args.batch_rows,
             args.batch_rows,
             args.vector_dim,
-            &text_value,
+            args.text_bytes,
         )?;
         batch_build_s += batch_start.elapsed().as_secs_f64();
 
@@ -475,6 +517,7 @@ async fn run(args: Args) -> Result<()> {
     let output = json!({
         "uri": uri,
         "mode": args.mode.as_str(),
+        "index_type": args.index_kind.as_str(),
         "schema_shape": args.schema_shape.as_str(),
         "seed_rows": args.seed_rows,
         "batch_rows": args.batch_rows,
@@ -562,6 +605,19 @@ async fn create_base_vector_index(dataset: &mut Dataset, args: &Args) -> Result<
         .map(|_| ())
 }
 
+async fn create_base_fts_index(dataset: &mut Dataset) -> Result<()> {
+    dataset
+        .create_index(
+            &[TEXT_COL],
+            IndexType::Inverted,
+            Some(FTS_INDEX_NAME.to_string()),
+            &InvertedIndexParams::default(),
+            true,
+        )
+        .await
+        .map(|_| ())
+}
+
 fn push_sample(
     samples: &mut Vec<serde_json::Value>,
     phase: &'static str,
@@ -660,6 +716,104 @@ fn vector_only_schema(vector_dim: usize) -> Arc<ArrowSchema> {
     Arc::new(ArrowSchema::new(vec![id_field(), vector_field(vector_dim)]))
 }
 
+/// Deterministic pseudo-random text of roughly `target_bytes` bytes for
+/// row `row`. Drawn word-by-word from a small fixed vocabulary so the
+/// `text` column carries a realistic token distribution — essential for
+/// the FTS index to do representative work, and harmless for the vector
+/// runs where `text` is inert payload of the same size.
+fn gen_text(row: usize, target_bytes: usize) -> String {
+    const VOCAB: &[&str] = &[
+        "data",
+        "vector",
+        "search",
+        "index",
+        "query",
+        "memory",
+        "table",
+        "write",
+        "read",
+        "shard",
+        "stream",
+        "batch",
+        "flush",
+        "token",
+        "score",
+        "model",
+        "system",
+        "record",
+        "format",
+        "column",
+        "engine",
+        "result",
+        "filter",
+        "metric",
+        "latency",
+        "throughput",
+        "cache",
+        "buffer",
+        "segment",
+        "lance",
+        "arrow",
+        "schema",
+        "field",
+        "value",
+        "object",
+        "store",
+        "cloud",
+        "remote",
+        "local",
+        "build",
+        "merge",
+        "scan",
+        "rank",
+        "match",
+        "phrase",
+        "fuzzy",
+        "boolean",
+        "recall",
+        "corpus",
+        "document",
+        "passage",
+        "sentence",
+        "language",
+        "machine",
+        "learning",
+        "training",
+        "dataset",
+        "feature",
+        "embedding",
+        "cluster",
+        "graph",
+        "node",
+        "edge",
+        "path",
+        "weight",
+        "layer",
+        "tensor",
+        "kernel",
+    ];
+    // SplitMix64-ish deterministic generator seeded by the row index.
+    let mut state = (row as u64)
+        .wrapping_mul(0x9E37_79B9_7F4A_7C15)
+        .wrapping_add(1);
+    let mut next = || {
+        state = state.wrapping_add(0x9E37_79B9_7F4A_7C15);
+        let mut z = state;
+        z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
+        z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
+        z ^ (z >> 31)
+    };
+    let mut out = String::with_capacity(target_bytes + 16);
+    while out.len() < target_bytes {
+        if !out.is_empty() {
+            out.push(' ');
+        }
+        out.push_str(VOCAB[(next() as usize) % VOCAB.len()]);
+    }
+    out.truncate(target_bytes);
+    out
+}
+
 fn make_batch(
     shape: SchemaShape,
     schema: &ArrowSchema,
@@ -667,7 +821,7 @@ fn make_batch(
     start_row: usize,
     num_rows: usize,
     vector_dim: usize,
-    text_value: &str,
+    text_bytes: usize,
 ) -> Result<RecordBatch> {
     let ids = StringArray::from_iter_values(
         (0..num_rows).map(|i| format!("{id_prefix}-{:012}", start_row + i)),
@@ -697,7 +851,8 @@ fn make_batch(
         .map_err(Into::into);
     }
 
-    let text = StringArray::from_iter_values((0..num_rows).map(|_| text_value));
+    let text =
+        StringArray::from_iter_values((0..num_rows).map(|i| gen_text(start_row + i, text_bytes)));
     let dump = StringArray::from_iter_values((0..num_rows).map(|i| match (start_row + i) % 5 {
         0 => "CC-MAIN-2023-50",
         1 => "CC-MAIN-2024-10",
@@ -787,6 +942,10 @@ fn parse_args() -> Result<Args> {
             "--mode" => {
                 args.mode = Mode::parse(&value).map_err(lance_core::Error::invalid_input)?;
             }
+            "--index-type" => {
+                args.index_kind =
+                    IndexKind::parse(&value).map_err(lance_core::Error::invalid_input)?;
+            }
             "--schema-shape" => {
                 args.schema_shape =
                     SchemaShape::parse(&value).map_err(lance_core::Error::invalid_input)?;
@@ -861,12 +1020,23 @@ fn parse_args() -> Result<Args> {
             max_memtable_rows, args.batch_rows
         )));
     }
-    if args.mode.indexed() && args.vector_dim % args.num_sub_vectors != 0 {
+    if args.mode.indexed()
+        && args.index_kind == IndexKind::Vector
+        && args.vector_dim % args.num_sub_vectors != 0
+    {
         return Err(lance_core::Error::invalid_input(format!(
             "vector_dim must be divisible by num_sub_vectors for IVF_PQ: vector_dim={}, num_sub_vectors={}",
             args.vector_dim, args.num_sub_vectors
         )));
     }
+    if args.mode.indexed()
+        && args.index_kind == IndexKind::Fts
+        && args.schema_shape != SchemaShape::FineWeb
+    {
+        return Err(lance_core::Error::invalid_input(
+            "index-type=fts requires schema-shape=fineweb (it has the text column)",
+        ));
+    }
 
     Ok(args)
 }

From 95d2896797eca37425de2a8867863cea271283fe Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sat, 16 May 2026 12:00:50 -0700
Subject: [PATCH 15/37] bench: add Lance FtsMemIndex vs Apache Lucene FTS
 comparison
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mirrors the HNSW-vs-hnswlib comparison for full-text search:

- mem_wal_fts_bench.rs — Lance side. `gen` slices a FineWeb corpus and
  writes the shared inputs (corpus.txt, corpus_tok.txt, queries.txt,
  exact-BM25 truth.txt); `bench` builds a raw FtsMemIndex and measures
  build throughput, term/phrase query latency + QPS, and recall@k.
- lucene_fts_bench/LuceneFtsBench.java — Lucene reference, in-memory
  ByteBuffersDirectory + BM25Similarity(1.2,0.75), same inputs, same
  metrics, same JSON output shape.
- run_fts_compare.sh — builds Lucene from the local checkout (JDK 25),
  builds both benches, runs Run A (pre-tokenized, isolates index +
  scorer) and Run B (native analyzers) across a 100k/500k/1M sweep,
  reports both impls side by side plus their mutual top-k overlap.
---
 bench/lucene_fts_bench/LuceneFtsBench.java | Bin 0 -> 10756 bytes
 bench/run_fts_compare.sh                   | 131 ++++
 rust/lance/Cargo.toml                      |   4 +
 rust/lance/benches/mem_wal_fts_bench.rs    | 703 +++++++++++++++++++++
 4 files changed, 838 insertions(+)
 create mode 100644 bench/lucene_fts_bench/LuceneFtsBench.java
 create mode 100755 bench/run_fts_compare.sh
 create mode 100644 rust/lance/benches/mem_wal_fts_bench.rs

diff --git a/bench/lucene_fts_bench/LuceneFtsBench.java b/bench/lucene_fts_bench/LuceneFtsBench.java
new file mode 100644
index 0000000000000000000000000000000000000000..48bc91f0bd708f15a12ebdfd72b16962e8de903e
GIT binary patch
literal 10756
zcmb_i{chVv62HIm6jK2~s92)ywCN#E?4F$uG)3a1k=q^+7YZe=B*qj;bC-79T!4Fs
zd%}B?`_1ewKP1_4?=C<RlDj)QJ72$<S#59A=<Tz=4__rym1#A68LP}BvqY79bXY{w
zg&IBxckXPvX)lsgJ<W@&GMO*TiTY&rsD7oBg`!tcHdS<3nMGb|EaR1y4!x?dg(98A
zit-s5%zkk)qG`S?qB7BWM*sfjzvOeNW~x+7XcB3aCRjDgOC}}c?`e8r^lP<znZ@eU
zfTAo)^LburGC3_WeVHZ`N-`n!<MHUX))t{->79yVOa}{k5tS;&^rAAFiZYMEW=xYS
z`mj{X@nw{bXGV`FU}r&dm3?3~f@xkBl@84(L)#$FFJqIRxew=+Dih_0O$ll6NPj?+
zDoL?%lrfbg8)A#Ryz-(+_&gLGT^V&$%^(B)EGgC0U<%eQRivv@X-k9&m_?aa^948u
z#h8pLm)pgngv^LpxS}*N;QNZU=;ybifykjUpa7y#n*QyKiA3`#$+Qt+s87+<43Azv
zcu2xE%QhypOdy(wej2?wCeen8=5rPE>JQ1XNcU4w?T1={I$DZk3q_F?v$YG6qSwJM
zI-09JCPMTvx`=2vE$E3Qg%71Z*bALxpzb44z;Fnm#YsueUcRH@u&gqQ{xQMR8SU=i
zKS)G0`~7!!?trtrG(tK|^3Y0D#jJW&KE7`Fw=#ho*M`~fuUW}Mn^tAmbcV6;ZDba@
zOR+LZ8XlHqboDCHX2aNzkzS0{rjctFY@R&LvuRbrmrVHllbTjArI9Kx;Orar+%S0)
z+pTFYDg%prT+2Ey=V2sKBeZ%9**C9rqQgUe_@gD+8|I$mXDS;RsB&4edBb%0p9+6l
zKyMm*t3R`oHZbt0jKh(8_%n;)@G6UpDU%6%RrnO4<ODu==s(@Se4J0KC9CC`d$`Tq
z7l}&a+e{om&nmw4bPnWSn=G*@tmDy&FRW|F&X)+i;vBb_`!vsH$^3?sTI#K1Himi~
zdDhGg6t#-V=^{MJ^HfFI(=@7d<0I=dyVHN>X!~{IB8`!3bfY6U;%_CC-;(w%r;XTI
zU~^w*mgRLzq9=I)k~gWqok%CkBn8+TMn{I3k^gX`^w^riO{jR&DP&P?uZU4aHA#~x
zP18v0Rb2Ue=ME8qfx!d>o+O)7RL%qY`2Kf_2LAIbjpp@BtY&@sjHSd03I$1KD4~7Y
zd5r&_P!y&ro0|m<X%}C%w)|wGWJW>sKKVUV=T(&IfXVjz0A>VKVA}fxn7#Y^V<G)T
z{&hpxlD~dwG0ceY#Ikw-3Q_zD8IyL*QvvkpOR8|G>G`LrDolc0%W~aWnj??cr%YW^
zjZ9Atg1%+KAe{)h7)qs^-u)afg$6GJVE|{dJCYY9ClZ!4(d7a-NkJgV#XgmxKSh9b
z-+-qoP5T7?!YGbqmgA};8jLeR5zY4f`y?=u{DB+<sui7HCQt@veMlUTMG80plR~<`
zjfGDC9htZ(fyafGB^X4_|CeA}h)2k^_F+1LF3>7KFzE4Ky#X>jY}rV14RDCj2&;`|
zLDe`EtW6BX3!D&%=OeLH#w)wl2nLdDS}KMr;gJ>M^}w5fO>21=@Y=*8mBpUcwJLJr
zmKbAmT6y$#dxjW4ppG&`B-)>!UC4e36a9Qym@8(^TN%>I*O5W}qAAPGf<2nci!-GP
zn~|PNhKxf~lqT~GIe%>(n6FCW%r+F)Bkz}N*>zkLJLk7rJ-xq-$5}qs)pWttfXc?>
zo)?Q11M|!W^KRd>={jcBjJKjba#oZJU}fSQZNr!svx`7h8DwY=#Sl1vv?k&-p7okp
z7?M|*3oVmS$`enq;n+!8!<T|S?eEiW!>(KF$y*irw`McpC7i>=cKUu}yMaSoS-5uK
zO@p34iMSii>n)&r?&}!qqq6FNe)wjJlsO#z_~w^a&o+gbj+i9aIAn1KeUT?IEm2gp
z;G>*hTpKFf5t0mG#hwq`!QYDk^?bDh&+l!UWwFhbsI8Y(&#^W|(#hd$btOLWvZ&V&
zdCDOR-s+n{>I~CgW=72s{7`*sg%{qn+4+oD@9xz0%d32K^|iHb9#d!$V0U%p*%Lsd
z$U};?eR@}+>9AC6AMP>ehbRe=EKE}Mw#=tWYcv#C_2Q9EW=LjGc{A4FJR*dBx)<G}
zAE;-Cd{N$mKe$g^iTA7|I>7}QIR{>kJUWoFO-xo)!{xKzt-l`jI<zFV2nyd!vBwi#
zrvQpti=`c+Qjy<=Xx(sFB`KmqV1$h65bXza8xS6&l+S!M6fcL1b0YzL4T**kL$j-^
zYPht*b99bsI%Ra(T4}pI0NK$>Wo1KZgQu=8TvZ^4koB4l;b%`WlvS!eD0GubG@F(n
z%(S`#!EX3qHlUsGuMcP5rf{Gk{#YoIN0C>ZLQ*XLDa1B(Xgmj!<lB<RmNArEP8}0#
zV}VaWyl-AkLqJlK76&+i8%uLwm_%qC9*vEC8eM6lmSGlU`3c;|b0Dc&Yb&~3-#)-$
z2FzItd72b9#sbb^V<fTc7oiRP{H2}55+UuEGM0D+OU8fjyl`q9*8tQi*n(aXBS0jv
zM}zLn6PTcsq&0*H9}FN^GC+QbQW#Y`R|dy#+B7bCx0;RE36Il4-T|3ShzN>~mgmrZ
zk8@N9=r=Opj&PP>O}0ql8{zlS_S+42)FT&<Y9o0k-6e@4!0mln83^-2Wh{HgmT1XW
zgk-BES;w;M3<m6^%CGY?uQisIq%{)Tx*fnFu36mr&f@U#LEJ!KLHdXYP>nluLK3Gk
zOp&?WP&3F}0p5d}(6y7>a4vPgo^4`E#B1^m8~IU^71)>Og}O+g&E6>|$luoL=1i;b
z_16;&AbbO4vZ^jR9ER{2K&p7Z<I1hNLg9eM1HKNc2M{S`;6c5@2UTQU96J;4B&#N(
zK9-_mLnuXcrZvQy7-yXz1d!yeZYW{;XwQdTTepBE0T*A<p#PLysPxJws_sxShkXcq
zc5snd-y@vN265VyU}KFhdwh5r*j<@oEn!i<xHdo4pFN~;r6vv|1|V543K*M{y2+9}
zc@~s;;xdwh1KWKDAJ-Vx23u*+v1KbQn&j=&byl0oc0F?zvqHtVL>A($>r`*Gvvga*
ze`DIEhZJ4GT3Xdgw0#V+2+b1GT~T-px{S)@uqr6{>E%hkalSf2mx|3FA(|gx-SW^5
z%`#IYPv=m$Q%b+(`_hIA&Ife1nFWoKDTNJ7%GSC`FnNJ)JBJ7{SsOUvkGO02#mm@z
z^rm3Z!#Rrt1^K6??z~`+x<=aqeA;{k^=jF)8Zcc3R$4GZ*6d*4wOH?+!!@?ZaD-^W
zrW2jZ-2{^WNnVa*MS<oH7Hz~4gXGuZ$G3i`jzEBlUXmN4s<&69quj|O9Yet;4z>GG
z7s!lupd>EZhJX-IDySXkn4kNc<T|hgDEGq*qh&JvHA*Wa73csM_gFi5ZOBlZ(8oTt
z8S%ajPocHzgy0nFB>^ODtvfIDF2}B}0wFZ4TEVGhm72txDXzI1(*)n_A1uo6EcKN;
z5Gv5^#i;+RLs;O8<dcd|#OmK-*w-xXbYe8iui2t#(_W6Wg4#@OOK#9?ZNi*6uyez;
z`2cvjjc<Q;7y<*aJBeoajG?Y3obn)j9v-Y4GTUxrJ`OaSoXRTO{wpl5oQ!5LK(}R|
zW>PXaaXB-9z;L3CzTjlJVg{D!MP=grGV__5BL>?zX4?o@7yVTexo=gzO(48tDgM@z
zE;PwarxXPg*m5Y((yLaeb(<OVcizcuDXU~awqMhHv3A9l#xsxGljOyXBN=I6xB;kI
z;kS)hNI}||#$w<`v&9qFpZUSP+PDPOa9G`ddKSVu2)3FcA(U~^ff5iKA@HBsau~*9
z`;vXW;KacLv$w}<`6e0th8*R*S6a)dA%BnwtYrH_b(_wtgm{{7j%mM(>y2+{lJWG8
zDz><C$E^kr*gtmocx$Wq_=Qx8yYbJhSuOpplMt152)LDPk61I5Us+aS3<tD>RzzzH
zBzH2fhdPPtjqo@+w(L17?K3eQcL=hS+M|etey~eTSgi91zRhOM0kYZHxc42G$QVmh
z=1WA0C>`c_OQQmz6}d!>>z#hTEoW(he7Ke!Ta|D8+wr>b?_Sp$C_Q^AJccXuaKFJG
zu|+-fXkb{}mFaXqRtwX0gya@yan^W~&(Dus*HrtqsZEOA=eA+i{`V2GM!@|DUt+}c
z<?hFJ*k!X~6aC2AKqiI$Vu5Nd4(`#t8W?#}Gs3jk4L>%)v?)Xh@(eFoK>=2*^?sNV
zCP2(w0X1BE+Y?1tvqAQS?wCR61|<=vVQBB?%^Lo3McDk}0LR1Rz^>@V>Ea=$Lw3a1
zPH6YC6*mTph+=2F=liZcx7imVN)||BCj^1}<Y;>exc%z)UH0eg{L#bzaU&48aU%=V
zYRT7z47s*^220Te?#7VR_$vdPZ3jmxBRka<=OSH~7xH~af%_vf3u^1{;l2ckhg=f4
zpTDp@@88wZFxkHwdtl;6se{IP|8Ds0%)KD*(&Ivv@^xoMy_z&D9&T|yhBn3(FWQ|E
z-X1+{-U>b5H5kD{z~T6A>|WNWaJ(5KGUDexnEu_&1&z)p{+-iuBKzD$)3{~?=NYuy
zKeh<ucBS<X_j4xvu!s-iJ|Zs3X-5qmoCCo5=poi~T8Apowt?mSnBT#qh16fW2#Z%R
zXH{qu=I@=s-lUq(r#-&AIqmJ8x`=by8=UrV9Qkxt%PaGY_q#EMT$<@$&+NmRFi(3<
zkvtAzVx|4iC{spuXM+uEy+UOQx{uh_DYf-Q6zhp#2HrE{-!~9#>31BNv2boVv*_Ry
zy)wocpRi*v<@n2=)cV<ft$0b@Zl`ajXXmzvXS$AoXrFO_^B?N_-chymKP<$r#NHjg
z=8moX`5e%We~N{E@E?c<d=Xgpr_sB>kM3r9Zp6pCcfk@x>d;U(^O&&59hqWs6LBT6
zW6_+!(`OliF|I9>@$cgHcy*ZK(*7__+gFyjhZQnNM0f@>{7Qa55b(MhVjKX_l&?1_
jXs+~?XNd^-KT+BY)ePI)-jZGBuIpqY=Jm^+FL(X}wt+!p

literal 0
HcmV?d00001

diff --git a/bench/run_fts_compare.sh b/bench/run_fts_compare.sh
new file mode 100755
index 00000000000..e06b5c92274
--- /dev/null
+++ b/bench/run_fts_compare.sh
@@ -0,0 +1,131 @@
+#!/usr/bin/env bash
+# FTS comparison driver — Lance FtsMemIndex vs Apache Lucene.
+# Sibling of run_mem_wal_hnsw_compare.sh (Lance HNSW vs hnswlib).
+#
+# Builds both benches, generates one shared FineWeb corpus + query set per
+# size, runs each impl in Run A (pre-tokenized) and Run B (native
+# analyzers), and prints build/query/recall side by side.
+#
+# Usage: ./bench/run_fts_compare.sh [run_id]
+#
+# Env:
+#   SIZES        doc-count sweep (default "100000 500000 1000000")
+#   K            top-k (default 10)
+#   THREADS      query threads for the multi-thread QPS run
+#   LUCENE_DIR   Lucene checkout (default ~/oss/lucene)
+#   LUCENE_CP    pre-built Lucene classpath; if set, the Lucene build is skipped
+#   JAVA_HOME    JDK 25 home; if unset the script searches common locations
+#   CACHE_DIR    FineWeb shard cache (default /mnt/data/fineweb)
+
+set -uo pipefail
+cd "$(dirname "${BASH_SOURCE[0]}")/.."
+REPO_ROOT="$(pwd -P)"
+
+RUN_ID="${1:-fts-compare-$(date -u +%Y%m%dT%H%M%SZ)}"
+SIZES="${SIZES:-100000 500000 1000000}"
+K="${K:-10}"
+THREADS="${THREADS:-$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 8)}"
+LUCENE_DIR="${LUCENE_DIR:-$HOME/oss/lucene}"
+CACHE_DIR="${CACHE_DIR:-/mnt/data/fineweb}"
+WORK="${WORK:-/tmp/fts_compare/$RUN_ID}"
+RESULT_DIR="$REPO_ROOT/bench/results/$RUN_ID"
+mkdir -p "$WORK" "$RESULT_DIR"
+
+# ---- locate JDK 25 ----
+if [ -z "${JAVA_HOME:-}" ]; then
+    for cand in /usr/lib/jvm/java-25-* /usr/lib/jvm/jdk-25* /opt/jdk-25* \
+                "$HOME/.sdkman/candidates/java/25"*; do
+        [ -x "$cand/bin/java" ] && JAVA_HOME="$cand" && break
+    done
+fi
+if [ -z "${JAVA_HOME:-}" ] || [ ! -x "$JAVA_HOME/bin/java" ]; then
+    echo "ERROR: JDK 25 not found; set JAVA_HOME" >&2; exit 1
+fi
+JAVA="$JAVA_HOME/bin/java"
+JAVAC="$JAVA_HOME/bin/javac"
+echo "JDK: $($JAVA -version 2>&1 | head -1)"
+
+# ---- build Lucene classpath ----
+if [ -z "${LUCENE_CP:-}" ]; then
+    echo "=== Building Lucene jars ($LUCENE_DIR) ==="
+    ( cd "$LUCENE_DIR" && JAVA_HOME="$JAVA_HOME" ./gradlew -q \
+        :lucene:core:jar :lucene:analysis:common:jar ) || {
+        echo "ERROR: Lucene build failed" >&2; exit 1; }
+    CORE_JAR="$(find "$LUCENE_DIR/lucene/core/build/libs" -name 'lucene-core-*.jar' | head -1)"
+    ANALYSIS_JAR="$(find "$LUCENE_DIR/lucene/analysis/common/build/libs" -name 'lucene-analysis-common-*.jar' | head -1)"
+    LUCENE_CP="$CORE_JAR:$ANALYSIS_JAR"
+fi
+echo "Lucene classpath: $LUCENE_CP"
+
+# ---- build the Lance bench ----
+echo "=== Building Lance FTS bench ==="
+cargo bench -p lance --bench mem_wal_fts_bench --no-run
+LANCE_BIN="$(find "$REPO_ROOT/target/release/deps" -maxdepth 1 -type f -perm -111 \
+    -name 'mem_wal_fts_bench-*' ! -name '*.d' | sort | tail -1)"
+echo "Lance bench: $LANCE_BIN"
+
+# ---- compile the Lucene bench ----
+echo "=== Compiling Lucene FTS bench ==="
+"$JAVAC" -cp "$LUCENE_CP" -d "$WORK" \
+    "$REPO_ROOT/bench/lucene_fts_bench/LuceneFtsBench.java" \
+    || { echo "ERROR: javac failed" >&2; exit 1; }
+
+mutual_overlap() {  # $1=topk file A  $2=topk file B  $3=k
+    python3 - "$1" "$2" "$3" <<'PY'
+import sys
+a, b, k = sys.argv[1], sys.argv[2], int(sys.argv[3])
+la = [set(l.split()) for l in open(a)]
+lb = [set(l.split()) for l in open(b)]
+n = min(len(la), len(lb))
+if n == 0:
+    print("nan"); sys.exit()
+tot = sum(len(la[i] & lb[i]) / max(len(la[i] | lb[i]), 1) for i in range(n))
+print(f"{tot / n:.4f}")
+PY
+}
+
+echo ""
+for SIZE in $SIZES; do
+    DIR="$WORK/n$SIZE"
+    mkdir -p "$DIR"
+    echo "############ corpus size = $SIZE ############"
+    echo "--- generating shared corpus + queries ---"
+    "$LANCE_BIN" --bench gen --docs "$SIZE" --out-dir "$DIR" \
+        --cache-dir "$CACHE_DIR" --k "$K" > "$RESULT_DIR/gen_n$SIZE.log" 2>&1 || {
+        echo "  !!! gen failed (see gen_n$SIZE.log)"; continue; }
+
+    for RUN in a b; do
+        echo "--- run $RUN: lance ---"
+        "$LANCE_BIN" --bench bench --in-dir "$DIR" --run "$RUN" --k "$K" \
+            --threads "$THREADS" | tee "$RESULT_DIR/lance_n${SIZE}_run${RUN}.txt" \
+            | grep '^{' > "$RESULT_DIR/lance_n${SIZE}_run${RUN}.json"
+        echo "--- run $RUN: lucene ---"
+        "$JAVA" -cp "$LUCENE_CP:$WORK" LuceneFtsBench --in-dir "$DIR" --run "$RUN" \
+            --k "$K" --threads "$THREADS" | tee "$RESULT_DIR/lucene_n${SIZE}_run${RUN}.txt" \
+            | grep '^{' > "$RESULT_DIR/lucene_n${SIZE}_run${RUN}.json"
+        ov="$(mutual_overlap "$DIR/lance_fts_run${RUN}_topk.txt" \
+                             "$DIR/lucene_run${RUN}_topk.txt" "$K")"
+        echo "    lance<->lucene mutual top-$K overlap (run $RUN) = $ov"
+        echo "$ov" > "$RESULT_DIR/overlap_n${SIZE}_run${RUN}.txt"
+    done
+    echo ""
+done
+
+echo "=== summary ==="
+python3 - "$RESULT_DIR" "$K" <<'PY'
+import glob, json, os, sys
+d, k = sys.argv[1], sys.argv[2]
+print(f"{'size':>9} {'run':>4} {'impl':>10} {'build_dps':>11} {'q_p50_us':>10} "
+      f"{'q_p95_us':>10} {'qps_1t':>9} {'qps_nt':>10} {'term_rec':>9} {'phr_rec':>9}")
+for p in sorted(glob.glob(os.path.join(d, "*_n*_run*.json"))):
+    try: r = json.load(open(p))
+    except Exception: continue
+    print(f"{r['docs']:>9} {r['run']:>4} {r['impl']:>10} {r['build_docs_per_s']:>11.0f} "
+          f"{r['q_p50_us']:>10.1f} {r['q_p95_us']:>10.1f} {r['qps_1t']:>9.0f} "
+          f"{r['qps_nt']:>10.0f} {r['term_recall_at_k']:>9.3f} {r['phrase_recall_at_k']:>9.3f}")
+for p in sorted(glob.glob(os.path.join(d, "overlap_*.txt"))):
+    name = os.path.basename(p)[:-4]
+    print(f"  {name} = {open(p).read().strip()}")
+PY
+echo ""
+echo "results: $RESULT_DIR"
diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml
index 19eb1aa9c8a..9053738e657 100644
--- a/rust/lance/Cargo.toml
+++ b/rust/lance/Cargo.toml
@@ -240,5 +240,9 @@ harness = false
 name = "mem_wal_fineweb_fts"
 harness = false
 
+[[bench]]
+name = "mem_wal_fts_bench"
+harness = false
+
 [lints]
 workspace = true
diff --git a/rust/lance/benches/mem_wal_fts_bench.rs b/rust/lance/benches/mem_wal_fts_bench.rs
new file mode 100644
index 00000000000..4364fe053c9
--- /dev/null
+++ b/rust/lance/benches/mem_wal_fts_bench.rs
@@ -0,0 +1,703 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright The Lance Authors
+
+//! Lance `FtsMemIndex` microbenchmark — the Lance side of the FTS-vs-Lucene
+//! comparison (sibling of `mem_wal_hnsw_bench` for the HNSW-vs-hnswlib one).
+//!
+//! Two sub-commands:
+//!
+//!   gen   — slice a FineWeb corpus and write the shared comparison inputs:
+//!             corpus.txt        one raw document per line
+//!             corpus_tok.txt    one canonically-tokenized document per line
+//!             queries.txt       <type>\t<raw>\t<tok> per line (term|phrase)
+//!             truth.txt         exact-BM25 top-k doc ids per query (Run A)
+//!           Both this bench and the Lucene bench consume these files, so the
+//!           inputs are bit-identical.
+//!
+//!   bench — build a `FtsMemIndex` over the corpus and measure build
+//!           throughput, term/phrase query latency + QPS, recall@k, memory.
+//!           --run a  pre-tokenized: corpus_tok.txt + a whitespace tokenizer
+//!                    (isolates the inverted index + BM25 scorer).
+//!           --run b  native: corpus.txt + the default Lance tokenizer.
+//!
+//! Emits one `result ...` human line and one JSON line tagged
+//! `impl=lance_fts`, matching the `mem_wal_hnsw_bench` output convention.
+
+#![recursion_limit = "256"]
+#![allow(clippy::print_stdout, clippy::print_stderr)]
+
+use std::collections::HashMap;
+use std::io::{BufRead, BufWriter, Write};
+use std::sync::Arc;
+use std::time::{Duration, Instant};
+
+use arrow_array::{Array, ArrayRef, Int64Array, RecordBatch, StringArray};
+use arrow_schema::{DataType, Field, Schema as ArrowSchema};
+use lance::dataset::mem_wal::index::{FtsMemIndex, FtsQueryExpr, SearchOptions};
+use lance_core::Result;
+use lance_index::scalar::inverted::tokenizer::InvertedIndexParams;
+use lance_tokenizer::TokenStream;
+use parquet::arrow::async_reader::ParquetRecordBatchStreamBuilder;
+use rayon::prelude::*;
+
+const TEXT_COL: &str = "text";
+const HF_API_LISTING: &str =
+    "https://huggingface.co/api/datasets/HuggingFaceFW/fineweb/tree/main/sample/10BT";
+const HF_FILE_BASE: &str = "https://huggingface.co/datasets/HuggingFaceFW/fineweb/resolve/main/";
+const BM25_K1: f64 = 1.2;
+const BM25_B: f64 = 0.75;
+
+// ----------------------------------------------------------------------
+// HuggingFace FineWeb loading (gen mode only)
+// ----------------------------------------------------------------------
+
+#[derive(serde::Deserialize)]
+struct HfTreeEntry {
+    #[serde(rename = "type")]
+    kind: String,
+    path: String,
+}
+
+async fn list_shard_paths() -> Result<Vec<String>> {
+    let entries: Vec<HfTreeEntry> = reqwest::get(HF_API_LISTING)
+        .await
+        .map_err(|e| lance_core::Error::io(format!("listing HTTP: {e}")))?
+        .json()
+        .await
+        .map_err(|e| lance_core::Error::io(format!("listing JSON: {e}")))?;
+    let mut shards: Vec<String> = entries
+        .into_iter()
+        .filter(|e| e.kind == "file" && e.path.ends_with(".parquet"))
+        .map(|e| e.path)
+        .collect();
+    shards.sort();
+    Ok(shards)
+}
+
+async fn download_shard(rel_path: &str, dest: &std::path::Path) -> Result<()> {
+    if dest.exists() {
+        return Ok(());
+    }
+    let url = format!("{HF_FILE_BASE}{rel_path}");
+    let tmp = dest.with_extension("part");
+    for attempt in 1..=5u32 {
+        println!("downloading {rel_path} (attempt {attempt}/5) ...");
+        let r: Result<bytes::Bytes> = async {
+            let resp = reqwest::get(&url)
+                .await
+                .map_err(|e| lance_core::Error::io(format!("HTTP: {e}")))?;
+            if !resp.status().is_success() {
+                return Err(lance_core::Error::io(format!("status {}", resp.status())));
+            }
+            resp.bytes()
+                .await
+                .map_err(|e| lance_core::Error::io(format!("body: {e}")))
+        }
+        .await;
+        match r {
+            Ok(bytes) => {
+                std::fs::write(&tmp, &bytes)
+                    .map_err(|e| lance_core::Error::io(format!("write: {e}")))?;
+                std::fs::rename(&tmp, dest)
+                    .map_err(|e| lance_core::Error::io(format!("rename: {e}")))?;
+                return Ok(());
+            }
+            Err(e) if attempt < 5 => {
+                eprintln!("  retry: {e}");
+                tokio::time::sleep(Duration::from_secs(2u64.pow(attempt))).await;
+            }
+            Err(e) => return Err(e),
+        }
+    }
+    unreachable!()
+}
+
+async fn read_shard_text(
+    path: &std::path::Path,
+    out: &mut Vec<String>,
+    max_rows: usize,
+) -> Result<usize> {
+    let file = tokio::fs::File::open(path)
+        .await
+        .map_err(|e| lance_core::Error::io(format!("open parquet: {e}")))?;
+    let mut stream = ParquetRecordBatchStreamBuilder::new(file)
+        .await
+        .map_err(|e| lance_core::Error::io(format!("parquet builder: {e}")))?
+        .build()
+        .map_err(|e| lance_core::Error::io(format!("parquet stream: {e}")))?;
+    let mut taken = 0usize;
+    use futures::TryStreamExt;
+    while taken < max_rows {
+        let Some(rb) = stream
+            .try_next()
+            .await
+            .map_err(|e| lance_core::Error::io(format!("parquet read: {e}")))?
+        else {
+            break;
+        };
+        let col = rb
+            .column_by_name("text")
+            .ok_or_else(|| lance_core::Error::io("text column missing".to_string()))?;
+        let strs = col
+            .as_any()
+            .downcast_ref::<StringArray>()
+            .ok_or_else(|| lance_core::Error::io("text not StringArray".to_string()))?;
+        for i in 0..strs.len() {
+            if taken >= max_rows {
+                break;
+            }
+            if !strs.is_null(i) {
+                out.push(strs.value(i).to_string());
+                taken += 1;
+            }
+        }
+    }
+    Ok(taken)
+}
+
+async fn load_fineweb(docs: usize, cache_dir: &std::path::Path) -> Result<Vec<String>> {
+    std::fs::create_dir_all(cache_dir)
+        .map_err(|e| lance_core::Error::io(format!("mkdir cache: {e}")))?;
+    let shards = list_shard_paths().await?;
+    let mut buf: Vec<String> = Vec::with_capacity(docs);
+    for rel in &shards {
+        if buf.len() >= docs {
+            break;
+        }
+        let name = rel.rsplit('/').next().unwrap_or(rel);
+        let local = cache_dir.join(name);
+        download_shard(rel, &local).await?;
+        let want = docs - buf.len();
+        let got = read_shard_text(&local, &mut buf, want).await?;
+        println!("  shard {name}: {got} docs (total {})", buf.len());
+    }
+    if buf.len() < docs {
+        return Err(lance_core::Error::io(format!(
+            "fineweb yielded {} docs, need {docs}",
+            buf.len()
+        )));
+    }
+    Ok(buf)
+}
+
+// ----------------------------------------------------------------------
+// gen
+// ----------------------------------------------------------------------
+
+fn one_line(s: &str) -> String {
+    s.replace(['\n', '\r', '\t'], " ")
+}
+
+/// Canonical tokenizer used to produce `corpus_tok.txt` and the `tok` query
+/// field. Both the Lance Run-A and Lucene Run-A indexes consume its output
+/// verbatim (via a whitespace analyzer), so the choice does not bias either
+/// side — it only fixes a single shared token stream.
+fn canonical_tokenize(
+    tokenizer: &mut dyn lance_index::scalar::inverted::tokenizer::document_tokenizer::LanceTokenizer,
+    text: &str,
+) -> Vec<String> {
+    let mut stream = tokenizer.token_stream_for_doc(text);
+    let mut out = Vec::new();
+    while let Some(t) = stream.next() {
+        out.push(t.text.clone());
+    }
+    out
+}
+
+async fn run_gen(args: &GenArgs) -> Result<()> {
+    let raw = load_fineweb(args.docs, &args.cache_dir).await?;
+    std::fs::create_dir_all(&args.out_dir)
+        .map_err(|e| lance_core::Error::io(format!("mkdir out: {e}")))?;
+
+    let mut tokenizer = InvertedIndexParams::default()
+        .build()
+        .expect("default tokenizer builds");
+
+    // corpus.txt + corpus_tok.txt
+    let corpus_path = args.out_dir.join("corpus.txt");
+    let tok_path = args.out_dir.join("corpus_tok.txt");
+    let mut corpus_w = BufWriter::new(
+        std::fs::File::create(&corpus_path)
+            .map_err(|e| lance_core::Error::io(format!("create corpus.txt: {e}")))?,
+    );
+    let mut tok_w = BufWriter::new(
+        std::fs::File::create(&tok_path)
+            .map_err(|e| lance_core::Error::io(format!("create corpus_tok.txt: {e}")))?,
+    );
+    let mut tok_docs: Vec<Vec<String>> = Vec::with_capacity(raw.len());
+    for doc in &raw {
+        writeln!(corpus_w, "{}", one_line(doc)).ok();
+        let toks = canonical_tokenize(tokenizer.as_mut(), doc);
+        writeln!(tok_w, "{}", toks.join(" ")).ok();
+        tok_docs.push(toks);
+    }
+    corpus_w.flush().ok();
+    tok_w.flush().ok();
+    println!("wrote {} docs to corpus.txt / corpus_tok.txt", raw.len());
+
+    // Term + phrase queries sampled from the canonical token stream.
+    let queries = build_query_set(&tok_docs, args);
+    let q_path = args.out_dir.join("queries.txt");
+    let mut q_w = BufWriter::new(
+        std::fs::File::create(&q_path)
+            .map_err(|e| lance_core::Error::io(format!("create queries.txt: {e}")))?,
+    );
+    for q in &queries {
+        // raw == tok here: queries are sampled from canonical tokens. The
+        // column is kept for format stability with the Lucene bench.
+        writeln!(q_w, "{}\t{}\t{}", q.kind, q.text, q.text).ok();
+    }
+    q_w.flush().ok();
+    println!("wrote {} queries to queries.txt", queries.len());
+
+    // Exact-BM25 ground truth over the pre-tokenized corpus (Run A).
+    let truth = exact_bm25_truth(&tok_docs, &queries, args.k);
+    let t_path = args.out_dir.join("truth.txt");
+    let mut t_w = BufWriter::new(
+        std::fs::File::create(&t_path)
+            .map_err(|e| lance_core::Error::io(format!("create truth.txt: {e}")))?,
+    );
+    for ids in &truth {
+        let line: Vec<String> = ids.iter().map(|i| i.to_string()).collect();
+        writeln!(t_w, "{}", line.join(" ")).ok();
+    }
+    t_w.flush().ok();
+    println!(
+        "wrote exact-BM25 truth for {} queries to truth.txt",
+        truth.len()
+    );
+    Ok(())
+}
+
+struct Query {
+    kind: &'static str, // "term" | "phrase"
+    text: String,       // term: one token; phrase: "a b"
+}
+
+const STOPWORDS: &[&str] = &[
+    "the", "a", "an", "and", "or", "of", "to", "in", "on", "for", "with", "as", "by", "is", "was",
+    "are", "were", "be", "this", "that", "it", "its", "but", "not", "no", "if", "from", "at",
+];
+
+fn build_query_set(tok_docs: &[Vec<String>], args: &GenArgs) -> Vec<Query> {
+    // Term queries: mid-frequency tokens (skip the most common — their
+    // top-k is an unstable near-tie, as the FineWeb panel showed).
+    let mut freq: HashMap<&str, u64> = HashMap::new();
+    for d in tok_docs {
+        for t in d {
+            if t.len() >= 3 && !STOPWORDS.contains(&t.as_str()) {
+                *freq.entry(t.as_str()).or_default() += 1;
+            }
+        }
+    }
+    let mut by_freq: Vec<(&str, u64)> = freq.into_iter().collect();
+    by_freq.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(b.0)));
+    let skip = (by_freq.len() / 4).min(300);
+    let mut queries: Vec<Query> = by_freq
+        .iter()
+        .skip(skip)
+        .take(args.num_term_queries)
+        .map(|(t, _)| Query {
+            kind: "term",
+            text: (*t).to_string(),
+        })
+        .collect();
+
+    // Phrase queries: two adjacent non-stopword tokens, sampled on a stride.
+    let stride = (tok_docs.len().max(1) / args.num_phrase_queries.max(1)).max(1);
+    let mut idx = 0usize;
+    while queries.iter().filter(|q| q.kind == "phrase").count() < args.num_phrase_queries
+        && idx < tok_docs.len()
+    {
+        let d = &tok_docs[idx];
+        for w in d.windows(2) {
+            if w[0].len() >= 3
+                && w[1].len() >= 3
+                && !STOPWORDS.contains(&w[0].as_str())
+                && !STOPWORDS.contains(&w[1].as_str())
+            {
+                queries.push(Query {
+                    kind: "phrase",
+                    text: format!("{} {}", w[0], w[1]),
+                });
+                break;
+            }
+        }
+        idx += stride;
+    }
+    queries
+}
+
+/// Brute-force exact BM25 top-k over the pre-tokenized corpus.
+fn exact_bm25_truth(tok_docs: &[Vec<String>], queries: &[Query], k: usize) -> Vec<Vec<usize>> {
+    let n = tok_docs.len() as f64;
+    let dl: Vec<f64> = tok_docs.iter().map(|d| d.len() as f64).collect();
+    let avgdl = dl.iter().sum::<f64>() / n.max(1.0);
+    // postings: token -> Vec<(doc, tf)>
+    let mut postings: HashMap<&str, Vec<(usize, u32)>> = HashMap::new();
+    for (doc, toks) in tok_docs.iter().enumerate() {
+        let mut tf: HashMap<&str, u32> = HashMap::new();
+        for t in toks {
+            *tf.entry(t.as_str()).or_default() += 1;
+        }
+        for (t, c) in tf {
+            postings.entry(t).or_default().push((doc, c));
+        }
+    }
+    let term_score = |tf: f64, df: f64, d: f64| -> f64 {
+        let idf = ((n - df + 0.5) / (df + 0.5) + 1.0).ln();
+        let num = tf * (BM25_K1 + 1.0);
+        let den = tf + BM25_K1 * (1.0 - BM25_B + BM25_B * (d / avgdl));
+        idf * (num / den)
+    };
+    queries
+        .iter()
+        .map(|q| {
+            let mut scores: HashMap<usize, f64> = HashMap::new();
+            if q.kind == "term" {
+                if let Some(pl) = postings.get(q.text.as_str()) {
+                    let df = pl.len() as f64;
+                    for &(doc, tf) in pl {
+                        *scores.entry(doc).or_default() += term_score(tf as f64, df, dl[doc]);
+                    }
+                }
+            } else {
+                let words: Vec<&str> = q.text.split(' ').collect();
+                // Phrase: docs where the two tokens occur adjacently.
+                for (doc, toks) in tok_docs.iter().enumerate() {
+                    let adjacent = toks
+                        .windows(2)
+                        .any(|w| w[0] == words[0] && w[1] == words[1]);
+                    if !adjacent {
+                        continue;
+                    }
+                    let mut s = 0.0;
+                    for w in &words {
+                        if let Some(pl) = postings.get(*w) {
+                            let df = pl.len() as f64;
+                            if let Some(&(_, tf)) = pl.iter().find(|&&(d, _)| d == doc) {
+                                s += term_score(tf as f64, df, dl[doc]);
+                            }
+                        }
+                    }
+                    scores.insert(doc, s);
+                }
+            }
+            let mut ranked: Vec<(usize, f64)> = scores.into_iter().collect();
+            ranked.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
+            ranked.truncate(k);
+            ranked.into_iter().map(|(d, _)| d).collect()
+        })
+        .collect()
+}
+
+// ----------------------------------------------------------------------
+// bench
+// ----------------------------------------------------------------------
+
+fn read_lines(path: &std::path::Path) -> Result<Vec<String>> {
+    let f = std::fs::File::open(path)
+        .map_err(|e| lance_core::Error::io(format!("open {}: {e}", path.display())))?;
+    std::io::BufReader::new(f)
+        .lines()
+        .collect::<std::io::Result<Vec<_>>>()
+        .map_err(|e| lance_core::Error::io(format!("read {}: {e}", path.display())))
+}
+
+fn schema() -> Arc<ArrowSchema> {
+    Arc::new(ArrowSchema::new(vec![
+        Field::new("id", DataType::Int64, false),
+        Field::new(TEXT_COL, DataType::Utf8, true),
+    ]))
+}
+
+fn percentile(sorted: &[f64], pct: f64) -> f64 {
+    if sorted.is_empty() {
+        return f64::NAN;
+    }
+    let idx = ((pct / 100.0) * (sorted.len() - 1) as f64).round() as usize;
+    sorted[idx.min(sorted.len() - 1)]
+}
+
+fn run_bench(args: &BenchArgs) -> Result<()> {
+    let corpus_file = match args.run {
+        'a' => "corpus_tok.txt",
+        _ => "corpus.txt",
+    };
+    let docs = read_lines(&args.in_dir.join(corpus_file))?;
+    let query_lines = read_lines(&args.in_dir.join("queries.txt"))?;
+    let truth_lines = read_lines(&args.in_dir.join("truth.txt"))?;
+
+    // Run A indexes the already-tokenized text with a whitespace tokenizer;
+    // Run B indexes raw text with the default Lance tokenizer.
+    let params = if args.run == 'a' {
+        InvertedIndexParams::default()
+            .base_tokenizer("whitespace".to_string())
+            .lower_case(false)
+            .stem(false)
+            .remove_stop_words(false)
+            .with_position(true)
+    } else {
+        InvertedIndexParams::default().with_position(true)
+    };
+    let index = FtsMemIndex::with_params(0, TEXT_COL.to_string(), params);
+    let sch = schema();
+
+    // Build: insert in batches of 1000, doc id == line number == row position.
+    let build_start = Instant::now();
+    let batch_size = 1000;
+    let mut row = 0usize;
+    while row < docs.len() {
+        let end = (row + batch_size).min(docs.len());
+        let ids: Vec<i64> = (row as i64..end as i64).collect();
+        let texts: Vec<&str> = docs[row..end].iter().map(|s| s.as_str()).collect();
+        let batch = RecordBatch::try_new(
+            sch.clone(),
+            vec![
+                Arc::new(Int64Array::from(ids)) as ArrayRef,
+                Arc::new(StringArray::from(texts)) as ArrayRef,
+            ],
+        )
+        .map_err(|e| lance_core::Error::io(format!("batch: {e}")))?;
+        index.insert(&batch, row as u64)?;
+        row = end;
+    }
+    let build_s = build_start.elapsed().as_secs_f64();
+
+    // Parse queries.
+    struct Q {
+        kind: String,
+        raw: String,
+        tok: String,
+    }
+    let queries: Vec<Q> = query_lines
+        .iter()
+        .filter_map(|l| {
+            let mut p = l.splitn(3, '\t');
+            Some(Q {
+                kind: p.next()?.to_string(),
+                raw: p.next()?.to_string(),
+                tok: p.next()?.to_string(),
+            })
+        })
+        .collect();
+    let truth: Vec<Vec<usize>> = truth_lines
+        .iter()
+        .map(|l| {
+            l.split_whitespace()
+                .filter_map(|s| s.parse().ok())
+                .collect()
+        })
+        .collect();
+
+    let opts = SearchOptions::new().with_limit(args.k);
+    let make_query = |q: &Q| -> FtsQueryExpr {
+        let text = if args.run == 'a' { &q.tok } else { &q.raw };
+        if q.kind == "phrase" {
+            FtsQueryExpr::phrase(text.clone())
+        } else {
+            FtsQueryExpr::match_query(text.clone())
+        }
+    };
+
+    // Warm-up.
+    for q in &queries {
+        let _ = index.search_with_options(&make_query(q), opts.clone());
+    }
+
+    // Single-thread latency + collect top-k.
+    let mut latencies_us: Vec<f64> = Vec::with_capacity(queries.len());
+    let mut topk: Vec<Vec<usize>> = Vec::with_capacity(queries.len());
+    let st_start = Instant::now();
+    for q in &queries {
+        let t0 = Instant::now();
+        let hits = index.search_with_options(&make_query(q), opts.clone());
+        latencies_us.push(t0.elapsed().as_secs_f64() * 1.0e6);
+        let mut ids: Vec<(usize, f32)> = hits
+            .iter()
+            .map(|e| (e.row_position as usize, e.score))
+            .collect();
+        ids.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
+        ids.truncate(args.k);
+        topk.push(ids.into_iter().map(|(d, _)| d).collect());
+    }
+    let st_s = st_start.elapsed().as_secs_f64();
+    let qps_1t = queries.len() as f64 / st_s;
+
+    // Multi-thread QPS.
+    let reps = 4usize;
+    let mt_start = Instant::now();
+    let _: usize = (0..reps)
+        .into_par_iter()
+        .map(|_| {
+            queries
+                .par_iter()
+                .map(|q| {
+                    index
+                        .search_with_options(&make_query(q), opts.clone())
+                        .len()
+                })
+                .sum::<usize>()
+        })
+        .sum();
+    let mt_s = mt_start.elapsed().as_secs_f64();
+    let qps_nt = (queries.len() * reps) as f64 / mt_s;
+
+    // recall@k vs exact BM25 (Run A only — truth is over the canonical corpus).
+    let (mut term_recall, mut term_n) = (0.0f64, 0usize);
+    let (mut phrase_recall, mut phrase_n) = (0.0f64, 0usize);
+    if args.run == 'a' {
+        for (i, q) in queries.iter().enumerate() {
+            let t: std::collections::HashSet<usize> = truth
+                .get(i)
+                .map(|v| v.iter().copied().collect())
+                .unwrap_or_default();
+            if t.is_empty() {
+                continue;
+            }
+            let hit = topk[i].iter().filter(|d| t.contains(d)).count() as f64;
+            let r = hit / args.k as f64;
+            if q.kind == "phrase" {
+                phrase_recall += r;
+                phrase_n += 1;
+            } else {
+                term_recall += r;
+                term_n += 1;
+            }
+        }
+    }
+
+    // Write this impl's top-k for the driver's mutual-overlap computation.
+    let topk_path = args
+        .in_dir
+        .join(format!("lance_fts_run{}_topk.txt", args.run));
+    let mut tw = BufWriter::new(
+        std::fs::File::create(&topk_path)
+            .map_err(|e| lance_core::Error::io(format!("create topk: {e}")))?,
+    );
+    for ids in &topk {
+        let s: Vec<String> = ids.iter().map(|i| i.to_string()).collect();
+        writeln!(tw, "{}", s.join(" ")).ok();
+    }
+    tw.flush().ok();
+
+    latencies_us.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
+    let term_recall_v = if term_n > 0 {
+        term_recall / term_n as f64
+    } else {
+        f64::NAN
+    };
+    let phrase_recall_v = if phrase_n > 0 {
+        phrase_recall / phrase_n as f64
+    } else {
+        f64::NAN
+    };
+
+    println!(
+        "result impl=lance_fts run={} docs={} queries={} build_s={:.3} build_docs_per_s={:.0} \
+         q_p50_us={:.1} q_p95_us={:.1} qps_1t={:.0} qps_{}t={:.0} \
+         term_recall={:.4} phrase_recall={:.4} mem_mb={:.1}",
+        args.run,
+        docs.len(),
+        queries.len(),
+        build_s,
+        docs.len() as f64 / build_s,
+        percentile(&latencies_us, 50.0),
+        percentile(&latencies_us, 95.0),
+        qps_1t,
+        rayon::current_num_threads(),
+        qps_nt,
+        term_recall_v,
+        phrase_recall_v,
+        index.memory_usage() as f64 / 1.0e6,
+    );
+    println!(
+        "{{\"impl\":\"lance_fts\",\"run\":\"{}\",\"docs\":{},\"queries\":{},\"k\":{},\
+         \"build_s\":{:.4},\"build_docs_per_s\":{:.1},\
+         \"q_p50_us\":{:.2},\"q_p95_us\":{:.2},\"qps_1t\":{:.1},\"qps_nt\":{:.1},\
+         \"term_recall_at_k\":{:.4},\"phrase_recall_at_k\":{:.4},\"mem_bytes\":{}}}",
+        args.run,
+        docs.len(),
+        queries.len(),
+        args.k,
+        build_s,
+        docs.len() as f64 / build_s,
+        percentile(&latencies_us, 50.0),
+        percentile(&latencies_us, 95.0),
+        qps_1t,
+        qps_nt,
+        term_recall_v,
+        phrase_recall_v,
+        index.memory_usage(),
+    );
+    Ok(())
+}
+
+// ----------------------------------------------------------------------
+// CLI
+// ----------------------------------------------------------------------
+
+struct GenArgs {
+    docs: usize,
+    out_dir: std::path::PathBuf,
+    cache_dir: std::path::PathBuf,
+    num_term_queries: usize,
+    num_phrase_queries: usize,
+    k: usize,
+}
+
+struct BenchArgs {
+    in_dir: std::path::PathBuf,
+    run: char,
+    k: usize,
+}
+
+fn main() -> Result<()> {
+    let argv: Vec<String> = std::env::args().skip(1).collect();
+    let argv: Vec<&str> = argv
+        .iter()
+        .map(|s| s.as_str())
+        .filter(|s| *s != "--bench")
+        .collect();
+    let mode = argv.first().copied().unwrap_or("");
+    let get = |flag: &str, def: &str| -> String {
+        argv.iter()
+            .position(|a| *a == flag)
+            .and_then(|i| argv.get(i + 1))
+            .map(|s| s.to_string())
+            .unwrap_or_else(|| def.to_string())
+    };
+
+    match mode {
+        "gen" => {
+            let args = GenArgs {
+                docs: get("--docs", "1000000").parse().unwrap(),
+                out_dir: get("--out-dir", "/tmp/fts_compare").into(),
+                cache_dir: get("--cache-dir", "/tmp/fineweb_cache").into(),
+                num_term_queries: get("--num-term-queries", "100").parse().unwrap(),
+                num_phrase_queries: get("--num-phrase-queries", "50").parse().unwrap(),
+                k: get("--k", "10").parse().unwrap(),
+            };
+            let rt = tokio::runtime::Runtime::new()
+                .map_err(|e| lance_core::Error::io(format!("runtime: {e}")))?;
+            rt.block_on(run_gen(&args))
+        }
+        "bench" => {
+            let threads: usize = get("--threads", "0").parse().unwrap();
+            if threads > 0 {
+                let _ = rayon::ThreadPoolBuilder::new()
+                    .num_threads(threads)
+                    .build_global();
+            }
+            let args = BenchArgs {
+                in_dir: get("--in-dir", "/tmp/fts_compare").into(),
+                run: get("--run", "a").chars().next().unwrap_or('a'),
+                k: get("--k", "10").parse().unwrap(),
+            };
+            run_bench(&args)
+        }
+        other => Err(lance_core::Error::invalid_input(format!(
+            "usage: mem_wal_fts_bench (gen|bench) [...]; got '{other}'"
+        ))),
+    }
+}

From 5cac7293a0326c4d9786288bd65820db78c196f6 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sat, 16 May 2026 12:05:23 -0700
Subject: [PATCH 16/37] bench: chunk the Lucene MT QPS loop to match the Lance
 rayon harness
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

One Future per query made the multi-thread QPS dominated by thread-pool
submit overhead (smoke run: Lucene qps_nt 10k vs Lance 105k for cheap
queries). Submit one task per thread, each striding a slice of the
queries x reps work — the same shape as the Lance side's rayon loop.
---
 bench/lucene_fts_bench/LuceneFtsBench.java | Bin 10756 -> 11246 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/bench/lucene_fts_bench/LuceneFtsBench.java b/bench/lucene_fts_bench/LuceneFtsBench.java
index 48bc91f0bd708f15a12ebdfd72b16962e8de903e..03931ef40a4a3e686ac82f8df88d8f6b08247d07 100644
GIT binary patch
delta 666
zcmZWmu}T9$5bYGTvJr$3UL)R-<YHkLV_{(>NI<Y&bDLzzW%u0OBSw)}_y;F`h6w(M
zosE^1mHvXWyGabX#j-o|=DjyF_h(<{&+nCKvx!s5VVM~s=NfjPIZNx{tUG`%q!h9b
zhOUxU^b1uOm`rpzRF9y19dwA|vfx^9gDYsB8;q46;^XcCBWAk;iW3wkOj4Z@Dc=o<
zhOvCgG7^~6Xf{z2nNoQjLQ<NgPAF%S;NrNjg(i;TVkE4$>x3tXuNTq=^)~ifK9VW&
zZk1(O2W>>|8_%sKMqKkKE?=vw<!9w-Y6-E|g2lmX1cpAfT8tY{1@weu8M(5sA|<d5
z8kF#+Ov2JMnql2ka;35)I9(3AR1~9exFB{BO#SWHO)>ITmZ*FVJ*7hr7(029w&+1_
zeShv<qp07W`E?yi!`*SS!X*{Y)|K?RjUp0_(Tv8}S;WI^;&|TTGK=$9YnR{KTjg4H
h^XNb1_MdajX>HPV_7{Ci8s$#)aV2!|@>XqZ`~Ytfy$t{W

delta 149
zcmaDC-V(Cmj?CnxvO<%e%dQYAQm|DpuvREiuu&*VEhq-ECZCa1QLpAwfP%ErlG38o
zV!gza6pe!X{2aaF(xlwX5)BPa1zo$z?<K`#^oo=7i&9<klZ*9oQuESFGBh=<q1tOF
lt13*B(a6cqOV?B=EKM!S1e#QwS(OUZtp&Dl^Fak&ZUCogFu4E#


From a92d03462650f68f12cb2c29a91a472428cc4ea6 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sat, 16 May 2026 12:06:32 -0700
Subject: [PATCH 17/37] fix(bench): make Lucene MT thread count effectively
 final for the lambda

---
 bench/lucene_fts_bench/LuceneFtsBench.java | Bin 11246 -> 11283 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/bench/lucene_fts_bench/LuceneFtsBench.java b/bench/lucene_fts_bench/LuceneFtsBench.java
index 03931ef40a4a3e686ac82f8df88d8f6b08247d07..ad2b75ff251859738b068fb81bd76b88dbefd6ec 100644
GIT binary patch
delta 61
zcmaDCJ~?8;b6M%M%)G=Lh0MGXg}jiAqSVBcVg*};5-@A>16gaPypYK<a&;hTw%kr2
KwK-INCJzAb3KsSN

delta 30
lcmbOn@h*JBbJ@w9awd!=lkMf|faHF;os1=$tK?_$006+83%URR


From 69aba109b67338e41892cf676ecac530f7ea3756 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sat, 16 May 2026 13:39:57 -0700
Subject: [PATCH 18/37] feat(inverted): expose in-memory wand_search entry
 point

Adds a pub `wand_search` to the inverted module: it builds PostingIterators
from caller-supplied PostingLists and runs the block-max WAND algorithm,
with a no-op row mask. The WAND machinery was pub(crate); this exposes it
for the in-memory FTS MemTable index, whose frozen segments will hold
postings as PostingList::Plain and need the same query primitive as the
on-disk path. Stage 1 of the segment-structured FtsMemIndex redesign.
---
 rust/lance-index/src/scalar/inverted.rs      |  1 +
 rust/lance-index/src/scalar/inverted/wand.rs | 96 ++++++++++++++++++++
 2 files changed, 97 insertions(+)

diff --git a/rust/lance-index/src/scalar/inverted.rs b/rust/lance-index/src/scalar/inverted.rs
index 448861c5c0c..20cff663eaf 100644
--- a/rust/lance-index/src/scalar/inverted.rs
+++ b/rust/lance-index/src/scalar/inverted.rs
@@ -25,6 +25,7 @@ use lance_core::{Result, cache::LanceCache};
 pub use lance_tokenizer::Language;
 pub use scorer::MemBM25Scorer;
 pub use tokenizer::*;
+pub use wand::{DocCandidate, WandTerm, wand_search};
 
 use crate::scalar::inverted::query::{FtsSearchParams, Tokens};
 
diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs
index b06c75c0021..2e2132f5df6 100644
--- a/rust/lance-index/src/scalar/inverted/wand.rs
+++ b/rust/lance-index/src/scalar/inverted/wand.rs
@@ -442,6 +442,46 @@ pub struct DocCandidate {
     pub doc_length: u32,
 }
 
+/// One query term for [`wand_search`]: the token text, its id, the token's
+/// position in the query (used only by phrase queries), the query-side
+/// weight, and the term's posting list.
+pub type WandTerm = (String, u32, u32, f32, PostingList);
+
+/// In-memory block-max WAND search entry point.
+///
+/// Runs the same WAND algorithm as the on-disk inverted index query path,
+/// but over caller-supplied posting lists. This is the query primitive for
+/// the in-memory FTS MemTable index, whose frozen segments hold their
+/// postings as [`PostingList::Plain`]. Mirrors `Wand`'s on-disk usage; the
+/// row-address mask is a no-op (the in-memory index has no deletions).
+pub fn wand_search<S: Scorer>(
+    operator: Operator,
+    terms: Vec<WandTerm>,
+    docs: &DocSet,
+    scorer: S,
+    params: &FtsSearchParams,
+) -> Result<Vec<DocCandidate>> {
+    let num_doc = docs.len();
+    let postings = terms
+        .into_iter()
+        .map(|(token, token_id, position, query_weight, list)| {
+            PostingIterator::with_query_weight(
+                token,
+                token_id,
+                position,
+                query_weight,
+                list,
+                num_doc,
+            )
+        });
+    let mut wand = Wand::new(operator, postings, docs, scorer);
+    wand.search(
+        params,
+        Arc::new(RowAddrMask::default()),
+        &crate::metrics::NoOpMetricsCollector,
+    )
+}
+
 struct HeadPosting {
     // Iterators that are already positioned on or after the next candidate doc.
     // The heap is ordered by smallest doc id so the top element determines
@@ -1655,6 +1695,62 @@ mod tests {
         assert_eq!(result.len(), 0); // Should not panic
     }
 
+    // Proves the public `wand_search` entry point works over caller-built
+    // in-memory posting lists — the query primitive for the segmented
+    // FTS MemTable index. (BM25 ranking itself is covered by `test_wand`
+    // and the on-disk index tests.)
+    #[test]
+    fn test_wand_search_in_memory() {
+        let mut docs = DocSet::default();
+        for i in 0..10u64 {
+            docs.append(i, 1);
+        }
+        // term "a": docs 0,2,4,6,8 — term "b": docs 1,2,5,8 — union of 7.
+        let term_a = generate_posting_list(vec![0, 2, 4, 6, 8], 1.0, None, false);
+        let term_b = generate_posting_list(vec![1, 2, 5, 8], 1.0, None, false);
+        let union: std::collections::HashSet<u64> = [0, 1, 2, 4, 5, 6, 8].into_iter().collect();
+        let terms: Vec<WandTerm> = vec![
+            ("a".to_string(), 0, 0, 1.0, term_a),
+            ("b".to_string(), 1, 1, 1.0, term_b),
+        ];
+        let params = FtsSearchParams::default().with_limit(Some(3));
+        let scorer = IndexBM25Scorer::new(std::iter::empty());
+        let result = wand_search(Operator::Or, terms, &docs, scorer, &params).unwrap();
+        assert_eq!(result.len(), 3, "limit=3 should return 3 candidates");
+        for c in &result {
+            assert!(union.contains(&c.row_id), "id {} not in union", c.row_id);
+            assert!(!c.freqs.is_empty(), "candidate must record term freqs");
+        }
+
+        // Unlimited search returns exactly the union of the two posting lists.
+        let scorer = IndexBM25Scorer::new(std::iter::empty());
+        let all = wand_search(
+            Operator::Or,
+            vec![
+                (
+                    "a".to_string(),
+                    0,
+                    0,
+                    1.0,
+                    generate_posting_list(vec![0, 2, 4, 6, 8], 1.0, None, false),
+                ),
+                (
+                    "b".to_string(),
+                    1,
+                    1,
+                    1.0,
+                    generate_posting_list(vec![1, 2, 5, 8], 1.0, None, false),
+                ),
+            ],
+            &docs,
+            scorer,
+            &FtsSearchParams::default(),
+        )
+        .unwrap();
+        let got: std::collections::HashSet<u64> = all.iter().map(|c| c.row_id).collect();
+        assert_eq!(got, union, "OR search must return the posting-list union");
+    }
+
     #[test]
     fn test_posting_iterator_next_compressed_partition_point() {
         let mut docs = DocSet::default();

From beab662946be7fec890ac0ffa89076fdc92edaec Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sat, 16 May 2026 13:57:53 -0700
Subject: [PATCH 19/37] feat(mem_wal): add immutable FTS Partition type
 (redesign stage 2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds `Partition` + `PartitionBuilder` to the FTS mem index: an immutable
frozen slice of inserts holding per-term posting lists in the on-disk
`PostingList::Plain` form, queried with the shared block-max WAND
(`wand_search`). WAND reports partition-local doc ids; `Partition::search`
remaps them to MemTable row positions via the partition's `DocSet`.

Standalone and dead-code-allowed — wired into `FtsMemIndex` in the next
stage. Part of the partition-structured redesign that replaces the
O(corpus) per-batch-chunk query layout.
---
 rust/lance/src/dataset/mem_wal/index/fts.rs | 242 ++++++++++++++++++++
 1 file changed, 242 insertions(+)

diff --git a/rust/lance/src/dataset/mem_wal/index/fts.rs b/rust/lance/src/dataset/mem_wal/index/fts.rs
index 739ab21de2a..9b8061e11eb 100644
--- a/rust/lance/src/dataset/mem_wal/index/fts.rs
+++ b/rust/lance/src/dataset/mem_wal/index/fts.rs
@@ -1646,6 +1646,169 @@ impl FtsIndexConfig {
     }
 }
 
+// ============================================================================
+// Partition-structured index (redesign — see
+// ~/ai/analysis/lance/jack-MemTableFTSBetter/fts-segment-redesign/DESIGN.md)
+//
+// An immutable `Partition` holds a frozen slice of the MemTable's inserts as
+// on-disk-shaped posting lists, queried with the shared block-max WAND
+// (`wand_search`). The current per-batch-chunk layout makes queries
+// O(corpus); partitions make them ≈ O(matches) and flush 1:1 to a Lance FTS
+// on-disk partition. Wired into `FtsMemIndex` in a later stage; standalone
+// and dead-code-allowed until then.
+// ============================================================================
+
+use arrow_array::builder::{Int32Builder, ListBuilder};
+use arrow_buffer::ScalarBuffer;
+use lance_index::scalar::inverted::query::{FtsSearchParams, Operator};
+use lance_index::scalar::inverted::{
+    DocCandidate, DocSet, MemBM25Scorer, PlainPostingList, PostingList, TokenSet, WandTerm,
+    wand_search,
+};
+
+/// An immutable, frozen FTS partition.
+#[allow(dead_code)]
+struct Partition {
+    /// token text -> local token id (dense, 0-based).
+    tokens: TokenSet,
+    /// posting list per local token id.
+    postings: Vec<PostingList>,
+    /// local doc id -> (row position, token count).
+    docs: DocSet,
+}
+
+#[allow(dead_code)]
+impl Partition {
+    fn doc_count(&self) -> usize {
+        self.docs.len()
+    }
+
+    fn total_tokens(&self) -> u64 {
+        self.docs.total_tokens_num()
+    }
+
+    /// Number of docs in this partition containing `token`.
+    fn token_df(&self, token: &str) -> usize {
+        self.tokens
+            .get(token)
+            .map(|id| self.postings[id as usize].len())
+            .unwrap_or(0)
+    }
+
+    /// WAND-search this partition. `query` is `(token, query-position)` pairs,
+    /// already tokenized. Returns this partition's local candidates (row
+    /// positions); the caller merges across partitions and the tail.
+    fn search(
+        &self,
+        query: &[(String, u32)],
+        operator: Operator,
+        scorer: MemBM25Scorer,
+        params: &FtsSearchParams,
+    ) -> Result<Vec<DocCandidate>> {
+        let mut terms: Vec<WandTerm> = Vec::with_capacity(query.len());
+        for (token, position) in query {
+            match self.tokens.get(token) {
+                Some(token_id) => terms.push((
+                    token.clone(),
+                    token_id,
+                    *position,
+                    1.0,
+                    self.postings[token_id as usize].clone(),
+                )),
+                // Term absent here: an AND query can't match in this
+                // partition; an OR query just drops the term.
+                None if operator == Operator::And => return Ok(Vec::new()),
+                None => {}
+            }
+        }
+        if terms.is_empty() {
+            return Ok(Vec::new());
+        }
+        // WAND reports the partition-local doc id in `DocCandidate.row_id`;
+        // remap it to the MemTable row position via the partition's DocSet.
+        let mut hits = wand_search(operator, terms, &self.docs, scorer, params)?;
+        for c in &mut hits {
+            c.row_id = self.docs.row_id(c.row_id as u32);
+        }
+        Ok(hits)
+    }
+}
+
+/// Accumulates documents and freezes them into an immutable `Partition`.
+#[allow(dead_code)]
+struct PartitionBuilder {
+    tokens: TokenSet,
+    /// per local token id: `(doc_local_id, freq, positions)` postings.
+    postings: Vec<Vec<(u32, u32, Vec<u32>)>>,
+    docs: DocSet,
+}
+
+#[allow(dead_code)]
+impl PartitionBuilder {
+    fn new() -> Self {
+        Self {
+            tokens: TokenSet::default(),
+            postings: Vec::new(),
+            docs: DocSet::default(),
+        }
+    }
+
+    fn doc_count(&self) -> usize {
+        self.docs.len()
+    }
+
+    fn is_empty(&self) -> bool {
+        self.docs.len() == 0
+    }
+
+    /// Add one document: `terms` is `(token, freq, positions)` per distinct
+    /// token, `num_tokens` is the document length.
+    fn add_doc(&mut self, row_position: u64, num_tokens: u32, terms: Vec<(String, u32, Vec<u32>)>) {
+        let doc_id = self.docs.append(row_position, num_tokens);
+        for (token, freq, positions) in terms {
+            let token_id = self.tokens.add(token) as usize;
+            if token_id >= self.postings.len() {
+                self.postings.resize_with(token_id + 1, Vec::new);
+            }
+            self.postings[token_id].push((doc_id, freq, positions));
+        }
+    }
+
+    /// Freeze into an immutable `Partition`. Each token's postings are sorted
+    /// by doc id and converted to a `PlainPostingList`; `max_score` is left
+    /// `None` so WAND derives the per-term upper bound from `df` at query
+    /// time (global BM25 stats are not known to a single partition).
+    fn build(self) -> Partition {
+        let postings: Vec<PostingList> = self
+            .postings
+            .into_iter()
+            .map(|mut docs| {
+                docs.sort_by_key(|(doc_id, _, _)| *doc_id);
+                let row_ids: Vec<u64> = docs.iter().map(|(d, _, _)| *d as u64).collect();
+                let freqs: Vec<f32> = docs.iter().map(|(_, f, _)| *f as f32).collect();
+                let mut positions = ListBuilder::new(Int32Builder::new());
+                for (_, _, doc_positions) in &docs {
+                    for p in doc_positions {
+                        positions.values().append_value(*p as i32);
+                    }
+                    positions.append(true);
+                }
+                PostingList::Plain(PlainPostingList::new(
+                    ScalarBuffer::from(row_ids),
+                    ScalarBuffer::from(freqs),
+                    None,
+                    Some(positions.finish()),
+                ))
+            })
+            .collect();
+        Partition {
+            tokens: self.tokens,
+            postings,
+            docs: self.docs,
+        }
+    }
+}
+
 // ============================================================================
 // Tests
 // ============================================================================
@@ -2657,4 +2820,83 @@ mod tests {
         let err = index.insert(&batch, 0).unwrap_err();
         assert!(err.to_string().contains("only supports"), "{err}");
     }
+
+    // ===== Partition redesign (Stage 2) =====
+
+    fn build_test_partition() -> Partition {
+        // 4 docs at row positions 100..104.
+        let mut b = PartitionBuilder::new();
+        b.add_doc(
+            100,
+            2,
+            vec![
+                ("apple".to_string(), 1, vec![0]),
+                ("banana".to_string(), 1, vec![1]),
+            ],
+        );
+        b.add_doc(
+            101,
+            2,
+            vec![
+                ("apple".to_string(), 1, vec![0]),
+                ("cherry".to_string(), 1, vec![1]),
+            ],
+        );
+        b.add_doc(102, 1, vec![("banana".to_string(), 1, vec![0])]);
+        b.add_doc(
+            103,
+            3,
+            vec![
+                ("apple".to_string(), 2, vec![0, 1]),
+                ("date".to_string(), 1, vec![2]),
+            ],
+        );
+        b.build()
+    }
+
+    fn search_partition(p: &Partition, token: &str) -> Vec<u64> {
+        let mut token_docs = std::collections::HashMap::new();
+        token_docs.insert(token.to_string(), p.token_df(token));
+        let scorer = MemBM25Scorer::new(p.total_tokens(), p.doc_count(), token_docs);
+        let params = FtsSearchParams::default().with_limit(Some(10));
+        let hits = p
+            .search(&[(token.to_string(), 0)], Operator::Or, scorer, &params)
+            .unwrap();
+        let mut ids: Vec<u64> = hits.iter().map(|c| c.row_id).collect();
+        ids.sort_unstable();
+        ids
+    }
+
+    #[test]
+    fn test_partition_build_and_search() {
+        let p = build_test_partition();
+        assert_eq!(p.doc_count(), 4);
+        assert_eq!(p.total_tokens(), 8); // 2 + 2 + 1 + 3
+        assert_eq!(p.token_df("apple"), 3);
+        assert_eq!(p.token_df("banana"), 2);
+        assert_eq!(p.token_df("missing"), 0);
+
+        // WAND search must return the row positions containing each term.
+        assert_eq!(search_partition(&p, "apple"), vec![100, 101, 103]);
+        assert_eq!(search_partition(&p, "banana"), vec![100, 102]);
+        assert_eq!(search_partition(&p, "date"), vec![103]);
+        assert!(search_partition(&p, "missing").is_empty());
+    }
+
+    #[test]
+    fn test_partition_and_query_short_circuits_missing_term() {
+        let p = build_test_partition();
+        // "apple" present, "missing" absent -> AND yields nothing.
+        let scorer = MemBM25Scorer::new(p.total_tokens(), p.doc_count(), Default::default());
+        let params = FtsSearchParams::default().with_limit(Some(10));
+        let hits = p
+            .search(
+                &[("apple".to_string(), 0), ("missing".to_string(), 1)],
+                Operator::And,
+                scorer,
+                &params,
+            )
+            .unwrap();
+        assert!(hits.is_empty());
+    }
 }

From 50be021bf95a4285faa3c05046edcdc3d2b47991 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sat, 16 May 2026 15:52:12 -0700
Subject: [PATCH 20/37] perf(mem_wal): partition-structured FTS mem index
 (redesign stages 3-7)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replaces the per-batch-chunk `FtsMemIndex` layout — where every query walked
every chunk of every query term, making latency O(corpus) (p95 ~1.2s at 1M
docs) — with Lucene's segment model adapted to a Lance FTS partition.

The index is now a set of immutable `Partition`s plus a bounded mutable tail,
published together behind one `ArcSwap` (`IndexState`) so a freeze is observed
atomically. Each partition holds frozen, on-disk-shaped posting lists queried
with block-max WAND in ~O(matches); the writer freezes the tail into a new
partition past `freeze_threshold_rows` (50k) and merges partitions past a hard
cap. All scoring routes through one corpus-wide `MemBM25Scorer` so partition
and tail scores are comparable; the scorer and the scanned tail share a single
snapshot to keep BM25 stats consistent. Flush merges every partition and the
tail into one `InnerBuilder`. The public query API is unchanged.

Exposes `Scorer` from `lance-index`'s inverted module.
---
 rust/lance-index/src/scalar/inverted.rs     |    2 +-
 rust/lance/src/dataset/mem_wal/index/fts.rs | 1683 ++++++++++++-------
 2 files changed, 1114 insertions(+), 571 deletions(-)

diff --git a/rust/lance-index/src/scalar/inverted.rs b/rust/lance-index/src/scalar/inverted.rs
index 20cff663eaf..399eabd3ff0 100644
--- a/rust/lance-index/src/scalar/inverted.rs
+++ b/rust/lance-index/src/scalar/inverted.rs
@@ -23,7 +23,7 @@ use datafusion::execution::SendableRecordBatchStream;
 pub use index::*;
 use lance_core::{Result, cache::LanceCache};
 pub use lance_tokenizer::Language;
-pub use scorer::MemBM25Scorer;
+pub use scorer::{MemBM25Scorer, Scorer};
 pub use tokenizer::*;
 pub use wand::{DocCandidate, WandTerm, wand_search};
 
diff --git a/rust/lance/src/dataset/mem_wal/index/fts.rs b/rust/lance/src/dataset/mem_wal/index/fts.rs
index 9b8061e11eb..0a99a109986 100644
--- a/rust/lance/src/dataset/mem_wal/index/fts.rs
+++ b/rust/lance/src/dataset/mem_wal/index/fts.rs
@@ -18,18 +18,32 @@
 //!   (`doc_count`, `total_tokens`, per-term `df`) are mutually consistent
 //!   with the postings the reader walks.
 //!
-//! Visibility is published atomically by replacing a single `Snapshot` value
-//! via `ArcSwap`. The writer first installs all term chunks into the
-//! per-term `ArcSwap<TermSlice>` slots, then atomically swaps in a new
+//! # Partition-structured layout
+//!
+//! The index is a set of immutable [`Partition`]s plus a bounded mutable
+//! `tail` ([`TailIndex`]). Each partition holds a frozen slice of inserts as
+//! on-disk-shaped posting lists and is queried with block-max WAND in
+//! ≈O(matches). The tail accumulates recent inserts in a per-batch-chunk
+//! layout and is searched in place; the writer freezes it into a new
+//! partition once it crosses `freeze_threshold_rows`. The `{partitions, tail}`
+//! pair lives behind one [`ArcSwap`] ([`IndexState`]) so a freeze publishes
+//! atomically — a reader never sees a doc twice or misses one across a
+//! freeze.
+//!
+//! Within the tail, visibility is published atomically by replacing a single
+//! `Snapshot` via `ArcSwap`. The writer first installs all term chunks into
+//! the per-term `ArcSwap<TermSlice>` slots, then atomically swaps in a new
 //! `Snapshot` whose `visible_count` covers the new batch. Readers load the
 //! `Snapshot` first and filter every term chunk by `batch_position <
-//! snapshot.visible_count`.
+//! snapshot.visible_count`. Frozen partitions are fully visible by
+//! construction.
 //!
 //! # On-disk format
 //!
 //! At flush time we hand off to `lance_index::scalar::inverted::builder::InnerBuilder`
-//! via `to_index_builder_reversed`. The on-disk format is unchanged from
-//! Lance's existing inverted index.
+//! via `to_index_builder_reversed`, which merges every partition and the tail
+//! into one builder. The on-disk format is unchanged from Lance's existing
+//! inverted index.
 
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
@@ -37,12 +51,18 @@ use std::sync::Mutex;
 use std::sync::atomic::{AtomicUsize, Ordering};
 
 use arc_swap::ArcSwap;
-use arrow_array::{Array, LargeStringArray, RecordBatch, StringArray, StringViewArray};
+use arrow_array::builder::{Int32Builder, ListBuilder};
+use arrow_array::{Array, Int32Array, LargeStringArray, RecordBatch, StringArray, StringViewArray};
+use arrow_buffer::ScalarBuffer;
 use arrow_schema::DataType;
 use crossbeam_skiplist::SkipMap;
 use lance_core::{Error, Result};
 use lance_index::scalar::InvertedIndexParams;
+use lance_index::scalar::inverted::query::{FtsSearchParams, Operator};
 use lance_index::scalar::inverted::tokenizer::document_tokenizer::LanceTokenizer;
+use lance_index::scalar::inverted::{
+    DocSet, MemBM25Scorer, PlainPostingList, PostingList, Scorer, TokenSet, WandTerm, wand_search,
+};
 use lance_tokenizer::TokenStream;
 
 use super::RowPosition;
@@ -577,6 +597,120 @@ impl<'a> Drop for PooledTokenizer<'a> {
 // FtsMemIndex
 // ============================================================================
 
+/// In-memory full-text search index. See module docs for the concurrency
+/// model and visibility contract.
+/// The bounded mutable accumulator: recent inserts since the last freeze, in
+/// the per-batch-chunk layout. Searched in place; frozen into a [`Partition`]
+/// once it crosses the freeze threshold.
+struct TailIndex {
+    /// Per-term posting slices. `Arc<str>` interns the term so a single
+    /// allocation backs every chunk that mentions it.
+    terms: SkipMap<Arc<str>, ArcSwap<TermSlice>>,
+    /// Atomically-swapped visibility snapshot.
+    snapshot: ArcSwap<Snapshot>,
+    /// Strictly-monotonic, dense, 0-based batch position counter. Dense
+    /// positions are required by the `batch_position < visible_count`
+    /// visibility filter; the tail therefore assigns its own.
+    next_batch_position: AtomicUsize,
+}
+
+impl TailIndex {
+    fn new() -> Arc<Self> {
+        Arc::new(Self {
+            terms: SkipMap::new(),
+            snapshot: ArcSwap::from(Snapshot::empty()),
+            next_batch_position: AtomicUsize::new(0),
+        })
+    }
+
+    fn snapshot(&self) -> Arc<Snapshot> {
+        self.snapshot.load_full()
+    }
+
+    fn doc_count(&self) -> u64 {
+        self.snapshot.load().cumulative_doc_count
+    }
+
+    fn visible_count(&self) -> usize {
+        self.snapshot.load().visible_count
+    }
+
+    /// Next dense batch position for an incoming batch.
+    fn next_position(&self) -> usize {
+        self.next_batch_position.fetch_add(1, Ordering::Relaxed)
+    }
+
+    /// Install one batch's term chunks then publish a new visibility snapshot.
+    fn append_batch(
+        &self,
+        batch_position: usize,
+        row_offset: u64,
+        rows: u32,
+        doc_lengths: Vec<u32>,
+        total_tokens: u64,
+        term_builders: HashMap<Arc<str>, BatchTermBuilder>,
+    ) {
+        for (term, builder) in term_builders {
+            let chunk = builder.build(batch_position);
+            let entry = self
+                .terms
+                .get_or_insert_with(term, TermSlice::empty_arc_swap);
+            let cur = entry.value().load();
+            entry.value().store(cur.with_chunk_appended(chunk));
+        }
+        let new_meta = Arc::new(BatchMeta {
+            batch_position,
+            row_offset,
+            doc_lengths,
+            rows,
+        });
+        let cur = self.snapshot.load();
+        let mut batches: Vec<Arc<BatchMeta>> = Vec::with_capacity(cur.batches.len() + 1);
+        batches.extend(cur.batches.iter().cloned());
+        batches.push(new_meta);
+        self.snapshot.store(Arc::new(Snapshot {
+            visible_count: cur.visible_count + 1,
+            batches: Arc::from(batches.into_boxed_slice()),
+            cumulative_doc_count: cur.cumulative_doc_count + rows as u64,
+            cumulative_total_tokens: cur.cumulative_total_tokens + total_tokens,
+        }));
+    }
+
+    fn memory_size(&self) -> usize {
+        let mut total = std::mem::size_of::<Self>();
+        for entry in self.terms.iter() {
+            let term: &Arc<str> = entry.key();
+            total += std::mem::size_of::<Arc<str>>() + term.len() + 32;
+            total += entry.value().load().memory_size();
+        }
+        total += self
+            .snapshot
+            .load()
+            .batches
+            .iter()
+            .map(|b| b.memory_size())
+            .sum::<usize>();
+        total
+    }
+}
+
+/// Atomically-published index state: the immutable partitions plus the live
+/// mutable tail. The pair lives behind one `ArcSwap` so a freeze (which both
+/// appends a partition and resets the tail) is observed atomically.
+struct IndexState {
+    partitions: Arc<[Arc<Partition>]>,
+    tail: Arc<TailIndex>,
+}
+
+impl IndexState {
+    fn empty() -> Arc<Self> {
+        Arc::new(Self {
+            partitions: Arc::from(Vec::<Arc<Partition>>::new().into_boxed_slice()),
+            tail: TailIndex::new(),
+        })
+    }
+}
+
 /// In-memory full-text search index. See module docs for the concurrency
 /// model and visibility contract.
 pub struct FtsMemIndex {
@@ -589,29 +723,22 @@ pub struct FtsMemIndex {
     /// can take `&self`. Single-writer assumption means this is uncontested.
     writer_tokenizer: Mutex<Box<dyn LanceTokenizer>>,
 
-    /// Per-term posting slices. `Arc<str>` interns the term so a single
-    /// allocation backs every chunk that mentions it.
-    terms: SkipMap<Arc<str>, ArcSwap<TermSlice>>,
-
-    /// Atomically-swapped visibility snapshot.
-    snapshot: ArcSwap<Snapshot>,
+    /// `{partitions, tail}` published atomically. The tail mutates in place
+    /// between freezes; the whole state is swapped on freeze.
+    state: ArcSwap<IndexState>,
 
-    /// Strictly-monotonic batch position counter, used by `insert` (the
-    /// no-explicit-position variant) to assign sequential ids. Reads of
-    /// this counter are not part of the visibility contract — callers
-    /// passing explicit positions to `insert_with_batch_position` must keep
-    /// these monotonic themselves.
-    next_batch_position: AtomicUsize,
+    /// The tail freezes into a partition once it reaches this many docs.
+    freeze_threshold_rows: usize,
 }
 
 impl std::fmt::Debug for FtsMemIndex {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let snap = self.snapshot.load();
+        let st = self.state.load();
         f.debug_struct("FtsMemIndex")
             .field("field_id", &self.field_id)
             .field("column_name", &self.column_name)
-            .field("visible_doc_count", &snap.cumulative_doc_count)
-            .field("visible_batch_count", &snap.visible_count)
+            .field("doc_count", &self.doc_count())
+            .field("partitions", &st.partitions.len())
             .field("params", &self.params)
             .finish()
     }
@@ -622,6 +749,16 @@ impl FtsMemIndex {
     /// short bursts of concurrent search calls without thrashing.
     const DEFAULT_TOKENIZER_POOL_CAP: usize = 8;
 
+    /// Default tail freeze threshold (docs). Analogous to Lucene's
+    /// `ramBufferSizeMB`: large enough that the per-query tail scan stays
+    /// cheap, small enough to bound it independent of corpus size.
+    const DEFAULT_FREEZE_THRESHOLD_ROWS: usize = 50_000;
+
+    /// Hard cap on live partitions: when a freeze would exceed it, all
+    /// partitions are merged into one. With the default freeze threshold this
+    /// only triggers past ~1.6M docs.
+    const MAX_PARTITIONS: usize = 32;
+
     /// Create a new FTS index for the given field with default parameters.
     pub fn new(field_id: i32, column_name: String) -> Self {
         Self::with_params(field_id, column_name, InvertedIndexParams::default())
@@ -638,12 +775,19 @@ impl FtsMemIndex {
             params,
             tokenizer_pool: Arc::new(pool),
             writer_tokenizer: Mutex::new(writer_tokenizer),
-            terms: SkipMap::new(),
-            snapshot: ArcSwap::from(Snapshot::empty()),
-            next_batch_position: AtomicUsize::new(0),
+            state: ArcSwap::from(IndexState::empty()),
+            freeze_threshold_rows: Self::DEFAULT_FREEZE_THRESHOLD_ROWS,
         }
     }
 
+    /// Override the tail freeze threshold. Used by tests to exercise the
+    /// multi-partition path with small inputs.
+    #[cfg(test)]
+    fn with_freeze_threshold_rows(mut self, rows: usize) -> Self {
+        self.freeze_threshold_rows = rows.max(1);
+        self
+    }
+
     pub fn field_id(&self) -> i32 {
         self.field_id
     }
@@ -656,48 +800,46 @@ impl FtsMemIndex {
         &self.params
     }
 
-    /// Number of visible documents.
+    /// Number of visible documents across all partitions and the tail.
     pub fn doc_count(&self) -> usize {
-        self.snapshot.load().cumulative_doc_count as usize
+        let st = self.state.load();
+        st.partitions.iter().map(|p| p.doc_count()).sum::<usize>() + st.tail.doc_count() as usize
     }
 
     /// Whether there are any visible documents.
     pub fn is_empty(&self) -> bool {
-        self.snapshot.load().visible_count == 0
+        let st = self.state.load();
+        st.partitions.is_empty() && st.tail.visible_count() == 0
     }
 
     /// Total number of visible (term, doc) postings.
-    ///
-    /// Sums posting counts only over chunks whose batch is visible per the
-    /// current snapshot, so this matches what readers can actually walk.
     pub fn entry_count(&self) -> usize {
-        let visible = self.snapshot.load().visible_count;
-        self.terms
+        let st = self.state.load_full();
+        let part: usize = st.partitions.iter().map(|p| p.entry_count()).sum();
+        let visible = st.tail.visible_count();
+        let tail: usize = st
+            .tail
+            .terms
             .iter()
             .map(|e| {
-                let slice = e.value().load();
-                slice
+                e.value()
+                    .load()
                     .chunks
                     .iter()
                     .filter(|c| c.batch_position < visible)
                     .map(|c| c.doc_count())
                     .sum::<usize>()
             })
-            .sum()
+            .sum();
+        part + tail
     }
 
     /// Estimated bytes of heap memory held by this index.
     pub fn memory_usage(&self) -> usize {
+        let st = self.state.load_full();
         let mut total = std::mem::size_of::<Self>();
-        for entry in self.terms.iter() {
-            // ~32 bytes for SkipMap node overhead plus the interned term itself.
-            let term: &Arc<str> = entry.key();
-            total += std::mem::size_of::<Arc<str>>() + term.len() + 32;
-            let slice = entry.value().load();
-            total += slice.memory_size();
-        }
-        let snap = self.snapshot.load();
-        total += snap.batches.iter().map(|b| b.memory_size()).sum::<usize>();
+        total += st.partitions.iter().map(|p| p.memory_size()).sum::<usize>();
+        total += st.tail.memory_size();
         total
     }
 
@@ -705,34 +847,43 @@ impl FtsMemIndex {
     // Insert
     // ------------------------------------------------------------------
 
-    /// Insert a batch using a sequentially-derived batch position.
+    /// Insert a batch into the mutable tail.
     pub fn insert(&self, batch: &RecordBatch, row_offset: u64) -> Result<()> {
-        let batch_position = self.next_batch_position.fetch_add(1, Ordering::Relaxed);
-        self.insert_with_batch_position(batch, row_offset, batch_position)
+        self.insert_batch(batch, row_offset)
     }
 
-    /// Insert a batch with an explicit batch position. The position is part
-    /// of the visibility contract — see module docs.
+    /// Insert a batch. The explicit `batch_position` is accepted for API
+    /// compatibility but not used for placement: the tail assigns its own
+    /// dense, 0-based positions (required by the visibility filter).
     pub fn insert_with_batch_position(
         &self,
         batch: &RecordBatch,
         row_offset: u64,
-        batch_position: usize,
+        _batch_position: usize,
     ) -> Result<()> {
-        // Keep next_batch_position ahead of any explicit caller-supplied
-        // position so a later `insert` (no-position) doesn't collide.
-        let _ = self
-            .next_batch_position
-            .fetch_max(batch_position.saturating_add(1), Ordering::Relaxed);
+        self.insert_batch(batch, row_offset)
+    }
+
+    fn insert_batch(&self, batch: &RecordBatch, row_offset: u64) -> Result<()> {
+        let st = self.state.load_full();
+        let batch_position = st.tail.next_position();
 
         let Some(col_idx) = batch
             .schema()
             .column_with_name(&self.column_name)
             .map(|(idx, _)| idx)
         else {
-            // Column missing: nothing to index, but advance the snapshot so
-            // the caller's position counter stays in sync.
-            return self.publish_empty_batch(batch.num_rows() as u32, row_offset, batch_position);
+            // Column missing: nothing to index, but publish an empty batch so
+            // the tail's visibility counters keep up with the writer.
+            st.tail.append_batch(
+                batch_position,
+                row_offset,
+                batch.num_rows() as u32,
+                vec![0; batch.num_rows()],
+                0,
+                HashMap::new(),
+            );
+            return Ok(());
         };
 
         let column = batch.column(col_idx);
@@ -795,58 +946,41 @@ impl FtsMemIndex {
             }
         }
 
-        // Drop the tokenizer guard before any work that could be slow so we
-        // don't hold it across allocations.
+        // Drop the tokenizer guard before publishing so we don't hold it
+        // across the snapshot install.
         drop(tok_guard);
 
-        // Install per-term chunks first so any chunk a reader can possibly
-        // see (via a later snapshot store) is fully linked.
-        for (term, builder) in term_builders {
-            let chunk = builder.build(batch_position);
-            let entry = self
-                .terms
-                .get_or_insert_with(term, TermSlice::empty_arc_swap);
-            let cur = entry.value().load();
-            entry.value().store(cur.with_chunk_appended(chunk));
-        }
-
-        let new_meta = Arc::new(BatchMeta {
+        st.tail.append_batch(
             batch_position,
             row_offset,
+            batch.num_rows() as u32,
             doc_lengths,
-            rows: batch.num_rows() as u32,
-        });
-
-        self.publish_batch(new_meta, total_tokens);
-        Ok(())
-    }
+            total_tokens,
+            term_builders,
+        );
 
-    /// Publish a snapshot for an "empty" batch (e.g. column missing) so the
-    /// visibility counters keep up with the writer's batch_position stream.
-    fn publish_empty_batch(&self, rows: u32, row_offset: u64, batch_position: usize) -> Result<()> {
-        let meta = Arc::new(BatchMeta {
-            batch_position,
-            row_offset,
-            doc_lengths: vec![0; rows as usize],
-            rows,
-        });
-        self.publish_batch(meta, 0);
+        if st.tail.doc_count() >= self.freeze_threshold_rows as u64 {
+            self.freeze(&st);
+        }
         Ok(())
     }
 
-    fn publish_batch(&self, new_meta: Arc<BatchMeta>, batch_total_tokens: u64) {
-        let cur = self.snapshot.load();
-        let new_rows = new_meta.rows as u64;
-        let mut batches: Vec<Arc<BatchMeta>> = Vec::with_capacity(cur.batches.len() + 1);
-        batches.extend(cur.batches.iter().cloned());
-        batches.push(new_meta);
-        let new_snap = Snapshot {
-            visible_count: cur.visible_count + 1,
-            batches: Arc::from(batches.into_boxed_slice()),
-            cumulative_doc_count: cur.cumulative_doc_count + new_rows,
-            cumulative_total_tokens: cur.cumulative_total_tokens + batch_total_tokens,
+    /// Freeze the current tail into a new immutable partition and publish a
+    /// fresh empty tail. Only the writer calls this; readers snapshotting the
+    /// old `IndexState` keep a consistent view across the freeze.
+    fn freeze(&self, st: &IndexState) {
+        let Some(partition) = Partition::from_tail(&st.tail) else {
+            return;
         };
-        self.snapshot.store(Arc::new(new_snap));
+        let mut partitions: Vec<Arc<Partition>> = st.partitions.iter().cloned().collect();
+        partitions.push(Arc::new(partition));
+        if partitions.len() > Self::MAX_PARTITIONS {
+            partitions = vec![Arc::new(Partition::merge(&partitions))];
+        }
+        self.state.store(Arc::new(IndexState {
+            partitions: Arc::from(partitions.into_boxed_slice()),
+            tail: TailIndex::new(),
+        }));
     }
 
     // ------------------------------------------------------------------
@@ -859,145 +993,17 @@ impl FtsMemIndex {
     /// matching documents with BM25 scores. Result order is unspecified;
     /// use `search_with_options` for sorted/limited output.
     pub fn search(&self, term: &str) -> Vec<FtsEntry> {
-        let snap = self.snapshot.load_full();
-        self.search_with_snapshot(term, &snap)
-    }
-
-    fn search_with_snapshot(&self, term: &str, snap: &Arc<Snapshot>) -> Vec<FtsEntry> {
-        if snap.visible_count == 0 {
-            return Vec::new();
-        }
+        let st = self.state.load_full();
         let tokens = self.tokenize_for_search(term);
-        score_terms(snap, &self.terms, &tokens)
+        self.search_match(&st, &tokens, None)
     }
 
     /// Search for documents containing an exact phrase, optionally allowing
     /// `slop` intervening tokens between consecutive query tokens.
     pub fn search_phrase(&self, phrase: &str, slop: u32) -> Vec<FtsEntry> {
-        let snap = self.snapshot.load_full();
-        self.search_phrase_with_snapshot(phrase, slop, &snap)
-    }
-
-    fn search_phrase_with_snapshot(
-        &self,
-        phrase: &str,
-        slop: u32,
-        snap: &Arc<Snapshot>,
-    ) -> Vec<FtsEntry> {
-        if snap.visible_count == 0 {
-            return Vec::new();
-        }
-
+        let st = self.state.load_full();
         let tokens = self.tokenize_for_search(phrase);
-        if tokens.is_empty() {
-            return Vec::new();
-        }
-        if tokens.len() == 1 {
-            // Same shortcut as the previous implementation: a single-token
-            // phrase reduces to a regular term search.
-            return score_terms(snap, &self.terms, &tokens);
-        }
-
-        // Gather visible chunks per token.
-        let mut per_token_chunks: Vec<Vec<Arc<TermChunk>>> = Vec::with_capacity(tokens.len());
-        for tok in &tokens {
-            match self.terms.get(tok.as_str()) {
-                Some(entry) => {
-                    let slice = entry.value().load_full();
-                    let visible: Vec<Arc<TermChunk>> = slice
-                        .chunks
-                        .iter()
-                        .filter(|c| c.batch_position < snap.visible_count)
-                        .cloned()
-                        .collect();
-                    if visible.is_empty() {
-                        return Vec::new();
-                    }
-                    per_token_chunks.push(visible);
-                }
-                None => return Vec::new(),
-            }
-        }
-
-        // Per-term `df` for IDF.
-        let dfs: Vec<u32> = per_token_chunks
-            .iter()
-            .map(|chunks| chunks.iter().map(|c| c.doc_count() as u32).sum::<u32>())
-            .collect();
-
-        let n = snap.cumulative_doc_count as f32;
-        let avgdl = if n > 0.0 {
-            snap.cumulative_total_tokens as f32 / n
-        } else {
-            1.0
-        };
-
-        // Find candidate documents: rows that appear in every token's
-        // posting set. Start with the smallest token's docs to bound work.
-        let smallest_idx = (0..per_token_chunks.len())
-            .min_by_key(|&i| {
-                per_token_chunks[i]
-                    .iter()
-                    .map(|c| c.doc_count())
-                    .sum::<usize>()
-            })
-            .unwrap();
-        let candidate_chunks = &per_token_chunks[smallest_idx];
-
-        let mut results = Vec::new();
-        for chunk in candidate_chunks {
-            for (doc_idx, &row_position) in chunk.row_positions.iter().enumerate() {
-                let pos = chunk
-                    .positions
-                    .as_ref()
-                    .map(|p| p.doc_positions(doc_idx).to_vec());
-                let Some(pos) = pos else { continue };
-                let mut all_positions: Vec<Vec<u32>> = vec![Vec::new(); tokens.len()];
-                all_positions[smallest_idx] = pos;
-                let mut all_present = true;
-                let mut frequencies = vec![0u32; tokens.len()];
-                frequencies[smallest_idx] = chunk.frequencies[doc_idx];
-                for (ti, chunks) in per_token_chunks.iter().enumerate() {
-                    if ti == smallest_idx {
-                        continue;
-                    }
-                    match find_doc_in_chunks(chunks, row_position) {
-                        Some((c, doc_idx_other)) => {
-                            frequencies[ti] = c.frequencies[doc_idx_other];
-                            all_positions[ti] = c
-                                .positions
-                                .as_ref()
-                                .map(|p| p.doc_positions(doc_idx_other).to_vec())
-                                .unwrap_or_default();
-                        }
-                        None => {
-                            all_present = false;
-                            break;
-                        }
-                    }
-                }
-                if !all_present {
-                    continue;
-                }
-                if !phrase_matches(&all_positions, slop) {
-                    continue;
-                }
-
-                let dl = lookup_dl(snap, row_position).unwrap_or(1) as f32;
-                let mut score = 0.0f32;
-                for (ti, _tok) in tokens.iter().enumerate() {
-                    let tf = frequencies[ti] as f32;
-                    let df = dfs[ti] as f32;
-                    score += bm25_term_score(tf, df.max(1.0), n, dl, avgdl);
-                }
-                results.push(FtsEntry {
-                    row_position,
-                    score,
-                });
-            }
-        }
-
-        results
+        self.search_phrase_tokens(&st, &tokens, slop)
     }
 
     /// Expand a term to fuzzy matches within the specified edit distance.
@@ -1009,41 +1015,8 @@ impl FtsMemIndex {
         max_distance: u32,
         max_expansions: usize,
     ) -> Vec<(String, u32)> {
-        let snap = self.snapshot.load_full();
-        self.expand_fuzzy_with_snapshot(term, max_distance, max_expansions, &snap)
-    }
-
-    fn expand_fuzzy_with_snapshot(
-        &self,
-        term: &str,
-        max_distance: u32,
-        max_expansions: usize,
-        snap: &Arc<Snapshot>,
-    ) -> Vec<(String, u32)> {
-        let mut matches: Vec<(String, u32)> = Vec::new();
-
-        if max_distance == 0 {
-            if let Some(entry) = self.terms.get(term)
-                && has_visible_chunk(&entry.value().load(), snap.visible_count)
-            {
-                matches.push((term.to_string(), 0));
-            }
-            return matches;
-        }
-
-        for entry in self.terms.iter() {
-            let key: &Arc<str> = entry.key();
-            if !has_visible_chunk(&entry.value().load(), snap.visible_count) {
-                continue;
-            }
-            let dist = levenshtein_distance(term, key);
-            if dist <= max_distance {
-                matches.push((key.to_string(), dist));
-            }
-        }
-        matches.sort_by_key(|(_, d)| *d);
-        matches.truncate(max_expansions);
-        matches
+        let st = self.state.load_full();
+        self.expand_fuzzy_term(&st, term, max_distance, max_expansions)
     }
 
     /// Search for documents using fuzzy matching on each query token.
@@ -1053,65 +1026,178 @@ impl FtsMemIndex {
         fuzziness: Option<u32>,
         max_expansions: usize,
     ) -> Vec<FtsEntry> {
-        let snap = self.snapshot.load_full();
-        self.search_fuzzy_with_snapshot(query, fuzziness, max_expansions, &snap)
+        let st = self.state.load_full();
+        let tokens = self.tokenize_for_search(query);
+        self.search_fuzzy_tokens(&st, &tokens, fuzziness, max_expansions)
     }
 
-    fn search_fuzzy_with_snapshot(
+    /// BM25 OR-search over the query tokens: WAND each immutable partition,
+    /// scan the tail, all scored with one corpus-wide [`MemBM25Scorer`], and
+    /// the results merged. Each doc lives in exactly one partition or the
+    /// tail, so the merge is a plain concatenation.
+    fn search_match(
         &self,
-        query: &str,
-        fuzziness: Option<u32>,
-        max_expansions: usize,
-        snap: &Arc<Snapshot>,
+        st: &IndexState,
+        tokens: &[String],
+        limit: Option<usize>,
     ) -> Vec<FtsEntry> {
-        if snap.visible_count == 0 {
+        if tokens.is_empty() {
             return Vec::new();
         }
+        // Snapshot the tail once so the scorer's stats and the scanned tail
+        // postings are from the same visibility point.
+        let tail_snap = st.tail.snapshot();
+        let scorer = build_scorer(st, &tail_snap, tokens);
+        if scorer.num_docs() == 0 {
+            return Vec::new();
+        }
+        let mut results = Vec::new();
+        for p in st.partitions.iter() {
+            match p.search_match(tokens, Operator::Or, &scorer, limit) {
+                Ok(hits) => results.extend(hits),
+                Err(e) => log::warn!("FTS partition match search failed: {e}"),
+            }
+        }
+        if tail_snap.visible_count > 0 {
+            results.extend(score_terms(&tail_snap, &st.tail.terms, tokens, &scorer));
+        }
+        results
+    }
 
-        let tokens = self.tokenize_for_search(query);
+    fn search_phrase_tokens(&self, st: &IndexState, tokens: &[String], slop: u32) -> Vec<FtsEntry> {
         if tokens.is_empty() {
             return Vec::new();
         }
+        if tokens.len() == 1 {
+            // A single-token phrase reduces to a regular term search.
+            return self.search_match(st, tokens, None);
+        }
+        let tail_snap = st.tail.snapshot();
+        let scorer = build_scorer(st, &tail_snap, tokens);
+        if scorer.num_docs() == 0 {
+            return Vec::new();
+        }
+        let mut results = Vec::new();
+        for p in st.partitions.iter() {
+            match p.search_phrase(tokens, slop, &scorer) {
+                Ok(hits) => results.extend(hits),
+                Err(e) => log::warn!("FTS partition phrase search failed: {e}"),
+            }
+        }
+        if tail_snap.visible_count > 0 {
+            results.extend(phrase_search_tail(
+                &tail_snap,
+                &st.tail.terms,
+                tokens,
+                slop,
+                &scorer,
+            ));
+        }
+        results
+    }
 
+    fn search_fuzzy_tokens(
+        &self,
+        st: &IndexState,
+        tokens: &[String],
+        fuzziness: Option<u32>,
+        max_expansions: usize,
+    ) -> Vec<FtsEntry> {
+        if tokens.is_empty() {
+            return Vec::new();
+        }
         let mut expanded: Vec<String> = Vec::new();
-        for tok in &tokens {
+        let mut seen: HashSet<String> = HashSet::new();
+        for tok in tokens {
             let max_dist = fuzziness.unwrap_or_else(|| auto_fuzziness(tok));
-            for (matched, _) in self.expand_fuzzy_with_snapshot(tok, max_dist, max_expansions, snap)
-            {
-                expanded.push(matched);
+            for (matched, _) in self.expand_fuzzy_term(st, tok, max_dist, max_expansions) {
+                if seen.insert(matched.clone()) {
+                    expanded.push(matched);
+                }
             }
         }
         if expanded.is_empty() {
             return Vec::new();
         }
+        self.search_match(st, &expanded, None)
+    }
 
-        score_terms(snap, &self.terms, &expanded)
+    /// Expand `term` against the term dictionaries of every partition and the
+    /// visible tail.
+    fn expand_fuzzy_term(
+        &self,
+        st: &IndexState,
+        term: &str,
+        max_distance: u32,
+        max_expansions: usize,
+    ) -> Vec<(String, u32)> {
+        let tail_snap = st.tail.snapshot();
+        if max_distance == 0 {
+            let in_tail = st
+                .tail
+                .terms
+                .get(term)
+                .map(|e| has_visible_chunk(&e.value().load(), tail_snap.visible_count))
+                .unwrap_or(false);
+            let in_partition = st.partitions.iter().any(|p| p.contains_token(term));
+            return if in_tail || in_partition {
+                vec![(term.to_string(), 0)]
+            } else {
+                Vec::new()
+            };
+        }
+        let mut matches: Vec<(String, u32)> = Vec::new();
+        let mut seen: HashSet<String> = HashSet::new();
+        for entry in st.tail.terms.iter() {
+            if !has_visible_chunk(&entry.value().load(), tail_snap.visible_count) {
+                continue;
+            }
+            let key: &Arc<str> = entry.key();
+            let dist = levenshtein_distance(term, key);
+            if dist <= max_distance && seen.insert(key.to_string()) {
+                matches.push((key.to_string(), dist));
+            }
+        }
+        for p in st.partitions.iter() {
+            for key in p.tokens() {
+                let dist = levenshtein_distance(term, key);
+                if dist <= max_distance && seen.insert(key.to_string()) {
+                    matches.push((key.to_string(), dist));
+                }
+            }
+        }
+        matches.sort_by_key(|(_, d)| *d);
+        matches.truncate(max_expansions);
+        matches
     }
 
     /// Execute a query expression and return matching documents with scores.
     ///
     /// Snapshots the index state once at entry so the entire compound
     /// query — including every leaf invoked recursively from `Boolean` /
-    /// `Boost` — sees the same `Snapshot`. This preserves the per-batch
-    /// monotonic visibility contract for compound queries.
+    /// `Boost` — sees the same `{partitions, tail}` view. This preserves the
+    /// per-batch monotonic visibility contract for compound queries.
     pub fn search_query(&self, query: &FtsQueryExpr) -> Vec<FtsEntry> {
-        let snap = self.snapshot.load_full();
-        self.search_query_with_snapshot(query, &snap)
+        let st = self.state.load_full();
+        self.search_query_with_state(query, &st, None)
     }
 
-    fn search_query_with_snapshot(
+    fn search_query_with_state(
         &self,
         query: &FtsQueryExpr,
-        snap: &Arc<Snapshot>,
+        st: &IndexState,
+        limit: Option<usize>,
     ) -> Vec<FtsEntry> {
         match query {
             FtsQueryExpr::Match { query, boost } => {
-                let mut results = self.search_with_snapshot(query, snap);
+                let tokens = self.tokenize_for_search(query);
+                let mut results = self.search_match(st, &tokens, limit);
                 apply_boost(&mut results, *boost);
                 results
             }
             FtsQueryExpr::Phrase { query, slop, boost } => {
-                let mut results = self.search_phrase_with_snapshot(query, *slop, snap);
+                let tokens = self.tokenize_for_search(query);
+                let mut results = self.search_phrase_tokens(st, &tokens, *slop);
                 apply_boost(&mut results, *boost);
                 results
             }
@@ -1121,8 +1207,9 @@ impl FtsMemIndex {
                 max_expansions,
                 boost,
             } => {
+                let tokens = self.tokenize_for_search(query);
                 let mut results =
-                    self.search_fuzzy_with_snapshot(query, *fuzziness, *max_expansions, snap);
+                    self.search_fuzzy_tokens(st, &tokens, *fuzziness, *max_expansions);
                 apply_boost(&mut results, *boost);
                 results
             }
@@ -1130,12 +1217,12 @@ impl FtsMemIndex {
                 must,
                 should,
                 must_not,
-            } => self.search_boolean(must, should, must_not, snap),
+            } => self.search_boolean(must, should, must_not, st),
             FtsQueryExpr::Boost {
                 positive,
                 negative,
                 negative_boost,
-            } => self.search_boost(positive, negative.as_deref(), *negative_boost, snap),
+            } => self.search_boost(positive, negative.as_deref(), *negative_boost, st),
         }
     }
 
@@ -1145,8 +1232,8 @@ impl FtsMemIndex {
         query: &FtsQueryExpr,
         options: SearchOptions,
     ) -> Vec<FtsEntry> {
-        let snap = self.snapshot.load_full();
-        let mut results = self.search_query_with_snapshot(query, &snap);
+        let st = self.state.load_full();
+        let mut results = self.search_query_with_state(query, &st, options.limit);
         results.sort_by(|a, b| {
             b.score
                 .partial_cmp(&a.score)
@@ -1175,13 +1262,13 @@ impl FtsMemIndex {
         positive: &FtsQueryExpr,
         negative: Option<&FtsQueryExpr>,
         negative_boost: f32,
-        snap: &Arc<Snapshot>,
+        st: &IndexState,
     ) -> Vec<FtsEntry> {
-        let mut results = self.search_query_with_snapshot(positive, snap);
+        let mut results = self.search_query_with_state(positive, st, None);
         let Some(neg) = negative else {
             return results;
         };
-        let negative_results = self.search_query_with_snapshot(neg, snap);
+        let negative_results = self.search_query_with_state(neg, st, None);
         let negative_set: HashSet<RowPosition> = negative_results
             .into_iter()
             .map(|e| e.row_position)
@@ -1199,30 +1286,30 @@ impl FtsMemIndex {
         must: &[FtsQueryExpr],
         should: &[FtsQueryExpr],
         must_not: &[FtsQueryExpr],
-        snap: &Arc<Snapshot>,
+        st: &IndexState,
     ) -> Vec<FtsEntry> {
         let excluded: HashSet<RowPosition> = must_not
             .iter()
-            .flat_map(|q| self.search_query_with_snapshot(q, snap))
+            .flat_map(|q| self.search_query_with_state(q, st, None))
             .map(|e| e.row_position)
             .collect();
 
         let mut result_map: HashMap<RowPosition, f32> = if must.is_empty() {
             let mut map: HashMap<RowPosition, f32> = HashMap::new();
             for q in should {
-                for entry in self.search_query_with_snapshot(q, snap) {
+                for entry in self.search_query_with_state(q, st, None) {
                     *map.entry(entry.row_position).or_default() += entry.score;
                 }
             }
             map
         } else {
-            let first_results = self.search_query_with_snapshot(&must[0], snap);
+            let first_results = self.search_query_with_state(&must[0], st, None);
             let mut map: HashMap<RowPosition, f32> = first_results
                 .into_iter()
                 .map(|e| (e.row_position, e.score))
                 .collect();
             for q in must.iter().skip(1) {
-                let results = self.search_query_with_snapshot(q, snap);
+                let results = self.search_query_with_state(q, st, None);
                 let result_set: HashMap<RowPosition, f32> = results
                     .into_iter()
                     .map(|e| (e.row_position, e.score))
@@ -1233,7 +1320,7 @@ impl FtsMemIndex {
                     .collect();
             }
             for q in should {
-                for entry in self.search_query_with_snapshot(q, snap) {
+                for entry in self.search_query_with_state(q, st, None) {
                     if let Some(score) = map.get_mut(&entry.row_position) {
                         *score += entry.score;
                     }
@@ -1281,13 +1368,28 @@ impl FtsMemIndex {
         partition_id: u64,
         total_rows: usize,
     ) -> Result<lance_index::scalar::inverted::builder::InnerBuilder> {
+        use lance_index::scalar::inverted::PostingListBuilder;
         use lance_index::scalar::inverted::builder::{InnerBuilder, PositionRecorder};
-        use lance_index::scalar::inverted::{DocSet, PostingListBuilder, TokenSet};
 
-        let snap = self.snapshot.load_full();
+        let st = self.state.load_full();
         let with_position = self.params.has_positions();
+        let total_rows_u64 = total_rows as u64;
 
-        if snap.visible_count == 0 {
+        // Step 1: collect (original_pos, num_tokens) for every doc across all
+        // immutable partitions and the visible tail.
+        let mut all_docs: Vec<(u64, u32)> = Vec::new();
+        for p in st.partitions.iter() {
+            for (row_pos, num_tokens) in p.docs.iter() {
+                all_docs.push((*row_pos, *num_tokens));
+            }
+        }
+        let tail_snap = st.tail.snapshot();
+        for batch in tail_snap.batches.iter().take(tail_snap.visible_count) {
+            for i in 0..batch.rows as usize {
+                all_docs.push((batch.row_offset + i as u64, batch.doc_lengths[i]));
+            }
+        }
+        if all_docs.is_empty() {
             return Ok(InnerBuilder::new(
                 partition_id,
                 with_position,
@@ -1295,81 +1397,88 @@ impl FtsMemIndex {
             ));
         }
 
-        let total_rows_u64 = total_rows as u64;
-
-        // Step 1: collect (reversed_pos, num_tokens) for every visible row.
-        // We walk visible BatchMetas in order so the reverse-sort below has
-        // a small, dense input.
-        let mut entries: Vec<(u64, u32)> = Vec::new();
-        for batch in snap.batches.iter().take(snap.visible_count) {
-            for i in 0..batch.rows as usize {
-                let original_pos = batch.row_offset + i as u64;
-                if original_pos >= total_rows_u64 {
-                    return Err(Error::io(format!(
-                        "FTS flush: row position {} >= total_rows {}",
-                        original_pos, total_rows
-                    )));
-                }
-                let reversed = total_rows_u64 - original_pos - 1;
-                entries.push((reversed, batch.doc_lengths[i]));
+        // Step 2: assign doc_ids in ascending reversed-position order, so the
+        // on-disk layout matches MemWAL's newest-first flush convention.
+        let mut entries: Vec<(u64, u64, u32)> = Vec::with_capacity(all_docs.len());
+        for (original, num_tokens) in &all_docs {
+            if *original >= total_rows_u64 {
+                return Err(Error::io(format!(
+                    "FTS flush: row position {} >= total_rows {}",
+                    original, total_rows
+                )));
             }
+            entries.push((total_rows_u64 - original - 1, *original, *num_tokens));
         }
-        entries.sort_by_key(|(rev, _)| *rev);
-
-        // Step 2: assign doc_ids in ascending reversed-pos order.
+        entries.sort_by_key(|(rev, _, _)| *rev);
         let mut docs = DocSet::default();
         let mut original_to_doc_id: HashMap<u64, u32> = HashMap::with_capacity(entries.len());
-        for (rev, num_tokens) in &entries {
+        for (rev, original, num_tokens) in &entries {
             let doc_id = docs.append(*rev, *num_tokens);
-            // Recover the original position from the reversed position so
-            // we can map per-term postings without a second pass.
-            let original = total_rows_u64 - rev - 1;
-            original_to_doc_id.insert(original, doc_id);
-        }
-
-        // Step 3: walk terms in alphabetic (skip-list) order and emit
-        // posting lists into the builder.
-        let mut tokens = TokenSet::default();
-        let mut posting_lists: Vec<PostingListBuilder> = Vec::new();
-        for entry in self.terms.iter() {
-            let term: &Arc<str> = entry.key();
+            original_to_doc_id.insert(*original, doc_id);
+        }
+
+        // Step 3: merge per-term postings across every partition and the tail.
+        let mut term_postings: HashMap<String, Vec<(u32, u32, Option<Vec<u32>>)>> = HashMap::new();
+        for p in st.partitions.iter() {
+            for (token, posting) in p.tokens_with_postings() {
+                let plain = posting_as_plain(posting);
+                let bucket = term_postings.entry(token.to_string()).or_default();
+                for i in 0..plain.len() {
+                    let row_pos = p.docs.row_id(plain.row_ids[i] as u32);
+                    let Some(&doc_id) = original_to_doc_id.get(&row_pos) else {
+                        continue;
+                    };
+                    let pos = if with_position {
+                        Some(read_positions(plain, i))
+                    } else {
+                        None
+                    };
+                    bucket.push((doc_id, plain.frequencies[i] as u32, pos));
+                }
+            }
+        }
+        for entry in st.tail.terms.iter() {
+            let token: &Arc<str> = entry.key();
             let slice = entry.value().load();
-
-            // Collect per-doc postings filtered by visibility.
-            let mut docs_for_term: Vec<(u32, u32, Option<Vec<u32>>)> = Vec::new();
+            let bucket = term_postings.entry(token.to_string()).or_default();
             for chunk in &slice.chunks {
-                if chunk.batch_position >= snap.visible_count {
+                if chunk.batch_position >= tail_snap.visible_count {
                     continue;
                 }
                 for (i, row_position) in chunk.row_positions.iter().enumerate() {
                     let Some(&doc_id) = original_to_doc_id.get(row_position) else {
-                        // Should not happen: a chunk's batch_position is
-                        // visible only if its meta is visible, and the meta
-                        // contributed every row above.
                         continue;
                     };
                     let pos = if with_position {
-                        chunk
-                            .positions
-                            .as_ref()
-                            .map(|p| p.doc_positions(i).to_vec())
+                        Some(
+                            chunk
+                                .positions
+                                .as_ref()
+                                .map(|p| p.doc_positions(i).to_vec())
+                                .unwrap_or_default(),
+                        )
                     } else {
                         None
                     };
-                    docs_for_term.push((doc_id, chunk.frequencies[i], pos));
+                    bucket.push((doc_id, chunk.frequencies[i], pos));
                 }
             }
+        }
+
+        // Step 4: emit posting lists; tokens in sorted order for determinism.
+        let mut sorted_tokens: Vec<String> = term_postings.keys().cloned().collect();
+        sorted_tokens.sort_unstable();
+        let mut tokens = TokenSet::default();
+        let mut posting_lists: Vec<PostingListBuilder> = Vec::with_capacity(sorted_tokens.len());
+        for token in sorted_tokens {
+            let mut docs_for_term = term_postings.remove(&token).unwrap_or_default();
             if docs_for_term.is_empty() {
                 continue;
             }
             docs_for_term.sort_by_key(|(doc_id, _, _)| *doc_id);
-
-            let token_id = tokens.add(term.to_string()) as usize;
-            // PostingListBuilder vector is indexed by token_id. New token
-            // ids are assigned monotonically by `add`, so we just push.
+            let token_id = tokens.add(token) as usize;
             debug_assert_eq!(token_id, posting_lists.len());
             posting_lists.push(PostingListBuilder::new(with_position));
-
             let plb = &mut posting_lists[token_id];
             for (doc_id, freq, pos) in docs_for_term {
                 let recorder = if with_position {
@@ -1503,48 +1612,69 @@ fn find_doc_in_chunks(
     None
 }
 
-const BM25_K1: f32 = 1.2;
-const BM25_B: f32 = 0.75;
+/// Build a corpus-wide BM25 scorer over every partition and the tail, for the
+/// (deduplicated) set of query tokens. A single scorer makes partition-WAND
+/// scores and tail-scan scores directly comparable. `tail_snap` must be the
+/// *same* tail snapshot the caller scans, so the scorer's stats and the
+/// scanned postings stay mutually consistent.
+fn build_scorer(st: &IndexState, tail_snap: &Snapshot, tokens: &[String]) -> MemBM25Scorer {
+    let mut total_tokens = tail_snap.cumulative_total_tokens;
+    let mut num_docs = tail_snap.cumulative_doc_count as usize;
+    for p in st.partitions.iter() {
+        total_tokens += p.total_tokens();
+        num_docs += p.doc_count();
+    }
+    let mut token_docs: HashMap<String, usize> = HashMap::new();
+    for token in tokens {
+        if token_docs.contains_key(token) {
+            continue;
+        }
+        let mut df = tail_token_df(&st.tail.terms, token, tail_snap.visible_count);
+        for p in st.partitions.iter() {
+            df += p.token_df(token);
+        }
+        token_docs.insert(token.clone(), df);
+    }
+    MemBM25Scorer::new(total_tokens, num_docs, token_docs)
+}
 
-fn bm25_term_score(tf: f32, df: f32, n: f32, dl: f32, avgdl: f32) -> f32 {
-    let idf = ((n - df + 0.5) / (df + 0.5) + 1.0).ln();
-    let numerator = tf * (BM25_K1 + 1.0);
-    let denominator = tf + BM25_K1 * (1.0 - BM25_B + BM25_B * (dl / avgdl));
-    idf * (numerator / denominator)
+/// Number of visible tail docs containing `token`.
+fn tail_token_df(
+    terms: &SkipMap<Arc<str>, ArcSwap<TermSlice>>,
+    token: &str,
+    visible_count: usize,
+) -> usize {
+    match terms.get(token) {
+        Some(e) => e
+            .value()
+            .load()
+            .chunks
+            .iter()
+            .filter(|c| c.batch_position < visible_count)
+            .map(|c| c.doc_count())
+            .sum(),
+        None => 0,
+    }
 }
 
-/// Score a list of query tokens against the visible state. Each token
-/// contributes its BM25 term score to every document containing it.
+/// OR-score `tokens` against the visible tail, summing each token's BM25
+/// contribution per document. Uses the shared corpus-wide `scorer`.
 fn score_terms(
     snap: &Snapshot,
     terms: &SkipMap<Arc<str>, ArcSwap<TermSlice>>,
     tokens: &[String],
+    scorer: &MemBM25Scorer,
 ) -> Vec<FtsEntry> {
-    let n = snap.cumulative_doc_count as f32;
-    let avgdl = if n > 0.0 {
-        snap.cumulative_total_tokens as f32 / n
-    } else {
-        1.0
-    };
-
     let mut doc_scores: HashMap<RowPosition, f32> = HashMap::new();
-
     for token in tokens {
         let Some(entry) = terms.get(token.as_str()) else {
             continue;
         };
-        let slice = entry.value().load_full();
-        // df = total visible postings for this term.
-        let df: u32 = slice
-            .chunks
-            .iter()
-            .filter(|c| c.batch_position < snap.visible_count)
-            .map(|c| c.doc_count() as u32)
-            .sum();
-        if df == 0 {
+        let qw = scorer.query_weight(token);
+        if qw == 0.0 {
             continue;
         }
-        let df_f = df as f32;
+        let slice = entry.value().load_full();
         for chunk in &slice.chunks {
             if chunk.batch_position >= snap.visible_count {
                 continue;
@@ -1553,14 +1683,12 @@ fn score_terms(
                 continue;
             };
             for (i, &row_position) in chunk.row_positions.iter().enumerate() {
-                let dl = meta.dl(row_position).unwrap_or(1) as f32;
-                let tf = chunk.frequencies[i] as f32;
-                let score = bm25_term_score(tf, df_f, n, dl, avgdl);
+                let dl = meta.dl(row_position).unwrap_or(1);
+                let score = qw * scorer.doc_weight(chunk.frequencies[i], dl);
                 *doc_scores.entry(row_position).or_default() += score;
             }
         }
     }
-
     doc_scores
         .into_iter()
         .map(|(row_position, score)| FtsEntry {
@@ -1570,6 +1698,97 @@ fn score_terms(
         .collect()
 }
 
+/// Phrase-search the visible tail. Callers shortcut the single-token case, so
+/// `tokens.len() >= 2` here. Scored with the shared corpus-wide `scorer`.
+fn phrase_search_tail(
+    snap: &Snapshot,
+    terms: &SkipMap<Arc<str>, ArcSwap<TermSlice>>,
+    tokens: &[String],
+    slop: u32,
+    scorer: &MemBM25Scorer,
+) -> Vec<FtsEntry> {
+    // Gather visible chunks per query token; any missing token => no match.
+    let mut per_token_chunks: Vec<Vec<Arc<TermChunk>>> = Vec::with_capacity(tokens.len());
+    for tok in tokens {
+        match terms.get(tok.as_str()) {
+            Some(entry) => {
+                let slice = entry.value().load_full();
+                let visible: Vec<Arc<TermChunk>> = slice
+                    .chunks
+                    .iter()
+                    .filter(|c| c.batch_position < snap.visible_count)
+                    .cloned()
+                    .collect();
+                if visible.is_empty() {
+                    return Vec::new();
+                }
+                per_token_chunks.push(visible);
+            }
+            None => return Vec::new(),
+        }
+    }
+    // Drive the intersection from the rarest token to bound work.
+    let smallest_idx = (0..per_token_chunks.len())
+        .min_by_key(|&i| {
+            per_token_chunks[i]
+                .iter()
+                .map(|c| c.doc_count())
+                .sum::<usize>()
+        })
+        .unwrap();
+
+    let mut results = Vec::new();
+    for chunk in &per_token_chunks[smallest_idx] {
+        for (doc_idx, &row_position) in chunk.row_positions.iter().enumerate() {
+            let Some(pos) = chunk
+                .positions
+                .as_ref()
+                .map(|p| p.doc_positions(doc_idx).to_vec())
+            else {
+                continue;
+            };
+            let mut all_positions: Vec<Vec<u32>> = vec![Vec::new(); tokens.len()];
+            all_positions[smallest_idx] = pos;
+            let mut frequencies = vec![0u32; tokens.len()];
+            frequencies[smallest_idx] = chunk.frequencies[doc_idx];
+            let mut all_present = true;
+            for (ti, chunks) in per_token_chunks.iter().enumerate() {
+                if ti == smallest_idx {
+                    continue;
+                }
+                match find_doc_in_chunks(chunks, row_position) {
+                    Some((c, other_idx)) => {
+                        frequencies[ti] = c.frequencies[other_idx];
+                        all_positions[ti] = c
+                            .positions
+                            .as_ref()
+                            .map(|p| p.doc_positions(other_idx).to_vec())
+                            .unwrap_or_default();
+                    }
+                    None => {
+                        all_present = false;
+                        break;
+                    }
+                }
+            }
+            if !all_present || !phrase_matches(&all_positions, slop) {
+                continue;
+            }
+            let dl = lookup_dl(snap, row_position).unwrap_or(1);
+            let score: f32 = tokens
+                .iter()
+                .enumerate()
+                .map(|(ti, tok)| scorer.query_weight(tok) * scorer.doc_weight(frequencies[ti], dl))
+                .sum();
+            results.push(FtsEntry {
+                row_position,
+                score,
+            });
+        }
+    }
+    results
+}
+
 fn phrase_matches(positions: &[Vec<u32>], slop: u32) -> bool {
     if positions.is_empty() {
         return false;
@@ -1647,37 +1866,23 @@ impl FtsIndexConfig {
 }
 
 // ============================================================================
-// Partition-structured index (redesign — see
-// ~/ai/analysis/lance/jack-MemTableFTSBetter/fts-segment-redesign/DESIGN.md)
-//
-// An immutable `Partition` holds a frozen slice of the MemTable's inserts as
-// on-disk-shaped posting lists, queried with the shared block-max WAND
-// (`wand_search`). The current per-batch-chunk layout makes queries
-// O(corpus); partitions make them ≈ O(matches) and flush 1:1 to a Lance FTS
-// on-disk partition. Wired into `FtsMemIndex` in a later stage; standalone
-// and dead-code-allowed until then.
+// Immutable partition
 // ============================================================================
 
-use arrow_array::builder::{Int32Builder, ListBuilder};
-use arrow_buffer::ScalarBuffer;
-use lance_index::scalar::inverted::query::{FtsSearchParams, Operator};
-use lance_index::scalar::inverted::{
-    DocCandidate, DocSet, MemBM25Scorer, PlainPostingList, PostingList, TokenSet, WandTerm,
-    wand_search,
-};
-
-/// An immutable, frozen FTS partition.
-#[allow(dead_code)]
+/// An immutable, frozen FTS partition: a slice of the MemTable's inserts held
+/// as on-disk-shaped posting lists and queried with block-max WAND in
+/// ≈O(matches). Built by freezing the tail; flushed 1:1 into the Lance FTS
+/// on-disk format. See `DESIGN.md` in the redesign analysis directory.
 struct Partition {
-    /// token text -> local token id (dense, 0-based).
-    tokens: TokenSet,
-    /// posting list per local token id.
+    /// token text -> local token id (dense, 0-based; indexes `postings`).
+    token_ids: HashMap<Arc<str>, u32>,
+    /// posting list per local token id. In-memory partitions are always
+    /// `PostingList::Plain` and always carry positions.
     postings: Vec<PostingList>,
-    /// local doc id -> (row position, token count).
+    /// local doc id -> (MemTable row position, token count).
     docs: DocSet,
 }
 
-#[allow(dead_code)]
 impl Partition {
     fn doc_count(&self) -> usize {
         self.docs.len()
@@ -1687,125 +1892,310 @@ impl Partition {
         self.docs.total_tokens_num()
     }
 
+    fn entry_count(&self) -> usize {
+        self.postings
+            .iter()
+            .map(|p| posting_as_plain(p).len())
+            .sum()
+    }
+
+    fn contains_token(&self, token: &str) -> bool {
+        self.token_ids.contains_key(token)
+    }
+
+    fn tokens(&self) -> impl Iterator<Item = &Arc<str>> {
+        self.token_ids.keys()
+    }
+
+    fn tokens_with_postings(&self) -> impl Iterator<Item = (&Arc<str>, &PostingList)> {
+        self.token_ids
+            .iter()
+            .map(move |(t, &id)| (t, &self.postings[id as usize]))
+    }
+
     /// Number of docs in this partition containing `token`.
     fn token_df(&self, token: &str) -> usize {
-        self.tokens
+        self.token_ids
             .get(token)
-            .map(|id| self.postings[id as usize].len())
+            .map(|&id| posting_as_plain(&self.postings[id as usize]).len())
             .unwrap_or(0)
     }
 
-    /// WAND-search this partition. `query` is `(token, query-position)` pairs,
-    /// already tokenized. Returns this partition's local candidates (row
-    /// positions); the caller merges across partitions and the tail.
-    fn search(
+    fn memory_size(&self) -> usize {
+        let mut total = std::mem::size_of::<Self>();
+        for t in self.token_ids.keys() {
+            total += std::mem::size_of::<Arc<str>>() + t.len() + std::mem::size_of::<u32>() + 16;
+        }
+        for p in &self.postings {
+            let pl = posting_as_plain(p);
+            total += pl.row_ids.len() * std::mem::size_of::<u64>()
+                + pl.frequencies.len() * std::mem::size_of::<f32>()
+                + pl.positions
+                    .as_ref()
+                    .map(|a| a.get_array_memory_size())
+                    .unwrap_or(0);
+        }
+        total += self.docs.len() * (std::mem::size_of::<u64>() + std::mem::size_of::<u32>());
+        total
+    }
+
+    /// Freeze the visible contents of `tail` into a new partition. Returns
+    /// `None` if the tail has no visible docs.
+    fn from_tail(tail: &TailIndex) -> Option<Self> {
+        let snap = tail.snapshot();
+        if snap.visible_count == 0 {
+            return None;
+        }
+        // Assign dense local doc ids in row-position order.
+        let mut docs = DocSet::default();
+        let mut pos_to_doc: HashMap<u64, u32> = HashMap::new();
+        for batch in snap.batches.iter().take(snap.visible_count) {
+            for i in 0..batch.rows as usize {
+                let rp = batch.row_offset + i as u64;
+                let doc_id = docs.append(rp, batch.doc_lengths[i]);
+                pos_to_doc.insert(rp, doc_id);
+            }
+        }
+        let mut token_ids: HashMap<Arc<str>, u32> = HashMap::new();
+        let mut raw: Vec<Vec<(u32, u32, Vec<u32>)>> = Vec::new();
+        for entry in tail.terms.iter() {
+            let term: &Arc<str> = entry.key();
+            let slice = entry.value().load();
+            let mut docs_for_term: Vec<(u32, u32, Vec<u32>)> = Vec::new();
+            for chunk in &slice.chunks {
+                if chunk.batch_position >= snap.visible_count {
+                    continue;
+                }
+                for (i, rp) in chunk.row_positions.iter().enumerate() {
+                    let Some(&doc_id) = pos_to_doc.get(rp) else {
+                        continue;
+                    };
+                    let pos = chunk
+                        .positions
+                        .as_ref()
+                        .map(|p| p.doc_positions(i).to_vec())
+                        .unwrap_or_default();
+                    docs_for_term.push((doc_id, chunk.frequencies[i], pos));
+                }
+            }
+            if docs_for_term.is_empty() {
+                continue;
+            }
+            docs_for_term.sort_by_key(|(d, _, _)| *d);
+            let id = raw.len() as u32;
+            token_ids.insert(term.clone(), id);
+            raw.push(docs_for_term);
+        }
+        let postings = raw.into_iter().map(freeze_postings_one).collect();
+        Some(Self {
+            token_ids,
+            postings,
+            docs,
+        })
+    }
+
+    /// Merge several partitions into one. Local doc ids are reassigned by
+    /// concatenation, which keeps each merged per-token posting list sorted.
+    fn merge(parts: &[Arc<Self>]) -> Self {
+        let mut token_ids: HashMap<Arc<str>, u32> = HashMap::new();
+        let mut raw: Vec<Vec<(u32, u32, Vec<u32>)>> = Vec::new();
+        let mut docs = DocSet::default();
+        let mut doc_offset: u32 = 0;
+        for p in parts {
+            for (rp, nt) in p.docs.iter() {
+                docs.append(*rp, *nt);
+            }
+            for (token, &local_id) in &p.token_ids {
+                let merged_id = *token_ids.entry(token.clone()).or_insert_with(|| {
+                    let id = raw.len() as u32;
+                    raw.push(Vec::new());
+                    id
+                });
+                let plain = posting_as_plain(&p.postings[local_id as usize]);
+                for i in 0..plain.len() {
+                    raw[merged_id as usize].push((
+                        plain.row_ids[i] as u32 + doc_offset,
+                        plain.frequencies[i] as u32,
+                        read_positions(plain, i),
+                    ));
+                }
+            }
+            doc_offset += p.docs.len() as u32;
+        }
+        let postings = raw
+            .into_iter()
+            .map(|mut v| {
+                v.sort_by_key(|(d, _, _)| *d);
+                freeze_postings_one(v)
+            })
+            .collect();
+        Self {
+            token_ids,
+            postings,
+            docs,
+        }
+    }
+
+    /// WAND OR/AND-search; candidates scored with `scorer` and reported as
+    /// MemTable row positions.
+    fn search_match(
         &self,
-        query: &[(String, u32)],
+        tokens: &[String],
         operator: Operator,
-        scorer: MemBM25Scorer,
-        params: &FtsSearchParams,
-    ) -> Result<Vec<DocCandidate>> {
-        let mut terms: Vec<WandTerm> = Vec::with_capacity(query.len());
-        for (token, position) in query {
-            match self.tokens.get(token) {
-                Some(token_id) => terms.push((
-                    token.clone(),
-                    token_id,
-                    *position,
-                    1.0,
-                    self.postings[token_id as usize].clone(),
-                )),
-                // Term absent here: an AND query can't match in this
-                // partition; an OR query just drops the term.
+        scorer: &MemBM25Scorer,
+        limit: Option<usize>,
+    ) -> Result<Vec<FtsEntry>> {
+        let mut wand_terms: Vec<WandTerm> = Vec::new();
+        let mut tok_order: Vec<String> = Vec::new();
+        for (qpos, t) in tokens.iter().enumerate() {
+            match self.token_ids.get(t.as_str()) {
+                Some(&id) => {
+                    wand_terms.push((
+                        t.clone(),
+                        id,
+                        qpos as u32,
+                        1.0,
+                        self.postings[id as usize].clone(),
+                    ));
+                    tok_order.push(t.clone());
+                }
+                // Term absent: an AND query cannot match here; OR drops it.
                 None if operator == Operator::And => return Ok(Vec::new()),
                 None => {}
             }
         }
-        if terms.is_empty() {
+        if wand_terms.is_empty() {
             return Ok(Vec::new());
         }
-        // WAND reports the partition-local doc id in `DocCandidate.row_id`;
-        // remap it to the MemTable row position via the partition's DocSet.
-        let mut hits = wand_search(operator, terms, &self.docs, scorer, params)?;
-        for c in &mut hits {
-            c.row_id = self.docs.row_id(c.row_id as u32);
-        }
-        Ok(hits)
+        let params = FtsSearchParams::new().with_limit(limit);
+        let cands = wand_search(operator, wand_terms, &self.docs, scorer.clone(), &params)?;
+        Ok(cands
+            .into_iter()
+            .map(|c| {
+                let score: f32 = c
+                    .freqs
+                    .iter()
+                    .map(|(ti, f)| {
+                        scorer.query_weight(&tok_order[*ti as usize])
+                            * scorer.doc_weight(*f, c.doc_length)
+                    })
+                    .sum();
+                FtsEntry {
+                    row_position: self.docs.row_id(c.row_id as u32),
+                    score,
+                }
+            })
+            .collect())
     }
-}
 
-/// Accumulates documents and freezes them into an immutable `Partition`.
-#[allow(dead_code)]
-struct PartitionBuilder {
-    tokens: TokenSet,
-    /// per local token id: `(doc_local_id, freq, positions)` postings.
-    postings: Vec<Vec<(u32, u32, Vec<u32>)>>,
-    docs: DocSet,
-}
-
-#[allow(dead_code)]
-impl PartitionBuilder {
-    fn new() -> Self {
-        Self {
-            tokens: TokenSet::default(),
-            postings: Vec::new(),
-            docs: DocSet::default(),
+    /// Phrase-search: WAND-And finds docs containing every token, then token
+    /// positions are verified. `tokens.len() >= 2`.
+    fn search_phrase(
+        &self,
+        tokens: &[String],
+        slop: u32,
+        scorer: &MemBM25Scorer,
+    ) -> Result<Vec<FtsEntry>> {
+        let mut wand_terms: Vec<WandTerm> = Vec::new();
+        let mut token_id_order: Vec<u32> = Vec::new();
+        for (qpos, t) in tokens.iter().enumerate() {
+            match self.token_ids.get(t.as_str()) {
+                Some(&id) => {
+                    wand_terms.push((
+                        t.clone(),
+                        id,
+                        qpos as u32,
+                        1.0,
+                        self.postings[id as usize].clone(),
+                    ));
+                    token_id_order.push(id);
+                }
+                // A phrase needs every token present in this partition.
+                None => return Ok(Vec::new()),
+            }
         }
+        let params = FtsSearchParams::new();
+        let cands = wand_search(
+            Operator::And,
+            wand_terms,
+            &self.docs,
+            scorer.clone(),
+            &params,
+        )?;
+        let mut results = Vec::new();
+        for c in cands {
+            let local_doc = c.row_id; // partition-local doc id
+            let mut all_positions: Vec<Vec<u32>> = Vec::with_capacity(tokens.len());
+            let mut present = true;
+            for &tid in &token_id_order {
+                let plain = posting_as_plain(&self.postings[tid as usize]);
+                match plain.row_ids.binary_search(&local_doc) {
+                    Ok(idx) => all_positions.push(read_positions(plain, idx)),
+                    Err(_) => {
+                        present = false;
+                        break;
+                    }
+                }
+            }
+            if !present || !phrase_matches(&all_positions, slop) {
+                continue;
+            }
+            let score: f32 = c
+                .freqs
+                .iter()
+                .map(|(ti, f)| {
+                    scorer.query_weight(&tokens[*ti as usize]) * scorer.doc_weight(*f, c.doc_length)
+                })
+                .sum();
+            results.push(FtsEntry {
+                row_position: self.docs.row_id(local_doc as u32),
+                score,
+            });
+        }
+        Ok(results)
     }
+}
 
-    fn doc_count(&self) -> usize {
-        self.docs.len()
-    }
-
-    fn is_empty(&self) -> bool {
-        self.docs.len() == 0
-    }
+/// Convert per-token `(doc_id, freq, positions)` postings — sorted by doc id —
+/// into an on-disk-shaped `PlainPostingList`. `max_score` is left `None`;
+/// WAND derives per-term bounds from the posting list at query time.
+fn freeze_postings_one(docs: Vec<(u32, u32, Vec<u32>)>) -> PostingList {
+    let row_ids: Vec<u64> = docs.iter().map(|(d, _, _)| *d as u64).collect();
+    let freqs: Vec<f32> = docs.iter().map(|(_, f, _)| *f as f32).collect();
+    let mut positions = ListBuilder::new(Int32Builder::new());
+    for (_, _, doc_positions) in &docs {
+        for p in doc_positions {
+            positions.values().append_value(*p as i32);
+        }
+        positions.append(true);
+    }
+    PostingList::Plain(PlainPostingList::new(
+        ScalarBuffer::from(row_ids),
+        ScalarBuffer::from(freqs),
+        None,
+        Some(positions.finish()),
+    ))
+}
 
-    /// Add one document: `terms` is `(token, freq, positions)` per distinct
-    /// token, `num_tokens` is the document length.
-    fn add_doc(&mut self, row_position: u64, num_tokens: u32, terms: Vec<(String, u32, Vec<u32>)>) {
-        let doc_id = self.docs.append(row_position, num_tokens);
-        for (token, freq, positions) in terms {
-            let token_id = self.tokens.add(token) as usize;
-            if token_id >= self.postings.len() {
-                self.postings.resize_with(token_id + 1, Vec::new);
-            }
-            self.postings[token_id].push((doc_id, freq, positions));
+/// In-memory FTS partitions only ever build `PostingList::Plain`.
+fn posting_as_plain(p: &PostingList) -> &PlainPostingList {
+    match p {
+        PostingList::Plain(pl) => pl,
+        PostingList::Compressed(_) => {
+            unreachable!("in-memory FTS partitions only build plain posting lists")
         }
     }
+}
 
-    /// Freeze into an immutable `Partition`. Each token's postings are sorted
-    /// by doc id and converted to a `PlainPostingList`; `max_score` is left
-    /// `None` so WAND derives the per-term upper bound from `df` at query
-    /// time (global BM25 stats are not known to a single partition).
-    fn build(self) -> Partition {
-        let postings: Vec<PostingList> = self
-            .postings
-            .into_iter()
-            .map(|mut docs| {
-                docs.sort_by_key(|(doc_id, _, _)| *doc_id);
-                let row_ids: Vec<u64> = docs.iter().map(|(d, _, _)| *d as u64).collect();
-                let freqs: Vec<f32> = docs.iter().map(|(_, f, _)| *f as f32).collect();
-                let mut positions = ListBuilder::new(Int32Builder::new());
-                for (_, _, doc_positions) in &docs {
-                    for p in doc_positions {
-                        positions.values().append_value(*p as i32);
-                    }
-                    positions.append(true);
-                }
-                PostingList::Plain(PlainPostingList::new(
-                    ScalarBuffer::from(row_ids),
-                    ScalarBuffer::from(freqs),
-                    None,
-                    Some(positions.finish()),
-                ))
-            })
-            .collect();
-        Partition {
-            tokens: self.tokens,
-            postings,
-            docs: self.docs,
-        }
+/// Read the position list for the `idx`-th posting as a `Vec<u32>`.
+fn read_positions(plain: &PlainPostingList, idx: usize) -> Vec<u32> {
+    match plain.positions(idx) {
+        Some(arr) => arr
+            .as_any()
+            .downcast_ref::<Int32Array>()
+            .map(|a| a.values().iter().map(|&v| v as u32).collect())
+            .unwrap_or_default(),
+        None => Vec::new(),
     }
 }
 
@@ -2821,82 +3211,235 @@ mod tests {
         assert!(err.to_string().contains("only supports"), "{err}");
     }
 
-    // ===== Partition redesign (Stage 2) =====
+    // ===== Partition-structured redesign =====
 
-    fn build_test_partition() -> Partition {
-        // 4 docs at row positions 100..104.
-        let mut b = PartitionBuilder::new();
-        b.add_doc(
-            100,
-            2,
-            vec![
-                ("apple".to_string(), 1, vec![0]),
-                ("banana".to_string(), 1, vec![1]),
-            ],
-        );
-        b.add_doc(
-            101,
-            2,
-            vec![
-                ("apple".to_string(), 1, vec![0]),
-                ("cherry".to_string(), 1, vec![1]),
-            ],
-        );
-        b.add_doc(102, 1, vec![("banana".to_string(), 1, vec![0])]);
-        b.add_doc(
-            103,
-            3,
-            vec![
-                ("apple".to_string(), 2, vec![0, 1]),
-                ("date".to_string(), 1, vec![2]),
-            ],
-        );
-        b.build()
+    /// Sorted row positions of a result set.
+    fn rows(mut entries: Vec<FtsEntry>) -> Vec<u64> {
+        entries.sort_by_key(|e| e.row_position);
+        entries.into_iter().map(|e| e.row_position).collect()
     }
 
-    fn search_partition(p: &Partition, token: &str) -> Vec<u64> {
-        let mut token_docs = std::collections::HashMap::new();
-        token_docs.insert(token.to_string(), p.token_df(token));
-        let scorer = MemBM25Scorer::new(p.total_tokens(), p.doc_count(), token_docs);
-        let params = FtsSearchParams::default().with_limit(Some(10));
-        let hits = p
-            .search(&[(token.to_string(), 0)], Operator::Or, scorer, &params)
-            .unwrap();
-        let mut ids: Vec<u64> = hits.iter().map(|c| c.row_id).collect();
-        ids.sort_unstable();
-        ids
+    /// An index whose 4 docs (row positions 100..104) have all been frozen
+    /// into a single partition (`freeze_threshold_rows = 1`).
+    fn build_test_partition_index() -> FtsMemIndex {
+        let index = FtsMemIndex::new(1, "description".to_string()).with_freeze_threshold_rows(1);
+        let batch = RecordBatch::try_new(
+            create_test_schema(),
+            vec![
+                Arc::new(Int32Array::from(vec![0, 1, 2, 3])),
+                Arc::new(StringArray::from(vec![
+                    "apple banana",
+                    "apple cherry",
+                    "banana",
+                    "apple apple date",
+                ])),
+            ],
+        )
+        .unwrap();
+        index.insert(&batch, 100).unwrap();
+        index
     }
 
     #[test]
     fn test_partition_build_and_search() {
-        let p = build_test_partition();
+        let index = build_test_partition_index();
+        let st = index.state.load_full();
+        assert_eq!(st.partitions.len(), 1, "the batch should have frozen");
+        assert_eq!(st.tail.visible_count(), 0, "the tail should be empty");
+
+        let p = &st.partitions[0];
         assert_eq!(p.doc_count(), 4);
         assert_eq!(p.total_tokens(), 8); // 2 + 2 + 1 + 3
-        assert_eq!(p.token_df("apple"), 3);
-        assert_eq!(p.token_df("banana"), 2);
-        assert_eq!(p.token_df("missing"), 0);
+        // Term dictionary stores the *tokenized* form, so probe via the
+        // configured tokenizer rather than the raw word.
+        let tok = |w: &str| index.tokenize_for_search(w).pop().unwrap();
+        assert_eq!(p.token_df(&tok("apple")), 3);
+        assert_eq!(p.token_df(&tok("banana")), 2);
+        assert_eq!(p.token_df("definitely_missing"), 0);
 
-        // WAND search must return the row positions containing each term.
-        assert_eq!(search_partition(&p, "apple"), vec![100, 101, 103]);
-        assert_eq!(search_partition(&p, "banana"), vec![100, 102]);
-        assert_eq!(search_partition(&p, "date"), vec![103]);
-        assert!(search_partition(&p, "missing").is_empty());
+        // Searching the index must return the partition's row positions.
+        assert_eq!(rows(index.search("apple")), vec![100, 101, 103]);
+        assert_eq!(rows(index.search("banana")), vec![100, 102]);
+        assert_eq!(rows(index.search("date")), vec![103]);
+        assert!(index.search("definitely_missing").is_empty());
     }
 
     #[test]
     fn test_partition_and_query_short_circuits_missing_term() {
-        let p = build_test_partition();
-        // "apple" present, "missing" absent -> AND yields nothing.
-        let scorer = MemBM25Scorer::new(p.total_tokens(), p.doc_count(), Default::default());
-        let params = FtsSearchParams::default().with_limit(Some(10));
-        let hits = p
-            .search(
-                &[("apple".to_string(), 0), ("missing".to_string(), 1)],
-                Operator::And,
-                scorer,
-                &params,
-            )
+        let index = build_test_partition_index();
+        let st = index.state.load_full();
+        let p = &st.partitions[0];
+        let tail_snap = st.tail.snapshot();
+        let apple = index.tokenize_for_search("apple").pop().unwrap();
+        // "apple" is present -> an OR search over it matches.
+        let or_scorer = build_scorer(&st, &tail_snap, std::slice::from_ref(&apple));
+        let or_hits = p
+            .search_match(std::slice::from_ref(&apple), Operator::Or, &or_scorer, None)
+            .unwrap();
+        assert_eq!(or_hits.len(), 3);
+        // Adding an absent term to an AND query short-circuits to nothing.
+        let and_tokens = vec![apple, "definitely_missing".to_string()];
+        let and_scorer = build_scorer(&st, &tail_snap, &and_tokens);
+        let and_hits = p
+            .search_match(&and_tokens, Operator::And, &and_scorer, None)
             .unwrap();
-        assert!(hits.is_empty());
+        assert!(and_hits.is_empty());
+    }
+
+    #[test]
+    fn test_freeze_produces_multiple_partitions() {
+        // freeze_threshold_rows = 3 with 3-row batches => one partition per
+        // batch insert.
+        let schema = create_test_schema();
+        let index = FtsMemIndex::new(1, "description".to_string()).with_freeze_threshold_rows(3);
+        for i in 0..5 {
+            index
+                .insert(&create_test_batch(&schema), (i * 100) as u64)
+                .unwrap();
+        }
+        let st = index.state.load_full();
+        assert_eq!(st.partitions.len(), 5, "every batch should have frozen");
+        assert_eq!(st.tail.visible_count(), 0);
+        assert_eq!(index.doc_count(), 15);
+
+        // "hello" appears in docs 0 and 2 of every batch; "world" in 0 and 1.
+        assert_eq!(
+            rows(index.search("hello")),
+            vec![0, 2, 100, 102, 200, 202, 300, 302, 400, 402]
+        );
+        assert_eq!(
+            rows(index.search("world")),
+            vec![0, 1, 100, 101, 200, 201, 300, 301, 400, 401]
+        );
+        assert!(index.search("definitely_missing").is_empty());
+        // Every score must be finite and positive.
+        for e in index.search("world") {
+            assert!(e.score.is_finite() && e.score > 0.0);
+        }
+    }
+
+    #[test]
+    fn test_search_spans_partitions_and_tail() {
+        // freeze_threshold_rows = 5, 3-row batches: batch 1 -> tail(3);
+        // batch 2 -> tail(6) freezes -> partition(6); batch 3 -> tail(3).
+        let schema = create_test_schema();
+        let index = FtsMemIndex::new(1, "description".to_string()).with_freeze_threshold_rows(5);
+        index.insert(&create_test_batch(&schema), 0).unwrap();
+        index.insert(&create_test_batch(&schema), 100).unwrap();
+        index.insert(&create_test_batch(&schema), 200).unwrap();
+        let st = index.state.load_full();
+        assert_eq!(st.partitions.len(), 1, "the first two batches froze");
+        assert_eq!(st.tail.visible_count(), 1, "the third batch is in the tail");
+        assert_eq!(index.doc_count(), 9);
+
+        // Results must merge the partition (rows 0..200) and the tail (200..).
+        assert_eq!(rows(index.search("hello")), vec![0, 2, 100, 102, 200, 202]);
+        assert_eq!(rows(index.search("goodbye")), vec![1, 101, 201]);
+    }
+
+    #[test]
+    fn test_flush_merges_partitions_and_tail() {
+        // A frozen partition plus a live tail must all reach the builder.
+        let schema = create_test_schema();
+        let index = FtsMemIndex::new(1, "description".to_string()).with_freeze_threshold_rows(5);
+        index.insert(&create_test_batch(&schema), 0).unwrap();
+        index.insert(&create_test_batch(&schema), 100).unwrap();
+        index.insert(&create_test_batch(&schema), 200).unwrap();
+        let st = index.state.load_full();
+        assert_eq!(st.partitions.len(), 1);
+        assert_eq!(st.tail.visible_count(), 1);
+
+        let builder = index.to_index_builder_reversed(7, 300).unwrap();
+        assert_eq!(builder.id(), 7);
+        // A non-empty builder proves the partition and the tail both reached
+        // the flush; end-to-end flush correctness is covered by the MemTable
+        // flush integration tests.
+        assert!(
+            !builder.is_empty(),
+            "9 docs across partition + tail must flush"
+        );
+    }
+
+    #[test]
+    fn test_phrase_across_partitions() {
+        let schema = create_test_schema();
+        let index = FtsMemIndex::new(1, "description".to_string()).with_freeze_threshold_rows(3);
+        // Each batch has "hello world" at its row_offset row.
+        for i in 0..4 {
+            index
+                .insert(&create_test_batch(&schema), (i * 100) as u64)
+                .unwrap();
+        }
+        let st = index.state.load_full();
+        assert_eq!(st.partitions.len(), 4);
+        // The phrase must be found in every partition.
+        assert_eq!(
+            rows(index.search_phrase("hello world", 0)),
+            vec![0, 100, 200, 300]
+        );
+        // A phrase not present anywhere returns nothing.
+        assert!(index.search_phrase("world hello", 0).is_empty());
+    }
+
+    #[test]
+    fn test_merge_caps_partition_count() {
+        // freeze_threshold_rows = 1 freezes every 3-row batch; past
+        // MAX_PARTITIONS (32) the partitions are merged into one.
+        let schema = create_test_schema();
+        let index = FtsMemIndex::new(1, "description".to_string()).with_freeze_threshold_rows(1);
+        for i in 0..40 {
+            index
+                .insert(&create_test_batch(&schema), (i * 100) as u64)
+                .unwrap();
+        }
+        let st = index.state.load_full();
+        assert!(
+            st.partitions.len() <= FtsMemIndex::MAX_PARTITIONS,
+            "partition count {} should be capped",
+            st.partitions.len()
+        );
+        assert_eq!(index.doc_count(), 120);
+        // Merge must preserve every posting: "hello" hits 2 docs per batch.
+        assert_eq!(index.search("hello").len(), 80);
+        for e in index.search("hello") {
+            assert!(e.score.is_finite() && e.score > 0.0);
+        }
+    }
+
+    #[test]
+    fn test_freeze_during_concurrent_search() {
+        // Searches running concurrently with freezes must never observe a
+        // doc twice or miss one: every result row is valid and scored.
+        let schema = create_test_schema();
+        let index =
+            Arc::new(FtsMemIndex::new(1, "description".to_string()).with_freeze_threshold_rows(6));
+        let stop = Arc::new(AtomicBool::new(false));
+        let mut readers = Vec::new();
+        for _ in 0..4 {
+            let idx = index.clone();
+            let stop = stop.clone();
+            readers.push(std::thread::spawn(move || {
+                while !stop.load(Ordering::Relaxed) {
+                    for e in idx.search("world") {
+                        // "world" only ever appears in docs 0 and 1 of a
+                        // batch, whose row_offset is a multiple of 100.
+                        assert!(e.row_position % 100 < 2);
+                        assert!(e.score.is_finite() && e.score >= 0.0);
+                    }
+                }
+            }));
+        }
+        for i in 0..60 {
+            index
+                .insert(&create_test_batch(&schema), (i * 100) as u64)
+                .unwrap();
+        }
+        std::thread::sleep(std::time::Duration::from_millis(20));
+        stop.store(true, Ordering::Relaxed);
+        for r in readers {
+            r.join().unwrap();
+        }
+        assert_eq!(index.doc_count(), 180);
+        assert_eq!(index.search("world").len(), 120);
     }
 }

From 0809e9d8be0983cf915e8893811635db898da10d Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sat, 16 May 2026 16:16:12 -0700
Subject: [PATCH 21/37] fix(mem_wal): run per-partition FTS WAND unbounded to
 keep top-k exact
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Block-max WAND top-k pruning needs per-term `max_score` upper bounds, which
in-memory partition posting lists do not carry (`max_score` is `None`). A
WAND given a `limit` therefore pruned by a bound it could not compute and
silently dropped valid top-k docs — term_recall@10 collapsed from 1.0 to
~0.1 once the corpus exceeded the freeze threshold and partitions appeared.

Per-partition WAND now always runs unbounded; the scan is still O(matches)
(the partition layout, not WAND pruning, is what makes latency flat), and
`search_with_options` truncates to the real limit after merging. Adds a
regression test asserting a limited search over many partitions returns the
exact global top-k.
---
 rust/lance/src/dataset/mem_wal/index/fts.rs | 97 ++++++++++++++-------
 1 file changed, 67 insertions(+), 30 deletions(-)

diff --git a/rust/lance/src/dataset/mem_wal/index/fts.rs b/rust/lance/src/dataset/mem_wal/index/fts.rs
index 0a99a109986..cb32b148102 100644
--- a/rust/lance/src/dataset/mem_wal/index/fts.rs
+++ b/rust/lance/src/dataset/mem_wal/index/fts.rs
@@ -995,7 +995,7 @@ impl FtsMemIndex {
     pub fn search(&self, term: &str) -> Vec<FtsEntry> {
         let st = self.state.load_full();
         let tokens = self.tokenize_for_search(term);
-        self.search_match(&st, &tokens, None)
+        self.search_match(&st, &tokens)
     }
 
     /// Search for documents containing an exact phrase, optionally allowing
@@ -1035,12 +1035,7 @@ impl FtsMemIndex {
     /// scan the tail, all scored with one corpus-wide [`MemBM25Scorer`], and
     /// the results merged. Each doc lives in exactly one partition or the
     /// tail, so the merge is a plain concatenation.
-    fn search_match(
-        &self,
-        st: &IndexState,
-        tokens: &[String],
-        limit: Option<usize>,
-    ) -> Vec<FtsEntry> {
+    fn search_match(&self, st: &IndexState, tokens: &[String]) -> Vec<FtsEntry> {
         if tokens.is_empty() {
             return Vec::new();
         }
@@ -1053,7 +1048,7 @@ impl FtsMemIndex {
         }
         let mut results = Vec::new();
         for p in st.partitions.iter() {
-            match p.search_match(tokens, Operator::Or, &scorer, limit) {
+            match p.search_match(tokens, Operator::Or, &scorer) {
                 Ok(hits) => results.extend(hits),
                 Err(e) => log::warn!("FTS partition match search failed: {e}"),
             }
@@ -1070,7 +1065,7 @@ impl FtsMemIndex {
         }
         if tokens.len() == 1 {
             // A single-token phrase reduces to a regular term search.
-            return self.search_match(st, tokens, None);
+            return self.search_match(st, tokens);
         }
         let tail_snap = st.tail.snapshot();
         let scorer = build_scorer(st, &tail_snap, tokens);
@@ -1119,7 +1114,7 @@ impl FtsMemIndex {
         if expanded.is_empty() {
             return Vec::new();
         }
-        self.search_match(st, &expanded, None)
+        self.search_match(st, &expanded)
     }
 
     /// Expand `term` against the term dictionaries of every partition and the
@@ -1179,19 +1174,14 @@ impl FtsMemIndex {
     /// per-batch monotonic visibility contract for compound queries.
     pub fn search_query(&self, query: &FtsQueryExpr) -> Vec<FtsEntry> {
         let st = self.state.load_full();
-        self.search_query_with_state(query, &st, None)
+        self.search_query_with_state(query, &st)
     }
 
-    fn search_query_with_state(
-        &self,
-        query: &FtsQueryExpr,
-        st: &IndexState,
-        limit: Option<usize>,
-    ) -> Vec<FtsEntry> {
+    fn search_query_with_state(&self, query: &FtsQueryExpr, st: &IndexState) -> Vec<FtsEntry> {
         match query {
             FtsQueryExpr::Match { query, boost } => {
                 let tokens = self.tokenize_for_search(query);
-                let mut results = self.search_match(st, &tokens, limit);
+                let mut results = self.search_match(st, &tokens);
                 apply_boost(&mut results, *boost);
                 results
             }
@@ -1233,7 +1223,7 @@ impl FtsMemIndex {
         options: SearchOptions,
     ) -> Vec<FtsEntry> {
         let st = self.state.load_full();
-        let mut results = self.search_query_with_state(query, &st, options.limit);
+        let mut results = self.search_query_with_state(query, &st);
         results.sort_by(|a, b| {
             b.score
                 .partial_cmp(&a.score)
@@ -1264,11 +1254,11 @@ impl FtsMemIndex {
         negative_boost: f32,
         st: &IndexState,
     ) -> Vec<FtsEntry> {
-        let mut results = self.search_query_with_state(positive, st, None);
+        let mut results = self.search_query_with_state(positive, st);
         let Some(neg) = negative else {
             return results;
         };
-        let negative_results = self.search_query_with_state(neg, st, None);
+        let negative_results = self.search_query_with_state(neg, st);
         let negative_set: HashSet<RowPosition> = negative_results
             .into_iter()
             .map(|e| e.row_position)
@@ -1290,26 +1280,26 @@ impl FtsMemIndex {
     ) -> Vec<FtsEntry> {
         let excluded: HashSet<RowPosition> = must_not
             .iter()
-            .flat_map(|q| self.search_query_with_state(q, st, None))
+            .flat_map(|q| self.search_query_with_state(q, st))
             .map(|e| e.row_position)
             .collect();
 
         let mut result_map: HashMap<RowPosition, f32> = if must.is_empty() {
             let mut map: HashMap<RowPosition, f32> = HashMap::new();
             for q in should {
-                for entry in self.search_query_with_state(q, st, None) {
+                for entry in self.search_query_with_state(q, st) {
                     *map.entry(entry.row_position).or_default() += entry.score;
                 }
             }
             map
         } else {
-            let first_results = self.search_query_with_state(&must[0], st, None);
+            let first_results = self.search_query_with_state(&must[0], st);
             let mut map: HashMap<RowPosition, f32> = first_results
                 .into_iter()
                 .map(|e| (e.row_position, e.score))
                 .collect();
             for q in must.iter().skip(1) {
-                let results = self.search_query_with_state(q, st, None);
+                let results = self.search_query_with_state(q, st);
                 let result_set: HashMap<RowPosition, f32> = results
                     .into_iter()
                     .map(|e| (e.row_position, e.score))
@@ -1320,7 +1310,7 @@ impl FtsMemIndex {
                     .collect();
             }
             for q in should {
-                for entry in self.search_query_with_state(q, st, None) {
+                for entry in self.search_query_with_state(q, st) {
                     if let Some(score) = map.get_mut(&entry.row_position) {
                         *score += entry.score;
                     }
@@ -2038,12 +2028,15 @@ impl Partition {
 
     /// WAND OR/AND-search; candidates scored with `scorer` and reported as
     /// MemTable row positions.
+    /// WAND runs unbounded (no `limit`): block-max top-k pruning needs
+    /// per-term `max_score` upper bounds, which in-memory posting lists do
+    /// not carry, so a bounded WAND would drop valid top-k docs. The scan is
+    /// still O(matches); the caller truncates to the real limit after merge.
     fn search_match(
         &self,
         tokens: &[String],
         operator: Operator,
         scorer: &MemBM25Scorer,
-        limit: Option<usize>,
     ) -> Result<Vec<FtsEntry>> {
         let mut wand_terms: Vec<WandTerm> = Vec::new();
         let mut tok_order: Vec<String> = Vec::new();
@@ -2067,7 +2060,7 @@ impl Partition {
         if wand_terms.is_empty() {
             return Ok(Vec::new());
         }
-        let params = FtsSearchParams::new().with_limit(limit);
+        let params = FtsSearchParams::new();
         let cands = wand_search(operator, wand_terms, &self.docs, scorer.clone(), &params)?;
         Ok(cands
             .into_iter()
@@ -3274,14 +3267,14 @@ mod tests {
         // "apple" is present -> an OR search over it matches.
         let or_scorer = build_scorer(&st, &tail_snap, std::slice::from_ref(&apple));
         let or_hits = p
-            .search_match(std::slice::from_ref(&apple), Operator::Or, &or_scorer, None)
+            .search_match(std::slice::from_ref(&apple), Operator::Or, &or_scorer)
             .unwrap();
         assert_eq!(or_hits.len(), 3);
         // Adding an absent term to an AND query short-circuits to nothing.
         let and_tokens = vec![apple, "definitely_missing".to_string()];
         let and_scorer = build_scorer(&st, &tail_snap, &and_tokens);
         let and_hits = p
-            .search_match(&and_tokens, Operator::And, &and_scorer, None)
+            .search_match(&and_tokens, Operator::And, &and_scorer)
             .unwrap();
         assert!(and_hits.is_empty());
     }
@@ -3442,4 +3435,48 @@ mod tests {
         assert_eq!(index.doc_count(), 180);
         assert_eq!(index.search("world").len(), 120);
     }
+
+    #[test]
+    fn test_limited_search_returns_exact_top_k_across_partitions() {
+        // Regression: a limited `search_with_options` over many partitions
+        // must still return the *exact* global top-k. Per-partition WAND must
+        // not prune by a limit it cannot bound (in-memory posting lists carry
+        // no `max_score`), or valid top-k docs are silently dropped.
+        let index = FtsMemIndex::new(1, "description".to_string()).with_freeze_threshold_rows(2);
+        // 8 docs (4 batches of 2) -> 4 partitions. "lion" appears with tf
+        // 1..=6, so the 6 matching docs have strictly distinct BM25 scores.
+        let texts = [
+            ["lion", "lion lion"],
+            ["lion lion lion", "lion lion lion lion"],
+            ["cat", "lion lion lion lion lion"],
+            ["lion lion lion lion lion lion", "dog"],
+        ];
+        for (b, pair) in texts.iter().enumerate() {
+            let batch = RecordBatch::try_new(
+                create_test_schema(),
+                vec![
+                    Arc::new(Int32Array::from(vec![0, 1])),
+                    Arc::new(StringArray::from(pair.to_vec())),
+                ],
+            )
+            .unwrap();
+            index.insert(&batch, (b * 100) as u64).unwrap();
+        }
+        assert_eq!(index.state.load_full().partitions.len(), 4);
+
+        // Exact full ranking via the unbounded search path.
+        let mut full = index.search("lion");
+        full.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
+        assert_eq!(full.len(), 6);
+
+        // The limited top-3 must equal the first 3 of the full ranking.
+        let limited = index.search_with_options(
+            &FtsQueryExpr::match_query("lion"),
+            SearchOptions::new().with_limit(3),
+        );
+        assert_eq!(limited.len(), 3);
+        let got: Vec<u64> = limited.iter().map(|e| e.row_position).collect();
+        let expected: Vec<u64> = full.iter().take(3).map(|e| e.row_position).collect();
+        assert_eq!(got, expected, "limited search must return the exact top-3");
+    }
 }

From 46b3b172b7770e21bcae3d7bc23048405804e3b2 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sat, 16 May 2026 16:27:19 -0700
Subject: [PATCH 22/37] fix(mem_wal): score FTS partitions by direct posting
 scan, not WAND
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Block-max WAND needs per-term `max_score` upper bounds to prune safely.
Frozen in-memory partition posting lists carry none (`max_score` is `None`),
so even an unbounded WAND mis-pruned and term_recall@10 collapsed from 1.0 to
~0.12 once the corpus crossed the freeze threshold.

`Partition::search_match` / `search_phrase` now scan the per-term merged
posting list directly: OR accumulates each token's BM25 contribution per doc,
phrase intersects from the rarest token and verifies positions. A partition
holds one merged list per term, so this is O(matches) — the per-batch-chunk
O(corpus) layout was the original problem, not the absence of WAND pruning.
All scoring still routes through the shared `MemBM25Scorer`, so partition and
tail scores stay identical. Block-max WAND can be reintroduced later once
partitions carry precomputed per-term score bounds.
---
 rust/lance/src/dataset/mem_wal/index/fts.rs | 177 +++++++++-----------
 1 file changed, 76 insertions(+), 101 deletions(-)

diff --git a/rust/lance/src/dataset/mem_wal/index/fts.rs b/rust/lance/src/dataset/mem_wal/index/fts.rs
index cb32b148102..e0ef9b5eded 100644
--- a/rust/lance/src/dataset/mem_wal/index/fts.rs
+++ b/rust/lance/src/dataset/mem_wal/index/fts.rs
@@ -58,10 +58,10 @@ use arrow_schema::DataType;
 use crossbeam_skiplist::SkipMap;
 use lance_core::{Error, Result};
 use lance_index::scalar::InvertedIndexParams;
-use lance_index::scalar::inverted::query::{FtsSearchParams, Operator};
+use lance_index::scalar::inverted::query::Operator;
 use lance_index::scalar::inverted::tokenizer::document_tokenizer::LanceTokenizer;
 use lance_index::scalar::inverted::{
-    DocSet, MemBM25Scorer, PlainPostingList, PostingList, Scorer, TokenSet, WandTerm, wand_search,
+    DocSet, MemBM25Scorer, PlainPostingList, PostingList, Scorer, TokenSet,
 };
 use lance_tokenizer::TokenStream;
 
@@ -1048,10 +1048,7 @@ impl FtsMemIndex {
         }
         let mut results = Vec::new();
         for p in st.partitions.iter() {
-            match p.search_match(tokens, Operator::Or, &scorer) {
-                Ok(hits) => results.extend(hits),
-                Err(e) => log::warn!("FTS partition match search failed: {e}"),
-            }
+            results.extend(p.search_match(tokens, Operator::Or, &scorer));
         }
         if tail_snap.visible_count > 0 {
             results.extend(score_terms(&tail_snap, &st.tail.terms, tokens, &scorer));
@@ -1074,10 +1071,7 @@ impl FtsMemIndex {
         }
         let mut results = Vec::new();
         for p in st.partitions.iter() {
-            match p.search_phrase(tokens, slop, &scorer) {
-                Ok(hits) => results.extend(hits),
-                Err(e) => log::warn!("FTS partition phrase search failed: {e}"),
-            }
+            results.extend(p.search_phrase(tokens, slop, &scorer));
         }
         if tail_snap.visible_count > 0 {
             results.extend(phrase_search_tail(
@@ -2026,105 +2020,91 @@ impl Partition {
         }
     }
 
-    /// WAND OR/AND-search; candidates scored with `scorer` and reported as
-    /// MemTable row positions.
-    /// WAND runs unbounded (no `limit`): block-max top-k pruning needs
-    /// per-term `max_score` upper bounds, which in-memory posting lists do
-    /// not carry, so a bounded WAND would drop valid top-k docs. The scan is
-    /// still O(matches); the caller truncates to the real limit after merge.
+    /// BM25 OR/AND-search by a direct posting-list scan. A partition holds one
+    /// merged posting list per term, so cost is O(matches) — not O(corpus) as
+    /// the old per-batch-chunk layout was. Block-max WAND pruning is not used:
+    /// it needs per-term `max_score` upper bounds that frozen in-memory
+    /// posting lists do not carry, and an unbounded WAND mis-prunes without
+    /// them. Each token occurrence in `tokens` contributes independently, so
+    /// scores match the tail's `score_terms`.
     fn search_match(
         &self,
         tokens: &[String],
         operator: Operator,
         scorer: &MemBM25Scorer,
-    ) -> Result<Vec<FtsEntry>> {
-        let mut wand_terms: Vec<WandTerm> = Vec::new();
-        let mut tok_order: Vec<String> = Vec::new();
-        for (qpos, t) in tokens.iter().enumerate() {
-            match self.token_ids.get(t.as_str()) {
+    ) -> Vec<FtsEntry> {
+        // doc -> (accumulated score, number of token-occurrence hits).
+        let mut doc_scores: HashMap<u32, (f32, u32)> = HashMap::new();
+        let mut any_present = false;
+        let mut all_present = true;
+        for token in tokens {
+            match self.token_ids.get(token.as_str()) {
                 Some(&id) => {
-                    wand_terms.push((
-                        t.clone(),
-                        id,
-                        qpos as u32,
-                        1.0,
-                        self.postings[id as usize].clone(),
-                    ));
-                    tok_order.push(t.clone());
+                    any_present = true;
+                    let plain = posting_as_plain(&self.postings[id as usize]);
+                    let qw = scorer.query_weight(token);
+                    for i in 0..plain.len() {
+                        let doc = plain.row_ids[i] as u32;
+                        let dl = self.docs.num_tokens(doc);
+                        let s = qw * scorer.doc_weight(plain.frequencies[i] as u32, dl);
+                        let e = doc_scores.entry(doc).or_insert((0.0, 0));
+                        e.0 += s;
+                        e.1 += 1;
+                    }
                 }
-                // Term absent: an AND query cannot match here; OR drops it.
-                None if operator == Operator::And => return Ok(Vec::new()),
-                None => {}
+                None => all_present = false,
             }
         }
-        if wand_terms.is_empty() {
-            return Ok(Vec::new());
+        if !any_present || (operator == Operator::And && !all_present) {
+            return Vec::new();
         }
-        let params = FtsSearchParams::new();
-        let cands = wand_search(operator, wand_terms, &self.docs, scorer.clone(), &params)?;
-        Ok(cands
+        let need = tokens.len() as u32;
+        doc_scores
             .into_iter()
-            .map(|c| {
-                let score: f32 = c
-                    .freqs
-                    .iter()
-                    .map(|(ti, f)| {
-                        scorer.query_weight(&tok_order[*ti as usize])
-                            * scorer.doc_weight(*f, c.doc_length)
-                    })
-                    .sum();
-                FtsEntry {
-                    row_position: self.docs.row_id(c.row_id as u32),
-                    score,
+            .filter_map(|(doc, (score, hits))| {
+                // AND requires the doc to be hit once per query-token slot.
+                if operator == Operator::And && hits < need {
+                    return None;
                 }
+                Some(FtsEntry {
+                    row_position: self.docs.row_id(doc),
+                    score,
+                })
             })
-            .collect())
+            .collect()
     }
 
-    /// Phrase-search: WAND-And finds docs containing every token, then token
-    /// positions are verified. `tokens.len() >= 2`.
-    fn search_phrase(
-        &self,
-        tokens: &[String],
-        slop: u32,
-        scorer: &MemBM25Scorer,
-    ) -> Result<Vec<FtsEntry>> {
-        let mut wand_terms: Vec<WandTerm> = Vec::new();
-        let mut token_id_order: Vec<u32> = Vec::new();
-        for (qpos, t) in tokens.iter().enumerate() {
-            match self.token_ids.get(t.as_str()) {
-                Some(&id) => {
-                    wand_terms.push((
-                        t.clone(),
-                        id,
-                        qpos as u32,
-                        1.0,
-                        self.postings[id as usize].clone(),
-                    ));
-                    token_id_order.push(id);
-                }
+    /// Phrase-search by intersecting posting lists: drive from the rarest
+    /// token, require every other token to contain the doc, and verify the
+    /// token positions satisfy the phrase. `tokens.len() >= 2`.
+    fn search_phrase(&self, tokens: &[String], slop: u32, scorer: &MemBM25Scorer) -> Vec<FtsEntry> {
+        let mut plains: Vec<&PlainPostingList> = Vec::with_capacity(tokens.len());
+        for token in tokens {
+            match self.token_ids.get(token.as_str()) {
+                Some(&id) => plains.push(posting_as_plain(&self.postings[id as usize])),
                 // A phrase needs every token present in this partition.
-                None => return Ok(Vec::new()),
+                None => return Vec::new(),
             }
         }
-        let params = FtsSearchParams::new();
-        let cands = wand_search(
-            Operator::And,
-            wand_terms,
-            &self.docs,
-            scorer.clone(),
-            &params,
-        )?;
+        let rarest = (0..plains.len()).min_by_key(|&i| plains[i].len()).unwrap();
         let mut results = Vec::new();
-        for c in cands {
-            let local_doc = c.row_id; // partition-local doc id
+        for i in 0..plains[rarest].len() {
+            let doc = plains[rarest].row_ids[i];
             let mut all_positions: Vec<Vec<u32>> = Vec::with_capacity(tokens.len());
+            let mut freqs: Vec<u32> = Vec::with_capacity(tokens.len());
             let mut present = true;
-            for &tid in &token_id_order {
-                let plain = posting_as_plain(&self.postings[tid as usize]);
-                match plain.row_ids.binary_search(&local_doc) {
-                    Ok(idx) => all_positions.push(read_positions(plain, idx)),
-                    Err(_) => {
+            for (ti, plain) in plains.iter().enumerate() {
+                let idx = if ti == rarest {
+                    Some(i)
+                } else {
+                    plain.row_ids.binary_search(&doc).ok()
+                };
+                match idx {
+                    Some(idx) => {
+                        freqs.push(plain.frequencies[idx] as u32);
+                        all_positions.push(read_positions(plain, idx));
+                    }
+                    None => {
                         present = false;
                         break;
                     }
@@ -2133,19 +2113,18 @@ impl Partition {
             if !present || !phrase_matches(&all_positions, slop) {
                 continue;
             }
-            let score: f32 = c
-                .freqs
+            let dl = self.docs.num_tokens(doc as u32);
+            let score: f32 = tokens
                 .iter()
-                .map(|(ti, f)| {
-                    scorer.query_weight(&tokens[*ti as usize]) * scorer.doc_weight(*f, c.doc_length)
-                })
+                .zip(&freqs)
+                .map(|(t, &f)| scorer.query_weight(t) * scorer.doc_weight(f, dl))
                 .sum();
             results.push(FtsEntry {
-                row_position: self.docs.row_id(local_doc as u32),
+                row_position: self.docs.row_id(doc as u32),
                 score,
             });
         }
-        Ok(results)
+        results
     }
 }
 
@@ -3266,16 +3245,12 @@ mod tests {
         let apple = index.tokenize_for_search("apple").pop().unwrap();
         // "apple" is present -> an OR search over it matches.
         let or_scorer = build_scorer(&st, &tail_snap, std::slice::from_ref(&apple));
-        let or_hits = p
-            .search_match(std::slice::from_ref(&apple), Operator::Or, &or_scorer)
-            .unwrap();
+        let or_hits = p.search_match(std::slice::from_ref(&apple), Operator::Or, &or_scorer);
         assert_eq!(or_hits.len(), 3);
         // Adding an absent term to an AND query short-circuits to nothing.
         let and_tokens = vec![apple, "definitely_missing".to_string()];
         let and_scorer = build_scorer(&st, &tail_snap, &and_tokens);
-        let and_hits = p
-            .search_match(&and_tokens, Operator::And, &and_scorer)
-            .unwrap();
+        let and_hits = p.search_match(&and_tokens, Operator::And, &and_scorer);
         assert!(and_hits.is_empty());
     }
 

From e096383a8acc779578a64f5041ccef3b4fabba23 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sun, 17 May 2026 11:41:56 -0700
Subject: [PATCH 23/37] perf(mem_wal): compact FTS partition posting lists,
 drop Arrow builder slack
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Partition posting lists were `PlainPostingList`s whose positions were each
built through a fresh Arrow `ListBuilder`/`Int32Builder`. Those builders
default-allocate 1024-element buffers, so the ~1M+ tiny per-term posting
lists across a large index carried ~10+ GB of pure capacity slack —
in-memory FTS footprint regressed ~2.8x (29 GB vs 10 GB at 1M docs).

Replaces the partition posting representation with `PartitionPosting`:
exact-sized plain `Vec`s, partition-local `u32` doc ids (not `u64`), `u32`
frequencies (not `f32`), and a CSR position layout. No Arrow builders, no
slack, smaller per-element. The OR/AND scan now accumulates into flat arrays
indexed by the dense local doc id instead of a per-posting `HashMap`.
---
 rust/lance/src/dataset/mem_wal/index/fts.rs | 228 ++++++++++----------
 1 file changed, 112 insertions(+), 116 deletions(-)

diff --git a/rust/lance/src/dataset/mem_wal/index/fts.rs b/rust/lance/src/dataset/mem_wal/index/fts.rs
index e0ef9b5eded..fb4c4d65cbd 100644
--- a/rust/lance/src/dataset/mem_wal/index/fts.rs
+++ b/rust/lance/src/dataset/mem_wal/index/fts.rs
@@ -51,18 +51,14 @@ use std::sync::Mutex;
 use std::sync::atomic::{AtomicUsize, Ordering};
 
 use arc_swap::ArcSwap;
-use arrow_array::builder::{Int32Builder, ListBuilder};
-use arrow_array::{Array, Int32Array, LargeStringArray, RecordBatch, StringArray, StringViewArray};
-use arrow_buffer::ScalarBuffer;
+use arrow_array::{Array, LargeStringArray, RecordBatch, StringArray, StringViewArray};
 use arrow_schema::DataType;
 use crossbeam_skiplist::SkipMap;
 use lance_core::{Error, Result};
 use lance_index::scalar::InvertedIndexParams;
 use lance_index::scalar::inverted::query::Operator;
 use lance_index::scalar::inverted::tokenizer::document_tokenizer::LanceTokenizer;
-use lance_index::scalar::inverted::{
-    DocSet, MemBM25Scorer, PlainPostingList, PostingList, Scorer, TokenSet,
-};
+use lance_index::scalar::inverted::{DocSet, MemBM25Scorer, Scorer, TokenSet};
 use lance_tokenizer::TokenStream;
 
 use super::RowPosition;
@@ -1405,19 +1401,18 @@ impl FtsMemIndex {
         let mut term_postings: HashMap<String, Vec<(u32, u32, Option<Vec<u32>>)>> = HashMap::new();
         for p in st.partitions.iter() {
             for (token, posting) in p.tokens_with_postings() {
-                let plain = posting_as_plain(posting);
                 let bucket = term_postings.entry(token.to_string()).or_default();
-                for i in 0..plain.len() {
-                    let row_pos = p.docs.row_id(plain.row_ids[i] as u32);
+                for i in 0..posting.len() {
+                    let row_pos = p.docs.row_id(posting.doc_ids[i]);
                     let Some(&doc_id) = original_to_doc_id.get(&row_pos) else {
                         continue;
                     };
                     let pos = if with_position {
-                        Some(read_positions(plain, i))
+                        Some(posting.positions(i).to_vec())
                     } else {
                         None
                     };
-                    bucket.push((doc_id, plain.frequencies[i] as u32, pos));
+                    bucket.push((doc_id, posting.freqs[i], pos));
                 }
             }
         }
@@ -1773,11 +1768,11 @@ fn phrase_search_tail(
     results
 }
 
-fn phrase_matches(positions: &[Vec<u32>], slop: u32) -> bool {
+fn phrase_matches<T: AsRef<[u32]>>(positions: &[T], slop: u32) -> bool {
     if positions.is_empty() {
         return false;
     }
-    for &first_pos in &positions[0] {
+    for &first_pos in positions[0].as_ref() {
         if phrase_from_position(positions, first_pos, slop) {
             return true;
         }
@@ -1785,12 +1780,13 @@ fn phrase_matches(positions: &[Vec<u32>], slop: u32) -> bool {
     false
 }
 
-fn phrase_from_position(positions: &[Vec<u32>], first_pos: u32, slop: u32) -> bool {
+fn phrase_from_position<T: AsRef<[u32]>>(positions: &[T], first_pos: u32, slop: u32) -> bool {
     let mut expected = first_pos;
     for token_positions in positions.iter().skip(1) {
         let min = expected.saturating_add(1);
         let max = expected.saturating_add(1 + slop);
         match token_positions
+            .as_ref()
             .iter()
             .filter(|&&p| p >= min && p <= max)
             .min()
@@ -1853,16 +1849,44 @@ impl FtsIndexConfig {
 // Immutable partition
 // ============================================================================
 
+/// A frozen per-term posting list inside a [`Partition`]. Exact-sized plain
+/// `Vec`s — no Arrow builder capacity slack — with partition-local `u32` doc
+/// ids. Positions use a CSR layout: posting `i` owns
+/// `pos_data[pos_offsets[i]..pos_offsets[i + 1]]`.
+struct PartitionPosting {
+    /// partition-local doc ids, ascending.
+    doc_ids: Vec<u32>,
+    /// term frequency per posting.
+    freqs: Vec<u32>,
+    /// `doc_ids.len() + 1` offsets into `pos_data`.
+    pos_offsets: Vec<u32>,
+    pos_data: Vec<u32>,
+}
+
+impl PartitionPosting {
+    fn len(&self) -> usize {
+        self.doc_ids.len()
+    }
+
+    fn positions(&self, i: usize) -> &[u32] {
+        &self.pos_data[self.pos_offsets[i] as usize..self.pos_offsets[i + 1] as usize]
+    }
+
+    fn heap_size(&self) -> usize {
+        (self.doc_ids.len() + self.freqs.len() + self.pos_offsets.len() + self.pos_data.len())
+            * std::mem::size_of::<u32>()
+    }
+}
+
 /// An immutable, frozen FTS partition: a slice of the MemTable's inserts held
-/// as on-disk-shaped posting lists and queried with block-max WAND in
-/// ≈O(matches). Built by freezing the tail; flushed 1:1 into the Lance FTS
-/// on-disk format. See `DESIGN.md` in the redesign analysis directory.
+/// as compact per-term posting lists, queried by a direct O(matches) scan.
+/// Built by freezing the tail; flushed 1:1 into the Lance FTS on-disk format.
+/// See `DESIGN.md` in the redesign analysis directory.
 struct Partition {
     /// token text -> local token id (dense, 0-based; indexes `postings`).
     token_ids: HashMap<Arc<str>, u32>,
-    /// posting list per local token id. In-memory partitions are always
-    /// `PostingList::Plain` and always carry positions.
-    postings: Vec<PostingList>,
+    /// posting list per local token id.
+    postings: Vec<PartitionPosting>,
     /// local doc id -> (MemTable row position, token count).
     docs: DocSet,
 }
@@ -1877,10 +1901,7 @@ impl Partition {
     }
 
     fn entry_count(&self) -> usize {
-        self.postings
-            .iter()
-            .map(|p| posting_as_plain(p).len())
-            .sum()
+        self.postings.iter().map(|p| p.len()).sum()
     }
 
     fn contains_token(&self, token: &str) -> bool {
@@ -1891,7 +1912,7 @@ impl Partition {
         self.token_ids.keys()
     }
 
-    fn tokens_with_postings(&self) -> impl Iterator<Item = (&Arc<str>, &PostingList)> {
+    fn tokens_with_postings(&self) -> impl Iterator<Item = (&Arc<str>, &PartitionPosting)> {
         self.token_ids
             .iter()
             .map(move |(t, &id)| (t, &self.postings[id as usize]))
@@ -1901,7 +1922,7 @@ impl Partition {
     fn token_df(&self, token: &str) -> usize {
         self.token_ids
             .get(token)
-            .map(|&id| posting_as_plain(&self.postings[id as usize]).len())
+            .map(|&id| self.postings[id as usize].len())
             .unwrap_or(0)
     }
 
@@ -1910,16 +1931,8 @@ impl Partition {
         for t in self.token_ids.keys() {
             total += std::mem::size_of::<Arc<str>>() + t.len() + std::mem::size_of::<u32>() + 16;
         }
-        for p in &self.postings {
-            let pl = posting_as_plain(p);
-            total += pl.row_ids.len() * std::mem::size_of::<u64>()
-                + pl.frequencies.len() * std::mem::size_of::<f32>()
-                + pl.positions
-                    .as_ref()
-                    .map(|a| a.get_array_memory_size())
-                    .unwrap_or(0);
-        }
-        total += self.docs.len() * (std::mem::size_of::<u64>() + std::mem::size_of::<u32>());
+        total += self.postings.iter().map(|p| p.heap_size()).sum::<usize>();
+        total += self.docs.len() * (std::mem::size_of::<u64>() + 2 * std::mem::size_of::<u32>());
         total
     }
 
@@ -1995,12 +2008,12 @@ impl Partition {
                     raw.push(Vec::new());
                     id
                 });
-                let plain = posting_as_plain(&p.postings[local_id as usize]);
-                for i in 0..plain.len() {
+                let posting = &p.postings[local_id as usize];
+                for i in 0..posting.len() {
                     raw[merged_id as usize].push((
-                        plain.row_ids[i] as u32 + doc_offset,
-                        plain.frequencies[i] as u32,
-                        read_positions(plain, i),
+                        posting.doc_ids[i] + doc_offset,
+                        posting.freqs[i],
+                        posting.positions(i).to_vec(),
                     ));
                 }
             }
@@ -2033,23 +2046,24 @@ impl Partition {
         operator: Operator,
         scorer: &MemBM25Scorer,
     ) -> Vec<FtsEntry> {
-        // doc -> (accumulated score, number of token-occurrence hits).
-        let mut doc_scores: HashMap<u32, (f32, u32)> = HashMap::new();
+        // Partition-local doc ids are dense 0..doc_count, so accumulate into
+        // flat arrays — no per-posting hashing.
+        let n = self.docs.len();
+        let mut scores = vec![0.0f32; n];
+        let mut hits = vec![0u32; n];
         let mut any_present = false;
         let mut all_present = true;
         for token in tokens {
             match self.token_ids.get(token.as_str()) {
                 Some(&id) => {
                     any_present = true;
-                    let plain = posting_as_plain(&self.postings[id as usize]);
+                    let posting = &self.postings[id as usize];
                     let qw = scorer.query_weight(token);
-                    for i in 0..plain.len() {
-                        let doc = plain.row_ids[i] as u32;
-                        let dl = self.docs.num_tokens(doc);
-                        let s = qw * scorer.doc_weight(plain.frequencies[i] as u32, dl);
-                        let e = doc_scores.entry(doc).or_insert((0.0, 0));
-                        e.0 += s;
-                        e.1 += 1;
+                    for i in 0..posting.len() {
+                        let doc = posting.doc_ids[i] as usize;
+                        let dl = self.docs.num_tokens(posting.doc_ids[i]);
+                        scores[doc] += qw * scorer.doc_weight(posting.freqs[i], dl);
+                        hits[doc] += 1;
                     }
                 }
                 None => all_present = false,
@@ -2058,51 +2072,52 @@ impl Partition {
         if !any_present || (operator == Operator::And && !all_present) {
             return Vec::new();
         }
+        // AND requires the doc to be hit once per query-token slot.
         let need = tokens.len() as u32;
-        doc_scores
-            .into_iter()
-            .filter_map(|(doc, (score, hits))| {
-                // AND requires the doc to be hit once per query-token slot.
-                if operator == Operator::And && hits < need {
-                    return None;
-                }
-                Some(FtsEntry {
-                    row_position: self.docs.row_id(doc),
-                    score,
-                })
-            })
-            .collect()
+        let mut results = Vec::new();
+        for doc in 0..n {
+            if hits[doc] == 0 || (operator == Operator::And && hits[doc] < need) {
+                continue;
+            }
+            results.push(FtsEntry {
+                row_position: self.docs.row_id(doc as u32),
+                score: scores[doc],
+            });
+        }
+        results
     }
 
     /// Phrase-search by intersecting posting lists: drive from the rarest
     /// token, require every other token to contain the doc, and verify the
     /// token positions satisfy the phrase. `tokens.len() >= 2`.
     fn search_phrase(&self, tokens: &[String], slop: u32, scorer: &MemBM25Scorer) -> Vec<FtsEntry> {
-        let mut plains: Vec<&PlainPostingList> = Vec::with_capacity(tokens.len());
+        let mut postings: Vec<&PartitionPosting> = Vec::with_capacity(tokens.len());
         for token in tokens {
             match self.token_ids.get(token.as_str()) {
-                Some(&id) => plains.push(posting_as_plain(&self.postings[id as usize])),
+                Some(&id) => postings.push(&self.postings[id as usize]),
                 // A phrase needs every token present in this partition.
                 None => return Vec::new(),
             }
         }
-        let rarest = (0..plains.len()).min_by_key(|&i| plains[i].len()).unwrap();
+        let rarest = (0..postings.len())
+            .min_by_key(|&i| postings[i].len())
+            .unwrap();
         let mut results = Vec::new();
-        for i in 0..plains[rarest].len() {
-            let doc = plains[rarest].row_ids[i];
-            let mut all_positions: Vec<Vec<u32>> = Vec::with_capacity(tokens.len());
+        for i in 0..postings[rarest].len() {
+            let doc = postings[rarest].doc_ids[i];
+            let mut all_positions: Vec<&[u32]> = Vec::with_capacity(tokens.len());
             let mut freqs: Vec<u32> = Vec::with_capacity(tokens.len());
             let mut present = true;
-            for (ti, plain) in plains.iter().enumerate() {
+            for (ti, posting) in postings.iter().enumerate() {
                 let idx = if ti == rarest {
                     Some(i)
                 } else {
-                    plain.row_ids.binary_search(&doc).ok()
+                    posting.doc_ids.binary_search(&doc).ok()
                 };
                 match idx {
                     Some(idx) => {
-                        freqs.push(plain.frequencies[idx] as u32);
-                        all_positions.push(read_positions(plain, idx));
+                        freqs.push(posting.freqs[idx]);
+                        all_positions.push(posting.positions(idx));
                     }
                     None => {
                         present = false;
@@ -2113,14 +2128,14 @@ impl Partition {
             if !present || !phrase_matches(&all_positions, slop) {
                 continue;
             }
-            let dl = self.docs.num_tokens(doc as u32);
+            let dl = self.docs.num_tokens(doc);
             let score: f32 = tokens
                 .iter()
                 .zip(&freqs)
                 .map(|(t, &f)| scorer.query_weight(t) * scorer.doc_weight(f, dl))
                 .sum();
             results.push(FtsEntry {
-                row_position: self.docs.row_id(doc as u32),
+                row_position: self.docs.row_id(doc),
                 score,
             });
         }
@@ -2128,46 +2143,27 @@ impl Partition {
     }
 }
 
-/// Convert per-token `(doc_id, freq, positions)` postings — sorted by doc id —
-/// into an on-disk-shaped `PlainPostingList`. `max_score` is left `None`;
-/// WAND derives per-term bounds from the posting list at query time.
-fn freeze_postings_one(docs: Vec<(u32, u32, Vec<u32>)>) -> PostingList {
-    let row_ids: Vec<u64> = docs.iter().map(|(d, _, _)| *d as u64).collect();
-    let freqs: Vec<f32> = docs.iter().map(|(_, f, _)| *f as f32).collect();
-    let mut positions = ListBuilder::new(Int32Builder::new());
-    for (_, _, doc_positions) in &docs {
-        for p in doc_positions {
-            positions.values().append_value(*p as i32);
-        }
-        positions.append(true);
-    }
-    PostingList::Plain(PlainPostingList::new(
-        ScalarBuffer::from(row_ids),
-        ScalarBuffer::from(freqs),
-        None,
-        Some(positions.finish()),
-    ))
-}
-
-/// In-memory FTS partitions only ever build `PostingList::Plain`.
-fn posting_as_plain(p: &PostingList) -> &PlainPostingList {
-    match p {
-        PostingList::Plain(pl) => pl,
-        PostingList::Compressed(_) => {
-            unreachable!("in-memory FTS partitions only build plain posting lists")
-        }
-    }
-}
-
-/// Read the position list for the `idx`-th posting as a `Vec<u32>`.
-fn read_positions(plain: &PlainPostingList, idx: usize) -> Vec<u32> {
-    match plain.positions(idx) {
-        Some(arr) => arr
-            .as_any()
-            .downcast_ref::<Int32Array>()
-            .map(|a| a.values().iter().map(|&v| v as u32).collect())
-            .unwrap_or_default(),
-        None => Vec::new(),
+/// Convert `(doc_id, freq, positions)` triples — already sorted by doc id —
+/// into a compact, exact-sized [`PartitionPosting`].
+fn freeze_postings_one(docs: Vec<(u32, u32, Vec<u32>)>) -> PartitionPosting {
+    let n = docs.len();
+    let total_pos: usize = docs.iter().map(|(_, _, p)| p.len()).sum();
+    let mut doc_ids = Vec::with_capacity(n);
+    let mut freqs = Vec::with_capacity(n);
+    let mut pos_offsets = Vec::with_capacity(n + 1);
+    let mut pos_data = Vec::with_capacity(total_pos);
+    pos_offsets.push(0);
+    for (d, f, positions) in docs {
+        doc_ids.push(d);
+        freqs.push(f);
+        pos_data.extend_from_slice(&positions);
+        pos_offsets.push(pos_data.len() as u32);
+    }
+    PartitionPosting {
+        doc_ids,
+        freqs,
+        pos_offsets,
+        pos_data,
     }
 }
 

From bde61dd693bcf6bdee171143f39a9657ad4ae5f9 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sun, 17 May 2026 11:56:44 -0700
Subject: [PATCH 24/37] perf(mem_wal): prune FTS partition top-k queries with
 block-max WAND
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A limited query previously scored every matching doc in every partition —
O(matches), so p50 grew with the corpus and trailed Lucene by ~24x. Each
`PartitionPosting` now carries `(max_freq, min_dl)`, which yields a sound
per-term BM25 score upper bound (`doc_weight` is monotone in both). With that
bound, `Partition::wand_match` runs block-max WAND: it maintains a top-k heap
and skips any doc whose cumulative term upper bounds cannot reach the k-th
score. This is exact — the earlier WAND attempt mis-pruned only because the
posting lists carried no bound at all.

WAND runs for OR queries that carry a result limit (`search_with_options`);
unbounded `search` and AND still take the exact direct scan. Adds a test
asserting WAND top-k equals the exact full-scan top-k across partitions.
---
 rust/lance/src/dataset/mem_wal/index/fts.rs | 325 +++++++++++++++++---
 1 file changed, 275 insertions(+), 50 deletions(-)

diff --git a/rust/lance/src/dataset/mem_wal/index/fts.rs b/rust/lance/src/dataset/mem_wal/index/fts.rs
index fb4c4d65cbd..2b2fda7de4b 100644
--- a/rust/lance/src/dataset/mem_wal/index/fts.rs
+++ b/rust/lance/src/dataset/mem_wal/index/fts.rs
@@ -45,7 +45,8 @@
 //! into one builder. The on-disk format is unchanged from Lance's existing
 //! inverted index.
 
-use std::collections::{HashMap, HashSet};
+use std::cmp::Reverse;
+use std::collections::{BinaryHeap, HashMap, HashSet};
 use std::sync::Arc;
 use std::sync::Mutex;
 use std::sync::atomic::{AtomicUsize, Ordering};
@@ -991,7 +992,7 @@ impl FtsMemIndex {
     pub fn search(&self, term: &str) -> Vec<FtsEntry> {
         let st = self.state.load_full();
         let tokens = self.tokenize_for_search(term);
-        self.search_match(&st, &tokens)
+        self.search_match(&st, &tokens, None)
     }
 
     /// Search for documents containing an exact phrase, optionally allowing
@@ -1031,7 +1032,12 @@ impl FtsMemIndex {
     /// scan the tail, all scored with one corpus-wide [`MemBM25Scorer`], and
     /// the results merged. Each doc lives in exactly one partition or the
     /// tail, so the merge is a plain concatenation.
-    fn search_match(&self, st: &IndexState, tokens: &[String]) -> Vec<FtsEntry> {
+    fn search_match(
+        &self,
+        st: &IndexState,
+        tokens: &[String],
+        limit: Option<usize>,
+    ) -> Vec<FtsEntry> {
         if tokens.is_empty() {
             return Vec::new();
         }
@@ -1044,7 +1050,7 @@ impl FtsMemIndex {
         }
         let mut results = Vec::new();
         for p in st.partitions.iter() {
-            results.extend(p.search_match(tokens, Operator::Or, &scorer));
+            results.extend(p.search_match(tokens, Operator::Or, &scorer, limit));
         }
         if tail_snap.visible_count > 0 {
             results.extend(score_terms(&tail_snap, &st.tail.terms, tokens, &scorer));
@@ -1058,7 +1064,7 @@ impl FtsMemIndex {
         }
         if tokens.len() == 1 {
             // A single-token phrase reduces to a regular term search.
-            return self.search_match(st, tokens);
+            return self.search_match(st, tokens, None);
         }
         let tail_snap = st.tail.snapshot();
         let scorer = build_scorer(st, &tail_snap, tokens);
@@ -1104,7 +1110,7 @@ impl FtsMemIndex {
         if expanded.is_empty() {
             return Vec::new();
         }
-        self.search_match(st, &expanded)
+        self.search_match(st, &expanded, None)
     }
 
     /// Expand `term` against the term dictionaries of every partition and the
@@ -1164,14 +1170,22 @@ impl FtsMemIndex {
     /// per-batch monotonic visibility contract for compound queries.
     pub fn search_query(&self, query: &FtsQueryExpr) -> Vec<FtsEntry> {
         let st = self.state.load_full();
-        self.search_query_with_state(query, &st)
+        self.search_query_with_state(query, &st, None)
     }
 
-    fn search_query_with_state(&self, query: &FtsQueryExpr, st: &IndexState) -> Vec<FtsEntry> {
+    /// `limit` is the caller's top-k, threaded down so a top-level `Match`
+    /// leaf can prune with WAND. Compound branches (`Boolean`/`Boost`) need
+    /// their children's full result sets, so they pass `None` downward.
+    fn search_query_with_state(
+        &self,
+        query: &FtsQueryExpr,
+        st: &IndexState,
+        limit: Option<usize>,
+    ) -> Vec<FtsEntry> {
         match query {
             FtsQueryExpr::Match { query, boost } => {
                 let tokens = self.tokenize_for_search(query);
-                let mut results = self.search_match(st, &tokens);
+                let mut results = self.search_match(st, &tokens, limit);
                 apply_boost(&mut results, *boost);
                 results
             }
@@ -1213,7 +1227,7 @@ impl FtsMemIndex {
         options: SearchOptions,
     ) -> Vec<FtsEntry> {
         let st = self.state.load_full();
-        let mut results = self.search_query_with_state(query, &st);
+        let mut results = self.search_query_with_state(query, &st, options.limit);
         results.sort_by(|a, b| {
             b.score
                 .partial_cmp(&a.score)
@@ -1244,11 +1258,11 @@ impl FtsMemIndex {
         negative_boost: f32,
         st: &IndexState,
     ) -> Vec<FtsEntry> {
-        let mut results = self.search_query_with_state(positive, st);
+        let mut results = self.search_query_with_state(positive, st, None);
         let Some(neg) = negative else {
             return results;
         };
-        let negative_results = self.search_query_with_state(neg, st);
+        let negative_results = self.search_query_with_state(neg, st, None);
         let negative_set: HashSet<RowPosition> = negative_results
             .into_iter()
             .map(|e| e.row_position)
@@ -1270,26 +1284,26 @@ impl FtsMemIndex {
     ) -> Vec<FtsEntry> {
         let excluded: HashSet<RowPosition> = must_not
             .iter()
-            .flat_map(|q| self.search_query_with_state(q, st))
+            .flat_map(|q| self.search_query_with_state(q, st, None))
             .map(|e| e.row_position)
             .collect();
 
         let mut result_map: HashMap<RowPosition, f32> = if must.is_empty() {
             let mut map: HashMap<RowPosition, f32> = HashMap::new();
             for q in should {
-                for entry in self.search_query_with_state(q, st) {
+                for entry in self.search_query_with_state(q, st, None) {
                     *map.entry(entry.row_position).or_default() += entry.score;
                 }
             }
             map
         } else {
-            let first_results = self.search_query_with_state(&must[0], st);
+            let first_results = self.search_query_with_state(&must[0], st, None);
             let mut map: HashMap<RowPosition, f32> = first_results
                 .into_iter()
                 .map(|e| (e.row_position, e.score))
                 .collect();
             for q in must.iter().skip(1) {
-                let results = self.search_query_with_state(q, st);
+                let results = self.search_query_with_state(q, st, None);
                 let result_set: HashMap<RowPosition, f32> = results
                     .into_iter()
                     .map(|e| (e.row_position, e.score))
@@ -1300,7 +1314,7 @@ impl FtsMemIndex {
                     .collect();
             }
             for q in should {
-                for entry in self.search_query_with_state(q, st) {
+                for entry in self.search_query_with_state(q, st, None) {
                     if let Some(score) = map.get_mut(&entry.row_position) {
                         *score += entry.score;
                     }
@@ -1861,6 +1875,11 @@ struct PartitionPosting {
     /// `doc_ids.len() + 1` offsets into `pos_data`.
     pos_offsets: Vec<u32>,
     pos_data: Vec<u32>,
+    /// Largest `freq` and smallest doc length over this list. `doc_weight` is
+    /// increasing in freq and decreasing in doc length, so `doc_weight(max_freq,
+    /// min_dl)` is a valid per-term score upper bound for WAND pruning.
+    max_freq: u32,
+    min_dl: u32,
 }
 
 impl PartitionPosting {
@@ -1872,6 +1891,11 @@ impl PartitionPosting {
         &self.pos_data[self.pos_offsets[i] as usize..self.pos_offsets[i + 1] as usize]
     }
 
+    /// Upper bound on this term's BM25 doc-side weight, given the scorer.
+    fn max_doc_weight(&self, scorer: &MemBM25Scorer) -> f32 {
+        scorer.doc_weight(self.max_freq, self.min_dl)
+    }
+
     fn heap_size(&self) -> usize {
         (self.doc_ids.len() + self.freqs.len() + self.pos_offsets.len() + self.pos_data.len())
             * std::mem::size_of::<u32>()
@@ -1983,7 +2007,10 @@ impl Partition {
             token_ids.insert(term.clone(), id);
             raw.push(docs_for_term);
         }
-        let postings = raw.into_iter().map(freeze_postings_one).collect();
+        let postings = raw
+            .into_iter()
+            .map(|v| freeze_postings_one(v, &docs))
+            .collect();
         Some(Self {
             token_ids,
             postings,
@@ -2023,7 +2050,7 @@ impl Partition {
             .into_iter()
             .map(|mut v| {
                 v.sort_by_key(|(d, _, _)| *d);
-                freeze_postings_one(v)
+                freeze_postings_one(v, &docs)
             })
             .collect();
         Self {
@@ -2033,49 +2060,61 @@ impl Partition {
         }
     }
 
-    /// BM25 OR/AND-search by a direct posting-list scan. A partition holds one
-    /// merged posting list per term, so cost is O(matches) — not O(corpus) as
-    /// the old per-batch-chunk layout was. Block-max WAND pruning is not used:
-    /// it needs per-term `max_score` upper bounds that frozen in-memory
-    /// posting lists do not carry, and an unbounded WAND mis-prunes without
-    /// them. Each token occurrence in `tokens` contributes independently, so
-    /// scores match the tail's `score_terms`.
+    /// BM25 OR/AND-search of the partition. With an OR operator and a result
+    /// limit, prunes with block-max WAND; otherwise runs a direct O(matches)
+    /// scan. Every result is scored with `scorer`, so scores are identical
+    /// regardless of which path runs (and identical to the tail's
+    /// `score_terms`).
     fn search_match(
         &self,
         tokens: &[String],
         operator: Operator,
         scorer: &MemBM25Scorer,
+        limit: Option<usize>,
     ) -> Vec<FtsEntry> {
-        // Partition-local doc ids are dense 0..doc_count, so accumulate into
-        // flat arrays — no per-posting hashing.
-        let n = self.docs.len();
-        let mut scores = vec![0.0f32; n];
-        let mut hits = vec![0u32; n];
-        let mut any_present = false;
+        // Resolve present query tokens to (local token id, query weight).
+        // Repeated query tokens are kept so scores match `score_terms`.
+        let mut terms: Vec<(usize, f32)> = Vec::with_capacity(tokens.len());
         let mut all_present = true;
         for token in tokens {
             match self.token_ids.get(token.as_str()) {
-                Some(&id) => {
-                    any_present = true;
-                    let posting = &self.postings[id as usize];
-                    let qw = scorer.query_weight(token);
-                    for i in 0..posting.len() {
-                        let doc = posting.doc_ids[i] as usize;
-                        let dl = self.docs.num_tokens(posting.doc_ids[i]);
-                        scores[doc] += qw * scorer.doc_weight(posting.freqs[i], dl);
-                        hits[doc] += 1;
-                    }
-                }
+                Some(&id) => terms.push((id as usize, scorer.query_weight(token))),
                 None => all_present = false,
             }
         }
-        if !any_present || (operator == Operator::And && !all_present) {
+        if terms.is_empty() || (operator == Operator::And && !all_present) {
             return Vec::new();
         }
-        // AND requires the doc to be hit once per query-token slot.
-        let need = tokens.len() as u32;
+        match limit {
+            Some(k) if k > 0 && operator == Operator::Or => self.wand_match(&terms, scorer, k),
+            _ => self.scan_match(&terms, operator, tokens.len() as u32, scorer),
+        }
+    }
+
+    /// Exact direct scan: accumulate every match into flat arrays indexed by
+    /// the dense local doc id. Used for unbounded and AND queries.
+    fn scan_match(
+        &self,
+        terms: &[(usize, f32)],
+        operator: Operator,
+        need: u32,
+        scorer: &MemBM25Scorer,
+    ) -> Vec<FtsEntry> {
+        let n = self.docs.len();
+        let mut scores = vec![0.0f32; n];
+        let mut hits = vec![0u32; n];
+        for &(id, qw) in terms {
+            let posting = &self.postings[id];
+            for i in 0..posting.len() {
+                let doc = posting.doc_ids[i] as usize;
+                let dl = self.docs.num_tokens(posting.doc_ids[i]);
+                scores[doc] += qw * scorer.doc_weight(posting.freqs[i], dl);
+                hits[doc] += 1;
+            }
+        }
         let mut results = Vec::new();
         for doc in 0..n {
+            // AND requires the doc to be hit once per query-token slot.
             if hits[doc] == 0 || (operator == Operator::And && hits[doc] < need) {
                 continue;
             }
@@ -2087,6 +2126,90 @@ impl Partition {
         results
     }
 
+    /// Block-max WAND top-k over an OR query. Exact: each term's
+    /// `(max_freq, min_dl)` gives a sound score upper bound, so the algorithm
+    /// only skips docs that provably cannot enter the top-`k`.
+    fn wand_match(
+        &self,
+        terms: &[(usize, f32)],
+        scorer: &MemBM25Scorer,
+        k: usize,
+    ) -> Vec<FtsEntry> {
+        let mut cursors: Vec<WandCursor> = terms
+            .iter()
+            .map(|&(id, qw)| {
+                let posting = &self.postings[id];
+                WandCursor {
+                    posting,
+                    idx: 0,
+                    qw,
+                    ub: qw * posting.max_doc_weight(scorer),
+                }
+            })
+            .collect();
+        // Min-heap of the current top-k by score.
+        let mut heap: BinaryHeap<Reverse<ScoredDoc>> = BinaryHeap::with_capacity(k + 1);
+        // theta = score of the weakest doc still in the top-k (-inf until full).
+        let mut theta = f32::NEG_INFINITY;
+        loop {
+            cursors.retain(|c| c.doc().is_some());
+            if cursors.is_empty() {
+                break;
+            }
+            cursors.sort_by_key(|c| c.doc().unwrap());
+            // Pivot: first cursor whose cumulative upper bound exceeds theta.
+            let mut acc = 0.0f32;
+            let mut pivot = None;
+            for (i, c) in cursors.iter().enumerate() {
+                acc += c.ub;
+                if acc > theta {
+                    pivot = Some(i);
+                    break;
+                }
+            }
+            let Some(pivot) = pivot else {
+                break; // no remaining doc can reach theta
+            };
+            let pivot_doc = cursors[pivot].doc().unwrap();
+            if cursors[0].doc().unwrap() == pivot_doc {
+                // Every cursor positioned at pivot_doc contributes; score it.
+                let dl = self.docs.num_tokens(pivot_doc);
+                let mut score = 0.0f32;
+                for c in cursors.iter_mut() {
+                    if c.doc() == Some(pivot_doc) {
+                        score += c.qw * scorer.doc_weight(c.posting.freqs[c.idx], dl);
+                        c.idx += 1;
+                    }
+                }
+                if heap.len() < k {
+                    heap.push(Reverse(ScoredDoc {
+                        score,
+                        doc: pivot_doc,
+                    }));
+                    if heap.len() == k {
+                        theta = heap.peek().unwrap().0.score;
+                    }
+                } else if score > theta {
+                    heap.pop();
+                    heap.push(Reverse(ScoredDoc {
+                        score,
+                        doc: pivot_doc,
+                    }));
+                    theta = heap.peek().unwrap().0.score;
+                }
+            } else {
+                // A cursor before the pivot trails pivot_doc; skip it forward.
+                cursors[0].skip_to(pivot_doc);
+            }
+        }
+        heap.into_iter()
+            .map(|Reverse(sd)| FtsEntry {
+                row_position: self.docs.row_id(sd.doc),
+                score: sd.score,
+            })
+            .collect()
+    }
+
     /// Phrase-search by intersecting posting lists: drive from the rarest
     /// token, require every other token to contain the doc, and verify the
     /// token positions satisfy the phrase. `tokens.len() >= 2`.
@@ -2143,9 +2266,57 @@ impl Partition {
     }
 }
 
+/// A scored doc id, ordered by score then doc id (`total_cmp`, so a stray
+/// non-finite score cannot panic the heap).
+struct ScoredDoc {
+    score: f32,
+    doc: u32,
+}
+
+impl PartialEq for ScoredDoc {
+    fn eq(&self, other: &Self) -> bool {
+        self.cmp(other) == std::cmp::Ordering::Equal
+    }
+}
+impl Eq for ScoredDoc {}
+impl PartialOrd for ScoredDoc {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+impl Ord for ScoredDoc {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        self.score
+            .total_cmp(&other.score)
+            .then(self.doc.cmp(&other.doc))
+    }
+}
+
+/// A cursor over one term's posting list, used by [`Partition::wand_match`].
+struct WandCursor<'a> {
+    posting: &'a PartitionPosting,
+    idx: usize,
+    qw: f32,
+    /// Upper bound on this term's contribution to any doc's score.
+    ub: f32,
+}
+
+impl WandCursor<'_> {
+    fn doc(&self) -> Option<u32> {
+        self.posting.doc_ids.get(self.idx).copied()
+    }
+
+    /// Advance to the first posting with `doc_id >= target`.
+    fn skip_to(&mut self, target: u32) {
+        let rest = &self.posting.doc_ids[self.idx..];
+        self.idx += rest.partition_point(|&d| d < target);
+    }
+}
+
 /// Convert `(doc_id, freq, positions)` triples — already sorted by doc id —
-/// into a compact, exact-sized [`PartitionPosting`].
-fn freeze_postings_one(docs: Vec<(u32, u32, Vec<u32>)>) -> PartitionPosting {
+/// into a compact, exact-sized [`PartitionPosting`]. `docset` supplies doc
+/// lengths for the WAND score-bound metadata.
+fn freeze_postings_one(docs: Vec<(u32, u32, Vec<u32>)>, docset: &DocSet) -> PartitionPosting {
     let n = docs.len();
     let total_pos: usize = docs.iter().map(|(_, _, p)| p.len()).sum();
     let mut doc_ids = Vec::with_capacity(n);
@@ -2153,9 +2324,13 @@ fn freeze_postings_one(docs: Vec<(u32, u32, Vec<u32>)>) -> PartitionPosting {
     let mut pos_offsets = Vec::with_capacity(n + 1);
     let mut pos_data = Vec::with_capacity(total_pos);
     pos_offsets.push(0);
+    let mut max_freq = 0u32;
+    let mut min_dl = u32::MAX;
     for (d, f, positions) in docs {
         doc_ids.push(d);
         freqs.push(f);
+        max_freq = max_freq.max(f);
+        min_dl = min_dl.min(docset.num_tokens(d));
         pos_data.extend_from_slice(&positions);
         pos_offsets.push(pos_data.len() as u32);
     }
@@ -2164,6 +2339,8 @@ fn freeze_postings_one(docs: Vec<(u32, u32, Vec<u32>)>) -> PartitionPosting {
         freqs,
         pos_offsets,
         pos_data,
+        max_freq,
+        min_dl: min_dl.max(1),
     }
 }
 
@@ -3241,12 +3418,12 @@ mod tests {
         let apple = index.tokenize_for_search("apple").pop().unwrap();
         // "apple" is present -> an OR search over it matches.
         let or_scorer = build_scorer(&st, &tail_snap, std::slice::from_ref(&apple));
-        let or_hits = p.search_match(std::slice::from_ref(&apple), Operator::Or, &or_scorer);
+        let or_hits = p.search_match(std::slice::from_ref(&apple), Operator::Or, &or_scorer, None);
         assert_eq!(or_hits.len(), 3);
         // Adding an absent term to an AND query short-circuits to nothing.
         let and_tokens = vec![apple, "definitely_missing".to_string()];
         let and_scorer = build_scorer(&st, &tail_snap, &and_tokens);
-        let and_hits = p.search_match(&and_tokens, Operator::And, &and_scorer);
+        let and_hits = p.search_match(&and_tokens, Operator::And, &and_scorer, None);
         assert!(and_hits.is_empty());
     }
 
@@ -3450,4 +3627,52 @@ mod tests {
         let expected: Vec<u64> = full.iter().take(3).map(|e| e.row_position).collect();
         assert_eq!(got, expected, "limited search must return the exact top-3");
     }
+
+    #[test]
+    fn test_wand_topk_matches_exact_full_scan() {
+        // WAND must produce the exact same top-k as an unbounded scan, at a
+        // scale where pruning genuinely engages. 40 docs across ~8 partitions;
+        // "alpha" appears with tf 1..=40, giving 40 strictly distinct scores.
+        let index = FtsMemIndex::new(1, "description".to_string()).with_freeze_threshold_rows(5);
+        for i in 0..40u64 {
+            let text = "alpha ".repeat((i + 1) as usize);
+            let batch = RecordBatch::try_new(
+                create_test_schema(),
+                vec![
+                    Arc::new(Int32Array::from(vec![0])),
+                    Arc::new(StringArray::from(vec![text.as_str()])),
+                ],
+            )
+            .unwrap();
+            index.insert(&batch, i).unwrap();
+        }
+        assert!(index.state.load_full().partitions.len() >= 2);
+
+        // Exact ranking from the unbounded path.
+        let mut full = index.search("alpha");
+        assert_eq!(full.len(), 40);
+        full.sort_by(|a, b| {
+            b.score
+                .partial_cmp(&a.score)
+                .unwrap()
+                .then(a.row_position.cmp(&b.row_position))
+        });
+
+        // For several k, WAND-pruned top-k must equal the exact top-k.
+        for k in [1usize, 3, 10, 25, 40, 50] {
+            let limited = index.search_with_options(
+                &FtsQueryExpr::match_query("alpha"),
+                SearchOptions::new().with_limit(k),
+            );
+            let expect_len = k.min(40);
+            assert_eq!(limited.len(), expect_len, "k={k}");
+            let got: Vec<u64> = limited.iter().map(|e| e.row_position).collect();
+            let expected: Vec<u64> = full
+                .iter()
+                .take(expect_len)
+                .map(|e| e.row_position)
+                .collect();
+            assert_eq!(got, expected, "WAND top-{k} must equal the exact top-{k}");
+        }
+    }
 }

From 9e7e2ba69953c57d665807e92274ac5abe24c6ba Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sun, 17 May 2026 15:55:31 -0700
Subject: [PATCH 25/37] perf(mem_wal): byte-compress FTS partition postings,
 dedup term dictionary
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Frozen partitions held uncompressed `Vec<u32>` posting lists plus a per-
partition `HashMap<Arc<str>,u32>` term dictionary duplicated across every
partition — ~4 GB at 1M docs (~5x Lucene).

Postings are now stored in three shared per-partition buffers as 128-doc
blocks: VByte + delta doc-id gaps, VByte frequencies, and VByte delta-encoded
positions. A `PostingCursor` decodes a block at a time and `skip_to` jumps
whole blocks via `BlockMeta` without decoding — so WAND still block-skips and
the exact O(matches) scan path is unchanged. Per-term overhead drops to one
`PostingRef`. The term dictionary becomes a sorted `Box<[Arc<str>]>`
(binary-searched, no hashmap node overhead) whose strings are shared across
partitions via an index-wide `TermInterner`.

Flush is unaffected — `to_index_builder_reversed` decodes postings through
the same cursor, reverses, and rebuilds the Lance FTS on-disk format as
before. Adds VByte and multi-block cursor round-trip tests.
---
 rust/lance/src/dataset/mem_wal/index/fts.rs | 710 +++++++++++++-------
 1 file changed, 479 insertions(+), 231 deletions(-)

diff --git a/rust/lance/src/dataset/mem_wal/index/fts.rs b/rust/lance/src/dataset/mem_wal/index/fts.rs
index 2b2fda7de4b..d4ce618e5a1 100644
--- a/rust/lance/src/dataset/mem_wal/index/fts.rs
+++ b/rust/lance/src/dataset/mem_wal/index/fts.rs
@@ -708,6 +708,24 @@ impl IndexState {
     }
 }
 
+/// Deduplicates term `Arc<str>`s across partitions: a term that appears in
+/// many partitions is then backed by a single string allocation.
+#[derive(Default)]
+struct TermInterner {
+    seen: Mutex<HashSet<Arc<str>>>,
+}
+
+impl TermInterner {
+    fn intern(&self, term: &Arc<str>) -> Arc<str> {
+        let mut seen = self.seen.lock().expect("term interner poisoned");
+        if let Some(existing) = seen.get(term.as_ref()) {
+            return existing.clone();
+        }
+        seen.insert(term.clone());
+        term.clone()
+    }
+}
+
 /// In-memory full-text search index. See module docs for the concurrency
 /// model and visibility contract.
 pub struct FtsMemIndex {
@@ -724,6 +742,9 @@ pub struct FtsMemIndex {
     /// between freezes; the whole state is swapped on freeze.
     state: ArcSwap<IndexState>,
 
+    /// Shared term-string interner for frozen partitions.
+    term_interner: TermInterner,
+
     /// The tail freezes into a partition once it reaches this many docs.
     freeze_threshold_rows: usize,
 }
@@ -773,6 +794,7 @@ impl FtsMemIndex {
             tokenizer_pool: Arc::new(pool),
             writer_tokenizer: Mutex::new(writer_tokenizer),
             state: ArcSwap::from(IndexState::empty()),
+            term_interner: TermInterner::default(),
             freeze_threshold_rows: Self::DEFAULT_FREEZE_THRESHOLD_ROWS,
         }
     }
@@ -966,7 +988,7 @@ impl FtsMemIndex {
     /// fresh empty tail. Only the writer calls this; readers snapshotting the
     /// old `IndexState` keep a consistent view across the freeze.
     fn freeze(&self, st: &IndexState) {
-        let Some(partition) = Partition::from_tail(&st.tail) else {
+        let Some(partition) = Partition::from_tail(&st.tail, &self.term_interner) else {
             return;
         };
         let mut partitions: Vec<Arc<Partition>> = st.partitions.iter().cloned().collect();
@@ -1414,19 +1436,22 @@ impl FtsMemIndex {
         // Step 3: merge per-term postings across every partition and the tail.
         let mut term_postings: HashMap<String, Vec<(u32, u32, Option<Vec<u32>>)>> = HashMap::new();
         for p in st.partitions.iter() {
-            for (token, posting) in p.tokens_with_postings() {
-                let bucket = term_postings.entry(token.to_string()).or_default();
-                for i in 0..posting.len() {
-                    let row_pos = p.docs.row_id(posting.doc_ids[i]);
-                    let Some(&doc_id) = original_to_doc_id.get(&row_pos) else {
-                        continue;
-                    };
-                    let pos = if with_position {
-                        Some(posting.positions(i).to_vec())
-                    } else {
-                        None
-                    };
-                    bucket.push((doc_id, posting.freqs[i], pos));
+            for term_id in 0..p.terms.len() as u32 {
+                let bucket = term_postings
+                    .entry(p.terms[term_id as usize].to_string())
+                    .or_default();
+                let mut cursor = PostingCursor::new(p, term_id);
+                while let Some(local_doc) = cursor.doc() {
+                    let row_pos = p.docs.row_id(local_doc);
+                    if let Some(&doc_id) = original_to_doc_id.get(&row_pos) {
+                        let pos = if with_position {
+                            Some(cursor.positions().to_vec())
+                        } else {
+                            None
+                        };
+                        bucket.push((doc_id, cursor.freq(), pos));
+                    }
+                    cursor.advance();
                 }
             }
         }
@@ -1860,61 +1885,160 @@ impl FtsIndexConfig {
 }
 
 // ============================================================================
-// Immutable partition
+// Immutable partition (compressed posting storage)
 // ============================================================================
 
-/// A frozen per-term posting list inside a [`Partition`]. Exact-sized plain
-/// `Vec`s — no Arrow builder capacity slack — with partition-local `u32` doc
-/// ids. Positions use a CSR layout: posting `i` owns
-/// `pos_data[pos_offsets[i]..pos_offsets[i + 1]]`.
-struct PartitionPosting {
-    /// partition-local doc ids, ascending.
-    doc_ids: Vec<u32>,
-    /// term frequency per posting.
-    freqs: Vec<u32>,
-    /// `doc_ids.len() + 1` offsets into `pos_data`.
-    pos_offsets: Vec<u32>,
-    pos_data: Vec<u32>,
-    /// Largest `freq` and smallest doc length over this list. `doc_weight` is
-    /// increasing in freq and decreasing in doc length, so `doc_weight(max_freq,
-    /// min_dl)` is a valid per-term score upper bound for WAND pruning.
-    max_freq: u32,
-    min_dl: u32,
-}
+/// Docs per posting block. Blocks bound WAND skip granularity and carry the
+/// per-block score bound; a block's payload is self-delimiting.
+const POSTING_BLOCK: usize = 128;
 
-impl PartitionPosting {
-    fn len(&self) -> usize {
-        self.doc_ids.len()
+/// Append `v` to `buf` as unsigned LEB128 (VByte).
+fn vbyte_put(buf: &mut Vec<u8>, mut v: u32) {
+    while v >= 0x80 {
+        buf.push((v as u8) | 0x80);
+        v >>= 7;
     }
+    buf.push(v as u8);
+}
 
-    fn positions(&self, i: usize) -> &[u32] {
-        &self.pos_data[self.pos_offsets[i] as usize..self.pos_offsets[i + 1] as usize]
+/// Read one unsigned LEB128 value from `buf` at `*pos`, advancing `*pos`.
+fn vbyte_get(buf: &[u8], pos: &mut usize) -> u32 {
+    let mut v = 0u32;
+    let mut shift = 0u32;
+    loop {
+        let b = buf[*pos];
+        *pos += 1;
+        v |= ((b & 0x7f) as u32) << shift;
+        if b & 0x80 == 0 {
+            break;
+        }
+        shift += 7;
     }
+    v
+}
 
-    /// Upper bound on this term's BM25 doc-side weight, given the scorer.
-    fn max_doc_weight(&self, scorer: &MemBM25Scorer) -> f32 {
-        scorer.doc_weight(self.max_freq, self.min_dl)
-    }
+/// Number of docs in block `b` (0-based within a term) of a `doc_count`-long
+/// posting list.
+fn block_len(doc_count: u32, b: u32) -> usize {
+    (doc_count as usize - b as usize * POSTING_BLOCK).min(POSTING_BLOCK)
+}
 
-    fn heap_size(&self) -> usize {
-        (self.doc_ids.len() + self.freqs.len() + self.pos_offsets.len() + self.pos_data.len())
-            * std::mem::size_of::<u32>()
-    }
+/// Per-128-doc-block metadata — enough to skip and score-bound a block
+/// without decoding its payload.
+#[derive(Clone, Copy)]
+struct BlockMeta {
+    /// first / last doc id in the block (block doc ids are ascending).
+    first_doc: u32,
+    last_doc: u32,
+    /// start offset of the block's doc/freq payload in `doc_freq_data`.
+    df_offset: u32,
+    /// start offset of the block's position payload in `pos_data`.
+    pos_offset: u32,
+}
+
+/// Per-term locator into a partition's shared posting buffers.
+#[derive(Clone, Copy)]
+struct PostingRef {
+    /// index of the term's first block in `block_meta`.
+    block_start: u32,
+    /// number of blocks the term spans.
+    block_count: u32,
+    /// number of docs (postings) for the term.
+    doc_count: u32,
+    /// largest frequency / smallest doc length over the whole term: the WAND
+    /// per-term upper bound is `query_weight * doc_weight(max_freq, min_dl)`.
+    max_freq: u32,
+    min_dl: u32,
 }
 
-/// An immutable, frozen FTS partition: a slice of the MemTable's inserts held
-/// as compact per-term posting lists, queried by a direct O(matches) scan.
-/// Built by freezing the tail; flushed 1:1 into the Lance FTS on-disk format.
-/// See `DESIGN.md` in the redesign analysis directory.
+/// An immutable, frozen FTS partition. Posting lists are byte-compressed
+/// (VByte + delta, 128-doc blocks) into three shared buffers, so per-term
+/// overhead is one `PostingRef`. See `compress-fts-partition-memory/DESIGN.md`.
 struct Partition {
-    /// token text -> local token id (dense, 0-based; indexes `postings`).
-    token_ids: HashMap<Arc<str>, u32>,
-    /// posting list per local token id.
-    postings: Vec<PartitionPosting>,
+    /// term texts, sorted; the index is the local term id. Interned, so the
+    /// string bytes are shared across partitions.
+    terms: Box<[Arc<str>]>,
+    /// per term, parallel to `terms`.
+    postings: Box<[PostingRef]>,
+    /// per-block metadata for every term's blocks, concatenated.
+    block_meta: Box<[BlockMeta]>,
+    /// VByte(doc-id gaps) then VByte(freqs) for every block, concatenated.
+    doc_freq_data: Box<[u8]>,
+    /// per doc per block: VByte(count) then VByte(delta positions), concatenated.
+    pos_data: Box<[u8]>,
     /// local doc id -> (MemTable row position, token count).
     docs: DocSet,
 }
 
+/// Build a partition from `(term, sorted (doc, freq, positions))` entries.
+/// Each term's docs must already be sorted ascending by doc id.
+fn build_partition(
+    mut entries: Vec<(Arc<str>, Vec<(u32, u32, Vec<u32>)>)>,
+    docs: DocSet,
+) -> Partition {
+    entries.sort_by(|a, b| a.0.cmp(&b.0));
+    let mut terms = Vec::with_capacity(entries.len());
+    let mut postings = Vec::with_capacity(entries.len());
+    let mut block_meta: Vec<BlockMeta> = Vec::new();
+    let mut doc_freq_data: Vec<u8> = Vec::new();
+    let mut pos_data: Vec<u8> = Vec::new();
+    for (term, docs_for_term) in entries {
+        let doc_count = docs_for_term.len() as u32;
+        let block_start = block_meta.len() as u32;
+        let mut term_max_freq = 0u32;
+        let mut term_min_dl = u32::MAX;
+        for chunk in docs_for_term.chunks(POSTING_BLOCK) {
+            let df_offset = doc_freq_data.len() as u32;
+            let pos_offset = pos_data.len() as u32;
+            // doc-id gaps for docs[1..] (docs[0] is `first_doc`).
+            let mut prev = chunk[0].0;
+            for &(d, _, _) in &chunk[1..] {
+                vbyte_put(&mut doc_freq_data, d - prev);
+                prev = d;
+            }
+            // frequencies.
+            let mut blk_max_freq = 0u32;
+            for &(_, f, _) in chunk {
+                vbyte_put(&mut doc_freq_data, f);
+                blk_max_freq = blk_max_freq.max(f);
+            }
+            // positions: count + delta-encoded positions per doc.
+            for &(d, _, ref positions) in chunk {
+                vbyte_put(&mut pos_data, positions.len() as u32);
+                let mut prev_p = 0u32;
+                for &p in positions {
+                    vbyte_put(&mut pos_data, p - prev_p);
+                    prev_p = p;
+                }
+                term_min_dl = term_min_dl.min(docs.num_tokens(d));
+            }
+            block_meta.push(BlockMeta {
+                first_doc: chunk[0].0,
+                last_doc: chunk[chunk.len() - 1].0,
+                df_offset,
+                pos_offset,
+            });
+            term_max_freq = term_max_freq.max(blk_max_freq);
+        }
+        postings.push(PostingRef {
+            block_start,
+            block_count: block_meta.len() as u32 - block_start,
+            doc_count,
+            max_freq: term_max_freq,
+            min_dl: term_min_dl.max(1),
+        });
+        terms.push(term);
+    }
+    Partition {
+        terms: terms.into_boxed_slice(),
+        postings: postings.into_boxed_slice(),
+        block_meta: block_meta.into_boxed_slice(),
+        doc_freq_data: doc_freq_data.into_boxed_slice(),
+        pos_data: pos_data.into_boxed_slice(),
+        docs,
+    }
+}
+
 impl Partition {
     fn doc_count(&self) -> usize {
         self.docs.len()
@@ -1925,44 +2049,50 @@ impl Partition {
     }
 
     fn entry_count(&self) -> usize {
-        self.postings.iter().map(|p| p.len()).sum()
+        self.postings.iter().map(|p| p.doc_count as usize).sum()
     }
 
-    fn contains_token(&self, token: &str) -> bool {
-        self.token_ids.contains_key(token)
+    /// Local term id of `token`, via binary search over the sorted `terms`.
+    fn term_id(&self, token: &str) -> Option<u32> {
+        self.terms
+            .binary_search_by(|t| t.as_ref().cmp(token))
+            .ok()
+            .map(|i| i as u32)
     }
 
-    fn tokens(&self) -> impl Iterator<Item = &Arc<str>> {
-        self.token_ids.keys()
+    fn contains_token(&self, token: &str) -> bool {
+        self.term_id(token).is_some()
     }
 
-    fn tokens_with_postings(&self) -> impl Iterator<Item = (&Arc<str>, &PartitionPosting)> {
-        self.token_ids
-            .iter()
-            .map(move |(t, &id)| (t, &self.postings[id as usize]))
+    fn tokens(&self) -> impl Iterator<Item = &Arc<str>> {
+        self.terms.iter()
     }
 
     /// Number of docs in this partition containing `token`.
     fn token_df(&self, token: &str) -> usize {
-        self.token_ids
-            .get(token)
-            .map(|&id| self.postings[id as usize].len())
+        self.term_id(token)
+            .map(|id| self.postings[id as usize].doc_count as usize)
             .unwrap_or(0)
     }
 
     fn memory_size(&self) -> usize {
-        let mut total = std::mem::size_of::<Self>();
-        for t in self.token_ids.keys() {
-            total += std::mem::size_of::<Arc<str>>() + t.len() + std::mem::size_of::<u32>() + 16;
-        }
-        total += self.postings.iter().map(|p| p.heap_size()).sum::<usize>();
-        total += self.docs.len() * (std::mem::size_of::<u64>() + 2 * std::mem::size_of::<u32>());
-        total
+        std::mem::size_of::<Self>()
+            + self
+                .terms
+                .iter()
+                .map(|t| std::mem::size_of::<Arc<str>>() + t.len())
+                .sum::<usize>()
+            + self.postings.len() * std::mem::size_of::<PostingRef>()
+            + self.block_meta.len() * std::mem::size_of::<BlockMeta>()
+            + self.doc_freq_data.len()
+            + self.pos_data.len()
+            + self.docs.len() * (std::mem::size_of::<u64>() + 2 * std::mem::size_of::<u32>())
     }
 
     /// Freeze the visible contents of `tail` into a new partition. Returns
-    /// `None` if the tail has no visible docs.
-    fn from_tail(tail: &TailIndex) -> Option<Self> {
+    /// `None` if the tail has no visible docs. Terms are interned so their
+    /// bytes are shared with other partitions.
+    fn from_tail(tail: &TailIndex, interner: &TermInterner) -> Option<Self> {
         let snap = tail.snapshot();
         if snap.visible_count == 0 {
             return None;
@@ -1977,10 +2107,8 @@ impl Partition {
                 pos_to_doc.insert(rp, doc_id);
             }
         }
-        let mut token_ids: HashMap<Arc<str>, u32> = HashMap::new();
-        let mut raw: Vec<Vec<(u32, u32, Vec<u32>)>> = Vec::new();
+        let mut entries: Vec<(Arc<str>, Vec<(u32, u32, Vec<u32>)>)> = Vec::new();
         for entry in tail.terms.iter() {
-            let term: &Arc<str> = entry.key();
             let slice = entry.value().load();
             let mut docs_for_term: Vec<(u32, u32, Vec<u32>)> = Vec::new();
             for chunk in &slice.chunks {
@@ -2003,61 +2131,33 @@ impl Partition {
                 continue;
             }
             docs_for_term.sort_by_key(|(d, _, _)| *d);
-            let id = raw.len() as u32;
-            token_ids.insert(term.clone(), id);
-            raw.push(docs_for_term);
+            entries.push((interner.intern(entry.key()), docs_for_term));
         }
-        let postings = raw
-            .into_iter()
-            .map(|v| freeze_postings_one(v, &docs))
-            .collect();
-        Some(Self {
-            token_ids,
-            postings,
-            docs,
-        })
+        Some(build_partition(entries, docs))
     }
 
     /// Merge several partitions into one. Local doc ids are reassigned by
-    /// concatenation, which keeps each merged per-token posting list sorted.
+    /// concatenation, which keeps each merged per-term posting list sorted.
     fn merge(parts: &[Arc<Self>]) -> Self {
-        let mut token_ids: HashMap<Arc<str>, u32> = HashMap::new();
-        let mut raw: Vec<Vec<(u32, u32, Vec<u32>)>> = Vec::new();
+        let mut merged: HashMap<Arc<str>, Vec<(u32, u32, Vec<u32>)>> = HashMap::new();
         let mut docs = DocSet::default();
         let mut doc_offset: u32 = 0;
         for p in parts {
             for (rp, nt) in p.docs.iter() {
                 docs.append(*rp, *nt);
             }
-            for (token, &local_id) in &p.token_ids {
-                let merged_id = *token_ids.entry(token.clone()).or_insert_with(|| {
-                    let id = raw.len() as u32;
-                    raw.push(Vec::new());
-                    id
-                });
-                let posting = &p.postings[local_id as usize];
-                for i in 0..posting.len() {
-                    raw[merged_id as usize].push((
-                        posting.doc_ids[i] + doc_offset,
-                        posting.freqs[i],
-                        posting.positions(i).to_vec(),
-                    ));
+            for term_id in 0..p.terms.len() as u32 {
+                let bucket = merged.entry(p.terms[term_id as usize].clone()).or_default();
+                let mut cursor = PostingCursor::new(p, term_id);
+                while let Some(doc) = cursor.doc() {
+                    let positions = cursor.positions().to_vec();
+                    bucket.push((doc + doc_offset, cursor.freq(), positions));
+                    cursor.advance();
                 }
             }
             doc_offset += p.docs.len() as u32;
         }
-        let postings = raw
-            .into_iter()
-            .map(|mut v| {
-                v.sort_by_key(|(d, _, _)| *d);
-                freeze_postings_one(v, &docs)
-            })
-            .collect();
-        Self {
-            token_ids,
-            postings,
-            docs,
-        }
+        build_partition(merged.into_iter().collect(), docs)
     }
 
     /// BM25 OR/AND-search of the partition. With an OR operator and a result
@@ -2072,13 +2172,13 @@ impl Partition {
         scorer: &MemBM25Scorer,
         limit: Option<usize>,
     ) -> Vec<FtsEntry> {
-        // Resolve present query tokens to (local token id, query weight).
+        // Resolve present query tokens to (local term id, query weight).
         // Repeated query tokens are kept so scores match `score_terms`.
-        let mut terms: Vec<(usize, f32)> = Vec::with_capacity(tokens.len());
+        let mut terms: Vec<(u32, f32)> = Vec::with_capacity(tokens.len());
         let mut all_present = true;
         for token in tokens {
-            match self.token_ids.get(token.as_str()) {
-                Some(&id) => terms.push((id as usize, scorer.query_weight(token))),
+            match self.term_id(token) {
+                Some(id) => terms.push((id, scorer.query_weight(token))),
                 None => all_present = false,
             }
         }
@@ -2095,7 +2195,7 @@ impl Partition {
     /// the dense local doc id. Used for unbounded and AND queries.
     fn scan_match(
         &self,
-        terms: &[(usize, f32)],
+        terms: &[(u32, f32)],
         operator: Operator,
         need: u32,
         scorer: &MemBM25Scorer,
@@ -2104,12 +2204,12 @@ impl Partition {
         let mut scores = vec![0.0f32; n];
         let mut hits = vec![0u32; n];
         for &(id, qw) in terms {
-            let posting = &self.postings[id];
-            for i in 0..posting.len() {
-                let doc = posting.doc_ids[i] as usize;
-                let dl = self.docs.num_tokens(posting.doc_ids[i]);
-                scores[doc] += qw * scorer.doc_weight(posting.freqs[i], dl);
-                hits[doc] += 1;
+            let mut cursor = PostingCursor::new(self, id);
+            while let Some(doc) = cursor.doc() {
+                let dl = self.docs.num_tokens(doc);
+                scores[doc as usize] += qw * scorer.doc_weight(cursor.freq(), dl);
+                hits[doc as usize] += 1;
+                cursor.advance();
             }
         }
         let mut results = Vec::new();
@@ -2129,21 +2229,15 @@ impl Partition {
     /// Block-max WAND top-k over an OR query. Exact: each term's
     /// `(max_freq, min_dl)` gives a sound score upper bound, so the algorithm
     /// only skips docs that provably cannot enter the top-`k`.
-    fn wand_match(
-        &self,
-        terms: &[(usize, f32)],
-        scorer: &MemBM25Scorer,
-        k: usize,
-    ) -> Vec<FtsEntry> {
-        let mut cursors: Vec<WandCursor> = terms
+    fn wand_match(&self, terms: &[(u32, f32)], scorer: &MemBM25Scorer, k: usize) -> Vec<FtsEntry> {
+        let mut lanes: Vec<WandLane> = terms
             .iter()
             .map(|&(id, qw)| {
-                let posting = &self.postings[id];
-                WandCursor {
-                    posting,
-                    idx: 0,
+                let pref = &self.postings[id as usize];
+                WandLane {
+                    cursor: PostingCursor::new(self, id),
                     qw,
-                    ub: qw * posting.max_doc_weight(scorer),
+                    ub: qw * scorer.doc_weight(pref.max_freq, pref.min_dl),
                 }
             })
             .collect();
@@ -2152,16 +2246,16 @@ impl Partition {
         // theta = score of the weakest doc still in the top-k (-inf until full).
         let mut theta = f32::NEG_INFINITY;
         loop {
-            cursors.retain(|c| c.doc().is_some());
-            if cursors.is_empty() {
+            lanes.retain(|l| l.cursor.doc().is_some());
+            if lanes.is_empty() {
                 break;
             }
-            cursors.sort_by_key(|c| c.doc().unwrap());
-            // Pivot: first cursor whose cumulative upper bound exceeds theta.
+            lanes.sort_by_key(|l| l.cursor.doc().unwrap());
+            // Pivot: first lane whose cumulative upper bound exceeds theta.
             let mut acc = 0.0f32;
             let mut pivot = None;
-            for (i, c) in cursors.iter().enumerate() {
-                acc += c.ub;
+            for (i, l) in lanes.iter().enumerate() {
+                acc += l.ub;
                 if acc > theta {
                     pivot = Some(i);
                     break;
@@ -2170,15 +2264,15 @@ impl Partition {
             let Some(pivot) = pivot else {
                 break; // no remaining doc can reach theta
             };
-            let pivot_doc = cursors[pivot].doc().unwrap();
-            if cursors[0].doc().unwrap() == pivot_doc {
-                // Every cursor positioned at pivot_doc contributes; score it.
+            let pivot_doc = lanes[pivot].cursor.doc().unwrap();
+            if lanes[0].cursor.doc().unwrap() == pivot_doc {
+                // Every lane positioned at pivot_doc contributes; score it.
                 let dl = self.docs.num_tokens(pivot_doc);
                 let mut score = 0.0f32;
-                for c in cursors.iter_mut() {
-                    if c.doc() == Some(pivot_doc) {
-                        score += c.qw * scorer.doc_weight(c.posting.freqs[c.idx], dl);
-                        c.idx += 1;
+                for l in lanes.iter_mut() {
+                    if l.cursor.doc() == Some(pivot_doc) {
+                        score += l.qw * scorer.doc_weight(l.cursor.freq(), dl);
+                        l.cursor.advance();
                     }
                 }
                 if heap.len() < k {
@@ -2198,8 +2292,8 @@ impl Partition {
                     theta = heap.peek().unwrap().0.score;
                 }
             } else {
-                // A cursor before the pivot trails pivot_doc; skip it forward.
-                cursors[0].skip_to(pivot_doc);
+                // A lane before the pivot trails pivot_doc; skip it forward.
+                lanes[0].cursor.skip_to(pivot_doc);
             }
         }
         heap.into_iter()
@@ -2214,53 +2308,51 @@ impl Partition {
     /// token, require every other token to contain the doc, and verify the
     /// token positions satisfy the phrase. `tokens.len() >= 2`.
     fn search_phrase(&self, tokens: &[String], slop: u32, scorer: &MemBM25Scorer) -> Vec<FtsEntry> {
-        let mut postings: Vec<&PartitionPosting> = Vec::with_capacity(tokens.len());
+        let mut term_ids: Vec<u32> = Vec::with_capacity(tokens.len());
         for token in tokens {
-            match self.token_ids.get(token.as_str()) {
-                Some(&id) => postings.push(&self.postings[id as usize]),
+            match self.term_id(token) {
+                Some(id) => term_ids.push(id),
                 // A phrase needs every token present in this partition.
                 None => return Vec::new(),
             }
         }
-        let rarest = (0..postings.len())
-            .min_by_key(|&i| postings[i].len())
+        let rarest = (0..term_ids.len())
+            .min_by_key(|&i| self.postings[term_ids[i] as usize].doc_count)
             .unwrap();
+        let mut cursors: Vec<PostingCursor> = term_ids
+            .iter()
+            .map(|&id| PostingCursor::new(self, id))
+            .collect();
         let mut results = Vec::new();
-        for i in 0..postings[rarest].len() {
-            let doc = postings[rarest].doc_ids[i];
-            let mut all_positions: Vec<&[u32]> = Vec::with_capacity(tokens.len());
-            let mut freqs: Vec<u32> = Vec::with_capacity(tokens.len());
+        while let Some(doc) = cursors[rarest].cursor_doc() {
+            let mut all_positions: Vec<Vec<u32>> = vec![Vec::new(); tokens.len()];
+            let mut freqs = vec![0u32; tokens.len()];
             let mut present = true;
-            for (ti, posting) in postings.iter().enumerate() {
-                let idx = if ti == rarest {
-                    Some(i)
+            for ti in 0..tokens.len() {
+                if ti != rarest {
+                    cursors[ti].skip_to(doc);
+                }
+                if cursors[ti].doc() == Some(doc) {
+                    freqs[ti] = cursors[ti].freq();
+                    all_positions[ti] = cursors[ti].positions().to_vec();
                 } else {
-                    posting.doc_ids.binary_search(&doc).ok()
-                };
-                match idx {
-                    Some(idx) => {
-                        freqs.push(posting.freqs[idx]);
-                        all_positions.push(posting.positions(idx));
-                    }
-                    None => {
-                        present = false;
-                        break;
-                    }
+                    present = false;
+                    break;
                 }
             }
-            if !present || !phrase_matches(&all_positions, slop) {
-                continue;
+            if present && phrase_matches(&all_positions, slop) {
+                let dl = self.docs.num_tokens(doc);
+                let score: f32 = tokens
+                    .iter()
+                    .zip(&freqs)
+                    .map(|(t, &f)| scorer.query_weight(t) * scorer.doc_weight(f, dl))
+                    .sum();
+                results.push(FtsEntry {
+                    row_position: self.docs.row_id(doc),
+                    score,
+                });
             }
-            let dl = self.docs.num_tokens(doc);
-            let score: f32 = tokens
-                .iter()
-                .zip(&freqs)
-                .map(|(t, &f)| scorer.query_weight(t) * scorer.doc_weight(f, dl))
-                .sum();
-            results.push(FtsEntry {
-                row_position: self.docs.row_id(doc),
-                score,
-            });
+            cursors[rarest].advance();
         }
         results
     }
@@ -2292,55 +2384,154 @@ impl Ord for ScoredDoc {
     }
 }
 
-/// A cursor over one term's posting list, used by [`Partition::wand_match`].
-struct WandCursor<'a> {
-    posting: &'a PartitionPosting,
-    idx: usize,
+/// One WAND lane: a posting cursor plus its query weight and score bound.
+struct WandLane<'a> {
+    cursor: PostingCursor<'a>,
     qw: f32,
     /// Upper bound on this term's contribution to any doc's score.
     ub: f32,
 }
 
-impl WandCursor<'_> {
+/// A decoding cursor over one term's compressed posting list. Decodes a
+/// 128-doc block at a time; `skip_to` jumps whole blocks via `BlockMeta`
+/// without decoding them.
+struct PostingCursor<'a> {
+    part: &'a Partition,
+    pref: PostingRef,
+    /// 0-based block index within the term; `== block_count` once exhausted.
+    block: u32,
+    /// the block decoded into `docs`/`freqs` (`u32::MAX` = none).
+    decoded: u32,
+    docs: Vec<u32>,
+    freqs: Vec<u32>,
+    /// the block decoded into `positions` (`u32::MAX` = none).
+    pos_decoded: u32,
+    positions: Vec<Vec<u32>>,
+    /// index within the current block.
+    i: usize,
+}
+
+impl<'a> PostingCursor<'a> {
+    fn new(part: &'a Partition, term_id: u32) -> Self {
+        let pref = part.postings[term_id as usize];
+        let mut cursor = Self {
+            part,
+            pref,
+            block: 0,
+            decoded: u32::MAX,
+            docs: Vec::new(),
+            freqs: Vec::new(),
+            pos_decoded: u32::MAX,
+            positions: Vec::new(),
+            i: 0,
+        };
+        if pref.block_count > 0 {
+            cursor.decode_doc_freq(0);
+        }
+        cursor
+    }
+
+    fn decode_doc_freq(&mut self, block: u32) {
+        if self.decoded == block {
+            return;
+        }
+        let bm = self.part.block_meta[(self.pref.block_start + block) as usize];
+        let n = block_len(self.pref.doc_count, block);
+        self.docs.clear();
+        self.freqs.clear();
+        let mut pos = bm.df_offset as usize;
+        let mut d = bm.first_doc;
+        self.docs.push(d);
+        for _ in 1..n {
+            d += vbyte_get(&self.part.doc_freq_data, &mut pos);
+            self.docs.push(d);
+        }
+        for _ in 0..n {
+            self.freqs
+                .push(vbyte_get(&self.part.doc_freq_data, &mut pos));
+        }
+        self.decoded = block;
+    }
+
+    fn decode_positions(&mut self, block: u32) {
+        if self.pos_decoded == block {
+            return;
+        }
+        let bm = self.part.block_meta[(self.pref.block_start + block) as usize];
+        let n = block_len(self.pref.doc_count, block);
+        self.positions.clear();
+        let mut pos = bm.pos_offset as usize;
+        for _ in 0..n {
+            let cnt = vbyte_get(&self.part.pos_data, &mut pos);
+            let mut v = Vec::with_capacity(cnt as usize);
+            let mut last = 0u32;
+            for _ in 0..cnt {
+                last += vbyte_get(&self.part.pos_data, &mut pos);
+                v.push(last);
+            }
+            self.positions.push(v);
+        }
+        self.pos_decoded = block;
+    }
+
+    /// Current doc id, or `None` once the list is exhausted.
     fn doc(&self) -> Option<u32> {
-        self.posting.doc_ids.get(self.idx).copied()
+        if self.block >= self.pref.block_count {
+            return None;
+        }
+        self.docs.get(self.i).copied()
+    }
+
+    /// `doc()` under a `&mut` receiver — for use as a loop condition while the
+    /// cursor is borrowed mutably elsewhere.
+    fn cursor_doc(&mut self) -> Option<u32> {
+        self.doc()
+    }
+
+    /// Frequency of the current posting.
+    fn freq(&self) -> u32 {
+        self.freqs[self.i]
+    }
+
+    /// Positions of the current posting (decoded lazily).
+    fn positions(&mut self) -> &[u32] {
+        let block = self.block;
+        self.decode_positions(block);
+        &self.positions[self.i]
+    }
+
+    /// Step to the next posting.
+    fn advance(&mut self) {
+        self.i += 1;
+        if self.i >= self.docs.len() {
+            self.block += 1;
+            self.i = 0;
+            if self.block < self.pref.block_count {
+                self.decode_doc_freq(self.block);
+            }
+        }
     }
 
-    /// Advance to the first posting with `doc_id >= target`.
+    /// Advance to the first posting with `doc_id >= target` (or exhaust),
+    /// skipping whole blocks via `BlockMeta` without decoding them.
     fn skip_to(&mut self, target: u32) {
-        let rest = &self.posting.doc_ids[self.idx..];
-        self.idx += rest.partition_point(|&d| d < target);
-    }
-}
-
-/// Convert `(doc_id, freq, positions)` triples — already sorted by doc id —
-/// into a compact, exact-sized [`PartitionPosting`]. `docset` supplies doc
-/// lengths for the WAND score-bound metadata.
-fn freeze_postings_one(docs: Vec<(u32, u32, Vec<u32>)>, docset: &DocSet) -> PartitionPosting {
-    let n = docs.len();
-    let total_pos: usize = docs.iter().map(|(_, _, p)| p.len()).sum();
-    let mut doc_ids = Vec::with_capacity(n);
-    let mut freqs = Vec::with_capacity(n);
-    let mut pos_offsets = Vec::with_capacity(n + 1);
-    let mut pos_data = Vec::with_capacity(total_pos);
-    pos_offsets.push(0);
-    let mut max_freq = 0u32;
-    let mut min_dl = u32::MAX;
-    for (d, f, positions) in docs {
-        doc_ids.push(d);
-        freqs.push(f);
-        max_freq = max_freq.max(f);
-        min_dl = min_dl.min(docset.num_tokens(d));
-        pos_data.extend_from_slice(&positions);
-        pos_offsets.push(pos_data.len() as u32);
-    }
-    PartitionPosting {
-        doc_ids,
-        freqs,
-        pos_offsets,
-        pos_data,
-        max_freq,
-        min_dl: min_dl.max(1),
+        if self.doc().is_some_and(|d| d >= target) {
+            return;
+        }
+        while self.block < self.pref.block_count {
+            let bm = self.part.block_meta[(self.pref.block_start + self.block) as usize];
+            if bm.last_doc >= target {
+                break;
+            }
+            self.block += 1;
+            self.i = 0;
+        }
+        if self.block >= self.pref.block_count {
+            return;
+        }
+        self.decode_doc_freq(self.block);
+        // `last_doc >= target`, so this block holds a doc >= target.
+        self.i += self.docs[self.i..].partition_point(|&d| d < target);
     }
 }
 
@@ -3675,4 +3866,61 @@ mod tests {
             assert_eq!(got, expected, "WAND top-{k} must equal the exact top-{k}");
         }
     }
+
+    #[test]
+    fn test_vbyte_roundtrip() {
+        let vals = [0u32, 1, 127, 128, 300, 16_383, 16_384, 70_000, u32::MAX];
+        let mut buf = Vec::new();
+        for &v in &vals {
+            vbyte_put(&mut buf, v);
+        }
+        let mut pos = 0;
+        for &v in &vals {
+            assert_eq!(vbyte_get(&buf, &mut pos), v);
+        }
+        assert_eq!(pos, buf.len(), "every byte consumed");
+    }
+
+    #[test]
+    fn test_posting_cursor_codec_roundtrip() {
+        // One term spanning 3 blocks (300 docs), varied freqs + positions.
+        let mut docs = DocSet::default();
+        let mut docs_for_term: Vec<(u32, u32, Vec<u32>)> = Vec::new();
+        for d in 0..300u32 {
+            docs.append(1000 + d as u64, (d % 7) + 1);
+            let freq = (d % 5) + 1;
+            let positions: Vec<u32> = (0..freq).map(|p| p * 3).collect();
+            docs_for_term.push((d, freq, positions));
+        }
+        let part = build_partition(vec![(Arc::from("term"), docs_for_term.clone())], docs);
+
+        // Full iteration must reproduce the input exactly.
+        let mut c = PostingCursor::new(&part, 0);
+        for (d, f, pos) in &docs_for_term {
+            assert_eq!(c.doc(), Some(*d));
+            assert_eq!(c.freq(), *f);
+            assert_eq!(c.positions(), pos.as_slice());
+            c.advance();
+        }
+        assert_eq!(c.doc(), None, "cursor exhausted after the last doc");
+
+        // skip_to must land correctly across and within block boundaries.
+        for &(target, expect) in &[
+            (5u32, Some(5u32)), // within block 0
+            (128, Some(128)),   // exact block-1 boundary
+            (200, Some(200)),   // within block 1
+            (299, Some(299)),   // last doc
+            (300, None),        // past the end
+        ] {
+            let mut c = PostingCursor::new(&part, 0);
+            c.skip_to(target);
+            assert_eq!(c.doc(), expect, "skip_to({target})");
+        }
+        // Sequential skips on one cursor.
+        let mut c = PostingCursor::new(&part, 0);
+        c.skip_to(100);
+        assert_eq!(c.doc(), Some(100));
+        c.skip_to(260);
+        assert_eq!(c.doc(), Some(260));
+    }
 }

From d6bea038ae6a76003aba0619cfa23b1ef75dbeaa Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sun, 17 May 2026 16:29:44 -0700
Subject: [PATCH 26/37] perf(mem_wal): bit-pack FTS posting doc ids and freqs
 instead of VByte
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

VByte's byte-at-a-time decode loop made compressed-partition queries far
slower than the uncompressed layout — p95 at 1M docs rose 4.7 ms to 24.7 ms.

Doc ids and frequencies in each 128-doc block are now fixed-width bit-packed
(doc ids as `doc - first_doc`, both at the block's minimal bit width, recorded
in `BlockMeta`). Decoding a block is now a tight shift/mask loop instead of a
branchy VByte scan, restoring query latency while keeping the footprint
compressed. Positions stay VByte (decoded lazily, off the hot path).

Adds a bit-pack round-trip test covering zero/sub-byte/byte-crossing/full
widths.
---
 rust/lance/src/dataset/mem_wal/index/fts.rs | 150 ++++++++++++++++----
 1 file changed, 124 insertions(+), 26 deletions(-)

diff --git a/rust/lance/src/dataset/mem_wal/index/fts.rs b/rust/lance/src/dataset/mem_wal/index/fts.rs
index d4ce618e5a1..6895a09834d 100644
--- a/rust/lance/src/dataset/mem_wal/index/fts.rs
+++ b/rust/lance/src/dataset/mem_wal/index/fts.rs
@@ -1923,6 +1923,65 @@ fn block_len(doc_count: u32, b: u32) -> usize {
     (doc_count as usize - b as usize * POSTING_BLOCK).min(POSTING_BLOCK)
 }
 
+/// Bit width needed to represent `v` (0 for `v == 0`).
+fn bit_width(v: u32) -> u8 {
+    (32 - v.leading_zeros()) as u8
+}
+
+/// Pack `values` (each `< 2^width`) into `buf` as little-endian `width`-bit
+/// fields. `width == 0` writes nothing.
+fn bitpack_put(buf: &mut Vec<u8>, values: &[u32], width: u8) {
+    if width == 0 {
+        return;
+    }
+    let mut acc: u64 = 0;
+    let mut bits: u32 = 0;
+    for &v in values {
+        acc |= (v as u64) << bits;
+        bits += width as u32;
+        while bits >= 8 {
+            buf.push(acc as u8);
+            acc >>= 8;
+            bits -= 8;
+        }
+    }
+    if bits > 0 {
+        buf.push(acc as u8);
+    }
+}
+
+/// Unpack `n` little-endian `width`-bit values starting at `buf[start]`,
+/// appending them to `out`.
+fn bitpack_get(buf: &[u8], start: usize, n: usize, width: u8, out: &mut Vec<u32>) {
+    if width == 0 {
+        out.resize(out.len() + n, 0);
+        return;
+    }
+    let mask: u64 = if width >= 32 {
+        u32::MAX as u64
+    } else {
+        (1u64 << width) - 1
+    };
+    let mut acc: u64 = 0;
+    let mut bits: u32 = 0;
+    let mut byte = start;
+    for _ in 0..n {
+        while bits < width as u32 {
+            acc |= (buf[byte] as u64) << bits;
+            byte += 1;
+            bits += 8;
+        }
+        out.push((acc & mask) as u32);
+        acc >>= width;
+        bits -= width as u32;
+    }
+}
+
+/// Number of bytes a `bitpack_put` of `n` values of `width` bits occupies.
+fn bitpack_len(n: usize, width: u8) -> usize {
+    (n * width as usize).div_ceil(8)
+}
+
 /// Per-128-doc-block metadata — enough to skip and score-bound a block
 /// without decoding its payload.
 #[derive(Clone, Copy)]
@@ -1934,6 +1993,9 @@ struct BlockMeta {
     df_offset: u32,
     /// start offset of the block's position payload in `pos_data`.
     pos_offset: u32,
+    /// bit width of the packed doc-id deltas (from `first_doc`) and freqs.
+    doc_width: u8,
+    freq_width: u8,
 }
 
 /// Per-term locator into a partition's shared posting buffers.
@@ -1990,19 +2052,18 @@ fn build_partition(
         for chunk in docs_for_term.chunks(POSTING_BLOCK) {
             let df_offset = doc_freq_data.len() as u32;
             let pos_offset = pos_data.len() as u32;
-            // doc-id gaps for docs[1..] (docs[0] is `first_doc`).
-            let mut prev = chunk[0].0;
-            for &(d, _, _) in &chunk[1..] {
-                vbyte_put(&mut doc_freq_data, d - prev);
-                prev = d;
-            }
-            // frequencies.
-            let mut blk_max_freq = 0u32;
-            for &(_, f, _) in chunk {
-                vbyte_put(&mut doc_freq_data, f);
-                blk_max_freq = blk_max_freq.max(f);
-            }
-            // positions: count + delta-encoded positions per doc.
+            let first_doc = chunk[0].0;
+            let last_doc = chunk[chunk.len() - 1].0;
+            // doc ids: bit-pack `doc - first_doc` at a fixed block width.
+            let doc_width = bit_width(last_doc - first_doc);
+            let doc_deltas: Vec<u32> = chunk.iter().map(|&(d, _, _)| d - first_doc).collect();
+            bitpack_put(&mut doc_freq_data, &doc_deltas, doc_width);
+            // frequencies: bit-pack at a fixed block width.
+            let blk_max_freq = chunk.iter().map(|&(_, f, _)| f).max().unwrap_or(0);
+            let freq_width = bit_width(blk_max_freq);
+            let freqs: Vec<u32> = chunk.iter().map(|&(_, f, _)| f).collect();
+            bitpack_put(&mut doc_freq_data, &freqs, freq_width);
+            // positions: VByte count + delta-encoded positions per doc.
             for &(d, _, ref positions) in chunk {
                 vbyte_put(&mut pos_data, positions.len() as u32);
                 let mut prev_p = 0u32;
@@ -2013,10 +2074,12 @@ fn build_partition(
                 term_min_dl = term_min_dl.min(docs.num_tokens(d));
             }
             block_meta.push(BlockMeta {
-                first_doc: chunk[0].0,
-                last_doc: chunk[chunk.len() - 1].0,
+                first_doc,
+                last_doc,
                 df_offset,
                 pos_offset,
+                doc_width,
+                freq_width,
             });
             term_max_freq = term_max_freq.max(blk_max_freq);
         }
@@ -2439,17 +2502,27 @@ impl<'a> PostingCursor<'a> {
         let n = block_len(self.pref.doc_count, block);
         self.docs.clear();
         self.freqs.clear();
-        let mut pos = bm.df_offset as usize;
-        let mut d = bm.first_doc;
-        self.docs.push(d);
-        for _ in 1..n {
-            d += vbyte_get(&self.part.doc_freq_data, &mut pos);
-            self.docs.push(d);
-        }
-        for _ in 0..n {
-            self.freqs
-                .push(vbyte_get(&self.part.doc_freq_data, &mut pos));
-        }
+        // doc ids: bit-packed `doc - first_doc`.
+        let df_start = bm.df_offset as usize;
+        bitpack_get(
+            &self.part.doc_freq_data,
+            df_start,
+            n,
+            bm.doc_width,
+            &mut self.docs,
+        );
+        for d in &mut self.docs {
+            *d += bm.first_doc;
+        }
+        // frequencies follow the doc-id block.
+        let freq_start = df_start + bitpack_len(n, bm.doc_width);
+        bitpack_get(
+            &self.part.doc_freq_data,
+            freq_start,
+            n,
+            bm.freq_width,
+            &mut self.freqs,
+        );
         self.decoded = block;
     }
 
@@ -3881,6 +3954,31 @@ mod tests {
         assert_eq!(pos, buf.len(), "every byte consumed");
     }
 
+    #[test]
+    fn test_bitpack_roundtrip() {
+        // Cover width 0 (all-zero), sub-byte, byte-crossing, and full widths.
+        let cases: [(Vec<u32>, u8); 6] = [
+            (vec![0u32; 5], 0),
+            (vec![0, 1, 0, 1, 1, 0, 1], 1),
+            (vec![3, 0, 7, 5, 2, 6, 1, 4], 3),
+            (vec![200, 17, 255, 0, 130], 8),
+            (vec![1000, 5, 65535, 42], 16),
+            (vec![1, u32::MAX, 0, 123_456_789], 32),
+        ];
+        for (values, width) in &cases {
+            let mut buf = Vec::new();
+            bitpack_put(&mut buf, values, *width);
+            assert_eq!(
+                buf.len(),
+                bitpack_len(values.len(), *width),
+                "width={width}"
+            );
+            let mut out = Vec::new();
+            bitpack_get(&buf, 0, values.len(), *width, &mut out);
+            assert_eq!(&out, values, "width={width}");
+        }
+    }
+
     #[test]
     fn test_posting_cursor_codec_roundtrip() {
         // One term spanning 3 blocks (300 docs), varied freqs + positions.

From ee88e8c6d099362a9c075061071284d877367f08 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sun, 17 May 2026 17:35:53 -0700
Subject: [PATCH 27/37] perf(mem_wal): cross-partition shared-threshold WAND
 for FTS top-k
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each partition's WAND previously ran with its own top-k heap, cold-starting
the pruning threshold at -inf. At 1M docs (~20 partitions) that re-paid the
"fill the heap to raise the threshold" warm-up ~20x, leaving FTS query p50
~2.5x Lucene even on the uncompressed layout.

Limited (top-k) OR searches now feed a single shared `TopK` across the whole
index: the tail is scanned first to warm the threshold, then each partition's
`wand_into` prunes against the shared, monotonically rising threshold — so a
partition processed late skips far more. Still exact: the per-term
`doc_weight(max_freq, min_dl)` upper bound is sound, and partitions hold
disjoint docs so the shared heap collects the true global top-k. Unbounded
`search` and AND keep the exact O(matches) scan.
---
 rust/lance/src/dataset/mem_wal/index/fts.rs | 186 ++++++++++++--------
 1 file changed, 117 insertions(+), 69 deletions(-)

diff --git a/rust/lance/src/dataset/mem_wal/index/fts.rs b/rust/lance/src/dataset/mem_wal/index/fts.rs
index 6895a09834d..c432a1d1921 100644
--- a/rust/lance/src/dataset/mem_wal/index/fts.rs
+++ b/rust/lance/src/dataset/mem_wal/index/fts.rs
@@ -1050,10 +1050,12 @@ impl FtsMemIndex {
         self.search_fuzzy_tokens(&st, &tokens, fuzziness, max_expansions)
     }
 
-    /// BM25 OR-search over the query tokens: WAND each immutable partition,
-    /// scan the tail, all scored with one corpus-wide [`MemBM25Scorer`], and
-    /// the results merged. Each doc lives in exactly one partition or the
-    /// tail, so the merge is a plain concatenation.
+    /// BM25 OR-search over the query tokens, scored with one corpus-wide
+    /// [`MemBM25Scorer`]. With a result limit, all partitions and the tail
+    /// feed a single shared top-k heap: the tail is scanned first to warm the
+    /// pruning threshold, then each partition's WAND prunes against the
+    /// shared rising threshold (instead of every partition cold-starting).
+    /// Without a limit, an exact O(matches) scan across partitions + tail.
     fn search_match(
         &self,
         st: &IndexState,
@@ -1070,14 +1072,32 @@ impl FtsMemIndex {
         if scorer.num_docs() == 0 {
             return Vec::new();
         }
-        let mut results = Vec::new();
-        for p in st.partitions.iter() {
-            results.extend(p.search_match(tokens, Operator::Or, &scorer, limit));
-        }
-        if tail_snap.visible_count > 0 {
-            results.extend(score_terms(&tail_snap, &st.tail.terms, tokens, &scorer));
+        match limit {
+            Some(k) if k > 0 => {
+                let mut topk = TopK::new(k);
+                // Scan the tail first so the shared threshold is warm before
+                // the partition WANDs run.
+                if tail_snap.visible_count > 0 {
+                    for e in score_terms(&tail_snap, &st.tail.terms, tokens, &scorer) {
+                        topk.offer(e.score, e.row_position);
+                    }
+                }
+                for p in st.partitions.iter() {
+                    p.wand_into(tokens, &scorer, &mut topk);
+                }
+                topk.into_entries()
+            }
+            _ => {
+                let mut results = Vec::new();
+                for p in st.partitions.iter() {
+                    results.extend(p.search_match(tokens, Operator::Or, &scorer));
+                }
+                if tail_snap.visible_count > 0 {
+                    results.extend(score_terms(&tail_snap, &st.tail.terms, tokens, &scorer));
+                }
+                results
+            }
         }
-        results
     }
 
     fn search_phrase_tokens(&self, st: &IndexState, tokens: &[String], slop: u32) -> Vec<FtsEntry> {
@@ -2223,17 +2243,14 @@ impl Partition {
         build_partition(merged.into_iter().collect(), docs)
     }
 
-    /// BM25 OR/AND-search of the partition. With an OR operator and a result
-    /// limit, prunes with block-max WAND; otherwise runs a direct O(matches)
-    /// scan. Every result is scored with `scorer`, so scores are identical
-    /// regardless of which path runs (and identical to the tail's
-    /// `score_terms`).
+    /// Exact O(matches) BM25 OR/AND-search of the partition by direct posting
+    /// scan. The pruned top-k path is `wand_into`; this is the unbounded and
+    /// AND path. Scored with `scorer`, so scores match `score_terms`.
     fn search_match(
         &self,
         tokens: &[String],
         operator: Operator,
         scorer: &MemBM25Scorer,
-        limit: Option<usize>,
     ) -> Vec<FtsEntry> {
         // Resolve present query tokens to (local term id, query weight).
         // Repeated query tokens are kept so scores match `score_terms`.
@@ -2248,10 +2265,7 @@ impl Partition {
         if terms.is_empty() || (operator == Operator::And && !all_present) {
             return Vec::new();
         }
-        match limit {
-            Some(k) if k > 0 && operator == Operator::Or => self.wand_match(&terms, scorer, k),
-            _ => self.scan_match(&terms, operator, tokens.len() as u32, scorer),
-        }
+        self.scan_match(&terms, operator, tokens.len() as u32, scorer)
     }
 
     /// Exact direct scan: accumulate every match into flat arrays indexed by
@@ -2289,31 +2303,35 @@ impl Partition {
         results
     }
 
-    /// Block-max WAND top-k over an OR query. Exact: each term's
-    /// `(max_freq, min_dl)` gives a sound score upper bound, so the algorithm
-    /// only skips docs that provably cannot enter the top-`k`.
-    fn wand_match(&self, terms: &[(u32, f32)], scorer: &MemBM25Scorer, k: usize) -> Vec<FtsEntry> {
-        let mut lanes: Vec<WandLane> = terms
-            .iter()
-            .map(|&(id, qw)| {
+    /// WAND top-k over an OR query, contributing into the caller's shared
+    /// [`TopK`]. Exact: each term's `(max_freq, min_dl)` gives a sound score
+    /// upper bound, so docs that provably cannot beat the shared threshold
+    /// are skipped. Because the threshold is shared across all partitions and
+    /// the tail, a partition processed late prunes against an already-warm
+    /// threshold instead of cold-starting.
+    fn wand_into(&self, tokens: &[String], scorer: &MemBM25Scorer, topk: &mut TopK) {
+        let mut lanes: Vec<WandLane> = Vec::with_capacity(tokens.len());
+        for token in tokens {
+            if let Some(id) = self.term_id(token) {
                 let pref = &self.postings[id as usize];
-                WandLane {
+                let qw = scorer.query_weight(token);
+                lanes.push(WandLane {
                     cursor: PostingCursor::new(self, id),
                     qw,
                     ub: qw * scorer.doc_weight(pref.max_freq, pref.min_dl),
-                }
-            })
-            .collect();
-        // Min-heap of the current top-k by score.
-        let mut heap: BinaryHeap<Reverse<ScoredDoc>> = BinaryHeap::with_capacity(k + 1);
-        // theta = score of the weakest doc still in the top-k (-inf until full).
-        let mut theta = f32::NEG_INFINITY;
+                });
+            }
+        }
+        if lanes.is_empty() {
+            return;
+        }
         loop {
             lanes.retain(|l| l.cursor.doc().is_some());
             if lanes.is_empty() {
                 break;
             }
             lanes.sort_by_key(|l| l.cursor.doc().unwrap());
+            let theta = topk.threshold();
             // Pivot: first lane whose cumulative upper bound exceeds theta.
             let mut acc = 0.0f32;
             let mut pivot = None;
@@ -2338,33 +2356,12 @@ impl Partition {
                         l.cursor.advance();
                     }
                 }
-                if heap.len() < k {
-                    heap.push(Reverse(ScoredDoc {
-                        score,
-                        doc: pivot_doc,
-                    }));
-                    if heap.len() == k {
-                        theta = heap.peek().unwrap().0.score;
-                    }
-                } else if score > theta {
-                    heap.pop();
-                    heap.push(Reverse(ScoredDoc {
-                        score,
-                        doc: pivot_doc,
-                    }));
-                    theta = heap.peek().unwrap().0.score;
-                }
+                topk.offer(score, self.docs.row_id(pivot_doc));
             } else {
                 // A lane before the pivot trails pivot_doc; skip it forward.
                 lanes[0].cursor.skip_to(pivot_doc);
             }
         }
-        heap.into_iter()
-            .map(|Reverse(sd)| FtsEntry {
-                row_position: self.docs.row_id(sd.doc),
-                score: sd.score,
-            })
-            .collect()
     }
 
     /// Phrase-search by intersecting posting lists: drive from the rarest
@@ -2421,29 +2418,80 @@ impl Partition {
     }
 }
 
-/// A scored doc id, ordered by score then doc id (`total_cmp`, so a stray
-/// non-finite score cannot panic the heap).
-struct ScoredDoc {
+/// A scored MemTable row, ordered by score then row position (`total_cmp`,
+/// so a stray non-finite score cannot panic the heap).
+struct ScoredEntry {
     score: f32,
-    doc: u32,
+    row_position: u64,
 }
 
-impl PartialEq for ScoredDoc {
+impl PartialEq for ScoredEntry {
     fn eq(&self, other: &Self) -> bool {
         self.cmp(other) == std::cmp::Ordering::Equal
     }
 }
-impl Eq for ScoredDoc {}
-impl PartialOrd for ScoredDoc {
+impl Eq for ScoredEntry {}
+impl PartialOrd for ScoredEntry {
     fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
         Some(self.cmp(other))
     }
 }
-impl Ord for ScoredDoc {
+impl Ord for ScoredEntry {
     fn cmp(&self, other: &Self) -> std::cmp::Ordering {
         self.score
             .total_cmp(&other.score)
-            .then(self.doc.cmp(&other.doc))
+            .then(self.row_position.cmp(&other.row_position))
+    }
+}
+
+/// A bounded top-k collector shared across all partitions and the tail of a
+/// single query. `threshold()` is the score of the weakest entry once full —
+/// the WAND pruning bound — and rises monotonically as entries are offered.
+struct TopK {
+    k: usize,
+    heap: BinaryHeap<Reverse<ScoredEntry>>,
+}
+
+impl TopK {
+    fn new(k: usize) -> Self {
+        Self {
+            k,
+            heap: BinaryHeap::with_capacity(k + 1),
+        }
+    }
+
+    /// Score a doc must beat to enter the top-k (`-inf` until the heap fills).
+    fn threshold(&self) -> f32 {
+        if self.heap.len() >= self.k {
+            self.heap.peek().unwrap().0.score
+        } else {
+            f32::NEG_INFINITY
+        }
+    }
+
+    fn offer(&mut self, score: f32, row_position: u64) {
+        if self.heap.len() < self.k {
+            self.heap.push(Reverse(ScoredEntry {
+                score,
+                row_position,
+            }));
+        } else if score > self.heap.peek().unwrap().0.score {
+            self.heap.pop();
+            self.heap.push(Reverse(ScoredEntry {
+                score,
+                row_position,
+            }));
+        }
+    }
+
+    fn into_entries(self) -> Vec<FtsEntry> {
+        self.heap
+            .into_iter()
+            .map(|Reverse(e)| FtsEntry {
+                row_position: e.row_position,
+                score: e.score,
+            })
+            .collect()
     }
 }
 
@@ -3682,12 +3730,12 @@ mod tests {
         let apple = index.tokenize_for_search("apple").pop().unwrap();
         // "apple" is present -> an OR search over it matches.
         let or_scorer = build_scorer(&st, &tail_snap, std::slice::from_ref(&apple));
-        let or_hits = p.search_match(std::slice::from_ref(&apple), Operator::Or, &or_scorer, None);
+        let or_hits = p.search_match(std::slice::from_ref(&apple), Operator::Or, &or_scorer);
         assert_eq!(or_hits.len(), 3);
         // Adding an absent term to an AND query short-circuits to nothing.
         let and_tokens = vec![apple, "definitely_missing".to_string()];
         let and_scorer = build_scorer(&st, &tail_snap, &and_tokens);
-        let and_hits = p.search_match(&and_tokens, Operator::And, &and_scorer, None);
+        let and_hits = p.search_match(&and_tokens, Operator::And, &and_scorer);
         assert!(and_hits.is_empty());
     }
 

From fd8edaf9037b8d07649fff700419768a5e46f9c4 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sun, 17 May 2026 18:24:20 -0700
Subject: [PATCH 28/37] perf(mem_wal): block-max WAND for FTS top-k pruning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The per-list WAND bound (`doc_weight(max_freq over the whole posting list,
...)`) is so loose for common terms that the pivot never drops below the
threshold — WAND degenerated into a full scan, so neither the shared
threshold nor a faster codec could help.

Each `BlockMeta` now carries the block's `max_freq`, and `wand_into` is a
block-max WAND: the per-list bound still selects the pivot doc, then a
per-block bound — summed over all lanes for the blocks covering the pivot —
decides whether the whole `[pivot_doc, min block end]` region can be skipped
without decoding or scoring it. The bound is sound (block max_freq and term
min_dl over-estimate every doc in the block), so the top-k stays exact.
`PostingCursor` gains shallow `block_max_freq_for` / `block_end_for` lookups
(binary search over `BlockMeta`, no payload decode).

Adds a 600-doc/2-partition test asserting the WAND top-k score multiset
equals the exact full-scan top-k across several k.
---
 rust/lance/src/dataset/mem_wal/index/fts.rs | 153 ++++++++++++++++----
 1 file changed, 128 insertions(+), 25 deletions(-)

diff --git a/rust/lance/src/dataset/mem_wal/index/fts.rs b/rust/lance/src/dataset/mem_wal/index/fts.rs
index c432a1d1921..b21b32e2de4 100644
--- a/rust/lance/src/dataset/mem_wal/index/fts.rs
+++ b/rust/lance/src/dataset/mem_wal/index/fts.rs
@@ -2009,6 +2009,8 @@ struct BlockMeta {
     /// first / last doc id in the block (block doc ids are ascending).
     first_doc: u32,
     last_doc: u32,
+    /// largest frequency in the block — the block-max WAND score bound.
+    max_freq: u32,
     /// start offset of the block's doc/freq payload in `doc_freq_data`.
     df_offset: u32,
     /// start offset of the block's position payload in `pos_data`.
@@ -2096,6 +2098,7 @@ fn build_partition(
             block_meta.push(BlockMeta {
                 first_doc,
                 last_doc,
+                max_freq: blk_max_freq,
                 df_offset,
                 pos_offset,
                 doc_width,
@@ -2303,12 +2306,18 @@ impl Partition {
         results
     }
 
-    /// WAND top-k over an OR query, contributing into the caller's shared
-    /// [`TopK`]. Exact: each term's `(max_freq, min_dl)` gives a sound score
-    /// upper bound, so docs that provably cannot beat the shared threshold
-    /// are skipped. Because the threshold is shared across all partitions and
-    /// the tail, a partition processed late prunes against an already-warm
-    /// threshold instead of cold-starting.
+    /// Block-max WAND top-k over an OR query, contributing into the caller's
+    /// shared [`TopK`]. Exact. Two sound bounds drive pruning:
+    ///
+    /// - the per-*list* bound `doc_weight(list max_freq, min_dl)` selects the
+    ///   pivot doc (always valid for any future doc);
+    /// - the per-*block* bound `doc_weight(block max_freq, min_dl)`, summed
+    ///   over all lanes for the blocks covering the pivot doc, decides whether
+    ///   the whole `[pivot_doc, min block end]` region can be skipped without
+    ///   decoding or scoring it.
+    ///
+    /// The threshold is shared across all partitions and the tail, so a
+    /// partition processed late prunes against an already-warm threshold.
     fn wand_into(&self, tokens: &[String], scorer: &MemBM25Scorer, topk: &mut TopK) {
         let mut lanes: Vec<WandLane> = Vec::with_capacity(tokens.len());
         for token in tokens {
@@ -2318,7 +2327,8 @@ impl Partition {
                 lanes.push(WandLane {
                     cursor: PostingCursor::new(self, id),
                     qw,
-                    ub: qw * scorer.doc_weight(pref.max_freq, pref.min_dl),
+                    min_dl: pref.min_dl,
+                    list_ub: qw * scorer.doc_weight(pref.max_freq, pref.min_dl),
                 });
             }
         }
@@ -2332,11 +2342,11 @@ impl Partition {
             }
             lanes.sort_by_key(|l| l.cursor.doc().unwrap());
             let theta = topk.threshold();
-            // Pivot: first lane whose cumulative upper bound exceeds theta.
+            // Pivot: first lane whose cumulative per-list bound exceeds theta.
             let mut acc = 0.0f32;
             let mut pivot = None;
             for (i, l) in lanes.iter().enumerate() {
-                acc += l.ub;
+                acc += l.list_ub;
                 if acc > theta {
                     pivot = Some(i);
                     break;
@@ -2346,21 +2356,38 @@ impl Partition {
                 break; // no remaining doc can reach theta
             };
             let pivot_doc = lanes[pivot].cursor.doc().unwrap();
-            if lanes[0].cursor.doc().unwrap() == pivot_doc {
-                // Every lane positioned at pivot_doc contributes; score it.
-                let dl = self.docs.num_tokens(pivot_doc);
-                let mut score = 0.0f32;
-                for l in lanes.iter_mut() {
-                    if l.cursor.doc() == Some(pivot_doc) {
-                        score += l.qw * scorer.doc_weight(l.cursor.freq(), dl);
-                        l.cursor.advance();
-                    }
-                }
-                topk.offer(score, self.docs.row_id(pivot_doc));
-            } else {
-                // A lane before the pivot trails pivot_doc; skip it forward.
+            if lanes[0].cursor.doc().unwrap() != pivot_doc {
+                // A lane before the pivot trails pivot_doc; align it first.
                 lanes[0].cursor.skip_to(pivot_doc);
+                continue;
+            }
+            // Block-max check: an upper bound on every doc in
+            // `[pivot_doc, region_end]` is the sum, over all lanes, of each
+            // lane's max contribution from the block covering pivot_doc.
+            // `region_end` is the smallest such block end, so within the
+            // region every lane stays in the same block and the bound holds.
+            let mut block_ub = 0.0f32;
+            let mut region_end = u32::MAX;
+            for l in &lanes {
+                let bmf = l.cursor.block_max_freq_for(pivot_doc);
+                block_ub += l.qw * scorer.doc_weight(bmf, l.min_dl);
+                region_end = region_end.min(l.cursor.block_end_for(pivot_doc));
+            }
+            if block_ub <= theta {
+                // No doc in the region can beat theta — skip the whole region.
+                lanes[0].cursor.skip_to(region_end.saturating_add(1));
+                continue;
+            }
+            // pivot_doc may win; score it.
+            let dl = self.docs.num_tokens(pivot_doc);
+            let mut score = 0.0f32;
+            for l in lanes.iter_mut() {
+                if l.cursor.doc() == Some(pivot_doc) {
+                    score += l.qw * scorer.doc_weight(l.cursor.freq(), dl);
+                    l.cursor.advance();
+                }
             }
+            topk.offer(score, self.docs.row_id(pivot_doc));
         }
     }
 
@@ -2495,12 +2522,14 @@ impl TopK {
     }
 }
 
-/// One WAND lane: a posting cursor plus its query weight and score bound.
+/// One WAND lane: a posting cursor plus its query weight and score bounds.
 struct WandLane<'a> {
     cursor: PostingCursor<'a>,
     qw: f32,
-    /// Upper bound on this term's contribution to any doc's score.
-    ub: f32,
+    /// Smallest doc length over the term — input to the per-block bound.
+    min_dl: u32,
+    /// Per-list contribution upper bound, for pivot selection.
+    list_ub: f32,
 }
 
 /// A decoding cursor over one term's compressed posting list. Decodes a
@@ -2654,6 +2683,38 @@ impl<'a> PostingCursor<'a> {
         // `last_doc >= target`, so this block holds a doc >= target.
         self.i += self.docs[self.i..].partition_point(|&d| d < target);
     }
+
+    /// The term's `BlockMeta` slice.
+    fn blocks(&self) -> &[BlockMeta] {
+        let start = self.pref.block_start as usize;
+        &self.part.block_meta[start..start + self.pref.block_count as usize]
+    }
+
+    /// Index (within the term) of the first block that could hold `target` —
+    /// the first block with `last_doc >= target`. Shallow: binary search over
+    /// `BlockMeta`, no payload decode.
+    fn shallow_block(&self, target: u32) -> Option<usize> {
+        let blocks = self.blocks();
+        let idx = blocks.partition_point(|b| b.last_doc < target);
+        (idx < blocks.len()).then_some(idx)
+    }
+
+    /// Largest frequency in the block that would hold `target` (a sound,
+    /// possibly loose, per-block score-bound input). 0 if `target` is past
+    /// the term's last block.
+    fn block_max_freq_for(&self, target: u32) -> u32 {
+        self.shallow_block(target)
+            .map(|b| self.blocks()[b].max_freq)
+            .unwrap_or(0)
+    }
+
+    /// Last doc id of the block that would hold `target` (`u32::MAX` if past
+    /// the last block) — the upper end of a skippable region.
+    fn block_end_for(&self, target: u32) -> u32 {
+        self.shallow_block(target)
+            .map(|b| self.blocks()[b].last_doc)
+            .unwrap_or(u32::MAX)
+    }
 }
 
 // ============================================================================
@@ -3988,6 +4049,48 @@ mod tests {
         }
     }
 
+    #[test]
+    fn test_block_max_wand_exact_at_scale() {
+        // 600 docs across 2 partitions; "alpha" in every doc with a frequency
+        // that cycles, so per-block maxima vary and the block-max skip path
+        // genuinely engages. The WAND top-k must still equal the exact
+        // full-scan top-k.
+        let index = FtsMemIndex::new(1, "description".to_string()).with_freeze_threshold_rows(300);
+        for i in 0..600u64 {
+            let tf = (i % 17) + 1;
+            let text = format!("{}filler", "alpha ".repeat(tf as usize));
+            let batch = RecordBatch::try_new(
+                create_test_schema(),
+                vec![
+                    Arc::new(Int32Array::from(vec![0])),
+                    Arc::new(StringArray::from(vec![text.as_str()])),
+                ],
+            )
+            .unwrap();
+            index.insert(&batch, i).unwrap();
+        }
+        assert!(index.state.load_full().partitions.len() >= 2);
+
+        // Exact ranking via the unbounded scan path.
+        let mut full = index.search("alpha");
+        assert_eq!(full.len(), 600);
+        full.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
+
+        // For several k, the WAND top-k's score multiset must equal the exact
+        // top-k's (row ties make the row set non-unique, but scores are not).
+        for k in [1usize, 10, 50, 200] {
+            let limited = index.search_with_options(
+                &FtsQueryExpr::match_query("alpha"),
+                SearchOptions::new().with_limit(k),
+            );
+            assert_eq!(limited.len(), k, "k={k}");
+            let mut got: Vec<f32> = limited.iter().map(|e| e.score).collect();
+            got.sort_by(|a, b| b.partial_cmp(a).unwrap());
+            let expected: Vec<f32> = full.iter().take(k).map(|e| e.score).collect();
+            assert_eq!(got, expected, "WAND top-{k} scores must match exact");
+        }
+    }
+
     #[test]
     fn test_vbyte_roundtrip() {
         let vals = [0u32, 1, 127, 128, 300, 16_383, 16_384, 70_000, u32::MAX];

From 2094b7a96d44359901f9c483af0d01ddab97a81c Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sun, 17 May 2026 20:13:09 -0700
Subject: [PATCH 29/37] Revert "perf(mem_wal): block-max WAND for FTS top-k
 pruning"

This reverts commit 0aa620c9cbd1731e9957ad7e348564bd2f7fe1d7.
---
 rust/lance/src/dataset/mem_wal/index/fts.rs | 153 ++++----------------
 1 file changed, 25 insertions(+), 128 deletions(-)

diff --git a/rust/lance/src/dataset/mem_wal/index/fts.rs b/rust/lance/src/dataset/mem_wal/index/fts.rs
index b21b32e2de4..c432a1d1921 100644
--- a/rust/lance/src/dataset/mem_wal/index/fts.rs
+++ b/rust/lance/src/dataset/mem_wal/index/fts.rs
@@ -2009,8 +2009,6 @@ struct BlockMeta {
     /// first / last doc id in the block (block doc ids are ascending).
     first_doc: u32,
     last_doc: u32,
-    /// largest frequency in the block — the block-max WAND score bound.
-    max_freq: u32,
     /// start offset of the block's doc/freq payload in `doc_freq_data`.
     df_offset: u32,
     /// start offset of the block's position payload in `pos_data`.
@@ -2098,7 +2096,6 @@ fn build_partition(
             block_meta.push(BlockMeta {
                 first_doc,
                 last_doc,
-                max_freq: blk_max_freq,
                 df_offset,
                 pos_offset,
                 doc_width,
@@ -2306,18 +2303,12 @@ impl Partition {
         results
     }
 
-    /// Block-max WAND top-k over an OR query, contributing into the caller's
-    /// shared [`TopK`]. Exact. Two sound bounds drive pruning:
-    ///
-    /// - the per-*list* bound `doc_weight(list max_freq, min_dl)` selects the
-    ///   pivot doc (always valid for any future doc);
-    /// - the per-*block* bound `doc_weight(block max_freq, min_dl)`, summed
-    ///   over all lanes for the blocks covering the pivot doc, decides whether
-    ///   the whole `[pivot_doc, min block end]` region can be skipped without
-    ///   decoding or scoring it.
-    ///
-    /// The threshold is shared across all partitions and the tail, so a
-    /// partition processed late prunes against an already-warm threshold.
+    /// WAND top-k over an OR query, contributing into the caller's shared
+    /// [`TopK`]. Exact: each term's `(max_freq, min_dl)` gives a sound score
+    /// upper bound, so docs that provably cannot beat the shared threshold
+    /// are skipped. Because the threshold is shared across all partitions and
+    /// the tail, a partition processed late prunes against an already-warm
+    /// threshold instead of cold-starting.
     fn wand_into(&self, tokens: &[String], scorer: &MemBM25Scorer, topk: &mut TopK) {
         let mut lanes: Vec<WandLane> = Vec::with_capacity(tokens.len());
         for token in tokens {
@@ -2327,8 +2318,7 @@ impl Partition {
                 lanes.push(WandLane {
                     cursor: PostingCursor::new(self, id),
                     qw,
-                    min_dl: pref.min_dl,
-                    list_ub: qw * scorer.doc_weight(pref.max_freq, pref.min_dl),
+                    ub: qw * scorer.doc_weight(pref.max_freq, pref.min_dl),
                 });
             }
         }
@@ -2342,11 +2332,11 @@ impl Partition {
             }
             lanes.sort_by_key(|l| l.cursor.doc().unwrap());
             let theta = topk.threshold();
-            // Pivot: first lane whose cumulative per-list bound exceeds theta.
+            // Pivot: first lane whose cumulative upper bound exceeds theta.
             let mut acc = 0.0f32;
             let mut pivot = None;
             for (i, l) in lanes.iter().enumerate() {
-                acc += l.list_ub;
+                acc += l.ub;
                 if acc > theta {
                     pivot = Some(i);
                     break;
@@ -2356,38 +2346,21 @@ impl Partition {
                 break; // no remaining doc can reach theta
             };
             let pivot_doc = lanes[pivot].cursor.doc().unwrap();
-            if lanes[0].cursor.doc().unwrap() != pivot_doc {
-                // A lane before the pivot trails pivot_doc; align it first.
-                lanes[0].cursor.skip_to(pivot_doc);
-                continue;
-            }
-            // Block-max check: an upper bound on every doc in
-            // `[pivot_doc, region_end]` is the sum, over all lanes, of each
-            // lane's max contribution from the block covering pivot_doc.
-            // `region_end` is the smallest such block end, so within the
-            // region every lane stays in the same block and the bound holds.
-            let mut block_ub = 0.0f32;
-            let mut region_end = u32::MAX;
-            for l in &lanes {
-                let bmf = l.cursor.block_max_freq_for(pivot_doc);
-                block_ub += l.qw * scorer.doc_weight(bmf, l.min_dl);
-                region_end = region_end.min(l.cursor.block_end_for(pivot_doc));
-            }
-            if block_ub <= theta {
-                // No doc in the region can beat theta — skip the whole region.
-                lanes[0].cursor.skip_to(region_end.saturating_add(1));
-                continue;
-            }
-            // pivot_doc may win; score it.
-            let dl = self.docs.num_tokens(pivot_doc);
-            let mut score = 0.0f32;
-            for l in lanes.iter_mut() {
-                if l.cursor.doc() == Some(pivot_doc) {
-                    score += l.qw * scorer.doc_weight(l.cursor.freq(), dl);
-                    l.cursor.advance();
+            if lanes[0].cursor.doc().unwrap() == pivot_doc {
+                // Every lane positioned at pivot_doc contributes; score it.
+                let dl = self.docs.num_tokens(pivot_doc);
+                let mut score = 0.0f32;
+                for l in lanes.iter_mut() {
+                    if l.cursor.doc() == Some(pivot_doc) {
+                        score += l.qw * scorer.doc_weight(l.cursor.freq(), dl);
+                        l.cursor.advance();
+                    }
                 }
+                topk.offer(score, self.docs.row_id(pivot_doc));
+            } else {
+                // A lane before the pivot trails pivot_doc; skip it forward.
+                lanes[0].cursor.skip_to(pivot_doc);
             }
-            topk.offer(score, self.docs.row_id(pivot_doc));
         }
     }
 
@@ -2522,14 +2495,12 @@ impl TopK {
     }
 }
 
-/// One WAND lane: a posting cursor plus its query weight and score bounds.
+/// One WAND lane: a posting cursor plus its query weight and score bound.
 struct WandLane<'a> {
     cursor: PostingCursor<'a>,
     qw: f32,
-    /// Smallest doc length over the term — input to the per-block bound.
-    min_dl: u32,
-    /// Per-list contribution upper bound, for pivot selection.
-    list_ub: f32,
+    /// Upper bound on this term's contribution to any doc's score.
+    ub: f32,
 }
 
 /// A decoding cursor over one term's compressed posting list. Decodes a
@@ -2683,38 +2654,6 @@ impl<'a> PostingCursor<'a> {
         // `last_doc >= target`, so this block holds a doc >= target.
         self.i += self.docs[self.i..].partition_point(|&d| d < target);
     }
-
-    /// The term's `BlockMeta` slice.
-    fn blocks(&self) -> &[BlockMeta] {
-        let start = self.pref.block_start as usize;
-        &self.part.block_meta[start..start + self.pref.block_count as usize]
-    }
-
-    /// Index (within the term) of the first block that could hold `target` —
-    /// the first block with `last_doc >= target`. Shallow: binary search over
-    /// `BlockMeta`, no payload decode.
-    fn shallow_block(&self, target: u32) -> Option<usize> {
-        let blocks = self.blocks();
-        let idx = blocks.partition_point(|b| b.last_doc < target);
-        (idx < blocks.len()).then_some(idx)
-    }
-
-    /// Largest frequency in the block that would hold `target` (a sound,
-    /// possibly loose, per-block score-bound input). 0 if `target` is past
-    /// the term's last block.
-    fn block_max_freq_for(&self, target: u32) -> u32 {
-        self.shallow_block(target)
-            .map(|b| self.blocks()[b].max_freq)
-            .unwrap_or(0)
-    }
-
-    /// Last doc id of the block that would hold `target` (`u32::MAX` if past
-    /// the last block) — the upper end of a skippable region.
-    fn block_end_for(&self, target: u32) -> u32 {
-        self.shallow_block(target)
-            .map(|b| self.blocks()[b].last_doc)
-            .unwrap_or(u32::MAX)
-    }
 }
 
 // ============================================================================
@@ -4049,48 +3988,6 @@ mod tests {
         }
     }
 
-    #[test]
-    fn test_block_max_wand_exact_at_scale() {
-        // 600 docs across 2 partitions; "alpha" in every doc with a frequency
-        // that cycles, so per-block maxima vary and the block-max skip path
-        // genuinely engages. The WAND top-k must still equal the exact
-        // full-scan top-k.
-        let index = FtsMemIndex::new(1, "description".to_string()).with_freeze_threshold_rows(300);
-        for i in 0..600u64 {
-            let tf = (i % 17) + 1;
-            let text = format!("{}filler", "alpha ".repeat(tf as usize));
-            let batch = RecordBatch::try_new(
-                create_test_schema(),
-                vec![
-                    Arc::new(Int32Array::from(vec![0])),
-                    Arc::new(StringArray::from(vec![text.as_str()])),
-                ],
-            )
-            .unwrap();
-            index.insert(&batch, i).unwrap();
-        }
-        assert!(index.state.load_full().partitions.len() >= 2);
-
-        // Exact ranking via the unbounded scan path.
-        let mut full = index.search("alpha");
-        assert_eq!(full.len(), 600);
-        full.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
-
-        // For several k, the WAND top-k's score multiset must equal the exact
-        // top-k's (row ties make the row set non-unique, but scores are not).
-        for k in [1usize, 10, 50, 200] {
-            let limited = index.search_with_options(
-                &FtsQueryExpr::match_query("alpha"),
-                SearchOptions::new().with_limit(k),
-            );
-            assert_eq!(limited.len(), k, "k={k}");
-            let mut got: Vec<f32> = limited.iter().map(|e| e.score).collect();
-            got.sort_by(|a, b| b.partial_cmp(a).unwrap());
-            let expected: Vec<f32> = full.iter().take(k).map(|e| e.score).collect();
-            assert_eq!(got, expected, "WAND top-{k} scores must match exact");
-        }
-    }
-
     #[test]
     fn test_vbyte_roundtrip() {
         let vals = [0u32, 1, 127, 128, 300, 16_383, 16_384, 70_000, u32::MAX];

From 75c512b295fbb44bba09590677d6a17d52380870 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sun, 17 May 2026 20:20:22 -0700
Subject: [PATCH 30/37] perf(mem_wal): SIMD-decode FTS posting blocks with
 BitPacker4x
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Compression cut the in-memory FTS footprint 2.7x but VByte/scalar block
decoding regressed query p95 at 1M docs from ~4.7 ms to ~25 ms — and the
FineWeb top-k queries turned out to be scan-bound, not prune-bound, so WAND
pruning could not recover it. Decode speed is the lever.

Full 128-doc posting blocks (doc ids and frequencies) are now packed and
unpacked with `bitpacking`'s SIMD `BitPacker4x`; the partial final block of
each term keeps the scalar codec. The byte layout and `BlockMeta` widths are
unchanged, so the two codecs interoperate per block. Positions stay VByte
(decoded lazily, off the hot path).

Adds the `bitpacking` crate dependency.
---
 Cargo.lock                                  |  1 +
 Cargo.toml                                  |  1 +
 rust/lance/Cargo.toml                       |  1 +
 rust/lance/src/dataset/mem_wal/index/fts.rs | 55 ++++++++++++++++++---
 4 files changed, 52 insertions(+), 6 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 655cb2f123f..f2d73d833b4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4343,6 +4343,7 @@ dependencies = [
  "aws-credential-types",
  "aws-sdk-dynamodb",
  "aws-sdk-s3",
+ "bitpacking",
  "byteorder",
  "bytes",
  "chrono",
diff --git a/Cargo.toml b/Cargo.toml
index 693cffe5266..d856c5a822d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -100,6 +100,7 @@ half = { "version" = "2.1", default-features = false, features = [
     "std",
 ] }
 lance-bitpacking = { version = "=7.0.0-beta.14", path = "./rust/compression/bitpacking" }
+bitpacking = "0.9"
 bitvec = "1"
 bytes = "1.11.1"
 byteorder = "1.5"
diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml
index 9053738e657..6a31d01dda8 100644
--- a/rust/lance/Cargo.toml
+++ b/rust/lance/Cargo.toml
@@ -42,6 +42,7 @@ arrow-schema = { workspace = true }
 arrow-select = { workspace = true }
 async-recursion.workspace = true
 async-trait.workspace = true
+bitpacking = { workspace = true }
 byteorder.workspace = true
 bytes.workspace = true
 chrono.workspace = true
diff --git a/rust/lance/src/dataset/mem_wal/index/fts.rs b/rust/lance/src/dataset/mem_wal/index/fts.rs
index c432a1d1921..953e1aba079 100644
--- a/rust/lance/src/dataset/mem_wal/index/fts.rs
+++ b/rust/lance/src/dataset/mem_wal/index/fts.rs
@@ -54,6 +54,7 @@ use std::sync::atomic::{AtomicUsize, Ordering};
 use arc_swap::ArcSwap;
 use arrow_array::{Array, LargeStringArray, RecordBatch, StringArray, StringViewArray};
 use arrow_schema::DataType;
+use bitpacking::{BitPacker, BitPacker4x};
 use crossbeam_skiplist::SkipMap;
 use lance_core::{Error, Result};
 use lance_index::scalar::InvertedIndexParams;
@@ -1997,11 +1998,48 @@ fn bitpack_get(buf: &[u8], start: usize, n: usize, width: u8, out: &mut Vec<u32>
     }
 }
 
-/// Number of bytes a `bitpack_put` of `n` values of `width` bits occupies.
+/// Number of bytes a width-`width` bit-packed block of `n` values occupies.
+/// Identical for the scalar and `BitPacker4x` codecs.
 fn bitpack_len(n: usize, width: u8) -> usize {
     (n * width as usize).div_ceil(8)
 }
 
+/// Bit-pack `values` into `buf`: SIMD `BitPacker4x` for a full 128-element
+/// block (the common-term hot path), scalar `bitpack_put` for a shorter
+/// final block.
+fn pack_block(bp: &BitPacker4x, buf: &mut Vec<u8>, values: &[u32], width: u8) {
+    if values.len() == POSTING_BLOCK && width > 0 {
+        let mut input = [0u32; POSTING_BLOCK];
+        input.copy_from_slice(values);
+        let mut out = [0u8; POSTING_BLOCK * 4];
+        let n = bp.compress(&input, &mut out, width);
+        buf.extend_from_slice(&out[..n]);
+    } else {
+        bitpack_put(buf, values, width);
+    }
+}
+
+/// Unpack `n` values of `width` bits from `buf[start..]` into `out` (cleared
+/// first) — SIMD for a full 128-element block, scalar otherwise.
+fn unpack_block(
+    bp: &BitPacker4x,
+    buf: &[u8],
+    start: usize,
+    n: usize,
+    width: u8,
+    out: &mut Vec<u32>,
+) {
+    out.clear();
+    if n == POSTING_BLOCK && width > 0 {
+        let mut decoded = [0u32; POSTING_BLOCK];
+        let bytes = bitpack_len(POSTING_BLOCK, width);
+        bp.decompress(&buf[start..start + bytes], &mut decoded, width);
+        out.extend_from_slice(&decoded);
+    } else {
+        bitpack_get(buf, start, n, width, out);
+    }
+}
+
 /// Per-128-doc-block metadata — enough to skip and score-bound a block
 /// without decoding its payload.
 #[derive(Clone, Copy)]
@@ -2059,6 +2097,7 @@ fn build_partition(
     docs: DocSet,
 ) -> Partition {
     entries.sort_by(|a, b| a.0.cmp(&b.0));
+    let bp = BitPacker4x::new();
     let mut terms = Vec::with_capacity(entries.len());
     let mut postings = Vec::with_capacity(entries.len());
     let mut block_meta: Vec<BlockMeta> = Vec::new();
@@ -2077,12 +2116,12 @@ fn build_partition(
             // doc ids: bit-pack `doc - first_doc` at a fixed block width.
             let doc_width = bit_width(last_doc - first_doc);
             let doc_deltas: Vec<u32> = chunk.iter().map(|&(d, _, _)| d - first_doc).collect();
-            bitpack_put(&mut doc_freq_data, &doc_deltas, doc_width);
+            pack_block(&bp, &mut doc_freq_data, &doc_deltas, doc_width);
             // frequencies: bit-pack at a fixed block width.
             let blk_max_freq = chunk.iter().map(|&(_, f, _)| f).max().unwrap_or(0);
             let freq_width = bit_width(blk_max_freq);
             let freqs: Vec<u32> = chunk.iter().map(|&(_, f, _)| f).collect();
-            bitpack_put(&mut doc_freq_data, &freqs, freq_width);
+            pack_block(&bp, &mut doc_freq_data, &freqs, freq_width);
             // positions: VByte count + delta-encoded positions per doc.
             for &(d, _, ref positions) in chunk {
                 vbyte_put(&mut pos_data, positions.len() as u32);
@@ -2520,6 +2559,8 @@ struct PostingCursor<'a> {
     positions: Vec<Vec<u32>>,
     /// index within the current block.
     i: usize,
+    /// SIMD bit-(un)packer for full 128-doc blocks.
+    bp: BitPacker4x,
 }
 
 impl<'a> PostingCursor<'a> {
@@ -2535,6 +2576,7 @@ impl<'a> PostingCursor<'a> {
             pos_decoded: u32::MAX,
             positions: Vec::new(),
             i: 0,
+            bp: BitPacker4x::new(),
         };
         if pref.block_count > 0 {
             cursor.decode_doc_freq(0);
@@ -2549,10 +2591,10 @@ impl<'a> PostingCursor<'a> {
         let bm = self.part.block_meta[(self.pref.block_start + block) as usize];
         let n = block_len(self.pref.doc_count, block);
         self.docs.clear();
-        self.freqs.clear();
         // doc ids: bit-packed `doc - first_doc`.
         let df_start = bm.df_offset as usize;
-        bitpack_get(
+        unpack_block(
+            &self.bp,
             &self.part.doc_freq_data,
             df_start,
             n,
@@ -2564,7 +2606,8 @@ impl<'a> PostingCursor<'a> {
         }
         // frequencies follow the doc-id block.
         let freq_start = df_start + bitpack_len(n, bm.doc_width);
-        bitpack_get(
+        unpack_block(
+            &self.bp,
             &self.part.doc_freq_data,
             freq_start,
             n,

From 8810a774d9f3da9f72851c28035adb9328bccdc2 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Sun, 17 May 2026 20:53:32 -0700
Subject: [PATCH 31/37] test(mem_wal): split FTS bench latency by query type

Reports term vs phrase p50/p95 separately so a latency regression can be
attributed to one query class instead of the blended percentile.
---
 rust/lance/benches/mem_wal_fts_bench.rs | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/rust/lance/benches/mem_wal_fts_bench.rs b/rust/lance/benches/mem_wal_fts_bench.rs
index 4364fe053c9..9500bc3dba1 100644
--- a/rust/lance/benches/mem_wal_fts_bench.rs
+++ b/rust/lance/benches/mem_wal_fts_bench.rs
@@ -593,6 +593,29 @@ fn run_bench(args: &BenchArgs) -> Result<()> {
         f64::NAN
     };
 
+    // Per-query-type latency split — `latencies_us` is parallel to `queries`.
+    let mut term_lat: Vec<f64> = Vec::new();
+    let mut phrase_lat: Vec<f64> = Vec::new();
+    for (q, &lat) in queries.iter().zip(&latencies_us) {
+        if q.kind == "phrase" {
+            phrase_lat.push(lat);
+        } else {
+            term_lat.push(lat);
+        }
+    }
+    term_lat.sort_by(|a, b| a.partial_cmp(b).unwrap());
+    phrase_lat.sort_by(|a, b| a.partial_cmp(b).unwrap());
+    let pct = |v: &[f64], p: f64| if v.is_empty() { f64::NAN } else { percentile(v, p) };
+    println!(
+        "result split: term_p50={:.1} term_p95={:.1} ({} q) | phrase_p50={:.1} phrase_p95={:.1} ({} q)",
+        pct(&term_lat, 50.0),
+        pct(&term_lat, 95.0),
+        term_lat.len(),
+        pct(&phrase_lat, 50.0),
+        pct(&phrase_lat, 95.0),
+        phrase_lat.len(),
+    );
+
     println!(
         "result impl=lance_fts run={} docs={} queries={} build_s={:.3} build_docs_per_s={:.0} \
          q_p50_us={:.1} q_p95_us={:.1} qps_1t={:.0} qps_{}t={:.0} \

From 6014e033edcea814f56faa58645344f4f9a15a22 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Mon, 18 May 2026 00:56:24 -0700
Subject: [PATCH 32/37] perf(mem_wal): random-access compressed FTS positions
 for fast phrase
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Diagnosis (term/phrase latency split): term queries were already fast
(p95 ~0.6 ms, ahead of Lucene) — the blended p95 of ~25 ms was entirely
phrase queries. `search_phrase` calls `positions()` for the candidate docs
of the rare driving term, but each call VByte-decoded the *whole* 128-doc
block's positions to use one — ~128x wasted work, scattered across blocks.

A document's position count equals its term frequency, so no per-doc count
needs storing: each block's positions are now one bit-packed delta stream,
and doc `i`'s slice is located from the freq prefix sum. `bitpack_read_at`
reads that slice directly — phrase decodes one document's positions, never
the block. This also drops the per-doc count bytes, so the footprint is
slightly smaller. Replaces the VByte position codec entirely.
---
 rust/lance/src/dataset/mem_wal/index/fts.rs | 158 ++++++++++++--------
 1 file changed, 96 insertions(+), 62 deletions(-)

diff --git a/rust/lance/src/dataset/mem_wal/index/fts.rs b/rust/lance/src/dataset/mem_wal/index/fts.rs
index 953e1aba079..535b9c53fbd 100644
--- a/rust/lance/src/dataset/mem_wal/index/fts.rs
+++ b/rust/lance/src/dataset/mem_wal/index/fts.rs
@@ -1913,31 +1913,6 @@ impl FtsIndexConfig {
 /// per-block score bound; a block's payload is self-delimiting.
 const POSTING_BLOCK: usize = 128;
 
-/// Append `v` to `buf` as unsigned LEB128 (VByte).
-fn vbyte_put(buf: &mut Vec<u8>, mut v: u32) {
-    while v >= 0x80 {
-        buf.push((v as u8) | 0x80);
-        v >>= 7;
-    }
-    buf.push(v as u8);
-}
-
-/// Read one unsigned LEB128 value from `buf` at `*pos`, advancing `*pos`.
-fn vbyte_get(buf: &[u8], pos: &mut usize) -> u32 {
-    let mut v = 0u32;
-    let mut shift = 0u32;
-    loop {
-        let b = buf[*pos];
-        *pos += 1;
-        v |= ((b & 0x7f) as u32) << shift;
-        if b & 0x80 == 0 {
-            break;
-        }
-        shift += 7;
-    }
-    v
-}
-
 /// Number of docs in block `b` (0-based within a term) of a `doc_count`-long
 /// posting list.
 fn block_len(doc_count: u32, b: u32) -> usize {
@@ -2040,6 +2015,41 @@ fn unpack_block(
     }
 }
 
+/// Random-access read of `n` width-`width` values at logical index `s` from a
+/// bit-packed stream whose first value starts at byte `base` of `buf`. Used to
+/// decode one document's positions without touching the rest of the block.
+fn bitpack_read_at(buf: &[u8], base: usize, s: usize, n: usize, width: u8, out: &mut Vec<u32>) {
+    if n == 0 {
+        return;
+    }
+    if width == 0 {
+        out.resize(out.len() + n, 0);
+        return;
+    }
+    let w = width as u32;
+    let mask: u64 = if w >= 32 {
+        u32::MAX as u64
+    } else {
+        (1u64 << w) - 1
+    };
+    let start_bit = s * width as usize;
+    let mut byte = base + start_bit / 8;
+    let skip = (start_bit % 8) as u32;
+    let mut acc = (buf[byte] as u64) >> skip;
+    let mut bits = 8 - skip;
+    byte += 1;
+    for _ in 0..n {
+        while bits < w {
+            acc |= (buf[byte] as u64) << bits;
+            byte += 1;
+            bits += 8;
+        }
+        out.push((acc & mask) as u32);
+        acc >>= w;
+        bits -= w;
+    }
+}
+
 /// Per-128-doc-block metadata — enough to skip and score-bound a block
 /// without decoding its payload.
 #[derive(Clone, Copy)]
@@ -2054,6 +2064,8 @@ struct BlockMeta {
     /// bit width of the packed doc-id deltas (from `first_doc`) and freqs.
     doc_width: u8,
     freq_width: u8,
+    /// bit width of the packed position deltas.
+    pos_width: u8,
 }
 
 /// Per-term locator into a partition's shared posting buffers.
@@ -2122,16 +2134,20 @@ fn build_partition(
             let freq_width = bit_width(blk_max_freq);
             let freqs: Vec<u32> = chunk.iter().map(|&(_, f, _)| f).collect();
             pack_block(&bp, &mut doc_freq_data, &freqs, freq_width);
-            // positions: VByte count + delta-encoded positions per doc.
+            // positions: one bit-packed delta stream for the whole block.
+            // A doc's position count equals its frequency, so no count is
+            // stored — doc `i`'s slice is found from the freq prefix sum.
+            let mut pos_deltas: Vec<u32> = Vec::new();
             for &(d, _, ref positions) in chunk {
-                vbyte_put(&mut pos_data, positions.len() as u32);
                 let mut prev_p = 0u32;
                 for &p in positions {
-                    vbyte_put(&mut pos_data, p - prev_p);
+                    pos_deltas.push(p - prev_p);
                     prev_p = p;
                 }
                 term_min_dl = term_min_dl.min(docs.num_tokens(d));
             }
+            let pos_width = bit_width(pos_deltas.iter().copied().max().unwrap_or(0));
+            bitpack_put(&mut pos_data, &pos_deltas, pos_width);
             block_meta.push(BlockMeta {
                 first_doc,
                 last_doc,
@@ -2139,6 +2155,7 @@ fn build_partition(
                 pos_offset,
                 doc_width,
                 freq_width,
+                pos_width,
             });
             term_max_freq = term_max_freq.max(blk_max_freq);
         }
@@ -2554,9 +2571,13 @@ struct PostingCursor<'a> {
     decoded: u32,
     docs: Vec<u32>,
     freqs: Vec<u32>,
-    /// the block decoded into `positions` (`u32::MAX` = none).
-    pos_decoded: u32,
-    positions: Vec<Vec<u32>>,
+    /// freq prefix sum of the decoded block (`len == freqs.len() + 1`); the
+    /// block it was computed for is `prefix_block`. Indexes the position
+    /// stream for random per-doc access.
+    prefix: Vec<u32>,
+    prefix_block: u32,
+    /// scratch for the most recently decoded doc's positions.
+    pos_scratch: Vec<u32>,
     /// index within the current block.
     i: usize,
     /// SIMD bit-(un)packer for full 128-doc blocks.
@@ -2573,8 +2594,9 @@ impl<'a> PostingCursor<'a> {
             decoded: u32::MAX,
             docs: Vec::new(),
             freqs: Vec::new(),
-            pos_decoded: u32::MAX,
-            positions: Vec::new(),
+            prefix: Vec::new(),
+            prefix_block: u32::MAX,
+            pos_scratch: Vec::new(),
             i: 0,
             bp: BitPacker4x::new(),
         };
@@ -2617,25 +2639,19 @@ impl<'a> PostingCursor<'a> {
         self.decoded = block;
     }
 
-    fn decode_positions(&mut self, block: u32) {
-        if self.pos_decoded == block {
+    /// Ensure `prefix` holds the freq prefix sum of the current block.
+    fn ensure_prefix(&mut self) {
+        if self.prefix_block == self.block {
             return;
         }
-        let bm = self.part.block_meta[(self.pref.block_start + block) as usize];
-        let n = block_len(self.pref.doc_count, block);
-        self.positions.clear();
-        let mut pos = bm.pos_offset as usize;
-        for _ in 0..n {
-            let cnt = vbyte_get(&self.part.pos_data, &mut pos);
-            let mut v = Vec::with_capacity(cnt as usize);
-            let mut last = 0u32;
-            for _ in 0..cnt {
-                last += vbyte_get(&self.part.pos_data, &mut pos);
-                v.push(last);
-            }
-            self.positions.push(v);
+        self.prefix.clear();
+        self.prefix.push(0);
+        let mut sum = 0u32;
+        for &f in &self.freqs {
+            sum += f;
+            self.prefix.push(sum);
         }
-        self.pos_decoded = block;
+        self.prefix_block = self.block;
     }
 
     /// Current doc id, or `None` once the list is exhausted.
@@ -2657,11 +2673,29 @@ impl<'a> PostingCursor<'a> {
         self.freqs[self.i]
     }
 
-    /// Positions of the current posting (decoded lazily).
+    /// Positions of the current posting — decoded on demand for this one
+    /// document only (random access into the block's position stream).
     fn positions(&mut self) -> &[u32] {
-        let block = self.block;
-        self.decode_positions(block);
-        &self.positions[self.i]
+        self.ensure_prefix();
+        let bm = self.part.block_meta[(self.pref.block_start + self.block) as usize];
+        let s = self.prefix[self.i] as usize;
+        let n = self.prefix[self.i + 1] as usize - s;
+        self.pos_scratch.clear();
+        bitpack_read_at(
+            &self.part.pos_data,
+            bm.pos_offset as usize,
+            s,
+            n,
+            bm.pos_width,
+            &mut self.pos_scratch,
+        );
+        // un-delta in place.
+        let mut last = 0u32;
+        for d in &mut self.pos_scratch {
+            last += *d;
+            *d = last;
+        }
+        &self.pos_scratch
     }
 
     /// Step to the next posting.
@@ -4032,17 +4066,17 @@ mod tests {
     }
 
     #[test]
-    fn test_vbyte_roundtrip() {
-        let vals = [0u32, 1, 127, 128, 300, 16_383, 16_384, 70_000, u32::MAX];
+    fn test_bitpack_read_at_random_access() {
+        // Pack a stream, then random-access arbitrary [s, s+n) ranges.
+        let vals: Vec<u32> = (0..200u32).map(|i| (i * 7 + 3) % 1000).collect();
+        let width = bit_width(*vals.iter().max().unwrap());
         let mut buf = Vec::new();
-        for &v in &vals {
-            vbyte_put(&mut buf, v);
-        }
-        let mut pos = 0;
-        for &v in &vals {
-            assert_eq!(vbyte_get(&buf, &mut pos), v);
+        bitpack_put(&mut buf, &vals, width);
+        for &(s, n) in &[(0usize, 5usize), (1, 1), (63, 10), (130, 70), (199, 1)] {
+            let mut out = Vec::new();
+            bitpack_read_at(&buf, 0, s, n, width, &mut out);
+            assert_eq!(out, &vals[s..s + n], "s={s} n={n}");
         }
-        assert_eq!(pos, buf.len(), "every byte consumed");
     }
 
     #[test]

From eb8812241d832035ddad5b551bad4b6480db3824 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Mon, 18 May 2026 01:30:47 -0700
Subject: [PATCH 33/37] fix(bench): pick the freshest FTS bench binary by mtime
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

run_fts_compare.sh selected the bench binary with `sort | tail -1` — a
lexical sort by Cargo's build hash. When a dependency change rotates that
hash, an older binary can sort last, so the harness silently ran stale code.
Now it clears old binaries before building and selects by modification time.
---
 bench/run_fts_compare.sh                | 7 ++++++-
 rust/lance/benches/mem_wal_fts_bench.rs | 8 +++++++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/bench/run_fts_compare.sh b/bench/run_fts_compare.sh
index e06b5c92274..1374127464c 100755
--- a/bench/run_fts_compare.sh
+++ b/bench/run_fts_compare.sh
@@ -59,9 +59,14 @@ echo "Lucene classpath: $LUCENE_CP"
 
 # ---- build the Lance bench ----
 echo "=== Building Lance FTS bench ==="
+# Drop stale binaries so the freshest build is unambiguous, then pick the
+# most-recently-modified one — a lexical sort by build hash can otherwise
+# pick a stale binary after a dependency change rotates the hash.
+rm -f "$REPO_ROOT"/target/release/deps/mem_wal_fts_bench-*
 cargo bench -p lance --bench mem_wal_fts_bench --no-run
 LANCE_BIN="$(find "$REPO_ROOT/target/release/deps" -maxdepth 1 -type f -perm -111 \
-    -name 'mem_wal_fts_bench-*' ! -name '*.d' | sort | tail -1)"
+    -name 'mem_wal_fts_bench-*' ! -name '*.d' -printf '%T@ %p\n' \
+    | sort -nr | head -1 | cut -d' ' -f2-)"
 echo "Lance bench: $LANCE_BIN"
 
 # ---- compile the Lucene bench ----
diff --git a/rust/lance/benches/mem_wal_fts_bench.rs b/rust/lance/benches/mem_wal_fts_bench.rs
index 9500bc3dba1..0175a0deaa0 100644
--- a/rust/lance/benches/mem_wal_fts_bench.rs
+++ b/rust/lance/benches/mem_wal_fts_bench.rs
@@ -605,7 +605,13 @@ fn run_bench(args: &BenchArgs) -> Result<()> {
     }
     term_lat.sort_by(|a, b| a.partial_cmp(b).unwrap());
     phrase_lat.sort_by(|a, b| a.partial_cmp(b).unwrap());
-    let pct = |v: &[f64], p: f64| if v.is_empty() { f64::NAN } else { percentile(v, p) };
+    let pct = |v: &[f64], p: f64| {
+        if v.is_empty() {
+            f64::NAN
+        } else {
+            percentile(v, p)
+        }
+    };
     println!(
         "result split: term_p50={:.1} term_p95={:.1} ({} q) | phrase_p50={:.1} phrase_p95={:.1} ({} q)",
         pct(&term_lat, 50.0),

From d7da5e5f3bb49e2e7c3fb453cdde95b6b11d8f03 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Mon, 18 May 2026 17:28:45 -0700
Subject: [PATCH 34/37] chore(bench): adapt fineweb FTS bench to the
 initialize_mem_wal builder API

`initialize_mem_wal` is now a builder (`.maintained_indexes(...).execute()`)
rather than taking a `MemWalConfig`; update the benchmark accordingly after
rebasing onto main.
---
 rust/lance/benches/mem_wal_fineweb_fts.rs | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/rust/lance/benches/mem_wal_fineweb_fts.rs b/rust/lance/benches/mem_wal_fineweb_fts.rs
index 70a2e9d8926..51f23a0d600 100644
--- a/rust/lance/benches/mem_wal_fineweb_fts.rs
+++ b/rust/lance/benches/mem_wal_fineweb_fts.rs
@@ -45,7 +45,7 @@ use std::time::{Duration, Instant};
 use arrow_array::{Array, ArrayRef, Int64Array, RecordBatch, RecordBatchIterator, StringArray};
 use arrow_schema::{DataType, Field, Schema as ArrowSchema};
 use futures::TryStreamExt;
-use lance::dataset::mem_wal::{DatasetMemWalExt, MemTableScanner, MemWalConfig, ShardWriterConfig};
+use lance::dataset::mem_wal::{DatasetMemWalExt, MemTableScanner, ShardWriterConfig};
 use lance::dataset::{Dataset, WriteParams};
 use lance::index::DatasetIndexExt;
 use lance_core::Result;
@@ -396,14 +396,13 @@ async fn build_seed_dataset(
             .await?;
     }
     dataset
-        .initialize_mem_wal(MemWalConfig {
-            shard_spec: None,
-            maintained_indexes: if indexed {
-                vec![FTS_INDEX_NAME.to_string()]
-            } else {
-                vec![]
-            },
+        .initialize_mem_wal()
+        .maintained_indexes(if indexed {
+            vec![FTS_INDEX_NAME.to_string()]
+        } else {
+            vec![]
         })
+        .execute()
         .await?;
     Ok(start.elapsed().as_secs_f64())
 }

From a0c357a7a154595a0a653a71baccb57679da7da9 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Mon, 18 May 2026 17:48:22 -0700
Subject: [PATCH 35/37] refactor(mem_wal): drop unused lance-index WAND
 additions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The FTS MemTable index runs its own block-max WAND over its compressed
in-memory posting layout, so the `wand_search` / `WandTerm` entry point
added to `lance-index` early in this work is now dead. Revert it and the
`wand` re-export — `lance-index` is left with a single one-identifier
change: `Scorer` joins `MemBM25Scorer` in the `inverted` re-export so the
in-memory index can call the scorer's trait methods.
---
 rust/lance-index/src/scalar/inverted.rs      |  1 -
 rust/lance-index/src/scalar/inverted/wand.rs | 96 --------------------
 2 files changed, 97 deletions(-)

diff --git a/rust/lance-index/src/scalar/inverted.rs b/rust/lance-index/src/scalar/inverted.rs
index 399eabd3ff0..fc716b0474a 100644
--- a/rust/lance-index/src/scalar/inverted.rs
+++ b/rust/lance-index/src/scalar/inverted.rs
@@ -25,7 +25,6 @@ use lance_core::{Result, cache::LanceCache};
 pub use lance_tokenizer::Language;
 pub use scorer::{MemBM25Scorer, Scorer};
 pub use tokenizer::*;
-pub use wand::{DocCandidate, WandTerm, wand_search};
 
 use crate::scalar::inverted::query::{FtsSearchParams, Tokens};
 
diff --git a/rust/lance-index/src/scalar/inverted/wand.rs b/rust/lance-index/src/scalar/inverted/wand.rs
index 2e2132f5df6..b06c75c0021 100644
--- a/rust/lance-index/src/scalar/inverted/wand.rs
+++ b/rust/lance-index/src/scalar/inverted/wand.rs
@@ -442,46 +442,6 @@ pub struct DocCandidate {
     pub doc_length: u32,
 }
 
-/// One query term for [`wand_search`]: the token text, its id, the token's
-/// position in the query (used only by phrase queries), the query-side
-/// weight, and the term's posting list.
-pub type WandTerm = (String, u32, u32, f32, PostingList);
-
-/// In-memory block-max WAND search entry point.
-///
-/// Runs the same WAND algorithm as the on-disk inverted index query path,
-/// but over caller-supplied posting lists. This is the query primitive for
-/// the in-memory FTS MemTable index, whose frozen segments hold their
-/// postings as [`PostingList::Plain`]. Mirrors `Wand`'s on-disk usage; the
-/// row-address mask is a no-op (the in-memory index has no deletions).
-pub fn wand_search<S: Scorer>(
-    operator: Operator,
-    terms: Vec<WandTerm>,
-    docs: &DocSet,
-    scorer: S,
-    params: &FtsSearchParams,
-) -> Result<Vec<DocCandidate>> {
-    let num_doc = docs.len();
-    let postings = terms
-        .into_iter()
-        .map(|(token, token_id, position, query_weight, list)| {
-            PostingIterator::with_query_weight(
-                token,
-                token_id,
-                position,
-                query_weight,
-                list,
-                num_doc,
-            )
-        });
-    let mut wand = Wand::new(operator, postings, docs, scorer);
-    wand.search(
-        params,
-        Arc::new(RowAddrMask::default()),
-        &crate::metrics::NoOpMetricsCollector,
-    )
-}
-
 struct HeadPosting {
     // Iterators that are already positioned on or after the next candidate doc.
     // The heap is ordered by smallest doc id so the top element determines
@@ -1695,62 +1655,6 @@ mod tests {
         assert_eq!(result.len(), 0); // Should not panic
     }
 
-    // Proves the public `wand_search` entry point works over caller-built
-    // in-memory posting lists — the query primitive for the segmented
-    // FTS MemTable index. (BM25 ranking itself is covered by `test_wand`
-    // and the on-disk index tests.)
-    #[test]
-    fn test_wand_search_in_memory() {
-        let mut docs = DocSet::default();
-        for i in 0..10u64 {
-            docs.append(i, 1);
-        }
-        // term "a": docs 0,2,4,6,8 — term "b": docs 1,2,5,8 — union of 7.
-        let term_a = generate_posting_list(vec![0, 2, 4, 6, 8], 1.0, None, false);
-        let term_b = generate_posting_list(vec![1, 2, 5, 8], 1.0, None, false);
-        let union: std::collections::HashSet<u64> = [0, 1, 2, 4, 5, 6, 8].into_iter().collect();
-        let terms: Vec<WandTerm> = vec![
-            ("a".to_string(), 0, 0, 1.0, term_a),
-            ("b".to_string(), 1, 1, 1.0, term_b),
-        ];
-        let params = FtsSearchParams::default().with_limit(Some(3));
-        let scorer = IndexBM25Scorer::new(std::iter::empty());
-        let result = wand_search(Operator::Or, terms, &docs, scorer, &params).unwrap();
-        assert_eq!(result.len(), 3, "limit=3 should return 3 candidates");
-        for c in &result {
-            assert!(union.contains(&c.row_id), "id {} not in union", c.row_id);
-            assert!(!c.freqs.is_empty(), "candidate must record term freqs");
-        }
-
-        // Unlimited search returns exactly the union of the two posting lists.
-        let scorer = IndexBM25Scorer::new(std::iter::empty());
-        let all = wand_search(
-            Operator::Or,
-            vec![
-                (
-                    "a".to_string(),
-                    0,
-                    0,
-                    1.0,
-                    generate_posting_list(vec![0, 2, 4, 6, 8], 1.0, None, false),
-                ),
-                (
-                    "b".to_string(),
-                    1,
-                    1,
-                    1.0,
-                    generate_posting_list(vec![1, 2, 5, 8], 1.0, None, false),
-                ),
-            ],
-            &docs,
-            scorer,
-            &FtsSearchParams::default(),
-        )
-        .unwrap();
-        let got: std::collections::HashSet<u64> = all.iter().map(|c| c.row_id).collect();
-        assert_eq!(got, union, "OR search must return the posting-list union");
-    }
-
     #[test]
     fn test_posting_iterator_next_compressed_partition_point() {
         let mut docs = DocSet::default();

From d182f3c949e6efb095909e23130f888961a27507 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Mon, 18 May 2026 18:25:09 -0700
Subject: [PATCH 36/37] chore: sync python/Cargo.lock with the new bitpacking
 dependency

The `lance` crate gained a `bitpacking` dependency; the separate pylance
lockfile must list it so CI's `--locked` lint passes.
---
 python/Cargo.lock | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/Cargo.lock b/python/Cargo.lock
index c71f0f09b8c..c3d1fe07fa5 100644
--- a/python/Cargo.lock
+++ b/python/Cargo.lock
@@ -3993,6 +3993,7 @@ dependencies = [
  "async_cell",
  "aws-credential-types",
  "aws-sdk-dynamodb",
+ "bitpacking",
  "byteorder",
  "bytes",
  "chrono",

From 9f62f87a26e2813c152ea1bc414f8808a40baca4 Mon Sep 17 00:00:00 2001
From: Heng Ge <geheng001@gmail.com>
Date: Mon, 18 May 2026 22:21:02 -0700
Subject: [PATCH 37/37] chore(bench): move mem_wal benches under
 benches/mem_wal and scrub infra references

Relocate the FTS bench files out of the top-level bench/ folder into
rust/lance/benches/mem_wal/, with an fts/ subfolder for FTS-specific
benchmarks and drivers. Replace hard-coded S3 buckets, instance paths,
and AWS region exports in the driver scripts and bench doc comments
with generic placeholders and tmpdir-based defaults.
---
 rust/lance/Cargo.toml                         |   2 ++
 .../benches/mem_wal/fts}/LuceneFtsBench.java  | Bin
 .../{ => mem_wal/fts}/mem_wal_fineweb_fts.rs  |   5 ++--
 .../{ => mem_wal/fts}/mem_wal_fts_bench.rs    |   0
 .../benches/mem_wal/fts}/run_fineweb_fts.sh   |  26 +++++++++++-------
 .../benches/mem_wal/fts}/run_fts_compare.sh   |  26 ++++++++++--------
 .../mem_wal}/run_shard_writer_backpressure.sh |  21 ++++++++------
 .../mem_wal_shard_writer_backpressure.rs      |   3 +-
 8 files changed, 49 insertions(+), 34 deletions(-)
 rename {bench/lucene_fts_bench => rust/lance/benches/mem_wal/fts}/LuceneFtsBench.java (100%)
 rename rust/lance/benches/{ => mem_wal/fts}/mem_wal_fineweb_fts.rs (99%)
 rename rust/lance/benches/{ => mem_wal/fts}/mem_wal_fts_bench.rs (100%)
 rename {bench => rust/lance/benches/mem_wal/fts}/run_fineweb_fts.sh (78%)
 rename {bench => rust/lance/benches/mem_wal/fts}/run_fts_compare.sh (87%)
 rename {bench => rust/lance/benches/mem_wal}/run_shard_writer_backpressure.sh (86%)

diff --git a/rust/lance/Cargo.toml b/rust/lance/Cargo.toml
index 6a31d01dda8..9f055fc9154 100644
--- a/rust/lance/Cargo.toml
+++ b/rust/lance/Cargo.toml
@@ -239,10 +239,12 @@ harness = false
 
 [[bench]]
 name = "mem_wal_fineweb_fts"
+path = "benches/mem_wal/fts/mem_wal_fineweb_fts.rs"
 harness = false
 
 [[bench]]
 name = "mem_wal_fts_bench"
+path = "benches/mem_wal/fts/mem_wal_fts_bench.rs"
 harness = false
 
 [lints]
diff --git a/bench/lucene_fts_bench/LuceneFtsBench.java b/rust/lance/benches/mem_wal/fts/LuceneFtsBench.java
similarity index 100%
rename from bench/lucene_fts_bench/LuceneFtsBench.java
rename to rust/lance/benches/mem_wal/fts/LuceneFtsBench.java
diff --git a/rust/lance/benches/mem_wal_fineweb_fts.rs b/rust/lance/benches/mem_wal/fts/mem_wal_fineweb_fts.rs
similarity index 99%
rename from rust/lance/benches/mem_wal_fineweb_fts.rs
rename to rust/lance/benches/mem_wal/fts/mem_wal_fineweb_fts.rs
index 51f23a0d600..42df3e63fb1 100644
--- a/rust/lance/benches/mem_wal_fineweb_fts.rs
+++ b/rust/lance/benches/mem_wal/fts/mem_wal_fineweb_fts.rs
@@ -25,13 +25,12 @@
 //! Example:
 //!
 //! ```bash
-//! AWS_DEFAULT_REGION=us-east-1 \
 //! cargo bench -p lance --bench mem_wal_fineweb_fts -- \
 //!   --phase write --mode async_idx \
-//!   --uri s3://jack-devland-build/bench/mem-fts-fineweb/run1/w_async_idx_mt100k \
+//!   --uri /tmp/mem-fts-fineweb/run1/w_async_idx_mt100k \
 //!   --seed-rows 1000000 --batch-rows 1000 --calls 1000 \
 //!   --max-memtable-rows 100000 \
-//!   --cache-dir /mnt/data/fineweb --output result.json
+//!   --cache-dir /tmp/fineweb-cache --output result.json
 //! ```
 
 #![recursion_limit = "256"]
diff --git a/rust/lance/benches/mem_wal_fts_bench.rs b/rust/lance/benches/mem_wal/fts/mem_wal_fts_bench.rs
similarity index 100%
rename from rust/lance/benches/mem_wal_fts_bench.rs
rename to rust/lance/benches/mem_wal/fts/mem_wal_fts_bench.rs
diff --git a/bench/run_fineweb_fts.sh b/rust/lance/benches/mem_wal/fts/run_fineweb_fts.sh
similarity index 78%
rename from bench/run_fineweb_fts.sh
rename to rust/lance/benches/mem_wal/fts/run_fineweb_fts.sh
index c6dedd96cf9..8956ecb91eb 100755
--- a/bench/run_fineweb_fts.sh
+++ b/rust/lance/benches/mem_wal/fts/run_fineweb_fts.sh
@@ -8,31 +8,37 @@
 #               times the FTS queries, flushes, and replays on disk.
 #
 # Every config runs as its own process under a `timeout` watchdog, so a
-# hang costs one timeout window, not days. result.json is uploaded to S3.
+# hang costs one timeout window, not days. When DATASET_PREFIX points at
+# object storage, each result.json is also uploaded there.
 #
-# Usage: ./bench/run_fineweb_fts.sh [run_id]
+# Usage: rust/lance/benches/mem_wal/fts/run_fineweb_fts.sh [run_id]
+#
+# Env:
+#   DATASET_PREFIX  scratch dataset location (default <tmpdir>/mem-fts-fineweb)
+#   CACHE_DIR       FineWeb shard download cache (default <tmpdir>/lance-fineweb-cache)
 
 set -uo pipefail
 
-cd "$(dirname "${BASH_SOURCE[0]}")/.."
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)"
+REPO_ROOT="$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel)"
+cd "$REPO_ROOT"
 
 RUN_ID="${1:-$(date -u +%Y%m%dT%H%M%SZ)}"
-DATASET_PREFIX="${DATASET_PREFIX:-s3://jack-devland-build/bench/mem-fts-fineweb}"
+DATASET_PREFIX="${DATASET_PREFIX:-${TMPDIR:-/tmp}/mem-fts-fineweb}"
 SEED_ROWS="${SEED_ROWS:-1000000}"
 BATCH_ROWS="${BATCH_ROWS:-1000}"
 CALLS="${CALLS:-1000}"
-CACHE_DIR="${CACHE_DIR:-/mnt/data/fineweb}"
+CACHE_DIR="${CACHE_DIR:-${TMPDIR:-/tmp}/lance-fineweb-cache}"
 CONFIG_TIMEOUT="${CONFIG_TIMEOUT:-3600}"
-export AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}"
 
-LOCAL_DIR="bench/results/${RUN_ID}"
-mkdir -p "$LOCAL_DIR"
+LOCAL_DIR="$REPO_ROOT/target/fineweb-fts-results/${RUN_ID}"
+mkdir -p "$LOCAL_DIR" "$CACHE_DIR"
 
-BIN="$(find target/release/deps -maxdepth 1 -type f -perm -111 -name 'mem_wal_fineweb_fts-*' ! -name '*.d' 2>/dev/null | sort | tail -1)"
+BIN="$(find target/release/deps -maxdepth 1 -type f -perm -111 -name 'mem_wal_fineweb_fts-*' ! -name '*.d' -printf '%T@ %p\n' 2>/dev/null | sort -nr | head -1 | cut -d' ' -f2-)"
 if [ -z "$BIN" ]; then
     echo "building bench binary..."
     cargo bench -p lance --bench mem_wal_fineweb_fts --no-run
-    BIN="$(find target/release/deps -maxdepth 1 -type f -perm -111 -name 'mem_wal_fineweb_fts-*' ! -name '*.d' 2>/dev/null | sort | tail -1)"
+    BIN="$(find target/release/deps -maxdepth 1 -type f -perm -111 -name 'mem_wal_fineweb_fts-*' ! -name '*.d' -printf '%T@ %p\n' 2>/dev/null | sort -nr | head -1 | cut -d' ' -f2-)"
 fi
 echo "bench binary: $BIN"
 echo "run id:       $RUN_ID"
diff --git a/bench/run_fts_compare.sh b/rust/lance/benches/mem_wal/fts/run_fts_compare.sh
similarity index 87%
rename from bench/run_fts_compare.sh
rename to rust/lance/benches/mem_wal/fts/run_fts_compare.sh
index 1374127464c..ab5617762bc 100755
--- a/bench/run_fts_compare.sh
+++ b/rust/lance/benches/mem_wal/fts/run_fts_compare.sh
@@ -6,30 +6,30 @@
 # size, runs each impl in Run A (pre-tokenized) and Run B (native
 # analyzers), and prints build/query/recall side by side.
 #
-# Usage: ./bench/run_fts_compare.sh [run_id]
+# Usage: rust/lance/benches/mem_wal/fts/run_fts_compare.sh [run_id]
 #
 # Env:
 #   SIZES        doc-count sweep (default "100000 500000 1000000")
 #   K            top-k (default 10)
 #   THREADS      query threads for the multi-thread QPS run
-#   LUCENE_DIR   Lucene checkout (default ~/oss/lucene)
 #   LUCENE_CP    pre-built Lucene classpath; if set, the Lucene build is skipped
+#   LUCENE_DIR   Lucene source checkout — built when LUCENE_CP is unset
 #   JAVA_HOME    JDK 25 home; if unset the script searches common locations
-#   CACHE_DIR    FineWeb shard cache (default /mnt/data/fineweb)
+#   CACHE_DIR    FineWeb shard download cache (default <tmpdir>/lance-fineweb-cache)
 
 set -uo pipefail
-cd "$(dirname "${BASH_SOURCE[0]}")/.."
-REPO_ROOT="$(pwd -P)"
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)"
+REPO_ROOT="$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel)"
+cd "$REPO_ROOT"
 
 RUN_ID="${1:-fts-compare-$(date -u +%Y%m%dT%H%M%SZ)}"
 SIZES="${SIZES:-100000 500000 1000000}"
 K="${K:-10}"
 THREADS="${THREADS:-$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 8)}"
-LUCENE_DIR="${LUCENE_DIR:-$HOME/oss/lucene}"
-CACHE_DIR="${CACHE_DIR:-/mnt/data/fineweb}"
-WORK="${WORK:-/tmp/fts_compare/$RUN_ID}"
-RESULT_DIR="$REPO_ROOT/bench/results/$RUN_ID"
-mkdir -p "$WORK" "$RESULT_DIR"
+CACHE_DIR="${CACHE_DIR:-${TMPDIR:-/tmp}/lance-fineweb-cache}"
+WORK="${WORK:-${TMPDIR:-/tmp}/fts_compare/$RUN_ID}"
+RESULT_DIR="$REPO_ROOT/target/fts-compare-results/$RUN_ID"
+mkdir -p "$WORK" "$RESULT_DIR" "$CACHE_DIR"
 
 # ---- locate JDK 25 ----
 if [ -z "${JAVA_HOME:-}" ]; then
@@ -47,6 +47,10 @@ echo "JDK: $($JAVA -version 2>&1 | head -1)"
 
 # ---- build Lucene classpath ----
 if [ -z "${LUCENE_CP:-}" ]; then
+    if [ -z "${LUCENE_DIR:-}" ]; then
+        echo "ERROR: set LUCENE_CP (prebuilt jars) or LUCENE_DIR (source checkout)" >&2
+        exit 1
+    fi
     echo "=== Building Lucene jars ($LUCENE_DIR) ==="
     ( cd "$LUCENE_DIR" && JAVA_HOME="$JAVA_HOME" ./gradlew -q \
         :lucene:core:jar :lucene:analysis:common:jar ) || {
@@ -72,7 +76,7 @@ echo "Lance bench: $LANCE_BIN"
 # ---- compile the Lucene bench ----
 echo "=== Compiling Lucene FTS bench ==="
 "$JAVAC" -cp "$LUCENE_CP" -d "$WORK" \
-    "$REPO_ROOT/bench/lucene_fts_bench/LuceneFtsBench.java" \
+    "$SCRIPT_DIR/LuceneFtsBench.java" \
     || { echo "ERROR: javac failed" >&2; exit 1; }
 
 mutual_overlap() {  # $1=topk file A  $2=topk file B  $3=k
diff --git a/bench/run_shard_writer_backpressure.sh b/rust/lance/benches/mem_wal/run_shard_writer_backpressure.sh
similarity index 86%
rename from bench/run_shard_writer_backpressure.sh
rename to rust/lance/benches/mem_wal/run_shard_writer_backpressure.sh
index 15ad60f2382..5d669b46cd7 100755
--- a/bench/run_shard_writer_backpressure.sh
+++ b/rust/lance/benches/mem_wal/run_shard_writer_backpressure.sh
@@ -6,19 +6,25 @@
 # vector and FTS numbers are directly comparable.
 #
 # Usage:
-#   INDEX_TYPE=fts    ./bench/run_shard_writer_backpressure.sh [run_id]
-#   INDEX_TYPE=vector ./bench/run_shard_writer_backpressure.sh [run_id]
+#   INDEX_TYPE=fts    rust/lance/benches/mem_wal/run_shard_writer_backpressure.sh [run_id]
+#   INDEX_TYPE=vector rust/lance/benches/mem_wal/run_shard_writer_backpressure.sh [run_id]
 #
 # Finds the max sustainable async-indexed throughput: the highest paced
 # target where puts never block (slow>=1s == 0) and the WAL flush queue
 # does not accumulate (tail queue delta ~0).
+#
+# Env:
+#   DATASET_PREFIX  scratch dataset location (default <tmpdir>/mem-wal-backpressure)
 
 set -uo pipefail
-cd "$(dirname "${BASH_SOURCE[0]}")/.."
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd -P)"
+REPO_ROOT="$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel)"
+cd "$REPO_ROOT"
 
 INDEX_TYPE="${INDEX_TYPE:-fts}"
 RUN_ID="${1:-bp-${INDEX_TYPE}-$(date -u +%Y%m%dT%H%M%SZ)}"
-DATASET_PREFIX="${DATASET_PREFIX:-s3://jack-devland-build/bench/mem-fts-fineweb}"
+DATASET_PREFIX="${DATASET_PREFIX:-${TMPDIR:-/tmp}/mem-wal-backpressure}"
 
 # Shared knobs — identical to the HNSW vector backpressure sweep so the
 # two index types are measured the same way.
@@ -34,7 +40,6 @@ MAX_WAL_BUFFER_SIZE="${MAX_WAL_BUFFER_SIZE:-52428800}"        # 50 MiB
 MAX_WAL_FLUSH_INTERVAL_MS="${MAX_WAL_FLUSH_INTERVAL_MS:-0}"
 SAMPLE_INTERVAL_MS="${SAMPLE_INTERVAL_MS:-1000}"
 CONFIG_TIMEOUT="${CONFIG_TIMEOUT:-2400}"
-export AWS_DEFAULT_REGION="${AWS_DEFAULT_REGION:-us-east-1}"
 
 # Paced async-indexed target sweep (rows/s). FTS indexing is heavier than
 # IVF/PQ so its sustainable ceiling is lower; the vector sweep can push
@@ -47,15 +52,15 @@ esac
 TARGETS="${TARGETS:-$TARGETS_DEFAULT}"
 
 CALLS=$(( ROWS / BATCH_ROWS ))
-LOCAL_DIR="bench/results/${RUN_ID}"
+LOCAL_DIR="$REPO_ROOT/target/mem-wal-backpressure-results/${RUN_ID}"
 mkdir -p "$LOCAL_DIR"
 
 BIN="$(find target/release/deps -maxdepth 1 -type f -perm -111 \
-    -name 'mem_wal_shard_writer_backpressure-*' ! -name '*.d' 2>/dev/null | sort | tail -1)"
+    -name 'mem_wal_shard_writer_backpressure-*' ! -name '*.d' -printf '%T@ %p\n' 2>/dev/null | sort -nr | head -1 | cut -d' ' -f2-)"
 if [ -z "$BIN" ]; then
     cargo bench -p lance --bench mem_wal_shard_writer_backpressure --no-run
     BIN="$(find target/release/deps -maxdepth 1 -type f -perm -111 \
-        -name 'mem_wal_shard_writer_backpressure-*' ! -name '*.d' 2>/dev/null | sort | tail -1)"
+        -name 'mem_wal_shard_writer_backpressure-*' ! -name '*.d' -printf '%T@ %p\n' 2>/dev/null | sort -nr | head -1 | cut -d' ' -f2-)"
 fi
 echo "index_type=$INDEX_TYPE  bin=$BIN  run_id=$RUN_ID  rows=$ROWS"
 
diff --git a/rust/lance/benches/mem_wal_shard_writer_backpressure.rs b/rust/lance/benches/mem_wal_shard_writer_backpressure.rs
index 1e9be110040..d8e74def4d5 100644
--- a/rust/lance/benches/mem_wal_shard_writer_backpressure.rs
+++ b/rust/lance/benches/mem_wal_shard_writer_backpressure.rs
@@ -11,9 +11,8 @@
 //! Example:
 //!
 //! ```bash
-//! AWS_DEFAULT_REGION=us-east-1 \
 //! cargo bench -p lance --bench mem_wal_shard_writer_backpressure -- \
-//!   --uri s3://jack-devland-build/memwal-rust-native \
+//!   --uri /tmp/memwal-rust-native \
 //!   --mode async_idx \
 //!   --seed-rows 100000 \
 //!   --batch-rows 1000 \