diff --git a/src/sql/fts/posting_list.rs b/src/sql/fts/posting_list.rs index 99c0d3b..b67dbd7 100644 --- a/src/sql/fts/posting_list.rs +++ b/src/sql/fts/posting_list.rs @@ -76,6 +76,70 @@ impl PostingList { } } + /// Phase 8c — emit `(rowid, doc_len)` pairs for every indexed doc, + /// in ascending rowid order. The pager writes these into the FTS + /// index's doc-lengths sidecar cell; reload feeds them back to + /// [`Self::from_persisted_postings`]. + pub fn serialize_doc_lengths(&self) -> Vec<(i64, u32)> { + self.doc_lengths + .iter() + .map(|(id, len)| (*id, *len)) + .collect() + } + + /// Phase 8c — emit `(term, [(rowid, term_freq)])` triples in + /// lexicographic term order; per-term entries are in ascending + /// rowid order (the underlying `BTreeMap` already guarantees this). + /// One element per unique indexed term; pager writes one cell per + /// element. + pub fn serialize_postings(&self) -> Vec<(String, Vec<(i64, u32)>)> { + self.postings + .iter() + .map(|(term, postings)| { + let entries = postings.iter().map(|(id, freq)| (*id, *freq)).collect(); + (term.clone(), entries) + }) + .collect() + } + + /// Phase 8c — rebuild a `PostingList` directly from the persisted + /// doc-lengths sidecar + per-term postings. No tokenization runs; + /// the resulting index is byte-equivalent to what was saved + /// (assuming the input came from `serialize_*`). + /// + /// `doc_lengths` is the full `(rowid, doc_len)` map written into + /// the sidecar cell. `postings` is one `(term, [(rowid, tf)])` + /// element per term cell. + pub fn from_persisted_postings(doc_lengths: I, postings: J) -> Self + where + I: IntoIterator, + J: IntoIterator)>, + { + let mut doc_lengths_map: BTreeMap = BTreeMap::new(); + let mut total_tokens: u64 = 0; + for (rowid, len) in doc_lengths { + doc_lengths_map.insert(rowid, len); + total_tokens += len as u64; + } + + let mut postings_map: BTreeMap> = BTreeMap::new(); + for (term, entries) in postings { + let inner: BTreeMap = entries.into_iter().collect(); + // An empty posting list shouldn't be persisted, but if it + // somehow was, drop it on load — `remove()` would have + // pruned the same way at runtime. + if !inner.is_empty() { + postings_map.insert(term, inner); + } + } + + Self { + postings: postings_map, + doc_lengths: doc_lengths_map, + total_tokens, + } + } + /// Tokenize `text` and add its postings under `rowid`. If `rowid` is /// already indexed, its previous postings are removed first — i.e. /// `insert` is idempotent for re-indexing the same row. @@ -413,6 +477,33 @@ mod tests { assert_eq!(res[0].0, 1); } + #[test] + fn serialize_round_trips_through_from_persisted() { + // Phase 8c — the (de)serialize pair must reproduce the exact + // in-memory state that was saved. Emptiness, multi-term, and + // re-insert idempotence all need to round-trip. + let mut pl = PostingList::new(); + pl.insert(1, "rust embedded database"); + pl.insert(2, "rust web framework"); + pl.insert(3, ""); // zero-token doc — exercises the sidecar + pl.insert(4, "rust rust rust embedded power"); + + let docs = pl.serialize_doc_lengths(); + let postings = pl.serialize_postings(); + let roundtripped = PostingList::from_persisted_postings(docs, postings); + + assert_eq!(roundtripped.len(), pl.len(), "doc count"); + assert_eq!(roundtripped.avg_doc_len(), pl.avg_doc_len(), "avg_doc_len"); + // Every query result + score must match. + let q = pl.query("rust", &Bm25Params::default()); + let q2 = roundtripped.query("rust", &Bm25Params::default()); + assert_eq!(q, q2, "query results must match after round-trip"); + // Zero-token doc 3 stays in the corpus stats so total_docs is + // honest, even though it'll never match a query. + assert!(roundtripped.matches(1, "rust")); + assert!(!roundtripped.matches(3, "rust")); + } + #[test] fn synthetic_thousand_doc_corpus_top_ten_is_stable() { // 1000 deterministic docs. Most are noise; only 5 contain the diff --git a/src/sql/pager/cell.rs b/src/sql/pager/cell.rs index 8cd57aa..345a427 100644 --- a/src/sql/pager/cell.rs +++ b/src/sql/pager/cell.rs @@ -73,6 +73,30 @@ pub const KIND_INDEX: u8 = 0x04; /// the first varint after the kind tag — exactly the `node_id` here. pub const KIND_HNSW: u8 = 0x05; +/// Phase 8c: a single FTS posting-list cell. Body layout (after the +/// shared `cell_length | kind_tag` prefix): +/// +/// ```text +/// cell_id zigzag varint sequential id assigned at save time; +/// acts as the B-Tree slot key so +/// `peek_rowid` works uniformly +/// term_len varint length of the term in bytes +/// (0 → this cell is the doc-lengths +/// sidecar, value below is doc_len) +/// term term_len bytes ASCII-lowercased term (per Phase 8 Q3) +/// count varint number of (rowid, value) pairs +/// for each: +/// rowid zigzag varint the row this posting refers to +/// value varint term frequency for this (term, row), +/// or doc length when term_len == 0 +/// ``` +/// +/// One sidecar cell with `term_len == 0` holds `(rowid, doc_len)` +/// pairs so reload reproduces every indexed doc — including any with +/// zero-token text — without re-tokenizing. All remaining cells are +/// posting cells, one per term. +pub const KIND_FTS_POSTING: u8 = 0x06; + /// Value type tag stored in each non-NULL value block. pub mod tag { pub const INTEGER: u8 = 0; diff --git a/src/sql/pager/fts_cell.rs b/src/sql/pager/fts_cell.rs new file mode 100644 index 0000000..41d7896 --- /dev/null +++ b/src/sql/pager/fts_cell.rs @@ -0,0 +1,317 @@ +//! On-disk format for one FTS posting list (Phase 8c). +//! +//! Each cell carries either a posting list for one term — `(term, +//! [(rowid, term_freq), ...])` — or, in a single sidecar cell with +//! `term.is_empty()`, the per-doc length map `(rowid, doc_len)`. Cells +//! live on `TableLeaf` pages identical to regular table data trees, so +//! the slot directory + sibling `next_page` chain + interior-page +//! mechanics from Phase 3d work without FTS-specific page plumbing. +//! +//! Reusing the table-tree shape lets `Cell::peek_rowid` work uniformly +//! across cell kinds: it skips `cell_length | kind_tag` and reads the +//! first varint, which is `cell_id` here. `cell_id` is a sequential +//! integer assigned at save time (1, 2, 3 …), not a row identifier — +//! the B-Tree just needs an ordered key for slot directory binary +//! search; the actual data is keyed by `term`. +//! +//! ```text +//! cell_length varint bytes after this field +//! kind_tag u8 = 0x06 (KIND_FTS_POSTING) +//! cell_id zigzag varint sequential B-Tree slot key +//! term_len varint length of `term` in bytes; 0 → sidecar +//! term term_len bytes ASCII-lowercased term per Phase 8 Q3 +//! count varint number of (rowid, value) pairs +//! for each: +//! rowid zigzag varint the row this entry belongs to +//! value varint term frequency, or doc length when +//! term_len == 0 (sidecar cell) +//! ``` +//! +//! One sidecar cell suffices for the entire index: it lists every +//! indexed doc with its tokenized length, including the zero-length +//! corner case (a row whose text tokenizes to nothing — still indexed +//! so `len()` and `total_docs` round-trip). Posting cells follow. +//! +//! No null bitmap, no per-field type tag — every field has a fixed +//! type. The encoding is deliberately minimal because long posting +//! lists dominate disk usage on real corpora. + +use crate::error::{Result, SQLRiteError}; +use crate::sql::pager::cell::KIND_FTS_POSTING; +use crate::sql::pager::varint; + +/// One FTS posting list cell — either a per-term postings entry or the +/// single doc-lengths sidecar (when `term.is_empty()`). +#[derive(Debug, Clone, PartialEq)] +pub struct FtsPostingCell { + /// Sequential id assigned at save time. Acts as the B-Tree slot + /// directory key; never persisted as part of the index logic. + pub cell_id: i64, + /// Lowercased ASCII term. Empty on the doc-lengths sidecar. + pub term: String, + /// `(rowid, value)` pairs. `value` is term frequency for posting + /// cells, doc length for the sidecar. + pub entries: Vec<(i64, u32)>, +} + +impl FtsPostingCell { + pub fn posting(cell_id: i64, term: String, entries: Vec<(i64, u32)>) -> Self { + Self { + cell_id, + term, + entries, + } + } + + /// Constructs the doc-lengths sidecar cell (term left empty). + pub fn doc_lengths(cell_id: i64, entries: Vec<(i64, u32)>) -> Self { + Self { + cell_id, + term: String::new(), + entries, + } + } + + /// Encodes the cell into a freshly-allocated `Vec`. The result + /// starts with the shared `cell_length | kind_tag` prefix and is + /// directly usable as a slot-directory entry on a `TableLeaf`-style + /// page. + pub fn encode(&self) -> Result> { + // Body capacity guess: 1 (kind) + 10 (cell_id) + 5 (term_len) + // + term + 5 (count) + per-pair 10 (rowid) + 5 (value). + let pair_bytes = self.entries.len() * 15; + let mut body = Vec::with_capacity(1 + 10 + 5 + self.term.len() + 5 + pair_bytes); + + body.push(KIND_FTS_POSTING); + varint::write_i64(&mut body, self.cell_id); + varint::write_u64(&mut body, self.term.len() as u64); + body.extend_from_slice(self.term.as_bytes()); + varint::write_u64(&mut body, self.entries.len() as u64); + for (rowid, value) in &self.entries { + varint::write_i64(&mut body, *rowid); + varint::write_u64(&mut body, *value as u64); + } + + let mut out = Vec::with_capacity(body.len() + varint::MAX_VARINT_BYTES); + varint::write_u64(&mut out, body.len() as u64); + out.extend_from_slice(&body); + Ok(out) + } + + /// Decodes one cell starting at `pos`. Returns the cell plus the + /// total bytes consumed (including the leading length varint). + pub fn decode(buf: &[u8], pos: usize) -> Result<(FtsPostingCell, usize)> { + let (body_len, len_bytes) = varint::read_u64(buf, pos)?; + let body_start = pos + len_bytes; + let body_end = body_start + .checked_add(body_len as usize) + .ok_or_else(|| SQLRiteError::Internal("FTS cell length overflow".to_string()))?; + if body_end > buf.len() { + return Err(SQLRiteError::Internal(format!( + "FTS cell extends past buffer: needs {body_start}..{body_end}, have {}", + buf.len() + ))); + } + let body = &buf[body_start..body_end]; + if body.first().copied() != Some(KIND_FTS_POSTING) { + return Err(SQLRiteError::Internal(format!( + "FtsPostingCell::decode called on non-FTS entry (kind_tag = {:#x})", + body.first().copied().unwrap_or(0) + ))); + } + + let mut cur = 1usize; + let (cell_id, n) = varint::read_i64(body, cur)?; + cur += n; + + let (term_len, n) = varint::read_u64(body, cur)?; + cur += n; + // Sanity: a single term shouldn't exceed a few KB even with + // pathological input. The whole cell body sits inside one page + // (~4 KiB), so a giant term length is almost certainly a + // corrupt cell — bail before allocating. + if term_len as usize > body.len().saturating_sub(cur) { + return Err(SQLRiteError::Internal(format!( + "FTS cell {cell_id}: term_len {term_len} exceeds remaining body \ + ({}) — corrupt cell?", + body.len() - cur + ))); + } + let term_bytes = &body[cur..cur + term_len as usize]; + cur += term_len as usize; + let term = std::str::from_utf8(term_bytes) + .map_err(|e| { + SQLRiteError::Internal(format!("FTS cell {cell_id}: term not valid UTF-8: {e}")) + })? + .to_string(); + + let (count, n) = varint::read_u64(body, cur)?; + cur += n; + // Sanity: a single posting list shouldn't exceed corpus size. + // 8 GiB worth of entries (8 bytes per rowid alone) is firmly in + // "corrupt cell" territory. + if count > 1 << 28 { + return Err(SQLRiteError::Internal(format!( + "FTS cell {cell_id}: claims {count} entries (>2^28) — corrupt cell?" + ))); + } + let mut entries = Vec::with_capacity(count as usize); + for _ in 0..count { + let (rowid, n) = varint::read_i64(body, cur)?; + cur += n; + let (value_u64, n) = varint::read_u64(body, cur)?; + cur += n; + // Term frequencies and doc lengths fit in u32 (a doc with + // 4 billion tokens is implausible). Reject with a clean + // error instead of silently truncating. + if value_u64 > u32::MAX as u64 { + return Err(SQLRiteError::Internal(format!( + "FTS cell {cell_id}: value {value_u64} exceeds u32::MAX — corrupt cell?" + ))); + } + entries.push((rowid, value_u64 as u32)); + } + + if cur != body.len() { + return Err(SQLRiteError::Internal(format!( + "FTS cell {cell_id} had {} trailing bytes", + body.len() - cur + ))); + } + + Ok(( + FtsPostingCell { + cell_id, + term, + entries, + }, + len_bytes + body_len as usize, + )) + } + + /// True iff this cell is the doc-lengths sidecar. + pub fn is_doc_lengths(&self) -> bool { + self.term.is_empty() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + fn round_trip(cell: &FtsPostingCell) { + let bytes = cell.encode().expect("encode"); + let (decoded, consumed) = FtsPostingCell::decode(&bytes, 0).expect("decode"); + assert_eq!( + consumed, + bytes.len(), + "decode should consume the whole cell" + ); + assert_eq!(&decoded, cell); + } + + #[test] + fn posting_cell_round_trips() { + let cell = FtsPostingCell::posting(7, "rust".to_string(), vec![(1, 2), (3, 1), (5, 7)]); + round_trip(&cell); + } + + #[test] + fn doc_lengths_sidecar_round_trips() { + let cell = FtsPostingCell::doc_lengths(1, vec![(1, 12), (2, 20), (3, 0), (4, 7)]); + assert!(cell.is_doc_lengths()); + round_trip(&cell); + } + + #[test] + fn empty_postings_round_trips() { + // Edge case: an FTS cell with zero entries shouldn't be + // emitted in practice (the term would be pruned by remove()), + // but the format must still round-trip. + let cell = FtsPostingCell::posting(2, "ghost".to_string(), vec![]); + round_trip(&cell); + } + + #[test] + fn negative_and_large_rowids_round_trip() { + // Rowids are zigzag-encoded; cover both signs. + round_trip(&FtsPostingCell::posting( + 3, + "x".to_string(), + vec![(-1, 1), (i64::MAX, 99), (i64::MIN, 1)], + )); + } + + #[test] + fn long_term_round_trips() { + // A 1024-byte term — well within page capacity. Tokenizer + // wouldn't actually emit this in practice, but encode/decode + // must still survive. + let term = "a".repeat(1024); + let cell = FtsPostingCell::posting(4, term, vec![(1, 1)]); + round_trip(&cell); + } + + #[test] + fn long_posting_list_round_trips() { + // 5000 entries — exercises the count + pair-loop paths. + let entries: Vec<(i64, u32)> = (0..5000_i64).map(|i| (i, ((i * 3) as u32) + 1)).collect(); + let cell = FtsPostingCell::posting(5, "common".to_string(), entries); + round_trip(&cell); + } + + #[test] + fn decode_rejects_wrong_kind_tag() { + let mut bad = Vec::new(); + varint::write_u64(&mut bad, 1); // body_len + bad.push(0x01); // KIND_LOCAL, not KIND_FTS_POSTING + let err = FtsPostingCell::decode(&bad, 0).unwrap_err(); + assert!(format!("{err}").contains("non-FTS entry")); + } + + #[test] + fn decode_rejects_truncated_buffer() { + let cell = FtsPostingCell::posting(1, "rust".to_string(), vec![(1, 2), (5, 3)]); + let bytes = cell.encode().expect("encode"); + for chop in 1..=3 { + let truncated = &bytes[..bytes.len() - chop]; + assert!( + FtsPostingCell::decode(truncated, 0).is_err(), + "expected error chopping {chop} byte(s) from end of {} byte cell", + bytes.len() + ); + } + } + + #[test] + fn decode_rejects_invalid_utf8_term() { + // Hand-craft a cell whose term bytes aren't valid UTF-8. + let mut body = Vec::new(); + body.push(KIND_FTS_POSTING); + varint::write_i64(&mut body, 1); // cell_id + varint::write_u64(&mut body, 2); // term_len + body.extend_from_slice(&[0xFF, 0xFE]); // not valid UTF-8 + varint::write_u64(&mut body, 0); // count = 0 + let mut out = Vec::new(); + varint::write_u64(&mut out, body.len() as u64); + out.extend_from_slice(&body); + let err = FtsPostingCell::decode(&out, 0).unwrap_err(); + assert!(format!("{err}").to_lowercase().contains("utf-8")); + } + + #[test] + fn decode_rejects_implausible_count() { + // Hand-craft a cell with count = 2^29 (above the corruption sanity bound). + let mut body = Vec::new(); + body.push(KIND_FTS_POSTING); + varint::write_i64(&mut body, 1); + varint::write_u64(&mut body, 4); + body.extend_from_slice(b"term"); + varint::write_u64(&mut body, 1u64 << 29); + let mut out = Vec::new(); + varint::write_u64(&mut out, body.len() as u64); + out.extend_from_slice(&body); + let err = FtsPostingCell::decode(&out, 0).unwrap_err(); + assert!(format!("{err}").to_lowercase().contains("corrupt")); + } +} diff --git a/src/sql/pager/header.rs b/src/sql/pager/header.rs index 4d2a539..d031b58 100644 --- a/src/sql/pager/header.rs +++ b/src/sql/pager/header.rs @@ -24,23 +24,38 @@ pub const MAGIC: &[u8; 16] = b"SQLRiteFormat\0\0\0"; /// the new `KIND_INDEX` format. /// - Version 4 (Phase 7): cell encoding gains the `KIND_VECTOR` value /// tag (length-prefixed dense f32 array) for the new `VECTOR(N)` -/// column type. Per the Phase 7 plan (`docs/phase-7-plan.md` Q8), -/// later Phase 7 sub-phases (JSON, HNSW indexes) will add their own -/// value/cell tags inside this same v4 envelope — no v5 mid-Phase-7. -pub const FORMAT_VERSION: u16 = 4; +/// column type, plus the `KIND_HNSW` cell tag for vector ANN +/// indexes. All Phase 7 storage additions (VECTOR cells, JSON cells, +/// HNSW index nodes) live inside the v4 envelope. +/// - Version 5 (Phase 8c): adds the `KIND_FTS_POSTING` cell tag for +/// persisted FTS posting lists. Bumped **on demand** — a database +/// without any FTS index keeps writing v4. The first save with at +/// least one FTS index attached writes v5 instead. Decoders accept +/// both v4 and v5; v5 reading a v4-shaped DB just sees zero FTS +/// indexes in `sqlrite_master`. See [Phase 8 plan Q10]. +pub const FORMAT_VERSION_V4: u16 = 4; +pub const FORMAT_VERSION_V5: u16 = 5; +/// The version a brand-new write defaults to when no FTS index forces +/// a bump. Existing databases keep their on-disk version unchanged +/// across reads + non-FTS writes; FTS-bearing saves switch to V5. +pub const FORMAT_VERSION_BASELINE: u16 = FORMAT_VERSION_V4; /// Parsed header. `page_count` includes page 0 itself. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub struct DbHeader { pub page_count: u32, pub schema_root_page: u32, + /// On-disk format version this header carries. Tracked explicitly + /// so save can preserve a v4 file as v4 (no FTS) or bump it to v5 + /// (FTS present), per Phase 8c's on-demand bump strategy. + pub format_version: u16, } /// Encodes the header into a `PAGE_SIZE`-sized buffer. pub fn encode_header(h: &DbHeader) -> [u8; PAGE_SIZE] { let mut buf = [0u8; PAGE_SIZE]; buf[0..16].copy_from_slice(MAGIC); - buf[16..18].copy_from_slice(&FORMAT_VERSION.to_le_bytes()); + buf[16..18].copy_from_slice(&h.format_version.to_le_bytes()); buf[18..20].copy_from_slice(&(PAGE_SIZE as u16).to_le_bytes()); buf[20..24].copy_from_slice(&h.page_count.to_le_bytes()); buf[24..28].copy_from_slice(&h.schema_root_page.to_le_bytes()); @@ -49,6 +64,8 @@ pub fn encode_header(h: &DbHeader) -> [u8; PAGE_SIZE] { /// Decodes the header from a `PAGE_SIZE`-sized buffer. Returns an error if /// magic bytes, format version, or page size don't match what we wrote. +/// Both V4 and V5 are accepted; the result's `format_version` echoes +/// what was on disk so a no-op resave preserves it. pub fn decode_header(buf: &[u8]) -> Result { if buf.len() != PAGE_SIZE { return Err(SQLRiteError::Internal(format!( @@ -62,9 +79,10 @@ pub fn decode_header(buf: &[u8]) -> Result { )); } let version = u16::from_le_bytes(buf[16..18].try_into().unwrap()); - if version != FORMAT_VERSION { + if version != FORMAT_VERSION_V4 && version != FORMAT_VERSION_V5 { return Err(SQLRiteError::General(format!( - "unsupported SQLRite format version {version}; this build understands {FORMAT_VERSION}" + "unsupported SQLRite format version {version}; this build understands \ + {FORMAT_VERSION_V4} and {FORMAT_VERSION_V5}" ))); } let page_size = u16::from_le_bytes(buf[18..20].try_into().unwrap()) as usize; @@ -78,5 +96,6 @@ pub fn decode_header(buf: &[u8]) -> Result { Ok(DbHeader { page_count, schema_root_page, + format_version: version, }) } diff --git a/src/sql/pager/mod.rs b/src/sql/pager/mod.rs index 3a91e83..11e239b 100644 --- a/src/sql/pager/mod.rs +++ b/src/sql/pager/mod.rs @@ -32,6 +32,8 @@ #[allow(dead_code)] pub mod cell; pub mod file; +#[allow(dead_code)] +pub mod fts_cell; pub mod header; #[allow(dead_code)] pub mod hnsw_cell; @@ -166,7 +168,7 @@ pub fn open_database_with_mode(path: &Path, db_name: String, mode: AccessMode) - if create_index_sql_uses_hnsw(&row.sql) { rebuild_hnsw_index(&mut db, &pager, &row)?; } else if create_index_sql_uses_fts(&row.sql) { - rebuild_fts_index(&mut db, &row)?; + rebuild_fts_index(&mut db, &pager, &row)?; } else { attach_index(&mut db, &pager, row)?; } @@ -291,10 +293,15 @@ pub fn save_database(db: &mut Database, path: &Path) -> Result<()> { }); } - // 2c. Phase 8b — persist a sqlrite_master entry for each FTS index - // so it's replayed on open. The posting list itself isn't on - // disk yet (Phase 8c) — `rootpage = 0` signals replay-from-rows - // to `rebuild_fts_index`. Mirrors HNSW's pre-7d.3 shape. + // 2c. Phase 8c — persist FTS posting lists as their own + // cell-encoded page trees, with the rootpage recorded in + // sqlrite_master. Reopen loads the postings back from cells + // (fast, exact match) instead of re-tokenizing rows. + // + // Dirty indexes (set by DELETE / UPDATE-on-text-col) are + // rebuilt from current rows BEFORE staging by + // `rebuild_dirty_fts_indexes`, so the on-disk tree reflects + // the current row set. let mut fts_entries: Vec<(&Table, &crate::sql::db::table::FtsIndexEntry)> = Vec::new(); for table in db.tables.values() { for entry in &table.fts_indexes { @@ -303,7 +310,10 @@ pub fn save_database(db: &mut Database, path: &Path) -> Result<()> { } fts_entries .sort_by(|(ta, ea), (tb, eb)| ta.tb_name.cmp(&tb.tb_name).then(ea.name.cmp(&eb.name))); + let any_fts = !fts_entries.is_empty(); for (table, entry) in fts_entries { + let (rootpage, new_next) = stage_fts_btree(&mut pager, &entry.index, next_free_page)?; + next_free_page = new_next; master_rows.push(CatalogEntry { kind: "index".into(), name: entry.name.clone(), @@ -311,7 +321,7 @@ pub fn save_database(db: &mut Database, path: &Path) -> Result<()> { "CREATE INDEX {} ON {} USING fts ({})", entry.name, table.tb_name, entry.column_name ), - rootpage: 0, + rootpage, last_rowid: 0, }); } @@ -335,9 +345,20 @@ pub fn save_database(db: &mut Database, path: &Path) -> Result<()> { let (master_root, master_next) = stage_table_btree(&mut pager, &master, next_free_page)?; next_free_page = master_next; + // Phase 8c — on-demand v4→v5 file-format bump per Q10. If any FTS + // index attached to the database, write v5; otherwise preserve the + // pre-existing version (v4 for files born before this build, or + // a previously-promoted v5 file). Reads accept both. + let format_version = if any_fts { + crate::sql::pager::header::FORMAT_VERSION_V5 + } else { + pager.header().format_version + }; + pager.commit(DbHeader { page_count: next_free_page, schema_root_page: master_root, + format_version, })?; if same_path { @@ -707,12 +728,22 @@ fn create_index_sql_uses_fts(sql: &str) -> bool { matches!(using, Some(IndexType::Custom(ident)) if ident.value.eq_ignore_ascii_case("fts")) } -/// Phase 8b — replays a `CREATE INDEX … USING fts(...)` statement on -/// database open to rebuild its in-memory `PostingList` from current -/// rows. Mirrors the `rootpage == 0` arm of [`rebuild_hnsw_index`]. -/// Persistence of the posting lists themselves is Phase 8c. -fn rebuild_fts_index(db: &mut Database, row: &IndexCatalogRow) -> Result<()> { +/// Phase 8c — loads (or rebuilds) an FTS index on database open. Two +/// paths mirror [`rebuild_hnsw_index`]: +/// +/// - **rootpage != 0** (Phase 8c default): the posting list is +/// persisted as cell-encoded pages. Read every cell directly via +/// [`load_fts_postings`] and reconstruct the index — no +/// re-tokenization, exact bit-for-bit reproduction. +/// +/// - **rootpage == 0** (compatibility): no on-disk postings, e.g. +/// for files saved by Phase 8b before persistence landed. Replay +/// the CREATE INDEX SQL through `execute_create_index`, which +/// walks the table's current rows and tokenizes them fresh. +fn rebuild_fts_index(db: &mut Database, pager: &Pager, row: &IndexCatalogRow) -> Result<()> { + use crate::sql::db::table::FtsIndexEntry; use crate::sql::executor::execute_create_index; + use crate::sql::fts::PostingList; use sqlparser::ast::Statement; let dialect = SQLiteDialect {}; @@ -723,10 +754,67 @@ fn rebuild_fts_index(db: &mut Database, row: &IndexCatalogRow) -> Result<()> { row.sql ))); }; - execute_create_index(&stmt, db)?; + + if row.rootpage == 0 { + // Compatibility path — no persisted postings; replay rows. + execute_create_index(&stmt, db)?; + return Ok(()); + } + + let (doc_lengths, postings) = load_fts_postings(pager, row.rootpage)?; + let index = PostingList::from_persisted_postings(doc_lengths, postings); + let (tbl_name, col_name) = parse_fts_create_index_sql(&row.sql)?; + let table_mut = db.get_table_mut(tbl_name.clone()).map_err(|_| { + SQLRiteError::Internal(format!( + "FTS index '{}' references unknown table '{tbl_name}'", + row.name + )) + })?; + table_mut.fts_indexes.push(FtsIndexEntry { + name: row.name.clone(), + column_name: col_name, + index, + needs_rebuild: false, + }); Ok(()) } +/// Pulls (table_name, column_name) out of a `CREATE INDEX … USING fts(col)` +/// SQL string. Same shape as `parse_hnsw_create_index_sql`. +fn parse_fts_create_index_sql(sql: &str) -> Result<(String, String)> { + use sqlparser::ast::{CreateIndex, Expr, Statement}; + + let dialect = SQLiteDialect {}; + let mut ast = Parser::parse_sql(&dialect, sql).map_err(SQLRiteError::from)?; + let Some(Statement::CreateIndex(CreateIndex { + table_name, + columns, + .. + })) = ast.pop() + else { + return Err(SQLRiteError::Internal(format!( + "sqlrite_master FTS row's SQL isn't a CREATE INDEX: {sql}" + ))); + }; + if columns.len() != 1 { + return Err(SQLRiteError::NotImplemented( + "multi-column FTS indexes aren't supported yet".to_string(), + )); + } + let col = match &columns[0].column.expr { + Expr::Identifier(ident) => ident.value.clone(), + Expr::CompoundIdentifier(parts) => { + parts.last().map(|p| p.value.clone()).unwrap_or_default() + } + other => { + return Err(SQLRiteError::Internal(format!( + "FTS CREATE INDEX has unexpected column expr: {other:?}" + ))); + } + }; + Ok((table_name.to_string(), col)) +} + /// Loads (or rebuilds) an HNSW index on database open. Two paths: /// /// - **rootpage != 0** (Phase 7d.3 default): the graph is persisted @@ -1094,6 +1182,156 @@ fn stage_hnsw_btree( Ok((level[0].0, next_free_page)) } +/// Phase 8c — stage one FTS index as a `TableLeaf`-shaped B-Tree. +/// Mirrors `stage_hnsw_btree` (sibling-chained leaves, optional interior +/// levels). Returns `(root_page, next_free_page)`. Each leaf is filled +/// with `KIND_FTS_POSTING` cells: one sidecar cell holding the +/// doc-lengths map, then one cell per term in lexicographic order. +fn stage_fts_btree( + pager: &mut Pager, + idx: &crate::sql::fts::PostingList, + start_page: u32, +) -> Result<(u32, u32)> { + let (leaves, mut next_free_page) = stage_fts_leaves(pager, idx, start_page)?; + if leaves.len() == 1 { + return Ok((leaves[0].0, next_free_page)); + } + let mut level: Vec<(u32, i64)> = leaves; + while level.len() > 1 { + let (next_level, new_next_free) = stage_interior_level(pager, &level, next_free_page)?; + next_free_page = new_next_free; + level = next_level; + } + Ok((level[0].0, next_free_page)) +} + +/// Packs FTS posting cells into a sibling-chained run of `TableLeaf` +/// pages. Cell layout: a single doc-lengths sidecar at `cell_id = 1`, +/// followed by one cell per term in lexicographic order with +/// `cell_id = 2..=N + 1`. Sequential ids keep the slot directory's +/// rowid ordering valid (the `cell_id` field is what `peek_rowid` +/// returns). +fn stage_fts_leaves( + pager: &mut Pager, + idx: &crate::sql::fts::PostingList, + start_page: u32, +) -> Result<(Vec<(u32, i64)>, u32)> { + use crate::sql::pager::fts_cell::FtsPostingCell; + + let mut leaves: Vec<(u32, i64)> = Vec::new(); + let mut current_leaf = TablePage::empty(); + let mut current_leaf_page = start_page; + let mut current_max_rowid: Option = None; + let mut next_free_page = start_page + 1; + + // Build the cell sequence: sidecar first, then per-term cells. The + // sidecar always exists (even on an empty index) so reload sees a + // canonical "this index was persisted" marker in slot 0. + let mut cell_id: i64 = 1; + let mut cells: Vec = Vec::new(); + cells.push(FtsPostingCell::doc_lengths( + cell_id, + idx.serialize_doc_lengths(), + )); + for (term, entries) in idx.serialize_postings() { + cell_id += 1; + cells.push(FtsPostingCell::posting(cell_id, term, entries)); + } + + for cell in cells { + let entry_bytes = cell.encode()?; + + if !current_leaf.would_fit(entry_bytes.len()) { + let next_leaf_page_num = next_free_page; + emit_leaf(pager, current_leaf_page, ¤t_leaf, next_leaf_page_num); + leaves.push((current_leaf_page, current_max_rowid.unwrap_or(i64::MIN))); + current_leaf = TablePage::empty(); + current_leaf_page = next_leaf_page_num; + next_free_page += 1; + + if !current_leaf.would_fit(entry_bytes.len()) { + // A single posting cell exceeds page capacity. Phase + // 8c MVP doesn't chain via overflow cells (the plan + // notes this as a stretch goal); surface a clear + // error so users know which term tripped it. + return Err(SQLRiteError::Internal(format!( + "FTS posting cell {} of {} bytes exceeds empty-page capacity {} \ + (term too long or too many postings; overflow chaining is Phase 8.1)", + cell.cell_id, + entry_bytes.len(), + current_leaf.free_space() + ))); + } + } + current_leaf.insert_entry(cell.cell_id, &entry_bytes)?; + current_max_rowid = Some(cell.cell_id); + } + + emit_leaf(pager, current_leaf_page, ¤t_leaf, 0); + leaves.push((current_leaf_page, current_max_rowid.unwrap_or(i64::MIN))); + Ok((leaves, next_free_page)) +} + +/// (rowid, value) pairs as decoded from a single FTS cell — value is +/// either term frequency (posting cell) or doc length (sidecar cell). +type FtsEntries = Vec<(i64, u32)>; +/// (term, posting list) pairs as decoded from non-sidecar FTS cells. +type FtsPostings = Vec<(String, FtsEntries)>; + +/// Phase 8c — read every cell of an FTS index from `root_page` back +/// into the `(doc_lengths, postings)` shape `PostingList::from_persisted_postings` +/// expects. Mirrors `load_hnsw_nodes`: leftmost-leaf descent, walk the +/// sibling chain, decode each slot. +fn load_fts_postings(pager: &Pager, root_page: u32) -> Result<(FtsEntries, FtsPostings)> { + use crate::sql::pager::fts_cell::FtsPostingCell; + + let mut doc_lengths: Vec<(i64, u32)> = Vec::new(); + let mut postings: Vec<(String, Vec<(i64, u32)>)> = Vec::new(); + let mut saw_sidecar = false; + + let first_leaf = find_leftmost_leaf(pager, root_page)?; + let mut current = first_leaf; + while current != 0 { + let page_buf = pager + .read_page(current) + .ok_or_else(|| SQLRiteError::Internal(format!("missing FTS leaf page {current}")))?; + if page_buf[0] != PageType::TableLeaf as u8 { + return Err(SQLRiteError::Internal(format!( + "page {current} tagged {} but expected TableLeaf (FTS)", + page_buf[0] + ))); + } + let next_leaf = u32::from_le_bytes(page_buf[1..5].try_into().unwrap()); + let payload: &[u8; PAYLOAD_PER_PAGE] = (&page_buf[PAGE_HEADER_SIZE..]) + .try_into() + .map_err(|_| SQLRiteError::Internal("FTS leaf payload size".to_string()))?; + let leaf = TablePage::from_bytes(payload); + for slot in 0..leaf.slot_count() { + let offset = leaf.slot_offset_raw(slot)?; + let (cell, _) = FtsPostingCell::decode(leaf.as_bytes(), offset)?; + if cell.is_doc_lengths() { + if saw_sidecar { + return Err(SQLRiteError::Internal( + "FTS index has more than one doc-lengths sidecar cell".to_string(), + )); + } + saw_sidecar = true; + doc_lengths = cell.entries; + } else { + postings.push((cell.term, cell.entries)); + } + } + current = next_leaf; + } + + if !saw_sidecar { + return Err(SQLRiteError::Internal( + "FTS index missing doc-lengths sidecar cell — corrupt or truncated tree".to_string(), + )); + } + Ok((doc_lengths, postings)) +} + /// Packs HNSW nodes into a sibling-chained run of `TableLeaf` pages. /// `serialize_nodes` already returns nodes in ascending node_id order, /// so the slot directory's rowid ordering stays valid. @@ -1633,10 +1871,10 @@ mod tests { #[test] fn round_trip_rebuilds_fts_index_from_create_sql() { - // Phase 8b — FTS indexes don't yet persist their posting lists - // (Phase 8c does that). On open, sqlrite_master records the - // CREATE INDEX SQL and `rebuild_fts_index` replays it through - // `execute_create_index`, walking current rows. + // Phase 8c: FTS indexes now persist their posting lists as + // cell-encoded pages. After save+reopen the index entry + // reattaches with the same column + same posting count, loaded + // directly from disk (no re-tokenization). let path = tmp_path("fts_roundtrip"); { @@ -1735,6 +1973,198 @@ mod tests { cleanup(&path); } + #[test] + fn fts_roundtrip_uses_persistence_path_not_replay() { + // Phase 8c — assert the reload didn't go through the + // rootpage=0 replay shortcut. We do this by reading the + // sqlrite_master row for the FTS index and confirming its + // rootpage field is non-zero. + let path = tmp_path("fts_persistence_path"); + + { + let mut db = Database::new("test".to_string()); + process_command( + "CREATE TABLE docs (id INTEGER PRIMARY KEY, body TEXT);", + &mut db, + ) + .unwrap(); + process_command( + "INSERT INTO docs (body) VALUES ('rust embedded database');", + &mut db, + ) + .unwrap(); + process_command("CREATE INDEX ix_body ON docs USING fts (body);", &mut db).unwrap(); + save_database(&mut db, &path).expect("save"); + } + + // Read raw sqlrite_master to find the FTS index row. + let pager = Pager::open(&path).expect("open pager"); + let mut master = build_empty_master_table(); + load_table_rows(&pager, &mut master, pager.header().schema_root_page).unwrap(); + let mut found_rootpage: Option = None; + for rowid in master.rowids() { + let name = take_text(&master, "name", rowid).unwrap(); + if name == "ix_body" { + let rp = take_integer(&master, "rootpage", rowid).unwrap(); + found_rootpage = Some(rp as u32); + } + } + let rootpage = found_rootpage.expect("ix_body row in sqlrite_master"); + assert!( + rootpage != 0, + "Phase 8c FTS save should set rootpage != 0; got {rootpage}" + ); + + cleanup(&path); + } + + #[test] + fn save_without_fts_keeps_format_v4() { + // Phase 8c on-demand bump — a database with zero FTS indexes + // continues writing the v4 header. Existing v4 users must not + // see their files silently promoted to v5 by an upgrade. + use crate::sql::pager::header::FORMAT_VERSION_V4; + + let path = tmp_path("fts_no_bump"); + let mut db = Database::new("test".to_string()); + process_command( + "CREATE TABLE t (id INTEGER PRIMARY KEY, n INTEGER);", + &mut db, + ) + .unwrap(); + process_command("INSERT INTO t (n) VALUES (1);", &mut db).unwrap(); + save_database(&mut db, &path).unwrap(); + drop(db); + + let pager = Pager::open(&path).expect("open"); + assert_eq!( + pager.header().format_version, + FORMAT_VERSION_V4, + "no-FTS save should keep v4" + ); + cleanup(&path); + } + + #[test] + fn save_with_fts_bumps_to_v5() { + // Phase 8c on-demand bump — first FTS-bearing save promotes + // the file to v5. v5 readers handle both v4 and v5; v4 + // readers correctly refuse a v5 file. + use crate::sql::pager::header::FORMAT_VERSION_V5; + + let path = tmp_path("fts_bump_v5"); + let mut db = Database::new("test".to_string()); + process_command( + "CREATE TABLE docs (id INTEGER PRIMARY KEY, body TEXT);", + &mut db, + ) + .unwrap(); + process_command("INSERT INTO docs (body) VALUES ('hello');", &mut db).unwrap(); + process_command("CREATE INDEX ix_body ON docs USING fts (body);", &mut db).unwrap(); + save_database(&mut db, &path).unwrap(); + drop(db); + + let pager = Pager::open(&path).expect("open"); + assert_eq!( + pager.header().format_version, + FORMAT_VERSION_V5, + "FTS save should promote to v5" + ); + cleanup(&path); + } + + #[test] + fn fts_persistence_handles_empty_and_zero_token_docs() { + // Phase 8c — sidecar cell carries doc-lengths for every doc + // including any with zero tokens (so total_docs is honest + // post-reopen). Empty index also round-trips: a CREATE INDEX + // on an empty table emits a single empty leaf with just the + // (empty) sidecar. + let path = tmp_path("fts_edges"); + + { + let mut db = Database::new("test".to_string()); + process_command( + "CREATE TABLE docs (id INTEGER PRIMARY KEY, body TEXT);", + &mut db, + ) + .unwrap(); + process_command("CREATE INDEX ix_body ON docs USING fts (body);", &mut db).unwrap(); + // Mix: real text, then a row that tokenizes to zero tokens + // (only punctuation), then real again. + process_command("INSERT INTO docs (body) VALUES ('rust embedded');", &mut db).unwrap(); + process_command("INSERT INTO docs (body) VALUES ('!!!---???');", &mut db).unwrap(); + process_command("INSERT INTO docs (body) VALUES ('go embedded');", &mut db).unwrap(); + save_database(&mut db, &path).unwrap(); + } + + let loaded = open_database(&path, "test".to_string()).expect("open"); + let table = loaded.get_table("docs".to_string()).unwrap(); + let entry = &table.fts_indexes[0]; + // All three rows present — including the zero-token row, + // which is critical for total_docs honesty in BM25. + assert_eq!(entry.index.len(), 3); + // 'embedded' appears in 2 rows after reload. + let res = entry + .index + .query("embedded", &crate::sql::fts::Bm25Params::default()); + assert_eq!(res.len(), 2); + + cleanup(&path); + } + + #[test] + fn fts_persistence_round_trips_large_corpus() { + // Phase 8c — exercise multi-leaf staging. ~500 docs with + // single-token bodies generates enough cells to overflow a + // single 4 KiB leaf (each posting cell averages ~8 bytes). + let path = tmp_path("fts_large_corpus"); + + let mut expected_terms: std::collections::BTreeSet = + std::collections::BTreeSet::new(); + { + let mut db = Database::new("test".to_string()); + process_command( + "CREATE TABLE docs (id INTEGER PRIMARY KEY, body TEXT);", + &mut db, + ) + .unwrap(); + process_command("CREATE INDEX ix_body ON docs USING fts (body);", &mut db).unwrap(); + // 500 docs, each one a unique term — drives unique-term + // count up so multiple leaves are required. + for i in 0..500 { + let term = format!("term{i:04}"); + process_command( + &format!("INSERT INTO docs (body) VALUES ('{term}');"), + &mut db, + ) + .unwrap(); + expected_terms.insert(term); + } + save_database(&mut db, &path).unwrap(); + } + + let loaded = open_database(&path, "test".to_string()).expect("open"); + let table = loaded.get_table("docs".to_string()).unwrap(); + let entry = &table.fts_indexes[0]; + assert_eq!(entry.index.len(), 500); + + // Spot-check a handful of terms come back with their original + // single-row posting list. + for &i in &[0_i64, 137, 248, 391, 499] { + let term = format!("term{i:04}"); + let res = entry + .index + .query(&term, &crate::sql::fts::Bm25Params::default()); + assert_eq!(res.len(), 1, "term {term} should match exactly 1 row"); + // PrimaryKey rowids start at 1; doc i was inserted at + // rowid i+1. + assert_eq!(res[0].0, i + 1); + } + + cleanup(&path); + } + #[test] fn delete_then_save_then_reopen_excludes_deleted_node_from_hnsw() { // Phase 7d.3 — DELETE marks HNSW dirty; save rebuilds it from diff --git a/src/sql/pager/overflow.rs b/src/sql/pager/overflow.rs index f146cf1..76ee87c 100644 --- a/src/sql/pager/overflow.rs +++ b/src/sql/pager/overflow.rs @@ -308,6 +308,7 @@ mod tests { .commit(crate::sql::pager::header::DbHeader { page_count: next_free, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap(); @@ -330,6 +331,7 @@ mod tests { .commit(crate::sql::pager::header::DbHeader { page_count: next, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap(); diff --git a/src/sql/pager/pager.rs b/src/sql/pager/pager.rs index 4ee0fc1..9451edf 100644 --- a/src/sql/pager/pager.rs +++ b/src/sql/pager/pager.rs @@ -305,6 +305,7 @@ impl Pager { let header = DbHeader { page_count: 2, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }; // Write the file synchronously so the initial create is durable and @@ -662,6 +663,7 @@ mod tests { .commit(DbHeader { page_count: 5, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap(); // 3 dirty data pages (pages 2, 3, 4). The page-0 commit frame is @@ -676,6 +678,7 @@ mod tests { .commit(DbHeader { page_count: 5, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap(); assert_eq!(writes, 1, "only the changed page should have been written"); @@ -735,6 +738,7 @@ mod tests { p.commit(DbHeader { page_count: 5, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap(); @@ -755,6 +759,7 @@ mod tests { p.commit(DbHeader { page_count: 3, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap(); @@ -786,6 +791,7 @@ mod tests { p.commit(DbHeader { page_count: 4, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap(); } @@ -810,6 +816,7 @@ mod tests { p.commit(DbHeader { page_count: 3, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap(); } @@ -845,6 +852,7 @@ mod tests { .commit(DbHeader { page_count: 3, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap(); assert_eq!(first, 1); @@ -855,6 +863,7 @@ mod tests { .commit(DbHeader { page_count: 3, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap(); assert_eq!(second, 0, "no data frames should be re-appended"); @@ -877,6 +886,7 @@ mod tests { p.commit(DbHeader { page_count: 4, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap(); @@ -917,6 +927,7 @@ mod tests { p.commit(DbHeader { page_count: 3, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap(); @@ -941,6 +952,7 @@ mod tests { p.commit(DbHeader { page_count: 5, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap(); p.checkpoint().unwrap(); @@ -953,6 +965,7 @@ mod tests { p.commit(DbHeader { page_count: 3, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap(); p.checkpoint().unwrap(); @@ -986,6 +999,7 @@ mod tests { p.commit(DbHeader { page_count: 3, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap(); } @@ -1019,6 +1033,7 @@ mod tests { p.commit(DbHeader { page_count: 3, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap(); } @@ -1075,6 +1090,7 @@ mod tests { p.commit(DbHeader { page_count: 3, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap(); } @@ -1084,6 +1100,7 @@ mod tests { .commit(DbHeader { page_count: 3, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap_err(); assert!( @@ -1115,6 +1132,7 @@ mod tests { p.commit(DbHeader { page_count: 3, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap(); // Force the WAL into the main file before we nuke it. @@ -1147,6 +1165,7 @@ mod tests { p.commit(DbHeader { page_count: 3, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap(); // Manually write the committed page 2 into the main file at @@ -1190,6 +1209,7 @@ mod tests { p.commit(DbHeader { page_count: 3, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap(); } @@ -1205,6 +1225,7 @@ mod tests { p.commit(DbHeader { page_count: 3, schema_root_page: 1, + format_version: crate::sql::pager::header::FORMAT_VERSION_BASELINE, }) .unwrap(); let post = std::fs::metadata(wal_path_for(&path)).unwrap().len();