diff --git a/crates/hot/src/conformance/history.rs b/crates/hot/src/conformance/history.rs index 3f7a67a..fa785e9 100644 --- a/crates/hot/src/conformance/history.rs +++ b/crates/hot/src/conformance/history.rs @@ -460,19 +460,21 @@ pub fn test_delete_and_rewrite_dual(hot_kv: &T) { /// /// Why: AccountsHistory is DUPSORT, so each stored dup value /// (`key2 || encoded BlockNumberList`) is capped at MDBX's DUPSORT value -/// limit (~1980 bytes on 4 KB pages). A roaring serialisation of 2000 -/// indices spread across many 16-bit containers exceeds 20 KB. The history -/// pipeline must split shards by encoded size, not by index count, so any -/// pattern of `update_history_indices_inconsistent` survives. ENG-2287. +/// limit (~1980 bytes on 4 KB pages). A roaring serialisation of a few +/// thousand sparse indices vastly exceeds that. The history pipeline must +/// split shards by encoded size, not by index count, so any pattern of +/// `update_history_indices_inconsistent` survives. ENG-2287. pub fn test_history_shard_fits_in_dupsort_limit(hot_kv: &T) { let addr = address!("0xcccccccccccccccccccccccccccccccccccccccc"); - // Sparse pattern: 2000 block-numbers spread far apart so each lives in - // its own roaring 16-bit container — the worst-case for serialised size - // (per-container header dominates). Write a change-set per block, then - // ask the history pipeline to index the full range in one call. - let blocks: Vec = - (0..ShardedKey::SHARD_COUNT as u64).map(|i| i.saturating_mul(100_000) + 1).collect(); + // Sparse pattern: block numbers spread far apart so each lives in its + // own roaring 16-bit container — the worst case for serialised size + // (per-container header dominates) within a single bitmap. The count + // forces several full-sized shards plus one open shard, so the + // splitter has to emit, restart, and emit again multiple times. + const SHARDS_TO_FORCE: usize = 30; + let n = (BlockNumberList::SAFE_INDICES_PER_SHARD * SHARDS_TO_FORCE) as u64; + let blocks: Vec = (0..n).map(|i| i.saturating_mul(100_000) + 1).collect(); { let writer = hot_kv.writer().unwrap(); diff --git a/crates/hot/src/db/inconsistent.rs b/crates/hot/src/db/inconsistent.rs index 5978e72..0dee458 100644 --- a/crates/hot/src/db/inconsistent.rs +++ b/crates/hot/src/db/inconsistent.rs @@ -537,7 +537,8 @@ impl UnsafeHistoryWrite for T where T: UnsafeDbWrite + HotKvWrite {} /// stored DUPSORT in MDBX, which caps each dup value at ~1980 bytes on /// 4 KB pages. A count-based threshold cannot guarantee fitting under that /// cap because roaring's serialised size depends on the value distribution, -/// not just the count. See [`ShardedKey::MAX_SHARD_BYTES`] (ENG-2287). +/// not just the count. See [`BlockNumberList::MAX_ENCODED_BYTES`] and +/// [`BlockNumberList::SAFE_INDICES_PER_SHARD`] (ENG-2287). fn append_to_sharded_history( existing: Option<(K, BlockNumberList)>, indices: impl IntoIterator, @@ -549,9 +550,8 @@ where D: FnMut(K) -> Result<(), E>, W: FnMut(u64, &BlockNumberList) -> Result<(), E>, { - let (old_key, last_shard) = + let (old_key, mut last_shard) = existing.map_or_else(|| (None, BlockNumberList::default()), |(k, list)| (Some(k), list)); - let mut last_shard = last_shard; last_shard.append(indices).map_err(HistoryError::IntList)?; @@ -560,62 +560,67 @@ where delete_old(key).map_err(HistoryError::Db)?; } - // Fast path: encoded list fits under the per-shard byte budget. - if last_shard.serialized_size() <= ShardedKey::MAX_SHARD_BYTES { + // Fast path: small lists provably fit under the byte budget regardless + // of distribution, so skip the size check entirely. + if last_shard.len() <= BlockNumberList::SAFE_INDICES_PER_SHARD as u64 + || last_shard.serialized_size() <= BlockNumberList::MAX_ENCODED_BYTES + { return write_shard(u64::MAX, &last_shard).map_err(HistoryError::Db); } - // Slow path: re-chunk into multiple shards whose encoded sizes each - // fit under the budget. - let all: Vec = last_shard.iter().collect(); - let mut start = 0; - while start < all.len() { - let take = max_prefix_fitting(&all[start..]); - // A single value is always far smaller than MAX_SHARD_BYTES, so we - // can always make forward progress. - debug_assert!(take > 0, "no prefix fits in MAX_SHARD_BYTES"); - let end = start + take; - let highest = if end == all.len() { u64::MAX } else { all[end - 1] }; - let shard = BlockNumberList::new_pre_sorted(all[start..end].iter().copied()); - write_shard(highest, &shard).map_err(HistoryError::Db)?; - start = end; - } - Ok(()) + // Slow path: walk indices through a working BlockNumberList, emitting + // whenever the next push would overflow the byte budget. Exact, + // single-pass, no probing. + chunk_by_encoded_size(last_shard.iter(), |highest, shard| write_shard(highest, shard)) + .map_err(HistoryError::Db) } -/// Binary-search the largest prefix length `n` of `indices` whose encoded -/// `BlockNumberList` fits in [`ShardedKey::MAX_SHARD_BYTES`]. +/// Split a pre-sorted iterator of indices into shards whose encoded +/// `BlockNumberList`s each fit in [`BlockNumberList::MAX_ENCODED_BYTES`]. /// -/// Returns 0 only if `indices` is empty. -fn max_prefix_fitting(indices: &[u64]) -> usize { - if indices.is_empty() { - return 0; - } - // Optimistic check: the entire slice may fit. Skips the binary search - // in the common case where the caller passes a small remainder. - if encoded_size_of(indices) <= ShardedKey::MAX_SHARD_BYTES { - return indices.len(); - } - let mut lo = 1usize; - let mut hi = indices.len(); - let mut best = 1usize; - while lo <= hi { - let mid = (lo + hi) / 2; - if encoded_size_of(&indices[..mid]) <= ShardedKey::MAX_SHARD_BYTES { - best = mid; - lo = mid + 1; - } else { - if mid == 1 { - // Even a single index doesn't fit — shouldn't happen but - // bail out to avoid an infinite loop. - break; +/// The last shard is emitted with `highest = u64::MAX` to mark it as the +/// open shard; earlier shards are emitted with the last index they +/// contain. +fn chunk_by_encoded_size( + indices: impl IntoIterator, + mut write_shard: W, +) -> Result<(), E> +where + W: FnMut(u64, &BlockNumberList) -> Result<(), E>, +{ + let mut shard = BlockNumberList::default(); + let mut prev_in_shard: Option = None; + + for idx in indices { + shard.push(idx).expect("indices are pre-sorted and strictly ascending"); + + if shard.serialized_size() > BlockNumberList::MAX_ENCODED_BYTES { + match prev_in_shard { + Some(highest) => { + // Roll back the overflowing index, emit the prefix, + // then restart the shard with just `idx`. + shard.remove(idx); + write_shard(highest, &shard)?; + shard.clear(); + shard.push(idx).expect("indices are pre-sorted and strictly ascending"); + prev_in_shard = Some(idx); + } + None => { + // A single index already overflows the budget. The + // smallest possible shard is one element, so emit it + // as-is. (Unreachable for u64: 30 encoded bytes.) + write_shard(idx, &shard)?; + shard.clear(); + prev_in_shard = None; + } } - hi = mid - 1; + } else { + prev_in_shard = Some(idx); } } - best -} -fn encoded_size_of(indices: &[u64]) -> usize { - BlockNumberList::new_pre_sorted(indices.iter().copied()).serialized_size() + if !shard.is_empty() { + write_shard(u64::MAX, &shard)?; + } + Ok(()) } diff --git a/crates/types/src/int_list.rs b/crates/types/src/int_list.rs index dde46bc..10db65d 100644 --- a/crates/types/src/int_list.rs +++ b/crates/types/src/int_list.rs @@ -43,6 +43,54 @@ impl fmt::Debug for IntegerList { } impl IntegerList { + /// Maximum encoded byte size targeted when splitting a list into + /// shards for storage in size-constrained backends. + /// + /// Currently sized for MDBX's `DUPSORT` value limit on 4 KB pages + /// (~1980 bytes). The budget reserves headroom for a paired secondary + /// key (up to 40 bytes), per-node metadata, and the rare case of a + /// trailing index that lands in a fresh roaring container. See + /// `append_to_sharded_history` in `signet-hot` (ENG-2287). + pub const MAX_ENCODED_BYTES: usize = 1500; + + /// Maximum number of indices that always serialise within + /// [`Self::MAX_ENCODED_BYTES`], regardless of how the values are + /// distributed across roaring containers. + /// + /// # Encoding overhead (roaring 0.11) + /// + /// A [`RoaringTreemap`] serialises as an 8-byte count of inner + /// [`roaring::RoaringBitmap`] entries, followed by `(u32 hi32, encoded + /// RoaringBitmap)` pairs. Each `RoaringBitmap` partitions its 32-bit + /// values into 16-bit array, bitmap, or run *containers* (4096-entry + /// crossover from array to bitmap). The portable serialised format of + /// a `RoaringBitmap` carries: + /// + /// - 8 bytes of cookie / container count / option flags + /// - 4 bytes per container (key + cardinality descriptor) + /// - 4 bytes per container (offset table entry) + /// - the container payload itself (2 bytes per element for array, 8 KB + /// for bitmap, variable for run) + /// + /// The worst case is therefore one index per `RoaringBitmap` with each + /// landing in its own array container: ~22 bytes per index plus the + /// 8-byte treemap header. Empirically: + /// + /// ```text + /// N= 1 (1 bitmap, 1 cont, 1 elem) -> 30 bytes + /// N= 67 (67 bitmaps, ...) -> 1482 bytes + /// N= 68 (68 bitmaps, ...) -> 1504 bytes (over) + /// ``` + /// + /// `BlockNumberList`s built from u64 block numbers that stay below + /// 2 ^ 32 (i.e. real-world Ethereum block numbers) share a single + /// `RoaringBitmap` and therefore encode at ~10 bytes per index in the + /// pathological "one element per 16-bit container" pattern, fitting + /// ~148 indices in [`Self::MAX_ENCODED_BYTES`]. This constant uses the + /// stricter, distribution-agnostic bound so callers can rely on it for + /// any u64 input. + pub const SAFE_INDICES_PER_SHARD: usize = 67; + /// Creates a new empty [`IntegerList`]. pub fn empty() -> Self { Self(RoaringTreemap::new()) @@ -84,6 +132,12 @@ impl IntegerList { self.0.try_push(value).map_err(|_| IntegerListError::IntegerTooSmall) } + /// Removes a single value from the list. Returns `true` if it was + /// present. + pub fn remove(&mut self, value: u64) -> bool { + self.0.remove(value) + } + /// Clears the list. pub fn clear(&mut self) { self.0.clear(); @@ -166,3 +220,83 @@ impl IntegerList { self.0.serialize_into(writer) } } + +#[cfg(test)] +mod tests { + use super::*; + + fn size(blocks: impl IntoIterator) -> usize { + IntegerList::new_pre_sorted(blocks).serialized_size() + } + + /// A dense run of indices in a single 16-bit container encodes as a + /// short header plus ~2 bytes per element (array container payload). + #[test] + fn dense_run_encodes_compactly() { + assert_eq!(size(0..1), 30); + assert_eq!(size(0..10), 48); + assert!(size(0..1_000) < 2_100); + } + + /// Each new 16-bit container costs 8 bytes of metadata (4 for the + /// descriptor entry, 4 for the offset entry) plus 2 bytes for the + /// element itself — so ~10 bytes per index when every index lands in + /// its own container within a single [`RoaringBitmap`]. + #[test] + fn sparse_within_single_bitmap_costs_ten_bytes_per_index() { + // (i << 16) lands index i in container i of bitmap 0. + let by_container = |n: u64| -> Vec { (0..n).map(|i| i << 16).collect() }; + assert_eq!(size(by_container(1)), 30); + // ~10 bytes per additional index. + assert_eq!(size(by_container(100)), 1020); + assert_eq!(size(by_container(147)), 1490); + // N=148 lands exactly on the budget; N=149 exceeds it. + assert_eq!(size(by_container(148)), IntegerList::MAX_ENCODED_BYTES); + assert!(size(by_container(149)) > IntegerList::MAX_ENCODED_BYTES); + } + + /// Each new [`RoaringBitmap`] (distinct upper-32-bit key) adds ~22 + /// bytes — this is the worst case across any u64 distribution. + #[test] + fn sparse_across_bitmaps_costs_twenty_two_bytes_per_index() { + // (i << 32) places each index in its own bitmap. + let by_bitmap = |n: u64| -> Vec { (0..n).map(|i| i << 32).collect() }; + assert_eq!(size(by_bitmap(1)), 30); + assert_eq!(size(by_bitmap(10)), 228); + assert_eq!(size(by_bitmap(50)), 1108); + // The boundary determines SAFE_INDICES_PER_SHARD. + assert!(size(by_bitmap(67)) <= IntegerList::MAX_ENCODED_BYTES); + assert!(size(by_bitmap(68)) > IntegerList::MAX_ENCODED_BYTES); + } + + /// The published [`IntegerList::SAFE_INDICES_PER_SHARD`] must fit in + /// [`IntegerList::MAX_ENCODED_BYTES`] under the worst-case + /// distribution (one index per [`RoaringBitmap`] — the most + /// metadata-heavy u64 layout). + #[test] + fn safe_indices_per_shard_fits_worst_case() { + let worst_case: Vec = + (0..IntegerList::SAFE_INDICES_PER_SHARD as u64).map(|i| i << 32).collect(); + let bytes = size(worst_case); + assert!( + bytes <= IntegerList::MAX_ENCODED_BYTES, + "SAFE_INDICES_PER_SHARD={} encodes to {} bytes, over MAX_ENCODED_BYTES={}", + IntegerList::SAFE_INDICES_PER_SHARD, + bytes, + IntegerList::MAX_ENCODED_BYTES, + ); + } + + /// The next index past [`IntegerList::SAFE_INDICES_PER_SHARD`] must + /// be *able* to overflow under the worst case — otherwise the bound + /// is needlessly conservative and should be raised. + #[test] + fn safe_indices_per_shard_is_tight() { + let just_over: Vec = + (0..(IntegerList::SAFE_INDICES_PER_SHARD as u64 + 1)).map(|i| i << 32).collect(); + assert!( + size(just_over) > IntegerList::MAX_ENCODED_BYTES, + "SAFE_INDICES_PER_SHARD is loose — N+1 worst-case indices still fit; raise the bound", + ); + } +} diff --git a/crates/types/src/sharded.rs b/crates/types/src/sharded.rs index 4eba074..1805b5d 100644 --- a/crates/types/src/sharded.rs +++ b/crates/types/src/sharded.rs @@ -12,26 +12,6 @@ pub struct ShardedKey { pub highest_block_number: u64, } -impl ShardedKey<()> { - /// Soft cap on the number of indices in one shard. - /// - /// This is a sanity ceiling used alongside [`Self::MAX_SHARD_BYTES`]; - /// shard splitting is driven by encoded size, not by this count. - pub const SHARD_COUNT: usize = 2000; - - /// Maximum encoded byte size of a single shard's [`BlockNumberList`]. - /// - /// [`BlockNumberList`]: crate::BlockNumberList - /// - /// The MDBX `DUPSORT` value limit is ~1980 bytes on 4 KB pages - /// (Linux production). Each stored dup value is `key2 || encoded list`, - /// so this budget reserves headroom for `ShardedKey` (40 bytes), - /// the 2-byte length prefix on `BlockNumberList`, and per-node overhead - /// inside MDBX. Exceeding this triggers `MDBX_BAD_VALSIZE` at write time - /// (ENG-2287). - pub const MAX_SHARD_BYTES: usize = 1500; -} - impl ShardedKey { /// Creates a new `ShardedKey`. pub const fn new(key: T, highest_block_number: u64) -> Self {