Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 12 additions & 10 deletions crates/hot/src/conformance/history.rs
Original file line number Diff line number Diff line change
Expand Up @@ -460,19 +460,21 @@ pub fn test_delete_and_rewrite_dual<T: HotKv>(hot_kv: &T) {
///
/// Why: AccountsHistory is DUPSORT, so each stored dup value
/// (`key2 || encoded BlockNumberList`) is capped at MDBX's DUPSORT value
/// limit (~1980 bytes on 4 KB pages). A roaring serialisation of 2000
/// indices spread across many 16-bit containers exceeds 20 KB. The history
/// pipeline must split shards by encoded size, not by index count, so any
/// pattern of `update_history_indices_inconsistent` survives. ENG-2287.
/// limit (~1980 bytes on 4 KB pages). A roaring serialisation of a few
/// thousand sparse indices vastly exceeds that. The history pipeline must
/// split shards by encoded size, not by index count, so any pattern of
/// `update_history_indices_inconsistent` survives. ENG-2287.
pub fn test_history_shard_fits_in_dupsort_limit<T: HotKv>(hot_kv: &T) {
let addr = address!("0xcccccccccccccccccccccccccccccccccccccccc");

// Sparse pattern: 2000 block-numbers spread far apart so each lives in
// its own roaring 16-bit container — the worst-case for serialised size
// (per-container header dominates). Write a change-set per block, then
// ask the history pipeline to index the full range in one call.
let blocks: Vec<u64> =
(0..ShardedKey::SHARD_COUNT as u64).map(|i| i.saturating_mul(100_000) + 1).collect();
// Sparse pattern: block numbers spread far apart so each lives in its
// own roaring 16-bit container — the worst case for serialised size
// (per-container header dominates) within a single bitmap. The count
// forces several full-sized shards plus one open shard, so the
// splitter has to emit, restart, and emit again multiple times.
const SHARDS_TO_FORCE: usize = 30;
let n = (BlockNumberList::SAFE_INDICES_PER_SHARD * SHARDS_TO_FORCE) as u64;
let blocks: Vec<u64> = (0..n).map(|i| i.saturating_mul(100_000) + 1).collect();

{
let writer = hot_kv.writer().unwrap();
Expand Down
107 changes: 56 additions & 51 deletions crates/hot/src/db/inconsistent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,8 @@ impl<T> UnsafeHistoryWrite for T where T: UnsafeDbWrite + HotKvWrite {}
/// stored DUPSORT in MDBX, which caps each dup value at ~1980 bytes on
/// 4 KB pages. A count-based threshold cannot guarantee fitting under that
/// cap because roaring's serialised size depends on the value distribution,
/// not just the count. See [`ShardedKey::MAX_SHARD_BYTES`] (ENG-2287).
/// not just the count. See [`BlockNumberList::MAX_ENCODED_BYTES`] and
/// [`BlockNumberList::SAFE_INDICES_PER_SHARD`] (ENG-2287).
fn append_to_sharded_history<K, E, D, W>(
existing: Option<(K, BlockNumberList)>,
indices: impl IntoIterator<Item = u64>,
Expand All @@ -549,9 +550,8 @@ where
D: FnMut(K) -> Result<(), E>,
W: FnMut(u64, &BlockNumberList) -> Result<(), E>,
{
let (old_key, last_shard) =
let (old_key, mut last_shard) =
existing.map_or_else(|| (None, BlockNumberList::default()), |(k, list)| (Some(k), list));
let mut last_shard = last_shard;

last_shard.append(indices).map_err(HistoryError::IntList)?;

Expand All @@ -560,62 +560,67 @@ where
delete_old(key).map_err(HistoryError::Db)?;
}

// Fast path: encoded list fits under the per-shard byte budget.
if last_shard.serialized_size() <= ShardedKey::MAX_SHARD_BYTES {
// Fast path: small lists provably fit under the byte budget regardless
// of distribution, so skip the size check entirely.
if last_shard.len() <= BlockNumberList::SAFE_INDICES_PER_SHARD as u64
|| last_shard.serialized_size() <= BlockNumberList::MAX_ENCODED_BYTES
{
return write_shard(u64::MAX, &last_shard).map_err(HistoryError::Db);
}

// Slow path: re-chunk into multiple shards whose encoded sizes each
// fit under the budget.
let all: Vec<u64> = last_shard.iter().collect();
let mut start = 0;
while start < all.len() {
let take = max_prefix_fitting(&all[start..]);
// A single value is always far smaller than MAX_SHARD_BYTES, so we
// can always make forward progress.
debug_assert!(take > 0, "no prefix fits in MAX_SHARD_BYTES");
let end = start + take;
let highest = if end == all.len() { u64::MAX } else { all[end - 1] };
let shard = BlockNumberList::new_pre_sorted(all[start..end].iter().copied());
write_shard(highest, &shard).map_err(HistoryError::Db)?;
start = end;
}
Ok(())
// Slow path: walk indices through a working BlockNumberList, emitting
// whenever the next push would overflow the byte budget. Exact,
// single-pass, no probing.
chunk_by_encoded_size(last_shard.iter(), |highest, shard| write_shard(highest, shard))
.map_err(HistoryError::Db)
}

/// Binary-search the largest prefix length `n` of `indices` whose encoded
/// `BlockNumberList` fits in [`ShardedKey::MAX_SHARD_BYTES`].
/// Split a pre-sorted iterator of indices into shards whose encoded
/// `BlockNumberList`s each fit in [`BlockNumberList::MAX_ENCODED_BYTES`].
///
/// Returns 0 only if `indices` is empty.
fn max_prefix_fitting(indices: &[u64]) -> usize {
if indices.is_empty() {
return 0;
}
// Optimistic check: the entire slice may fit. Skips the binary search
// in the common case where the caller passes a small remainder.
if encoded_size_of(indices) <= ShardedKey::MAX_SHARD_BYTES {
return indices.len();
}
let mut lo = 1usize;
let mut hi = indices.len();
let mut best = 1usize;
while lo <= hi {
let mid = (lo + hi) / 2;
if encoded_size_of(&indices[..mid]) <= ShardedKey::MAX_SHARD_BYTES {
best = mid;
lo = mid + 1;
} else {
if mid == 1 {
// Even a single index doesn't fit — shouldn't happen but
// bail out to avoid an infinite loop.
break;
/// The last shard is emitted with `highest = u64::MAX` to mark it as the
/// open shard; earlier shards are emitted with the last index they
/// contain.
fn chunk_by_encoded_size<E, W>(
indices: impl IntoIterator<Item = u64>,
mut write_shard: W,
) -> Result<(), E>
where
W: FnMut(u64, &BlockNumberList) -> Result<(), E>,
{
let mut shard = BlockNumberList::default();
let mut prev_in_shard: Option<u64> = None;

for idx in indices {
shard.push(idx).expect("indices are pre-sorted and strictly ascending");

if shard.serialized_size() > BlockNumberList::MAX_ENCODED_BYTES {
match prev_in_shard {
Some(highest) => {
// Roll back the overflowing index, emit the prefix,
// then restart the shard with just `idx`.
shard.remove(idx);
write_shard(highest, &shard)?;
shard.clear();
shard.push(idx).expect("indices are pre-sorted and strictly ascending");
prev_in_shard = Some(idx);
}
None => {
// A single index already overflows the budget. The
// smallest possible shard is one element, so emit it
// as-is. (Unreachable for u64: 30 encoded bytes.)
write_shard(idx, &shard)?;
shard.clear();
prev_in_shard = None;
}
}
hi = mid - 1;
} else {
prev_in_shard = Some(idx);
}
}
best
}

fn encoded_size_of(indices: &[u64]) -> usize {
BlockNumberList::new_pre_sorted(indices.iter().copied()).serialized_size()
if !shard.is_empty() {
write_shard(u64::MAX, &shard)?;
}
Ok(())
}
134 changes: 134 additions & 0 deletions crates/types/src/int_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,54 @@ impl fmt::Debug for IntegerList {
}

impl IntegerList {
/// Maximum encoded byte size targeted when splitting a list into
/// shards for storage in size-constrained backends.
///
/// Currently sized for MDBX's `DUPSORT` value limit on 4 KB pages
/// (~1980 bytes). The budget reserves headroom for a paired secondary
/// key (up to 40 bytes), per-node metadata, and the rare case of a
/// trailing index that lands in a fresh roaring container. See
/// `append_to_sharded_history` in `signet-hot` (ENG-2287).
pub const MAX_ENCODED_BYTES: usize = 1500;

/// Maximum number of indices that always serialise within
/// [`Self::MAX_ENCODED_BYTES`], regardless of how the values are
/// distributed across roaring containers.
///
/// # Encoding overhead (roaring 0.11)
///
/// A [`RoaringTreemap`] serialises as an 8-byte count of inner
/// [`roaring::RoaringBitmap`] entries, followed by `(u32 hi32, encoded
/// RoaringBitmap)` pairs. Each `RoaringBitmap` partitions its 32-bit
/// values into 16-bit array, bitmap, or run *containers* (4096-entry
/// crossover from array to bitmap). The portable serialised format of
/// a `RoaringBitmap` carries:
///
/// - 8 bytes of cookie / container count / option flags
/// - 4 bytes per container (key + cardinality descriptor)
/// - 4 bytes per container (offset table entry)
/// - the container payload itself (2 bytes per element for array, 8 KB
/// for bitmap, variable for run)
///
/// The worst case is therefore one index per `RoaringBitmap` with each
/// landing in its own array container: ~22 bytes per index plus the
/// 8-byte treemap header. Empirically:
///
/// ```text
/// N= 1 (1 bitmap, 1 cont, 1 elem) -> 30 bytes
/// N= 67 (67 bitmaps, ...) -> 1482 bytes
/// N= 68 (68 bitmaps, ...) -> 1504 bytes (over)
/// ```
///
/// `BlockNumberList`s built from u64 block numbers that stay below
/// 2 ^ 32 (i.e. real-world Ethereum block numbers) share a single
/// `RoaringBitmap` and therefore encode at ~10 bytes per index in the
/// pathological "one element per 16-bit container" pattern, fitting
/// ~148 indices in [`Self::MAX_ENCODED_BYTES`]. This constant uses the
/// stricter, distribution-agnostic bound so callers can rely on it for
/// any u64 input.
pub const SAFE_INDICES_PER_SHARD: usize = 67;

/// Creates a new empty [`IntegerList`].
pub fn empty() -> Self {
Self(RoaringTreemap::new())
Expand Down Expand Up @@ -84,6 +132,12 @@ impl IntegerList {
self.0.try_push(value).map_err(|_| IntegerListError::IntegerTooSmall)
}

/// Removes a single value from the list. Returns `true` if it was
/// present.
pub fn remove(&mut self, value: u64) -> bool {
self.0.remove(value)
}

/// Clears the list.
pub fn clear(&mut self) {
self.0.clear();
Expand Down Expand Up @@ -166,3 +220,83 @@ impl IntegerList {
self.0.serialize_into(writer)
}
}

#[cfg(test)]
mod tests {
use super::*;

fn size(blocks: impl IntoIterator<Item = u64>) -> usize {
IntegerList::new_pre_sorted(blocks).serialized_size()
}

/// A dense run of indices in a single 16-bit container encodes as a
/// short header plus ~2 bytes per element (array container payload).
#[test]
fn dense_run_encodes_compactly() {
assert_eq!(size(0..1), 30);
assert_eq!(size(0..10), 48);
assert!(size(0..1_000) < 2_100);
}

/// Each new 16-bit container costs 8 bytes of metadata (4 for the
/// descriptor entry, 4 for the offset entry) plus 2 bytes for the
/// element itself — so ~10 bytes per index when every index lands in
/// its own container within a single [`RoaringBitmap`].
#[test]
fn sparse_within_single_bitmap_costs_ten_bytes_per_index() {
// (i << 16) lands index i in container i of bitmap 0.
let by_container = |n: u64| -> Vec<u64> { (0..n).map(|i| i << 16).collect() };
assert_eq!(size(by_container(1)), 30);
// ~10 bytes per additional index.
assert_eq!(size(by_container(100)), 1020);
assert_eq!(size(by_container(147)), 1490);
// N=148 lands exactly on the budget; N=149 exceeds it.
assert_eq!(size(by_container(148)), IntegerList::MAX_ENCODED_BYTES);
assert!(size(by_container(149)) > IntegerList::MAX_ENCODED_BYTES);
}

/// Each new [`RoaringBitmap`] (distinct upper-32-bit key) adds ~22
/// bytes — this is the worst case across any u64 distribution.
#[test]
fn sparse_across_bitmaps_costs_twenty_two_bytes_per_index() {
// (i << 32) places each index in its own bitmap.
let by_bitmap = |n: u64| -> Vec<u64> { (0..n).map(|i| i << 32).collect() };
assert_eq!(size(by_bitmap(1)), 30);
assert_eq!(size(by_bitmap(10)), 228);
assert_eq!(size(by_bitmap(50)), 1108);
// The boundary determines SAFE_INDICES_PER_SHARD.
assert!(size(by_bitmap(67)) <= IntegerList::MAX_ENCODED_BYTES);
assert!(size(by_bitmap(68)) > IntegerList::MAX_ENCODED_BYTES);
}

/// The published [`IntegerList::SAFE_INDICES_PER_SHARD`] must fit in
/// [`IntegerList::MAX_ENCODED_BYTES`] under the worst-case
/// distribution (one index per [`RoaringBitmap`] — the most
/// metadata-heavy u64 layout).
#[test]
fn safe_indices_per_shard_fits_worst_case() {
let worst_case: Vec<u64> =
(0..IntegerList::SAFE_INDICES_PER_SHARD as u64).map(|i| i << 32).collect();
let bytes = size(worst_case);
assert!(
bytes <= IntegerList::MAX_ENCODED_BYTES,
"SAFE_INDICES_PER_SHARD={} encodes to {} bytes, over MAX_ENCODED_BYTES={}",
IntegerList::SAFE_INDICES_PER_SHARD,
bytes,
IntegerList::MAX_ENCODED_BYTES,
);
}

/// The next index past [`IntegerList::SAFE_INDICES_PER_SHARD`] must
/// be *able* to overflow under the worst case — otherwise the bound
/// is needlessly conservative and should be raised.
#[test]
fn safe_indices_per_shard_is_tight() {
let just_over: Vec<u64> =
(0..(IntegerList::SAFE_INDICES_PER_SHARD as u64 + 1)).map(|i| i << 32).collect();
assert!(
size(just_over) > IntegerList::MAX_ENCODED_BYTES,
"SAFE_INDICES_PER_SHARD is loose — N+1 worst-case indices still fit; raise the bound",
);
}
}
20 changes: 0 additions & 20 deletions crates/types/src/sharded.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,26 +12,6 @@ pub struct ShardedKey<T> {
pub highest_block_number: u64,
}

impl ShardedKey<()> {
/// Soft cap on the number of indices in one shard.
///
/// This is a sanity ceiling used alongside [`Self::MAX_SHARD_BYTES`];
/// shard splitting is driven by encoded size, not by this count.
pub const SHARD_COUNT: usize = 2000;

/// Maximum encoded byte size of a single shard's [`BlockNumberList`].
///
/// [`BlockNumberList`]: crate::BlockNumberList
///
/// The MDBX `DUPSORT` value limit is ~1980 bytes on 4 KB pages
/// (Linux production). Each stored dup value is `key2 || encoded list`,
/// so this budget reserves headroom for `ShardedKey<U256>` (40 bytes),
/// the 2-byte length prefix on `BlockNumberList`, and per-node overhead
/// inside MDBX. Exceeding this triggers `MDBX_BAD_VALSIZE` at write time
/// (ENG-2287).
pub const MAX_SHARD_BYTES: usize = 1500;
}

impl<T> ShardedKey<T> {
/// Creates a new `ShardedKey<T>`.
pub const fn new(key: T, highest_block_number: u64) -> Self {
Expand Down