Skip to content

Commit f5e2299

Browse files
committed
fix rare zkalloc/rayon interraction bug
1 parent 0b06b0e commit f5e2299

6 files changed

Lines changed: 69 additions & 0 deletions

File tree

Cargo.lock

Lines changed: 2 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

crates/backend/system-info/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ edition.workspace = true
55

66
[dependencies]
77
libc = "0.2"
8+
rayon.workspace = true
89

910
[lints]
1011
workspace = true

crates/backend/system-info/src/lib.rs

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,3 +9,36 @@ pub fn peak_rss_bytes() -> u64 {
99
// ru_maxrss unit: bytes on macOS, KiB on Linux.
1010
if cfg!(target_os = "macos") { max } else { max * 1024 }
1111
}
12+
13+
/// Number of jobs [`flush_rayon`] pushes. Must exceed
14+
/// `crossbeam_deque::deque::BLOCK_CAP` (currently 63 —
15+
/// `crossbeam-deque-0.8.6/src/deque.rs:1191`).
16+
const RAYON_FLUSH_JOBS: usize = 256;
17+
18+
/// Drain rayon's internal queues so they release any storage allocated during the
19+
/// previous phase.
20+
///
21+
/// Rayon's global pool owns a `crossbeam_deque::Injector`, internally a linked list
22+
/// of fixed-size blocks (`Block` and `Injector::push` —
23+
/// `crossbeam-deque-0.8.6/src/deque.rs:1219` and `:1371`). A block is freed only
24+
/// once its last slot has been consumed.
25+
///
26+
/// `rayon::join` from a non-worker thread reaches that injector via
27+
/// `join` (`rayon-core-1.13.0/src/join/mod.rs:132`) ->
28+
/// `registry::in_worker` (`registry.rs:946`) ->
29+
/// `Registry::in_worker_cold` (`:517`) ->
30+
/// `Registry::inject` (`:428`) -> `Injector::push`.
31+
///
32+
/// Under an arena allocator that recycles memory between phases (e.g. `zk-alloc`),
33+
/// a block allocated *during* a phase points into a slab the next `begin_phase()`
34+
/// will reuse. The next push then writes a `JobRef` straight through whatever the
35+
/// application has placed on top, silently corrupting it.
36+
///
37+
/// Pushing more than `BLOCK_CAP` jobs while the arena is off forces the Injector
38+
/// to allocate a fresh tail block (which lands in System), and forces workers to
39+
/// steal the last slot of every preceding block (which destroys them).
40+
pub fn flush_rayon() {
41+
for _ in 0..RAYON_FLUSH_JOBS {
42+
rayon::join(|| {}, || {});
43+
}
44+
}

crates/backend/zk-alloc/Cargo.toml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ description = "Bump+reset arena allocator for ZK proving workloads"
77
[dependencies]
88
system-info.workspace = true
99

10+
[dev-dependencies]
11+
rayon.workspace = true
12+
1013
[target.'cfg(not(all(target_os = "linux", target_arch = "x86_64")))'.dependencies]
1114
libc = "0.2"
1215

crates/backend/zk-alloc/src/lib.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,12 @@ pub fn begin_phase() {
106106

107107
/// Deactivates the arena. New allocations go to the system allocator; existing arena
108108
/// pointers stay valid until the next `begin_phase()` resets the slabs.
109+
///
110+
/// Also calls [`system_info::flush_rayon`] to release any rayon/crossbeam storage
111+
/// still referencing this phase's arena memory.
109112
pub fn end_phase() {
110113
ARENA_ACTIVE.store(false, Ordering::Release);
114+
system_info::flush_rayon();
111115
}
112116

113117
#[cold]
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
//! Regression test for the bug prevented by `system_info::flush_rayon`.
2+
3+
use rayon::prelude::*;
4+
5+
#[global_allocator]
6+
static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator;
7+
8+
#[test]
9+
fn rayon_does_not_corrupt_zkalloc() {
10+
zk_alloc::init();
11+
let _: u64 = (0..1_000_000_u64).into_par_iter().sum();
12+
13+
zk_alloc::begin_phase();
14+
for _ in 0..200 {
15+
rayon::join(|| {}, || {});
16+
}
17+
zk_alloc::end_phase();
18+
19+
zk_alloc::begin_phase();
20+
let canary = vec![0xAB_u8; 8192];
21+
rayon::join(|| {}, || {});
22+
zk_alloc::end_phase();
23+
24+
let pos = canary.iter().position(|&b| b != 0xAB);
25+
assert!(pos.is_none(), "canary corrupted at offset {}", pos.unwrap());
26+
}

0 commit comments

Comments
 (0)