diff --git a/examples/cranelift-panic-fallback.ilo b/examples/cranelift-panic-fallback.ilo new file mode 100644 index 00000000..586d5a47 --- /dev/null +++ b/examples/cranelift-panic-fallback.ilo @@ -0,0 +1,26 @@ +-- Cross-engine regression marker for the Cranelift JIT panic-fallback. +-- +-- On AArch64 (macOS arm64) the cranelift-jit 0.116 near-call relocation +-- assertion at `compiled_blob.rs:90` could fire non-deterministically on +-- the first run after a fresh `cargo build`, crashing the process. The +-- JIT entry point is now wrapped in `std::panic::catch_unwind`, so any +-- such panic becomes a stderr breadcrumb plus an engine fallback: +-- * default engine → tree interpreter +-- * `--run-cranelift` → bytecode VM +-- +-- This example pins that a numeric pipeline of the shape that triggered +-- the original repro keeps producing the same result on every engine. +-- The actual panic-capture is exercised in `tests/regression_cranelift_panic_fallback.rs` +-- via the debug-build `ILO_FORCE_JIT_PANIC=1` env-var hook. + +double x:n>n;*x 2 +inc x:n>n;+x 1 + +main>n;inc (double 20) + +-- run: main +-- out: 41 +-- run: double 21 +-- out: 42 +-- run: inc 99 +-- out: 100 diff --git a/src/main.rs b/src/main.rs index 734df5ac..d80dad9c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2879,6 +2879,35 @@ fn run_cranelift_engine( eprintln!("Cranelift JIT: compilation failed"); 1 } + Err(vm::jit_cranelift::JitCallError::Panic { msg }) => { + // Upstream cranelift-jit 0.116 has an AArch64 near-call + // relocation assertion (`compiled_blob.rs:90`) that fires + // non-deterministically when the JIT code-cache and runtime + // memory get laid out >±64 MB apart. Converting the process + // crash to a single-line stderr breadcrumb + VM fallback + // preserves the working pipeline while keeping the upstream + // issue visible. The user asked for Cranelift explicitly, so + // we note the engine swap; we fall back to the bytecode VM + // (the closest non-JIT performance tier) rather than the + // tree interpreter. + eprintln!( + "ilo: Cranelift JIT panicked ({}); falling back to bytecode VM", + msg + ); + match vm::run(&compiled, func_name, run_args) { + Ok(val) => { + print_value(&val, explicit_json, suppress); + program_exit_code(&val) + } + Err(e) => { + report_diagnostic( + &Diagnostic::from(&e).with_source(source.to_string()), + mode, + ); + 1 + } + } + } } } #[cfg(not(feature = "cranelift"))] @@ -3244,6 +3273,20 @@ fn run_default( // JIT couldn't dispatch this function — fall through // to the tree interpreter as before. } + Err(vm::jit_cranelift::JitCallError::Panic { msg }) => { + // Upstream cranelift-jit 0.116 AArch64 near-call + // relocation assertion (`compiled_blob.rs:90`) fires + // non-deterministically. The JIT never produced + // runnable code, so falling through to the tree + // interpreter is sound and preserves the user's + // pipeline. We emit a one-line breadcrumb so the + // upstream issue stays measurable rather than + // degrading silently into the slower engine. + eprintln!( + "ilo: Cranelift JIT panicked ({}); falling back to interpreter", + msg + ); + } } } } @@ -7977,6 +8020,48 @@ mod tests { assert!(code == 0 || code == 1); } + /// `run_cranelift_engine` must catch an upstream cranelift panic and + /// fall back to the bytecode VM so the user's program still completes. + /// Exercised via the test-only `FORCE_PANIC_FOR_TEST` flag (gated on + /// `cfg(debug_assertions)` to keep the hook out of release binaries). + #[test] + #[cfg(all(feature = "cranelift", debug_assertions))] + fn run_cranelift_engine_panic_falls_back_to_vm() { + let program = make_program("f x:n>n;*x 2"); + vm::jit_cranelift::FORCE_PANIC_FOR_TEST.with(|c| c.set(true)); + let code = run_cranelift_engine( + &program, + &["f".to_string(), "5".to_string()], + "", + OutputMode::Text, + false, + ); + // VM fallback ran the program → exit 0. A crash would have aborted + // the test process; an unhandled error would have returned 1. + assert_eq!(code, 0); + assert!(!vm::jit_cranelift::FORCE_PANIC_FOR_TEST.with(|c| c.get())); + } + + /// `run_default` must catch an upstream cranelift panic and fall through + /// to the tree interpreter (the same path used for `NotEligible`). + #[test] + #[cfg(all(feature = "cranelift", debug_assertions))] + fn run_default_cranelift_panic_falls_back_to_interpreter() { + let program = make_program("f x:n>n;*x 2"); + vm::jit_cranelift::FORCE_PANIC_FOR_TEST.with(|c| c.set(true)); + let code = run_default( + &program, + Some("f"), + vec![interpreter::Value::Number(5.0)], + "", + OutputMode::Text, + false, + ); + // Tree interpreter fallback ran the program → exit 0. + assert_eq!(code, 0); + assert!(!vm::jit_cranelift::FORCE_PANIC_FOR_TEST.with(|c| c.get())); + } + #[test] fn run_cranelift_engine_fn_not_found_returns_one() { let program = make_program("f x:n>n;*x 2"); diff --git a/src/vm/jit_cranelift.rs b/src/vm/jit_cranelift.rs index d772440a..ba8aa47e 100644 --- a/src/vm/jit_cranelift.rs +++ b/src/vm/jit_cranelift.rs @@ -4667,10 +4667,53 @@ fn call_raw(func: &JitFunction, args: &[u64]) -> Option { /// Callers should NOT fall back on `Runtime` — the program executed but /// hit a defined error condition, which is the same shape tree and VM /// surface for the same input. +/// +/// `Panic` means the Cranelift JIT itself panicked during compilation or +/// finalisation — most notably the AArch64 near-call relocation assertion +/// (`compiled_blob.rs:90` — `(diff >> 26 == -1) || (diff >> 26 == 0)`) in +/// cranelift-jit 0.116 when the JIT code-cache and runtime memory end up +/// more than ±64 MB apart. The compilation never produced runnable code, +/// so it is safe — and required — for the caller to fall back to a +/// non-JIT engine. Callers should surface a stderr breadcrumb so the +/// upstream issue stays visible rather than degrading silently. #[derive(Debug)] pub enum JitCallError { NotEligible, Runtime(VmRuntimeError), + Panic { msg: String }, +} + +// Debug-build-only test hook: when set, `compile_and_call` raises a +// synthetic panic from inside the catch_unwind region. Exercises the +// panic-fallback path without depending on the AArch64-specific upstream +// bug. Gated on `cfg(debug_assertions)` so release binaries don't carry +// the per-thread bool or the eligibility check; tests run debug builds. +#[cfg(debug_assertions)] +thread_local! { + #[doc(hidden)] + pub static FORCE_PANIC_FOR_TEST: std::cell::Cell = const { std::cell::Cell::new(false) }; +} + +/// Debug-build-only env-var hook: when `ILO_FORCE_JIT_PANIC=1`, +/// `compile_and_call` raises a synthetic panic on its first call so +/// integration tests can exercise the binary's stderr breadcrumb and +/// fallback dispatch without depending on the AArch64-specific upstream +/// bug. Disabled in release builds: the assertion that controls the hook +/// is `cfg(debug_assertions)`-gated and trips out entirely when optimised. +#[cfg(debug_assertions)] +fn check_force_panic_env() { + if std::env::var("ILO_FORCE_JIT_PANIC").as_deref() == Ok("1") { + // Single-shot: clear the env var so a subsequent dispatch (e.g. + // a recursive call from within the same process) runs normally. + // Tests run a fresh subprocess each time so this is just defence. + // SAFETY: `remove_var` is called before any threads are spawned in + // the JIT dispatch path; cranelift's compile/finalize all runs on + // the calling thread, so there's no concurrent env-var access. + unsafe { + std::env::remove_var("ILO_FORCE_JIT_PANIC"); + } + panic!("synthetic cranelift panic (ILO_FORCE_JIT_PANIC)"); + } } /// Call a compiled NanVal JIT function with u64 args. @@ -4707,17 +4750,125 @@ pub fn call(func: &JitFunction, args: &[u64]) -> Result { Ok(result) } +thread_local! { + /// Set while a JIT dispatch is in progress on the current thread. The + /// process-global panic hook installed by `install_jit_panic_suppressor` + /// reads this and elides the default stderr backtrace only when set, + /// so panics on other threads (and panics outside the JIT entry) keep + /// their normal rendering. + static IN_JIT_DISPATCH: std::cell::Cell = const { std::cell::Cell::new(false) }; +} + +/// One-shot installer: chains a wrapper hook onto the previous one that +/// suppresses output when `IN_JIT_DISPATCH` is true. Using `Once` makes +/// this safe to call from any thread and avoids racing global `set_hook` +/// calls across concurrent JIT dispatches. +fn install_jit_panic_suppressor() { + use std::sync::Once; + static INSTALL: Once = Once::new(); + INSTALL.call_once(|| { + let prev = std::panic::take_hook(); + std::panic::set_hook(Box::new(move |info| { + if IN_JIT_DISPATCH.with(|c| c.get()) { + // Inside JIT dispatch: caller emits a single-line breadcrumb + // with the panic payload, so we drop the default backtrace. + return; + } + prev(info); + })); + }); +} + /// Compile and call in one shot (convenience wrapper). +/// +/// The whole dispatch is wrapped in `std::panic::catch_unwind` so an upstream +/// cranelift panic (e.g. the AArch64 near-call relocation assertion in +/// cranelift-jit 0.116 — `compiled_blob.rs:90`) becomes a recoverable +/// `JitCallError::Panic` instead of crashing the user's program. The release +/// profile uses default unwind (`panic = "unwind"`); if a downstream consumer +/// ever switches to `panic = "abort"`, the catch is a no-op and the original +/// crash returns — that's a deliberate trade since `abort` semantics are +/// chosen for explicit reasons (binary size / no-std) and the operator opts +/// into them knowingly. +/// +/// We chain a panic hook onto the process-global chain on first call (via +/// `Once`) that suppresses the noisy default stderr backtrace *only* when +/// the current thread is inside this function, scoped by a thread-local +/// `IN_JIT_DISPATCH` flag. The caller emits a single-line breadcrumb +/// instead. This is concurrency-safe — concurrent JIT dispatches on other +/// threads do not race on `set_hook` / `take_hook`. pub fn compile_and_call( chunk: &Chunk, nan_consts: &[NanVal], args: &[u64], program: &CompiledProgram, ) -> Result { - with_active_registry(program, || { - let func = compile(chunk, nan_consts, program).ok_or(JitCallError::NotEligible)?; - call(&func, args) - }) + // Install a process-global panic hook *once* that suppresses stderr + // output only when the calling thread is inside a JIT dispatch. The + // thread-local `IN_JIT_DISPATCH` flag is set for the duration of the + // catch_unwind region and cleared by `InJitGuard::drop`, so any other + // thread's panic during this time still gets the chain's default + // rendering. This avoids the global `set_hook` race that would + // otherwise occur if two threads called `compile_and_call` + // concurrently — a real concern when ilo is embedded as a library + // and a sound concern for our own multi-threaded `cargo test`. + install_jit_panic_suppressor(); + // Save-restore (not set-then-clear) so a nested `compile_and_call` + // doesn't blank the outer's flag on return. Currently there is no + // nested call site, but the JIT helpers can run arbitrary Rust which + // is permitted to re-enter the JIT entry; keeping this nest-safe + // avoids a footgun for future call sites. + struct InJitGuard(bool); + impl Drop for InJitGuard { + fn drop(&mut self) { + IN_JIT_DISPATCH.with(|c| c.set(self.0)); + } + } + let prev_in_jit = IN_JIT_DISPATCH.with(|c| c.replace(true)); + let _in_jit = InJitGuard(prev_in_jit); + + // `AssertUnwindSafe` is sound here: `with_active_registry` clears its TLS + // on return (its own Drop guard handles this), `JitRuntimeErrorGuard` + // inside `call` clears the runtime-error cell on drop, and the chunk + + // program references are immutable. No shared mutable state survives an + // unwind in a corrupted state. + let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| { + with_active_registry(program, || { + #[cfg(debug_assertions)] + { + if FORCE_PANIC_FOR_TEST.with(|c| c.get()) { + FORCE_PANIC_FOR_TEST.with(|c| c.set(false)); + panic!("synthetic cranelift panic for test"); + } + check_force_panic_env(); + } + let func = compile(chunk, nan_consts, program).ok_or(JitCallError::NotEligible)?; + call(&func, args) + }) + })); + + match result { + Ok(inner) => inner, + Err(payload) => { + // Defensively reset the JIT bump arena and clear the runtime- + // error TLS cell: if the panic fired mid-`call()` after some + // helper allocations but before the normal tail reset, those + // allocations would otherwise leak into the next invocation. + // The runtime-error cell already has a Drop-based guard inside + // `call`, but if the panic fired in `compile()` no guard ever + // installed — drain defensively here. + super::jit_arena_reset(); + let _ = jit_take_runtime_error(); + let msg = if let Some(s) = payload.downcast_ref::<&'static str>() { + (*s).to_string() + } else if let Some(s) = payload.downcast_ref::() { + s.clone() + } else { + "cranelift JIT panicked (non-string payload)".to_string() + }; + Err(JitCallError::Panic { msg }) + } + } } #[cfg(test)] @@ -6805,4 +6956,52 @@ mod tests { other => panic!("expected a Record, got {:?}", other), } } + + // ── panic catch-and-return (upstream cranelift-jit AArch64 reloc bug) ──── + + /// `compile_and_call` must catch a panic raised from inside the JIT + /// dispatch and surface it as `JitCallError::Panic` rather than + /// unwinding into the caller. Exercised here via the test-only + /// `FORCE_PANIC_FOR_TEST` flag so we don't depend on the AArch64 + /// near-call relocation bug actually firing. The hook is gated on + /// `cfg(debug_assertions)` (so release binaries never carry it), and + /// this test follows the same gate. + #[cfg(debug_assertions)] + #[test] + fn cranelift_compile_and_call_catches_panic() { + let tokens: Vec = lexer::lex("f x:n>n;*x 2") + .unwrap() + .into_iter() + .map(|(t, _)| t) + .collect(); + let prog = parser::parse_tokens(tokens).unwrap(); + let compiled = crate::vm::compile(&prog).unwrap(); + let idx = compiled.func_names.iter().position(|n| n == "f").unwrap(); + let chunk = &compiled.chunks[idx]; + let nan_consts = &compiled.nan_constants[idx]; + let nan_args: Vec = [Value::Number(5.0)] + .iter() + .map(|v| NanVal::from_value(v).0) + .collect(); + + FORCE_PANIC_FOR_TEST.with(|c| c.set(true)); + let result = compile_and_call(chunk, nan_consts, &nan_args, &compiled); + + match result { + Err(JitCallError::Panic { msg }) => { + assert!( + msg.contains("synthetic cranelift panic"), + "panic msg should include payload, got {:?}", + msg + ); + } + other => panic!("expected JitCallError::Panic, got {:?}", other), + } + + // Flag must be cleared so the next call dispatches normally. + assert!(!FORCE_PANIC_FOR_TEST.with(|c| c.get())); + let result2 = compile_and_call(chunk, nan_consts, &nan_args, &compiled); + assert!(result2.is_ok(), "post-panic JIT call should succeed"); + assert_eq!(NanVal(result2.unwrap()).to_value(), Value::Number(10.0)); + } } diff --git a/tests/regression_cranelift_panic_fallback.rs b/tests/regression_cranelift_panic_fallback.rs new file mode 100644 index 00000000..0b7eefbd --- /dev/null +++ b/tests/regression_cranelift_panic_fallback.rs @@ -0,0 +1,112 @@ +//! Regression: a panic inside the Cranelift JIT (most notably the AArch64 +//! near-call relocation assertion in cranelift-jit 0.116, +//! `compiled_blob.rs:90` — `(diff >> 26 == -1) || (diff >> 26 == 0)`) must +//! be caught and surfaced as a stderr breadcrumb + engine fallback, not a +//! process crash. +//! +//! Hard repro of the AArch64 bug is non-deterministic and platform-specific +//! (depends on JIT code-cache vs runtime memory layout), so these tests use +//! the debug-build env-var hook `ILO_FORCE_JIT_PANIC=1` which raises a +//! synthetic panic at the same call site. The release binary does not have +//! the hook — the `cfg(debug_assertions)` guard trips it out — so this +//! cannot affect production users. +//! +//! Cross-engine coverage: +//! - default engine dispatch (`ilo file.ilo`) falls through to the tree +//! interpreter, same path as `JitCallError::NotEligible`. +//! - explicit `--run-cranelift` falls back to the bytecode VM, since the +//! user opted into a JIT engine and VM is the closest non-JIT tier. +//! +//! Gated on `cfg(debug_assertions)`: the env-var hook in +//! `vm::jit_cranelift::check_force_panic_env` is only compiled in debug +//! builds. In a release-mode test the hook is absent, so the synthetic +//! panic never fires and the assertions would misfire. + +#![cfg(debug_assertions)] + +use std::process::Command; + +fn ilo() -> Command { + Command::new(env!("CARGO_BIN_EXE_ilo")) +} + +#[test] +fn cranelift_panic_default_falls_back_to_interpreter() { + let out = ilo() + .args(["f x:n>n;*x 2", "f", "5"]) + .env("ILO_FORCE_JIT_PANIC", "1") + .output() + .expect("failed to run ilo"); + + let stdout = String::from_utf8_lossy(&out.stdout); + let stderr = String::from_utf8_lossy(&out.stderr); + + assert!( + out.status.success(), + "default engine should fall through after JIT panic. \ + stdout={stdout:?} stderr={stderr:?} status={:?}", + out.status.code() + ); + assert!( + stdout.trim() == "10", + "tree interpreter fallback should produce f(5)=10, got stdout={stdout:?}" + ); + assert!( + stderr.contains("Cranelift JIT panicked"), + "stderr breadcrumb missing, got {stderr:?}" + ); + assert!( + stderr.contains("falling back to interpreter"), + "default-engine breadcrumb should mention interpreter fallback, got {stderr:?}" + ); +} + +#[test] +fn cranelift_panic_explicit_engine_falls_back_to_vm() { + let out = ilo() + .args(["--run-cranelift", "f x:n>n;*x 2", "f", "5"]) + .env("ILO_FORCE_JIT_PANIC", "1") + .output() + .expect("failed to run ilo"); + + let stdout = String::from_utf8_lossy(&out.stdout); + let stderr = String::from_utf8_lossy(&out.stderr); + + assert!( + out.status.success(), + "--run-cranelift should fall back to VM after JIT panic. \ + stdout={stdout:?} stderr={stderr:?} status={:?}", + out.status.code() + ); + assert!( + stdout.trim() == "10", + "VM fallback should produce f(5)=10, got stdout={stdout:?}" + ); + assert!( + stderr.contains("Cranelift JIT panicked"), + "stderr breadcrumb missing, got {stderr:?}" + ); + assert!( + stderr.contains("falling back to bytecode VM"), + "explicit-cranelift breadcrumb should mention VM fallback, got {stderr:?}" + ); +} + +/// The breadcrumb must include the panic payload so the upstream issue +/// (AArch64 relocation assertion, etc.) is searchable in production logs +/// rather than being collapsed into a generic message. +#[test] +fn cranelift_panic_breadcrumb_includes_payload() { + let out = ilo() + .args(["f x:n>n;*x 2", "f", "5"]) + .env("ILO_FORCE_JIT_PANIC", "1") + .output() + .expect("failed to run ilo"); + + let stderr = String::from_utf8_lossy(&out.stderr); + assert!(out.status.success(), "fallback should succeed"); + assert!( + stderr.contains("ILO_FORCE_JIT_PANIC") || stderr.contains("synthetic cranelift panic"), + "breadcrumb should include the panic payload, got {stderr:?}" + ); +}