Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions examples/cranelift-panic-fallback.ilo
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
-- Cross-engine regression marker for the Cranelift JIT panic-fallback.
--
-- On AArch64 (macOS arm64) the cranelift-jit 0.116 near-call relocation
-- assertion at `compiled_blob.rs:90` could fire non-deterministically on
-- the first run after a fresh `cargo build`, crashing the process. The
-- JIT entry point is now wrapped in `std::panic::catch_unwind`, so any
-- such panic becomes a stderr breadcrumb plus an engine fallback:
-- * default engine → tree interpreter
-- * `--run-cranelift` → bytecode VM
--
-- This example pins that a numeric pipeline of the shape that triggered
-- the original repro keeps producing the same result on every engine.
-- The actual panic-capture is exercised in `tests/regression_cranelift_panic_fallback.rs`
-- via the debug-build `ILO_FORCE_JIT_PANIC=1` env-var hook.

double x:n>n;*x 2
inc x:n>n;+x 1

main>n;inc (double 20)

-- run: main
-- out: 41
-- run: double 21
-- out: 42
-- run: inc 99
-- out: 100
85 changes: 85 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2879,6 +2879,35 @@ fn run_cranelift_engine(
eprintln!("Cranelift JIT: compilation failed");
1
}
Err(vm::jit_cranelift::JitCallError::Panic { msg }) => {
// Upstream cranelift-jit 0.116 has an AArch64 near-call
// relocation assertion (`compiled_blob.rs:90`) that fires
// non-deterministically when the JIT code-cache and runtime
// memory get laid out >±64 MB apart. Converting the process
// crash to a single-line stderr breadcrumb + VM fallback
// preserves the working pipeline while keeping the upstream
// issue visible. The user asked for Cranelift explicitly, so
// we note the engine swap; we fall back to the bytecode VM
// (the closest non-JIT performance tier) rather than the
// tree interpreter.
eprintln!(
"ilo: Cranelift JIT panicked ({}); falling back to bytecode VM",
msg
);
match vm::run(&compiled, func_name, run_args) {
Ok(val) => {
print_value(&val, explicit_json, suppress);
program_exit_code(&val)
}
Err(e) => {
report_diagnostic(
&Diagnostic::from(&e).with_source(source.to_string()),
mode,
);
1
}
}
}
}
}
#[cfg(not(feature = "cranelift"))]
Expand Down Expand Up @@ -3244,6 +3273,20 @@ fn run_default(
// JIT couldn't dispatch this function — fall through
// to the tree interpreter as before.
}
Err(vm::jit_cranelift::JitCallError::Panic { msg }) => {
// Upstream cranelift-jit 0.116 AArch64 near-call
// relocation assertion (`compiled_blob.rs:90`) fires
// non-deterministically. The JIT never produced
// runnable code, so falling through to the tree
// interpreter is sound and preserves the user's
// pipeline. We emit a one-line breadcrumb so the
// upstream issue stays measurable rather than
// degrading silently into the slower engine.
eprintln!(
"ilo: Cranelift JIT panicked ({}); falling back to interpreter",
msg
);
}
}
}
}
Expand Down Expand Up @@ -7977,6 +8020,48 @@ mod tests {
assert!(code == 0 || code == 1);
}

/// `run_cranelift_engine` must catch an upstream cranelift panic and
/// fall back to the bytecode VM so the user's program still completes.
/// Exercised via the test-only `FORCE_PANIC_FOR_TEST` flag (gated on
/// `cfg(debug_assertions)` to keep the hook out of release binaries).
#[test]
#[cfg(all(feature = "cranelift", debug_assertions))]
fn run_cranelift_engine_panic_falls_back_to_vm() {
let program = make_program("f x:n>n;*x 2");
vm::jit_cranelift::FORCE_PANIC_FOR_TEST.with(|c| c.set(true));
let code = run_cranelift_engine(
&program,
&["f".to_string(), "5".to_string()],
"",
OutputMode::Text,
false,
);
// VM fallback ran the program → exit 0. A crash would have aborted
// the test process; an unhandled error would have returned 1.
assert_eq!(code, 0);
assert!(!vm::jit_cranelift::FORCE_PANIC_FOR_TEST.with(|c| c.get()));
}

/// `run_default` must catch an upstream cranelift panic and fall through
/// to the tree interpreter (the same path used for `NotEligible`).
#[test]
#[cfg(all(feature = "cranelift", debug_assertions))]
fn run_default_cranelift_panic_falls_back_to_interpreter() {
let program = make_program("f x:n>n;*x 2");
vm::jit_cranelift::FORCE_PANIC_FOR_TEST.with(|c| c.set(true));
let code = run_default(
&program,
Some("f"),
vec![interpreter::Value::Number(5.0)],
"",
OutputMode::Text,
false,
);
// Tree interpreter fallback ran the program → exit 0.
assert_eq!(code, 0);
assert!(!vm::jit_cranelift::FORCE_PANIC_FOR_TEST.with(|c| c.get()));
}

#[test]
fn run_cranelift_engine_fn_not_found_returns_one() {
let program = make_program("f x:n>n;*x 2");
Expand Down
207 changes: 203 additions & 4 deletions src/vm/jit_cranelift.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4667,10 +4667,53 @@ fn call_raw(func: &JitFunction, args: &[u64]) -> Option<u64> {
/// Callers should NOT fall back on `Runtime` — the program executed but
/// hit a defined error condition, which is the same shape tree and VM
/// surface for the same input.
///
/// `Panic` means the Cranelift JIT itself panicked during compilation or
/// finalisation — most notably the AArch64 near-call relocation assertion
/// (`compiled_blob.rs:90` — `(diff >> 26 == -1) || (diff >> 26 == 0)`) in
/// cranelift-jit 0.116 when the JIT code-cache and runtime memory end up
/// more than ±64 MB apart. The compilation never produced runnable code,
/// so it is safe — and required — for the caller to fall back to a
/// non-JIT engine. Callers should surface a stderr breadcrumb so the
/// upstream issue stays visible rather than degrading silently.
#[derive(Debug)]
pub enum JitCallError {
NotEligible,
Runtime(VmRuntimeError),
Panic { msg: String },
}

// Debug-build-only test hook: when set, `compile_and_call` raises a
// synthetic panic from inside the catch_unwind region. Exercises the
// panic-fallback path without depending on the AArch64-specific upstream
// bug. Gated on `cfg(debug_assertions)` so release binaries don't carry
// the per-thread bool or the eligibility check; tests run debug builds.
#[cfg(debug_assertions)]
thread_local! {
#[doc(hidden)]
pub static FORCE_PANIC_FOR_TEST: std::cell::Cell<bool> = const { std::cell::Cell::new(false) };
}

/// Debug-build-only env-var hook: when `ILO_FORCE_JIT_PANIC=1`,
/// `compile_and_call` raises a synthetic panic on its first call so
/// integration tests can exercise the binary's stderr breadcrumb and
/// fallback dispatch without depending on the AArch64-specific upstream
/// bug. Disabled in release builds: the assertion that controls the hook
/// is `cfg(debug_assertions)`-gated and trips out entirely when optimised.
#[cfg(debug_assertions)]
fn check_force_panic_env() {
if std::env::var("ILO_FORCE_JIT_PANIC").as_deref() == Ok("1") {
// Single-shot: clear the env var so a subsequent dispatch (e.g.
// a recursive call from within the same process) runs normally.
// Tests run a fresh subprocess each time so this is just defence.
// SAFETY: `remove_var` is called before any threads are spawned in
// the JIT dispatch path; cranelift's compile/finalize all runs on
// the calling thread, so there's no concurrent env-var access.
unsafe {
std::env::remove_var("ILO_FORCE_JIT_PANIC");
}
panic!("synthetic cranelift panic (ILO_FORCE_JIT_PANIC)");
}
}

/// Call a compiled NanVal JIT function with u64 args.
Expand Down Expand Up @@ -4707,17 +4750,125 @@ pub fn call(func: &JitFunction, args: &[u64]) -> Result<u64, JitCallError> {
Ok(result)
}

thread_local! {
/// Set while a JIT dispatch is in progress on the current thread. The
/// process-global panic hook installed by `install_jit_panic_suppressor`
/// reads this and elides the default stderr backtrace only when set,
/// so panics on other threads (and panics outside the JIT entry) keep
/// their normal rendering.
static IN_JIT_DISPATCH: std::cell::Cell<bool> = const { std::cell::Cell::new(false) };
}

/// One-shot installer: chains a wrapper hook onto the previous one that
/// suppresses output when `IN_JIT_DISPATCH` is true. Using `Once` makes
/// this safe to call from any thread and avoids racing global `set_hook`
/// calls across concurrent JIT dispatches.
fn install_jit_panic_suppressor() {
use std::sync::Once;
static INSTALL: Once = Once::new();
INSTALL.call_once(|| {
let prev = std::panic::take_hook();
std::panic::set_hook(Box::new(move |info| {
if IN_JIT_DISPATCH.with(|c| c.get()) {
// Inside JIT dispatch: caller emits a single-line breadcrumb
// with the panic payload, so we drop the default backtrace.
return;
}
prev(info);
}));
});
}

/// Compile and call in one shot (convenience wrapper).
///
/// The whole dispatch is wrapped in `std::panic::catch_unwind` so an upstream
/// cranelift panic (e.g. the AArch64 near-call relocation assertion in
/// cranelift-jit 0.116 — `compiled_blob.rs:90`) becomes a recoverable
/// `JitCallError::Panic` instead of crashing the user's program. The release
/// profile uses default unwind (`panic = "unwind"`); if a downstream consumer
/// ever switches to `panic = "abort"`, the catch is a no-op and the original
/// crash returns — that's a deliberate trade since `abort` semantics are
/// chosen for explicit reasons (binary size / no-std) and the operator opts
/// into them knowingly.
///
/// We chain a panic hook onto the process-global chain on first call (via
/// `Once`) that suppresses the noisy default stderr backtrace *only* when
/// the current thread is inside this function, scoped by a thread-local
/// `IN_JIT_DISPATCH` flag. The caller emits a single-line breadcrumb
/// instead. This is concurrency-safe — concurrent JIT dispatches on other
/// threads do not race on `set_hook` / `take_hook`.
pub fn compile_and_call(
chunk: &Chunk,
nan_consts: &[NanVal],
args: &[u64],
program: &CompiledProgram,
) -> Result<u64, JitCallError> {
with_active_registry(program, || {
let func = compile(chunk, nan_consts, program).ok_or(JitCallError::NotEligible)?;
call(&func, args)
})
// Install a process-global panic hook *once* that suppresses stderr
// output only when the calling thread is inside a JIT dispatch. The
// thread-local `IN_JIT_DISPATCH` flag is set for the duration of the
// catch_unwind region and cleared by `InJitGuard::drop`, so any other
// thread's panic during this time still gets the chain's default
// rendering. This avoids the global `set_hook` race that would
// otherwise occur if two threads called `compile_and_call`
// concurrently — a real concern when ilo is embedded as a library
// and a sound concern for our own multi-threaded `cargo test`.
install_jit_panic_suppressor();
// Save-restore (not set-then-clear) so a nested `compile_and_call`
// doesn't blank the outer's flag on return. Currently there is no
// nested call site, but the JIT helpers can run arbitrary Rust which
// is permitted to re-enter the JIT entry; keeping this nest-safe
// avoids a footgun for future call sites.
struct InJitGuard(bool);
impl Drop for InJitGuard {
fn drop(&mut self) {
IN_JIT_DISPATCH.with(|c| c.set(self.0));
}
}
let prev_in_jit = IN_JIT_DISPATCH.with(|c| c.replace(true));
let _in_jit = InJitGuard(prev_in_jit);

// `AssertUnwindSafe` is sound here: `with_active_registry` clears its TLS
// on return (its own Drop guard handles this), `JitRuntimeErrorGuard`
// inside `call` clears the runtime-error cell on drop, and the chunk +
// program references are immutable. No shared mutable state survives an
// unwind in a corrupted state.
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
with_active_registry(program, || {
#[cfg(debug_assertions)]
{
if FORCE_PANIC_FOR_TEST.with(|c| c.get()) {
FORCE_PANIC_FOR_TEST.with(|c| c.set(false));
panic!("synthetic cranelift panic for test");
}
check_force_panic_env();
}
let func = compile(chunk, nan_consts, program).ok_or(JitCallError::NotEligible)?;
call(&func, args)
})
}));

match result {
Ok(inner) => inner,
Err(payload) => {
// Defensively reset the JIT bump arena and clear the runtime-
// error TLS cell: if the panic fired mid-`call()` after some
// helper allocations but before the normal tail reset, those
// allocations would otherwise leak into the next invocation.
// The runtime-error cell already has a Drop-based guard inside
// `call`, but if the panic fired in `compile()` no guard ever
// installed — drain defensively here.
super::jit_arena_reset();
let _ = jit_take_runtime_error();
let msg = if let Some(s) = payload.downcast_ref::<&'static str>() {
(*s).to_string()
} else if let Some(s) = payload.downcast_ref::<String>() {
s.clone()
} else {
"cranelift JIT panicked (non-string payload)".to_string()
};
Err(JitCallError::Panic { msg })
}
}
}

#[cfg(test)]
Expand Down Expand Up @@ -6805,4 +6956,52 @@ mod tests {
other => panic!("expected a Record, got {:?}", other),
}
}

// ── panic catch-and-return (upstream cranelift-jit AArch64 reloc bug) ────

/// `compile_and_call` must catch a panic raised from inside the JIT
/// dispatch and surface it as `JitCallError::Panic` rather than
/// unwinding into the caller. Exercised here via the test-only
/// `FORCE_PANIC_FOR_TEST` flag so we don't depend on the AArch64
/// near-call relocation bug actually firing. The hook is gated on
/// `cfg(debug_assertions)` (so release binaries never carry it), and
/// this test follows the same gate.
#[cfg(debug_assertions)]
#[test]
fn cranelift_compile_and_call_catches_panic() {
let tokens: Vec<crate::lexer::Token> = lexer::lex("f x:n>n;*x 2")
.unwrap()
.into_iter()
.map(|(t, _)| t)
.collect();
let prog = parser::parse_tokens(tokens).unwrap();
let compiled = crate::vm::compile(&prog).unwrap();
let idx = compiled.func_names.iter().position(|n| n == "f").unwrap();
let chunk = &compiled.chunks[idx];
let nan_consts = &compiled.nan_constants[idx];
let nan_args: Vec<u64> = [Value::Number(5.0)]
.iter()
.map(|v| NanVal::from_value(v).0)
.collect();

FORCE_PANIC_FOR_TEST.with(|c| c.set(true));
let result = compile_and_call(chunk, nan_consts, &nan_args, &compiled);

match result {
Err(JitCallError::Panic { msg }) => {
assert!(
msg.contains("synthetic cranelift panic"),
"panic msg should include payload, got {:?}",
msg
);
}
other => panic!("expected JitCallError::Panic, got {:?}", other),
}

// Flag must be cleared so the next call dispatches normally.
assert!(!FORCE_PANIC_FOR_TEST.with(|c| c.get()));
let result2 = compile_and_call(chunk, nan_consts, &nan_args, &compiled);
assert!(result2.is_ok(), "post-panic JIT call should succeed");
assert_eq!(NanVal(result2.unwrap()).to_value(), Value::Number(10.0));
}
}
Loading
Loading