Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@

- `ls dir` renamed to `lsd dir`. Six rerun10 personas tripped ILO-P011 on `ls=rdl! p` because `ls` was reserved; rename frees `ls` for user code. `walk`, `glob` unchanged.

### Performance

- `mset` accumulator via helper fn no longer pays a ~1000x perf cliff. The canonical DRY refactor `addto m k v > mset m k v` followed by `m = addto m k v` in a loop now runs at the same speed as the inline form. New OP_MOVE_OWN / OP_CALL_OWN1 opcodes thread the first arg into the helper at the caller's RC, and a tail-position rewrite lets the helper's `mset m k v` fire the existing in-place fast path. 40k rows: 25.8s to 0.01s on VM; JIT and AOT linear at 1M rows.

## 0.12.0 - 2026-05-19

### Breaking
Expand Down
31 changes: 31 additions & 0 deletions examples/mset-helper-perf.ilo
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
-- Helper-fn `mset` accumulator now stays O(N) instead of O(N·K).
--
-- Pre-0.12.1: factoring the per-row map update into a helper fn caused
-- a silent ~1000x slowdown on high-cardinality rollups. The map crossed
-- an OP_CALL boundary, RC bumped to >=2, and OP_MSET's RC=1 in-place
-- fast path declined — every row cloned the whole HashMap. The agent's
-- obvious DRY refactor (extract per-row update into a helper) made the
-- program 1000x slower with no error or warning.
--
-- 0.12.1 closes the cliff with OP_CALL_OWN1 + OP_MOVE_OWN (move-not-
-- clone first arg when the compiler sees `name = fn(name, ...)`) plus
-- a tail-position rewrite that lets `mset m k v` inside the helper
-- fire the existing in-place fast path. The "obvious thing is the
-- right thing" promise holds again: factor or don't, the perf is the
-- same.

-- Canonical accumulator pattern: helper takes the map, mutates and
-- returns. Caller rebinds the map to the result. With the fix this
-- stays at one RC=1 in-place insert per row, even at scale.
addto m:M t n k:t v:n>M t n;mset m k v
build n:n>n;m=mmap;@i 1..n{k=str i;m=addto m k i};len (mkeys m)

-- Per-key bump helper: read-modify-write through the same shape.
-- Exercises the same move-semantics path with an extra arg.
bump m:M t n k:t inc:n>M t n;c=??(mget m k) 0;mset m k (+c inc)
total>n;m=mmap;@i 0..5{m=bump m "x" 2};??(mget m "x") 0

-- run: build 100
-- out: 99
-- run: total
-- out: 10
46 changes: 42 additions & 4 deletions src/vm/compile_cranelift.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1104,7 +1104,11 @@ fn compile_function_body(
non_num_write[a] = true;
}
// MOVE: skip here, handled by fixpoint below.
OP_MOVE => {}
// OP_MOVE_OWN shares OP_MOVE's encoding and propagates
// type info the same way (the move-vs-clone-rc distinction
// is a runtime-only concern that doesn't change the
// numeric/boolean classification of the destination).
OP_MOVE | OP_MOVE_OWN => {}
// Ops that write a non-numeric or unknown type to R[A].
OP_ADD | OP_SUB | OP_MUL | OP_DIV | OP_ADD_SS | OP_NEG | OP_WRAPOK | OP_WRAPERR
| OP_UNWRAP | OP_RECFLD | OP_RECFLD_NAME | OP_RECFLD_SAFE | OP_RECFLD_NAME_SAFE
Expand All @@ -1125,7 +1129,10 @@ fn compile_function_body(
non_bool_write[a] = true;
}
// OP_CALL: if callee is known all-numeric, result is numeric.
OP_CALL => {
// OP_CALL_OWN1 has the identical encoding and result-write
// shape; the move-not-clone first-arg semantics are an
// RC bookkeeping detail that doesn't change classification.
OP_CALL | OP_CALL_OWN1 => {
if let Some(prog) = program {
let bx = (inst & 0xFFFF) as usize;
let func_idx = bx >> 8;
Expand Down Expand Up @@ -1160,7 +1167,7 @@ fn compile_function_body(
continue;
}
i += 1;
if op != OP_MOVE {
if op != OP_MOVE && op != OP_MOVE_OWN {
continue;
}
let a = ((inst >> 16) & 0xFF) as usize;
Expand Down Expand Up @@ -1773,6 +1780,29 @@ fn compile_function_body(
}
}
}
OP_MOVE_OWN => {
// Move-not-clone variant of OP_MOVE used by the
// `name = fn(name, ...)` peephole. In Cranelift, SSA
// Variable assignment doesn't bump RC of heap values
// (that's done explicitly by jit_move for OP_MOVE).
// For OP_MOVE_OWN we deliberately skip jit_move: the
// source Variable is not used again on the SSA path
// emitted by the peephole (in / out pair brackets the
// call), so the RC stays at the caller's pre-move count
// exactly as the VM intends. Emitting a clone here
// would inflate RC by one per loop iteration, defeating
// the in-place OP_MSET fast path inside the helper and
// leaking memory.
if a_idx != b_idx {
let bv = builder.use_var(vars[b_idx]);
builder.def_var(vars[a_idx], bv);
let src_always_num = b_idx < reg_always_num.len() && reg_always_num[b_idx];
if src_always_num && a_idx < reg_always_num.len() && reg_always_num[a_idx] {
let bf = builder.use_var(f64_vars[b_idx]);
builder.def_var(f64_vars[a_idx], bf);
}
}
}
OP_NOT => {
let bv = builder.use_var(vars[b_idx]);
let fref = get_func_ref(&mut builder, module, helpers.not);
Expand Down Expand Up @@ -3575,7 +3605,15 @@ fn compile_function_body(
}
}
// ── Function call with inlining + F64 shadow support ──
OP_CALL => {
OP_CALL | OP_CALL_OWN1 => {
// OP_CALL_OWN1: move-not-clone first-arg variant of OP_CALL,
// emitted by the let-stmt peephole for `name = fn(name, ...)`.
// Under Cranelift's SSA Variable model, args are passed as
// values (no per-push clone_rc on a stack), so the lowering
// is identical to OP_CALL. The perf win on Cranelift comes
// from the compiler's tail-position rewrite of `mset m k v`
// inside the helper, which fires the existing in-place
// fast path in the OP_MSET handler.
let a = ((inst >> 16) & 0xFF) as u8;
let bx = (inst & 0xFFFF) as usize;
let func_idx = bx >> 8;
Expand Down
56 changes: 52 additions & 4 deletions src/vm/jit_cranelift.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1190,7 +1190,9 @@ fn compile_function_body(
non_num_write[a] = true;
}
// MOVE: skip here, handled by fixpoint below.
OP_MOVE => {}
// OP_MOVE_OWN behaves identically to OP_MOVE for type
// propagation purposes (same A=dest, B=src layout).
OP_MOVE | OP_MOVE_OWN => {}
// Ops that write a non-numeric or unknown type to R[A].
OP_ADD | OP_SUB | OP_MUL | OP_DIV // may be string concat etc.
| OP_ADD_SS // string concat — always a string
Expand All @@ -1217,7 +1219,10 @@ fn compile_function_body(
non_bool_write[a] = true;
}
// OP_CALL: if callee is known all-numeric, result is numeric.
OP_CALL => {
// OP_CALL_OWN1 has the identical encoding and result-write
// shape (move-not-clone on first arg only changes RC
// bookkeeping, not the result Variable type).
OP_CALL | OP_CALL_OWN1 => {
let bx = (inst & 0xFFFF) as usize;
let func_idx = bx >> 8;
if func_idx < program.chunks.len()
Expand Down Expand Up @@ -1252,7 +1257,7 @@ fn compile_function_body(
continue;
}
i += 1;
if op != OP_MOVE {
if op != OP_MOVE && op != OP_MOVE_OWN {
continue;
}
let a = ((inst >> 16) & 0xFF) as usize;
Expand Down Expand Up @@ -1945,6 +1950,38 @@ fn compile_function_body(
}
}
}
OP_MOVE_OWN => {
// Move-not-clone variant of OP_MOVE used by the
// `name = fn(name, ...)` peephole. The VM-side semantics
// are: transfer the NanVal bit pattern from R[B] to R[A]
// without bumping any RC, and clear R[B] to Nil so a
// later drop_rc on the source slot is a no-op.
//
// In Cranelift, registers are SSA Variables — assigning
// a Variable doesn't bump RC of the underlying heap
// value (that's done explicitly by jit_move for the
// ordinary OP_MOVE). For OP_MOVE_OWN we deliberately
// skip jit_move: the source Variable is not used again
// on the SSA path emitted by the peephole (in / out
// pair brackets the call), so the RC stays at the
// caller's pre-move count exactly as the VM intends.
//
// Emitting a clone here would inflate the RC by one
// per loop iteration, defeating the in-place OP_MSET
// fast path inside the helper and leaking memory.
if a_idx != b_idx {
let bv = builder.use_var(vars[b_idx]);
builder.def_var(vars[a_idx], bv);
// Propagate f64 shadow for the numeric fast path so
// downstream arithmetic ops skip the bitcast, same
// as OP_MOVE does.
let src_always_num = b_idx < reg_always_num.len() && reg_always_num[b_idx];
if src_always_num && a_idx < reg_always_num.len() && reg_always_num[a_idx] {
let bf = builder.use_var(f64_vars[b_idx]);
builder.def_var(f64_vars[a_idx], bf);
}
}
}
OP_NOT => {
let bv = builder.use_var(vars[b_idx]);
let fref = get_func_ref(&mut builder, module, helpers.not);
Expand Down Expand Up @@ -4307,7 +4344,18 @@ fn compile_function_body(
}
// else: blocks not found → JIT bails (should not happen in practice)
}
OP_CALL => {
OP_CALL | OP_CALL_OWN1 => {
// OP_CALL_OWN1 is the move-not-clone first-arg variant of
// OP_CALL used by the let-stmt peephole. In the Cranelift
// JIT/AOT model args are passed as SSA Variable values
// (no per-push clone_rc on the stack like the VM has),
// so the move-vs-clone distinction collapses here — both
// opcodes lower to the same call sequence. The compiler
// peephole still wins on Cranelift because it rewrites
// the tail mset to use a == b (in-place fast path), and
// the source-register clear on the caller side is a
// no-op under SSA: the moved-out Variable just isn't
// referenced again.
let a = ((inst >> 16) & 0xFF) as u8;
let bx = (inst & 0xFFFF) as usize;
let func_idx = bx >> 8;
Expand Down
Loading
Loading