diff --git a/examples/multiline-bodies.ilo b/examples/multiline-bodies.ilo new file mode 100644 index 00000000..806c625a --- /dev/null +++ b/examples/multiline-bodies.ilo @@ -0,0 +1,55 @@ +-- Multi-line function bodies — readable indented form. +-- +-- ilo's spec says newlines are non-semantic: an entire program can be one +-- line, but a multi-line file with indented continuations must work too. +-- These shapes all used to fail with ILO-P009 ("expected expression, got +-- Semi" / "got PipeOp") because `normalize_newlines` injected a `;` inside +-- brackets or before a continuation pipe. Now they all run. + +-- Multi-line list literal. +nums>L n + xs=[ + 1, + 2, + 3 + ] + xs + +-- Multi-line list literal with leading commas (common when copy-pasting +-- columns of values). +items>L n + xs=[1 + ,2 + ,3] + xs + +-- Multi-line paren-grouped expression. +gp x:n>n + y=(+x + 1) + y + +-- Pipe chain across multiple lines — `>>` on a continuation line is never +-- a statement start, so the `;` injection is suppressed. +pipe x:n>n + x + >>str + >>len + +-- Indented multi-statement body (already worked, kept as a regression +-- baseline). +tot p:n q:n>n + s=*p q + t=*s q + +s t + +-- run: nums +-- out: [1, 2, 3] +-- run: items +-- out: [1, 2, 3] +-- run: gp 5 +-- out: 6 +-- run: pipe 42 +-- out: 2 +-- run: tot 3 4 +-- out: 60 diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index db948bac..ea18ad2a 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -176,6 +176,14 @@ pub enum Token { /// - `\n` followed by whitespace (indented continuation) → `;` /// - `\n` at column 0 (new declaration) → kept as `\n` /// - `;` immediately after `{` or before `}` → removed +/// - Inside `(...)` or `[...]` (list literal, paren-group, fn-call arg list), +/// `\n` is treated as whitespace: no `;` is emitted, so multi-line list and +/// paren expressions parse correctly. String literals are walked through so +/// `(`/`[` inside text don't affect depth. +/// - Continuation lines starting with `>>` (pipe operator) suppress the `;` +/// so `xs\n >>map{...}` chains correctly. `>>` is never a valid statement +/// start, so this is unambiguous. Other operators (`+`, `-`, `*`, ...) are +/// valid prefix-call statement heads and are NOT special-cased. pub fn normalize_newlines(source: &str) -> String { if !source.contains('\n') { return source.to_string(); @@ -185,13 +193,17 @@ pub fn normalize_newlines(source: &str) -> String { let mut chars = source.chars().peekable(); // Track the last non-whitespace char pushed to `out` to avoid O(n) trim_end scans. let mut last_significant: Option = None; + // Depth of open `(` and `[` we're currently inside. `{` is tracked + // separately by `last_significant` (existing precedent). + let mut bracket_depth: u32 = 0; while let Some(c) = chars.next() { if c == '"' { // Pass through string literal content verbatim so `--` inside a - // string isn't mistaken for a comment, and so `\n` (if ever present - // inside a string) isn't rewritten to `;`. Mirrors logos's string - // regex: closing quote terminates unless escaped. + // string isn't mistaken for a comment, `\n` (if ever present + // inside a string) isn't rewritten to `;`, and `(`/`[` inside + // text don't bump bracket depth. Mirrors logos's string regex: + // closing quote terminates unless escaped. out.push(c); last_significant = Some(c); while let Some(sc) = chars.next() { @@ -223,14 +235,41 @@ pub fn normalize_newlines(source: &str) -> String { // surrounding `\n` handling on the next loop iteration emits the // appropriate `;` or newline based on the line that follows. } else if c == '\n' { + // Inside `(...)` or `[...]`, treat newlines as whitespace — + // don't emit `;` or `\n`, but emit a single space so tokens on + // adjacent lines don't get glued together (e.g. `(+x\n 1)` + // must not become `(+x1)`). Then skip indent on the next line. + if bracket_depth > 0 { + out.push(' '); + while matches!(chars.peek(), Some(' ') | Some('\t')) { + chars.next(); + } + continue; + } // Check if next line is indented (starts with space or tab) if matches!(chars.peek(), Some(' ') | Some('\t')) { + // Peek past indent at the first real char on the next line + // so we can decide whether to emit a `;` before it. + let mut lookahead = chars.clone(); + while matches!(lookahead.peek(), Some(' ') | Some('\t')) { + lookahead.next(); + } + // `>>` (pipe operator) at the start of a continuation line is + // never a statement start — it must be chaining the previous + // line's expression. Suppress the `;` so the chain parses. + // Other operators (`+`/`-`/`*`) are valid prefix-call + // statement starts and must NOT trigger this. + let next_is_pipe = { + let mut probe = lookahead.clone(); + probe.next() == Some('>') && probe.next() == Some('>') + }; // Indented continuation → emit `;` and skip the whitespace // But first check if the last non-whitespace char was `{` — if so, skip the `;` // Also skip if `out` already ends in `;` (e.g. previous line - // was a comment that produced no significant output). - if last_significant == Some('{') || out.ends_with(';') { - // Don't emit `;` after `{` or an existing `;`, just skip whitespace + // was a comment that produced no significant output), or if + // the continuation begins with `>>` (pipe chain). + if last_significant == Some('{') || out.ends_with(';') || next_is_pipe { + // Don't emit `;` } else { out.push(';'); } @@ -254,6 +293,13 @@ pub fn normalize_newlines(source: &str) -> String { if !c.is_ascii_whitespace() { last_significant = Some(c); } + match c { + '(' | '[' => bracket_depth += 1, + ')' | ']' => { + bracket_depth = bracket_depth.saturating_sub(1); + } + _ => {} + } } } diff --git a/tests/regression_multiline_fn_body.rs b/tests/regression_multiline_fn_body.rs index 02f06654..7c142deb 100644 --- a/tests/regression_multiline_fn_body.rs +++ b/tests/regression_multiline_fn_body.rs @@ -33,10 +33,16 @@ fn run_file(engine: &str, src: &str, entry: &str) -> String { seq )); std::fs::write(&path, src).unwrap(); - let out = ilo() - .args([path.to_str().unwrap(), engine, entry]) - .output() - .expect("failed to run ilo"); + // `entry` may be a bare function name (`f`) or a function name plus + // whitespace-separated CLI args (`gp 5`). Split on whitespace so the + // CLI receives each token as its own argv slot — matches how the + // `examples_engines` harness invokes things. + let mut cmd = ilo(); + cmd.arg(path.to_str().unwrap()).arg(engine); + for arg in entry.split_whitespace() { + cmd.arg(arg); + } + let out = cmd.output().expect("failed to run ilo"); assert!( out.status.success(), "ilo {engine} failed for `{src}`: stderr={}", @@ -60,6 +66,30 @@ const SL_SIMPLE: &str = "f>n;5\n"; // Single-line baseline, multi-token return type. const SL_RESULT: &str = "f>R t t;~\"hello\"\n"; +// Multi-line list literal — items spread across lines. Previously +// `normalize_newlines` injected a `;` after `[` and between items, producing +// ILO-P009 "expected expression, got Semi". `[`/`]` now suppress newlines. +const ML_LIST_LITERAL: &str = "nums>L n\n xs=[\n 1,\n 2,\n 3\n ]\n xs\n"; +// Multi-line list literal with leading commas (common when paginating +// long literal columns). +const ML_LIST_LEADING_COMMA: &str = "nums>L n\n xs=[1\n ,2\n ,3]\n xs\n"; +// Multi-line paren-grouped expression. `(`/`)` now suppress newlines and +// emit a space so adjacent-line tokens don't glue together (`(+x\n 1)` +// must not normalise to `(+x1)`). +const ML_PAREN: &str = "gp x:n>n\n y=(+x\n 1)\n y\n"; +// Pipe chain across continuation lines. `>>` is never a statement start, +// so the `;` is suppressed when a continuation line begins with `>>`. +const ML_PIPE: &str = "pipe x:n>n\n x\n >>str\n >>len\n"; +// Nested `(...)` inside `[...]` exercises both depth counters in the same +// source. The leading-comma layout is common when columns of expressions +// are spread across lines. +const ML_NESTED_BRACKETS: &str = "nest>L n\n xs=[(+1 2)\n ,(+3 4)\n ,(+5 6)]\n xs\n"; +// Multi-line `>>` pipe chain inside a `{...}` loop body. The pipe +// suppression must coexist with the `last_significant == '{'` rule for +// the line right after `{`. +const ML_PIPE_IN_BLOCK: &str = + "agg xs:L n>n\n s=0\n @x xs{\n v=x\n >>str\n >>len\n s=+s v\n }\n s\n"; + fn check_all(engine: &str) { assert_eq!( run_file(engine, ML_RESULT, "f"), @@ -91,6 +121,36 @@ fn check_all(engine: &str) { "~hello", "single-line R t t engine={engine}" ); + assert_eq!( + run_file(engine, ML_LIST_LITERAL, "nums"), + "[1, 2, 3]", + "multi-line list literal engine={engine}" + ); + assert_eq!( + run_file(engine, ML_LIST_LEADING_COMMA, "nums"), + "[1, 2, 3]", + "multi-line list literal leading-comma engine={engine}" + ); + assert_eq!( + run_file(engine, ML_PAREN, "gp 5"), + "6", + "multi-line paren expression engine={engine}" + ); + assert_eq!( + run_file(engine, ML_PIPE, "pipe 42"), + "2", + "multi-line pipe chain engine={engine}" + ); + assert_eq!( + run_file(engine, ML_NESTED_BRACKETS, "nest"), + "[3, 7, 11]", + "nested ( inside [ engine={engine}" + ); + assert_eq!( + run_file(engine, ML_PIPE_IN_BLOCK, "agg [1,22,333]"), + "6", + "multi-line pipe inside loop body engine={engine}" + ); } #[test]