From 6b81a7fe74b122c4a4acf729661ae4448c4d1b53 Mon Sep 17 00:00:00 2001 From: Daniel Morris Date: Tue, 19 May 2026 17:52:23 +0100 Subject: [PATCH 1/4] fix(interpreter): scan CSV/TSV in one pass with quote-state tracking parse_format split content via content.lines() before tracking quote state, so a cell containing \n (which write_csv_tsv correctly emits as a quoted multi-line field per RFC 4180) was re-parsed as two rows. ilo silently mis-read CSV it had just written. csv-pipeline rerun10 caught the round-trip drift: wrote 4 rows, read back 5. Replace the line-by-line approach with parse_csv_content: a single-pass scanner that walks the whole document, treats \n / \r\n as record separators only when out of quotes, keeps embedded newlines and CRLF verbatim inside quoted cells, and never emits a phantom trailing row for files that end in \n. Same signature shape going in, Vec> coming out. Unit tests cover multi-line cells, escaped quotes inside multi-line cells, CRLF row separators, CRLF inside a quoted field, embedded commas, mixed quoted/unquoted cells in one row, empty trailing fields, empty quoted fields, the UTF-8 BOM (pinned as preserved), unterminated quoted fields, and a full write_csv_tsv -> parse_csv_content round-trip. --- src/interpreter/mod.rs | 234 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 209 insertions(+), 25 deletions(-) diff --git a/src/interpreter/mod.rs b/src/interpreter/mod.rs index 7175a40c..97a1dc81 100644 --- a/src/interpreter/mod.rs +++ b/src/interpreter/mod.rs @@ -642,13 +642,11 @@ fn parse_format(fmt: &str, content: &str) -> std::result::Result match fmt { "csv" | "tsv" => { let sep = if fmt == "tsv" { '\t' } else { ',' }; - let rows: Vec = content - .lines() - .map(|line| { - let fields: Vec = parse_csv_row(line, sep) - .into_iter() - .map(|s| Value::Text(Arc::new(s))) - .collect(); + let rows: Vec = parse_csv_content(content, sep) + .into_iter() + .map(|row| { + let fields: Vec = + row.into_iter().map(|s| Value::Text(Arc::new(s))).collect(); Value::List(Arc::new(fields)) }) .collect(); @@ -661,12 +659,26 @@ fn parse_format(fmt: &str, content: &str) -> std::result::Result } } -/// Parse one CSV/TSV row respecting double-quoted fields. -fn parse_csv_row(line: &str, sep: char) -> Vec { - let mut fields = Vec::new(); +/// Parse a full CSV/TSV document into rows of fields, RFC 4180 compliant. +/// +/// Unlike a line-based split, this scanner tracks quote state across record +/// boundaries so that a quoted field containing an embedded newline is +/// preserved as a single cell. Both `\n` and `\r\n` are accepted as record +/// separators outside quotes; inside quotes they are kept verbatim. A final +/// trailing newline does not produce an extra empty row. +/// +/// History: the previous implementation called `content.lines()` then handed +/// each line to a per-line quote-aware parser. That meant any cell containing +/// `\n` (which `write_csv_tsv` correctly emits as a quoted multi-line field +/// per RFC 4180) was re-parsed as two rows on the way back in, so ilo silently +/// mis-parsed CSV it had just written. csv-pipeline rerun10 flagged this as +/// the one blocker on round-trip integrity. +fn parse_csv_content(content: &str, sep: char) -> Vec> { + let mut rows: Vec> = Vec::new(); + let mut row: Vec = Vec::new(); let mut field = String::new(); let mut in_quotes = false; - let mut chars = line.chars().peekable(); + let mut chars = content.chars().peekable(); while let Some(c) = chars.next() { if in_quotes { if c == '"' { @@ -677,20 +689,46 @@ fn parse_csv_row(line: &str, sep: char) -> Vec { in_quotes = false; } } else { + // Inside quotes, newlines (including \r\n) are part of the + // field. Keep them verbatim. field.push(c); } } else if c == '"' { in_quotes = true; } else if c == sep { - fields.push(std::mem::take(&mut field)); + row.push(std::mem::take(&mut field)); + } else if c == '\n' { + row.push(std::mem::take(&mut field)); + rows.push(std::mem::take(&mut row)); + } else if c == '\r' { + // Accept \r\n as a record terminator; bare \r outside quotes is + // treated the same way (matches `content.lines()` previously and + // keeps platform-CR-only files readable). + if chars.peek() == Some(&'\n') { + chars.next(); + } + row.push(std::mem::take(&mut field)); + rows.push(std::mem::take(&mut row)); } else { field.push(c); } } - fields.push(field); - fields + // Flush the trailing record. A file that ends with `\n` already emitted + // its last row in the loop and field/row are empty here — skip pushing + // a spurious empty row in that case. A file with no trailing newline + // still has one record left to flush. + if !field.is_empty() || !row.is_empty() || in_quotes { + row.push(field); + rows.push(row); + } + rows } +// Note: a separate per-line `parse_csv_row` previously existed but its only +// caller (`parse_format`) was the source of the multi-line round-trip bug +// fixed in csv-pipeline rerun10. The full-document `parse_csv_content` above +// is now the single entry point for csv/tsv parsing. + // ── Linear algebra helpers ────────────────────────────────────────── /// Coerce a `Value` into a row-major matrix `Vec>`. @@ -9535,22 +9573,168 @@ mod tests { assert_eq!(format!("{}", Value::FnRef("add".into())), ""); } - // L268-279: parse_csv_row with quoted fields + // Single-row quoted-field coverage now lives on parse_csv_content; + // the previous per-line `parse_csv_row` helper was removed as part of + // the csv-pipeline rerun10 fix. See parse_csv_content_* tests below. + #[test] + fn parse_csv_content_single_row_escaped_quote() { + let rows = parse_csv_content(r#""he said ""hello""","world""#, ','); + assert_eq!(rows.len(), 1); + assert_eq!(rows[0], vec![r#"he said "hello""#, "world"]); + } + + #[test] + fn parse_csv_content_single_row_simple_quoted() { + let rows = parse_csv_content(r#""hello","world""#, ','); + assert_eq!(rows.len(), 1); + assert_eq!(rows[0], vec!["hello", "world"]); + } + + // ── parse_csv_content: quote-state tracking across newlines ─────────────── + // Regression for csv-pipeline rerun10: the reader used to split content + // on `\n` before tracking quote state, so a multi-line quoted field + // (which the writer correctly emits per RFC 4180) was mis-parsed as + // two rows. parse_csv_content now scans the full document in one pass. + + #[test] + fn parse_csv_content_multiline_quoted_field() { + // The writer emits "line\nbreak" as a quoted multi-line cell. + // The reader must keep it as a single cell across the embedded \n. + let input = "name,note,n\nplain,\"line\nbreak\",2\n"; + let rows = parse_csv_content(input, ','); + assert_eq!(rows.len(), 2); + assert_eq!(rows[0], vec!["name", "note", "n"]); + assert_eq!(rows[1], vec!["plain", "line\nbreak", "2"]); + } + + #[test] + fn parse_csv_content_basic_no_trailing_newline() { + let rows = parse_csv_content("a,b\nc,d", ','); + assert_eq!(rows, vec![vec!["a", "b"], vec!["c", "d"]]); + } + + #[test] + fn parse_csv_content_basic_trailing_newline_no_phantom_row() { + // A file ending in `\n` should NOT yield a trailing empty row. + let rows = parse_csv_content("a,b\nc,d\n", ','); + assert_eq!(rows, vec![vec!["a", "b"], vec!["c", "d"]]); + } + + #[test] + fn parse_csv_content_crlf_line_endings() { + let rows = parse_csv_content("a,b\r\nc,d\r\n", ','); + assert_eq!(rows, vec![vec!["a", "b"], vec!["c", "d"]]); + } + + #[test] + fn parse_csv_content_crlf_inside_quoted_field_preserved() { + // \r\n inside a quoted cell is part of the cell, not a record break. + let rows = parse_csv_content("a,\"x\r\ny\"\n", ','); + assert_eq!(rows, vec![vec!["a".to_string(), "x\r\ny".to_string()]]); + } + #[test] - fn parse_csv_row_quoted_fields() { - // quoted field + escaped double-quote inside - let rows = parse_csv_row(r#""he said ""hello""","world""#, ','); + fn parse_csv_content_escaped_quote_inside_multiline_field() { + // Combined edge case: embedded newline AND escaped quote in one cell. + let input = "a,\"he said \"\"hi\"\"\nfoo\",b\n"; + let rows = parse_csv_content(input, ','); + assert_eq!(rows.len(), 1); + assert_eq!(rows[0], vec!["a", "he said \"hi\"\nfoo", "b"]); + } + + #[test] + fn parse_csv_content_tsv_separator() { + // Same scanner, tab separator. + let rows = parse_csv_content("a\tb\nc\td\n", '\t'); + assert_eq!(rows, vec![vec!["a", "b"], vec!["c", "d"]]); + } + + #[test] + fn parse_csv_content_empty_input() { + let rows = parse_csv_content("", ','); + assert!(rows.is_empty()); + } + + #[test] + fn parse_csv_content_embedded_comma_in_quoted_field() { + // A comma inside a quoted cell is part of the cell, not a separator. + let rows = parse_csv_content("a,\"x,y\",b\n", ','); + assert_eq!(rows, vec![vec!["a", "x,y", "b"]]); + } + + #[test] + fn parse_csv_content_mixed_quoted_and_unquoted_in_same_row() { + // Real-world CSV mixes quoted and unquoted cells freely. The scanner + // must handle both in a single row. + let rows = parse_csv_content("alice,\"engineer, sr.\",30,\"London\"\n", ','); + assert_eq!(rows, vec![vec!["alice", "engineer, sr.", "30", "London"]]); + } + + #[test] + fn parse_csv_content_empty_trailing_field() { + // A row ending with a separator means the last cell is empty. This is + // a common spreadsheet artefact ("alice,30," for a missing column). + let rows = parse_csv_content("a,b,\nc,d,\n", ','); + assert_eq!(rows, vec![vec!["a", "b", ""], vec!["c", "d", ""]]); + } + + #[test] + fn parse_csv_content_empty_field_in_middle() { + // ",,," produces ["", "", "", ""] -- four cells, three of them empty. + let rows = parse_csv_content("a,,b\n", ','); + assert_eq!(rows, vec![vec!["a", "", "b"]]); + } + + #[test] + fn parse_csv_content_empty_quoted_field() { + // "" is the canonical empty quoted cell -- not a stray escape. + let rows = parse_csv_content("a,\"\",b\n", ','); + assert_eq!(rows, vec![vec!["a", "", "b"]]); + } + + #[test] + fn parse_csv_content_utf8_bom_preserved_in_first_cell() { + // We don't currently strip a leading UTF-8 BOM. Pin the current + // behaviour so future BOM handling (if added) is a deliberate change, + // not silent drift. The BOM is the three bytes EF BB BF (\u{feff}). + let rows = parse_csv_content("\u{feff}name,age\nalice,30\n", ','); assert_eq!(rows.len(), 2); - assert_eq!(rows[0], r#"he said "hello""#); - assert_eq!(rows[1], "world"); + assert_eq!(rows[0], vec!["\u{feff}name", "age"]); + assert_eq!(rows[1], vec!["alice", "30"]); } #[test] - fn parse_csv_row_simple_quoted() { - // plain quoted field (no escaped quotes) - let rows = parse_csv_row(r#""hello","world""#, ','); - assert_eq!(rows[0], "hello"); - assert_eq!(rows[1], "world"); + fn parse_csv_content_single_unterminated_quoted_field() { + // Malformed input: open quote with no close. The scanner should not + // panic and should still produce the partial cell so the user can + // see the broken data rather than getting a silent empty result. + let rows = parse_csv_content("a,\"oops\n", ','); + assert_eq!(rows.len(), 1); + assert_eq!(rows[0], vec!["a", "oops\n"]); + } + + #[test] + fn parse_csv_content_round_trip_via_write_csv_tsv() { + // End-to-end: the canonical regression. Take a row with a multi-line + // cell, write it via write_csv_tsv, then parse_csv_content the result. + // The original cells should come back byte-for-byte. + let original = vec![ + Value::List(Arc::new(vec![ + Value::Text(Arc::new("name".to_string())), + Value::Text(Arc::new("note".to_string())), + Value::Text(Arc::new("n".to_string())), + ])), + Value::List(Arc::new(vec![ + Value::Text(Arc::new("plain".to_string())), + Value::Text(Arc::new("line\nbreak".to_string())), + Value::Number(2.0), + ])), + ]; + let serialised = write_csv_tsv(&original, ',').expect("write_csv_tsv failed"); + let rows = parse_csv_content(&serialised, ','); + assert_eq!(rows.len(), 2, "round-trip produced wrong row count"); + assert_eq!(rows[0], vec!["name", "note", "n"]); + assert_eq!(rows[1], vec!["plain", "line\nbreak", "2"]); } // L299: len on Map From 25f5c129e3485bd6ebcb0ae41908b34552149781 Mon Sep 17 00:00:00 2001 From: Daniel Morris Date: Tue, 19 May 2026 17:52:31 +0100 Subject: [PATCH 2/4] fix(vm): scan CSV/TSV in one pass to match interpreter scanner vm_parse_format had the same bug as the tree-walker: content.lines() ran before quote-state tracking, so a quoted multi-line cell came back as two rows. The VM is the default engine on 0.12.x so any csv-pipeline user hit this on every read. vm_parse_csv_content mirrors interpreter::parse_csv_content -- single pass, in_quotes state, \n and \r\n as record separators outside quotes, preserved inside. Both code paths now use the same algorithm so a write on one engine and a read on another stays byte-stable. --- src/vm/mod.rs | 52 +++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 38 insertions(+), 14 deletions(-) diff --git a/src/vm/mod.rs b/src/vm/mod.rs index 42f8ba96..f377fd09 100644 --- a/src/vm/mod.rs +++ b/src/vm/mod.rs @@ -12846,13 +12846,10 @@ fn vm_parse_format(fmt: &str, content: &str) -> Result { match fmt { "csv" | "tsv" => { let sep = if fmt == "tsv" { '\t' } else { ',' }; - let rows: Vec = content - .lines() - .map(|line| { - let fields: Vec = vm_parse_csv_row(line, sep) - .into_iter() - .map(NanVal::heap_string) - .collect(); + let rows: Vec = vm_parse_csv_content(content, sep) + .into_iter() + .map(|row| { + let fields: Vec = row.into_iter().map(NanVal::heap_string).collect(); NanVal::heap_list(fields) }) .collect(); @@ -12865,11 +12862,26 @@ fn vm_parse_format(fmt: &str, content: &str) -> Result { } } -fn vm_parse_csv_row(line: &str, sep: char) -> Vec { - let mut fields = Vec::new(); +/// Parse a full CSV/TSV document into rows of fields, RFC 4180 compliant. +/// +/// Mirrors `interpreter::parse_csv_content`: tracks quote state across +/// record separators so a quoted field containing an embedded newline is +/// preserved as a single cell. Accepts both `\n` and `\r\n` as record +/// terminators outside quotes; inside quotes they are kept verbatim. A +/// trailing newline does not produce an extra empty row. +/// +/// History: csv-pipeline rerun10. The VM's vm_parse_format previously +/// called `content.lines()` then handed each line to a per-line quote +/// parser, so any cell containing `\n` (which `write_csv_tsv` correctly +/// emits as a quoted multi-line field) was mis-read as two rows. The +/// tree-walker had the same bug in `parse_format`; both code paths now +/// use a single-pass scanner so cross-engine round-trip is byte-stable. +fn vm_parse_csv_content(content: &str, sep: char) -> Vec> { + let mut rows: Vec> = Vec::new(); + let mut row: Vec = Vec::new(); let mut field = String::new(); let mut in_quotes = false; - let mut chars = line.chars().peekable(); + let mut chars = content.chars().peekable(); while let Some(c) = chars.next() { if in_quotes { if c == '"' { @@ -12885,13 +12897,25 @@ fn vm_parse_csv_row(line: &str, sep: char) -> Vec { } else if c == '"' { in_quotes = true; } else if c == sep { - fields.push(std::mem::take(&mut field)); + row.push(std::mem::take(&mut field)); + } else if c == '\n' { + row.push(std::mem::take(&mut field)); + rows.push(std::mem::take(&mut row)); + } else if c == '\r' { + if chars.peek() == Some(&'\n') { + chars.next(); + } + row.push(std::mem::take(&mut field)); + rows.push(std::mem::take(&mut row)); } else { field.push(c); } } - fields.push(field); - fields + if !field.is_empty() || !row.is_empty() || in_quotes { + row.push(field); + rows.push(row); + } + rows } fn nanval_equal(a: NanVal, b: NanVal) -> bool { @@ -27206,7 +27230,7 @@ mod tests { assert_eq!(*inner, Value::Text(Arc::new("hello raw".to_string()))); } - // ── vm_parse_csv_row quoted fields (lines 4295-4306) ───────────────────── + // ── vm_parse_csv_content quoted fields (lines 4295-4306) ───────────────── // lines 4295-4306: OP_RD on .csv file with quoted fields (double-quote escaping) #[test] From 7cd42f3386103f8fec4f6cd87c71e6773ff81b2e Mon Sep 17 00:00:00 2001 From: Daniel Morris Date: Tue, 19 May 2026 17:52:38 +0100 Subject: [PATCH 3/4] test: cross-engine CSV multi-line round-trip regression + example tests/regression_csv_multiline_roundtrip.rs drives wr! then rd! across tree and VM. Pins the row count, the multi-line cell value, the quote+newline combined case, and the plain (no-newline) negative control so a future regression in either engine fails loudly. examples/csv-multiline-roundtrip.ilo gives agents an in-context example of the now-correct behaviour and rides the existing examples_engines harness so every engine runs it on each test pass. Covers a row-count round-trip, a single-cell length round-trip, an embedded-quote + embedded-newline cell, and a CRLF cell. --- examples/csv-multiline-roundtrip.ilo | 53 +++++++++ tests/regression_csv_multiline_roundtrip.rs | 117 ++++++++++++++++++++ 2 files changed, 170 insertions(+) create mode 100644 examples/csv-multiline-roundtrip.ilo create mode 100644 tests/regression_csv_multiline_roundtrip.rs diff --git a/examples/csv-multiline-roundtrip.ilo b/examples/csv-multiline-roundtrip.ilo new file mode 100644 index 00000000..fb6335d3 --- /dev/null +++ b/examples/csv-multiline-roundtrip.ilo @@ -0,0 +1,53 @@ +-- Multi-line quoted CSV fields round-trip cleanly. +-- The writer emits cells containing `\n` as quoted multi-line records per +-- RFC 4180; the reader tracks quote state across newlines so the same +-- file reads back with the original row count and cell value. +-- Regression for csv-pipeline rerun10. + +roundtrip>R n t; + path="/tmp/ilo_example_csv_multiline.csv"; + rows=[["name","note","n"],["plain","line\nbreak","2"]]; + wr! path rows "csv"; + back=rd! path "csv"; + ~len back + +celllen>R n t; + path="/tmp/ilo_example_csv_multiline_cell.csv"; + rows=[["a","line\nbreak","b"]]; + wr! path rows "csv"; + back=rd! path "csv"; + ~len (at back 0) + +-- An embedded double-quote inside a multi-line cell. The writer escapes +-- the inner quote as `""` per RFC 4180 and wraps the whole cell in `"`. +-- The reader keeps the cell intact across both the embedded `""` and `\n`. +-- We assert on cell length (16 chars: `he said "hi"\nfoo`) so the example +-- harness can compare a single-line stdout value. +quoteml>R n t; + path="/tmp/ilo_example_csv_quote_multiline.csv"; + rows=[["a","he said \"hi\"\nfoo","b"]]; + wr! path rows "csv"; + back=rd! path "csv"; + ~len (at (at back 0) 1) + +-- CRLF inside a quoted cell must be kept verbatim, not collapsed or split. +-- The writer emits LF row separators; this example constructs a CRLF cell +-- explicitly and round-trips it. +crlfcell>R n t; + path="/tmp/ilo_example_csv_crlf_cell.csv"; + rows=[["a","x\r\ny","b"]]; + wr! path rows "csv"; + back=rd! path "csv"; + ~len (at (at back 0) 1) + +-- run: roundtrip +-- out: 2 + +-- run: celllen +-- out: 3 + +-- run: quoteml +-- out: 16 + +-- run: crlfcell +-- out: 4 diff --git a/tests/regression_csv_multiline_roundtrip.rs b/tests/regression_csv_multiline_roundtrip.rs new file mode 100644 index 00000000..b6d78841 --- /dev/null +++ b/tests/regression_csv_multiline_roundtrip.rs @@ -0,0 +1,117 @@ +// Regression for csv-pipeline rerun10: +// +// `wr path data "csv"` correctly emits a quoted, multi-line field per +// RFC 4180. But `rd path "csv"` used to split content on `\n` before +// tracking quote state, so the embedded newline was treated as a record +// break and one logical row came back as two. +// +// Repro from the persona report: +// wrote 3 rows: [name,note,n], [Frame, Gamma,"has ""quote""",1], +// [plain,"line\nbreak",2] +// re-read got 4 rows: [name,note,n], [Frame, Gamma,has "quote",1], +// [plain,line], [break,2] +// +// The fix replaces the line-by-line approach in `parse_format` with a +// single-pass quote-aware scanner (`parse_csv_content`). These tests pin +// the round-trip across every available engine — if any engine regresses, +// or a future change drifts the writer and reader out of step, this fails. + +use std::process::Command; + +fn ilo() -> Command { + Command::new(env!("CARGO_BIN_EXE_ilo")) +} + +fn run_ok(engine: &str, src: &str, entry: &str) -> String { + let out = ilo() + .args([src, engine, entry]) + .output() + .expect("failed to run ilo"); + assert!( + out.status.success(), + "ilo {engine} failed for `{src}`: stderr={}", + String::from_utf8_lossy(&out.stderr) + ); + String::from_utf8_lossy(&out.stdout).trim().to_string() +} + +fn engines() -> &'static [&'static str] { + &["--run-tree", "--run-vm"] +} + +// Canonical regression: write a row with an embedded newline, then read +// the file back as csv. The row count must survive the round-trip. +#[test] +fn csv_multiline_quoted_field_round_trip_row_count() { + for (i, engine) in engines().iter().enumerate() { + let path = format!("/tmp/ilo_csv_ml_rt_count_{i}.csv"); + let _ = std::fs::remove_file(&path); + // Two rows: a header and a body row whose middle cell contains a + // literal newline. Entry returns the row count read back. + let src = format!( + r#"f>R n t;wr! "{path}" [["name","note","n"],["plain","line\nbreak","2"]] "csv";rows=rd! "{path}" "csv";~len rows"# + ); + let got = run_ok(engine, &src, "f"); + assert_eq!( + got, "2", + "engine={engine}: round-trip row count drifted (writer emitted multi-line quoted cell, reader split it)" + ); + let _ = std::fs::remove_file(&path); + } +} + +// The embedded-newline cell must come back as a single cell with the +// `\n` preserved, not as two cells across two rows. +#[test] +fn csv_multiline_quoted_field_round_trip_cell_value() { + for (i, engine) in engines().iter().enumerate() { + let path = format!("/tmp/ilo_csv_ml_rt_cell_{i}.csv"); + let _ = std::fs::remove_file(&path); + // After read-back, rows[1][1] should be "line\nbreak". + let src = format!( + r#"f>R t t;wr! "{path}" [["name","note","n"],["plain","line\nbreak","2"]] "csv";rows=rd! "{path}" "csv";~at (at rows 1) 1"# + ); + let got = run_ok(engine, &src, "f"); + assert_eq!( + got, "line\nbreak", + "engine={engine}: multi-line cell did not round-trip verbatim" + ); + let _ = std::fs::remove_file(&path); + } +} + +// Combined edge case: a single cell with BOTH an embedded newline AND an +// escaped double-quote. This exercises quote-state tracking across the +// embedded `""` and the embedded `\n`. +#[test] +fn csv_multiline_with_escaped_quote_round_trip() { + for (i, engine) in engines().iter().enumerate() { + let path = format!("/tmp/ilo_csv_ml_rt_qn_{i}.csv"); + let _ = std::fs::remove_file(&path); + let src = format!( + r#"f>R t t;wr! "{path}" [["a","he said \"hi\"\nfoo","b"]] "csv";rows=rd! "{path}" "csv";~at (at rows 0) 1"# + ); + let got = run_ok(engine, &src, "f"); + assert_eq!( + got, "he said \"hi\"\nfoo", + "engine={engine}: quote+newline cell did not round-trip" + ); + let _ = std::fs::remove_file(&path); + } +} + +// Negative control: round-trip on a CSV with NO embedded newlines must +// still produce the same number of rows as before the fix. +#[test] +fn csv_plain_round_trip_unchanged() { + for (i, engine) in engines().iter().enumerate() { + let path = format!("/tmp/ilo_csv_plain_rt_{i}.csv"); + let _ = std::fs::remove_file(&path); + let src = format!( + r#"f>R n t;wr! "{path}" [["name","n"],["alice","1"],["bob","2"]] "csv";rows=rd! "{path}" "csv";~len rows"# + ); + let got = run_ok(engine, &src, "f"); + assert_eq!(got, "3", "engine={engine}: plain csv row count regressed"); + let _ = std::fs::remove_file(&path); + } +} From d6ece14a4a1cf7fb4dcac6751fdace745d37bf54 Mon Sep 17 00:00:00 2001 From: Daniel Morris Date: Tue, 19 May 2026 17:52:42 +0100 Subject: [PATCH 4/4] docs: changelog 0.12.1 entry for CSV multi-line round-trip fix --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f8a0f130..1881d27c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ - `ls dir` renamed to `lsd dir`. Six rerun10 personas tripped ILO-P011 on `ls=rdl! p` because `ls` was reserved; rename frees `ls` for user code. `walk`, `glob` unchanged. +### Fixed + +- CSV/TSV reader now tracks quote state across record separators. A cell containing `\n` (which the writer correctly emits as a quoted multi-line field per RFC 4180) used to be re-parsed as two rows, so `rd path "csv"` silently disagreed with `wr path data "csv"`. The reader is now a single-pass scanner over the whole document and round-trips multi-line quoted fields, embedded quotes, and CRLF line endings byte-stably across tree and VM. Surfaced by csv-pipeline rerun10. + ## 0.12.0 - 2026-05-19 ### Breaking