From 1d43fd19bac79172630ac6826aac3b18cbe2b619 Mon Sep 17 00:00:00 2001 From: Daniel Morris Date: Tue, 12 May 2026 09:42:42 +0100 Subject: [PATCH] wr: type-verify csv and tsv 3-arg overloads wr 'foo.csv' xs already worked for writing CSV but the 3-arg form wr 'foo.csv' headers rows that takes an explicit header row was slipping past type verification, accepting wrong shapes and erroring deep in the writer. Verifier now matches the 3-arg overloads for csv and tsv, checking headers is a list of text and rows is a list of lists before dispatch. Same for tsv. --- examples/csv-tsv-writer.ilo | 14 +++ src/interpreter/mod.rs | 158 +++++++++++++++++++++-------- src/vm/mod.rs | 88 +++++++++++----- tests/regression_csv_tsv_writer.rs | 125 +++++++++++++++++++++++ 4 files changed, 315 insertions(+), 70 deletions(-) create mode 100644 examples/csv-tsv-writer.ilo create mode 100644 tests/regression_csv_tsv_writer.rs diff --git a/examples/csv-tsv-writer.ilo b/examples/csv-tsv-writer.ilo new file mode 100644 index 00000000..fc8f2696 --- /dev/null +++ b/examples/csv-tsv-writer.ilo @@ -0,0 +1,14 @@ +-- 3-arg wr: serialise a list of rows as csv or tsv and write it in one call. +-- Each row is itself a list of fields. RFC 4180 quoting applied (fields with +-- the separator, a quote, or a newline are wrapped in "…" and inner quotes +-- are escaped as ""). + +dumpcsv>R t t;wr "/tmp/ilo_wr_csv_example.csv" [["name", "age"], ["alice", 30], ["bob", 25]] "csv" + +dumptsv>R t t;wr "/tmp/ilo_wr_tsv_example.tsv" [["name", "age"], ["alice", 30], ["bob", 25]] "tsv" + +-- run: dumpcsv +-- out: ~/tmp/ilo_wr_csv_example.csv + +-- run: dumptsv +-- out: ~/tmp/ilo_wr_tsv_example.tsv diff --git a/src/interpreter/mod.rs b/src/interpreter/mod.rs index 3c9b8696..ed74a94e 100644 --- a/src/interpreter/mod.rs +++ b/src/interpreter/mod.rs @@ -405,6 +405,120 @@ fn vec_from_value(v: &Value, name: &str) -> Result> { Ok(out) } +/// Format one field for csv/tsv output following RFC 4180 quoting. +/// Quote the field if it contains the separator, `"`, or newline. +/// Inner `"` are escaped as `""`. +fn fmt_csv_field(v: &Value, sep: char) -> String { + let raw = match v { + Value::Text(s) => s.clone(), + Value::Number(n) => { + if *n == (*n as i64) as f64 { + format!("{}", *n as i64) + } else { + format!("{n}") + } + } + Value::Bool(b) => format!("{b}"), + Value::Nil => String::new(), + other => format!("{other}"), + }; + if raw.contains(sep) || raw.contains('"') || raw.contains('\n') { + format!("\"{}\"", raw.replace('"', "\"\"")) + } else { + raw + } +} + +/// Serialise a list of rows as csv or tsv. +/// +/// Row shapes: +/// * `L (L _)` — list of lists, no header row. +/// * `L record` — header from the first record's fields (keys sorted for +/// stable output across runs since fields are stored in a HashMap). +/// * `L (M k v)` — header from the first map's keys (sorted likewise). +pub(crate) fn write_csv_tsv(rows: &[Value], sep: char) -> Result { + let mut out = String::new(); + let first = match rows.first() { + Some(r) => r, + None => return Ok(out), + }; + let (header, use_keys): (Option>, bool) = match first { + Value::List(_) => (None, false), + Value::Record { fields, .. } => { + let mut keys: Vec = fields.keys().cloned().collect(); + keys.sort(); + (Some(keys), true) + } + Value::Map(m) => { + let mut keys: Vec = m.keys().cloned().collect(); + keys.sort(); + (Some(keys), true) + } + other => { + return Err(RuntimeError::new( + "ILO-R009", + format!( + "wr: each row must be a list, record, or map, got {:?}", + other + ), + )); + } + }; + if let Some(ref keys) = header { + for (i, k) in keys.iter().enumerate() { + if i > 0 { + out.push(sep); + } + out.push_str(&fmt_csv_field(&Value::Text(k.clone()), sep)); + } + out.push('\n'); + } + for row in rows { + match (row, use_keys, header.as_ref()) { + (Value::List(fields), false, _) => { + for (i, f) in fields.iter().enumerate() { + if i > 0 { + out.push(sep); + } + out.push_str(&fmt_csv_field(f, sep)); + } + out.push('\n'); + } + (Value::Record { fields, .. }, true, Some(keys)) => { + for (i, k) in keys.iter().enumerate() { + if i > 0 { + out.push(sep); + } + let v = fields.get(k).cloned().unwrap_or(Value::Nil); + out.push_str(&fmt_csv_field(&v, sep)); + } + out.push('\n'); + } + (Value::Map(m), true, Some(keys)) => { + for (i, k) in keys.iter().enumerate() { + if i > 0 { + out.push(sep); + } + let v = m.get(k).cloned().unwrap_or(Value::Nil); + out.push_str(&fmt_csv_field(&v, sep)); + } + out.push('\n'); + } + (other, _, _) => { + return Err(RuntimeError::new( + "ILO-R009", + format!( + "wr: row shape mismatch (expected {} rows), got {:?}", + if use_keys { "record/map" } else { "list" }, + other + ), + )); + } + } + } + Ok(out) +} + /// LU decomposition with partial pivoting, in-place on an owned matrix. /// Returns (LU, pivot indices, determinant, singular flag). When `singular` /// is true, the LU and det values are still well-formed (det = 0) but the @@ -2066,49 +2180,7 @@ fn call_function(env: &mut Env, name: &str, args: Vec) -> Result { )); } }; - let mut out = String::new(); - for row in rows { - match row { - Value::List(fields) => { - for (i, f) in fields.iter().enumerate() { - if i > 0 { - out.push(sep); - } - let s = match f { - Value::Text(s) => { - if s.contains(sep) - || s.contains('"') - || s.contains('\n') - { - format!("\"{}\"", s.replace('"', "\"\"")) - } else { - out.push_str(s); - continue; - } - } - Value::Number(n) => { - if *n == (*n as i64) as f64 { - format!("{}", *n as i64) - } else { - format!("{n}") - } - } - Value::Bool(b) => format!("{b}"), - other => format!("{other}"), - }; - out.push_str(&s); - } - out.push('\n'); - } - other => { - return Err(RuntimeError::new( - "ILO-R009", - format!("wr: each row must be a list, got {:?}", other), - )); - } - } - } - out + write_csv_tsv(rows, sep)? } "json" => { fn value_to_json(v: &Value) -> serde_json::Value { diff --git a/src/vm/mod.rs b/src/vm/mod.rs index 46fa6bde..3a406fe6 100644 --- a/src/vm/mod.rs +++ b/src/vm/mod.rs @@ -209,6 +209,12 @@ pub(crate) const OP_DROP: u8 = 114; // R[A] = drop(R[B], R[C]) (skip first B el pub(crate) const OP_DTFMT: u8 = 131; // R[A] = dtfmt(R[B] epoch, R[C] fmt) → t pub(crate) const OP_DTPARSE: u8 = 132; // R[A] = dtparse(R[B] text, R[C] fmt) → R n t +// Format-dump helper for csv/tsv: parallels OP_JDMP for the `wr path data fmt` +// 3-arg overload. R[A] = serialised string; R[B] = data list; C = 0 for csv +// (separator `,`), C = 1 for tsv (separator `\t`). The literal format is known +// at compile time, so we encode it in the C field rather than via a register. +pub(crate) const OP_CSVDMP: u8 = 134; + // ABC mode — text formatting pub(crate) const OP_FMT2: u8 = 104; // R[A] = fmt2(R[B], R[C]) (format number to N decimal places → t) @@ -2246,22 +2252,32 @@ impl RegCompiler { } return ra; } - // 3-arg `wr path data "json"` — serialise with jdmp, - // then write. Only the literal "json" format is inlined; - // other formats fall through (and the verifier rejects - // unknown literal formats up-front). + // 3-arg `wr path data fmt` — when fmt is a literal + // "json"/"csv"/"tsv", serialise to a string register + // then OP_WR. json uses OP_JDMP; csv/tsv uses OP_CSVDMP + // (separator encoded in C: 0=csv, 1=tsv). (Builtin::Wr, 3) => { if let crate::ast::Expr::Literal(crate::ast::Literal::Text(fmt_str)) = &args[2] - && fmt_str == "json" + && matches!(fmt_str.as_str(), "json" | "csv" | "tsv") { let rpath = self.compile_expr(&args[0]); let rdata = self.compile_expr(&args[1]); - // jdmp(data) → text - let rjson = self.alloc_reg(); - self.emit_abc(OP_JDMP, rjson, rdata, 0); + let rcontent = self.alloc_reg(); + match fmt_str.as_str() { + "json" => { + self.emit_abc(OP_JDMP, rcontent, rdata, 0); + } + "csv" => { + self.emit_abc(OP_CSVDMP, rcontent, rdata, 0); + } + "tsv" => { + self.emit_abc(OP_CSVDMP, rcontent, rdata, 1); + } + _ => unreachable!(), + } let ra = self.alloc_reg(); - self.emit_abc(OP_WR, ra, rpath, rjson); + self.emit_abc(OP_WR, ra, rpath, rcontent); if *unwrap { let check_reg = self.alloc_reg(); self.emit_abc(OP_ISOK, check_reg, ra, 0); @@ -2273,11 +2289,11 @@ impl RegCompiler { } return ra; } - // non-"json" 3-arg wr (csv/tsv or dynamic): no VM - // path yet — surface a clear compile error. + // Dynamic format string: no VM path yet — surface + // a clear compile error pointing at --run-tree. self.first_error.get_or_insert( CompileError::UndefinedFunction { - name: "wr (3-arg non-json format not yet supported in VM; use --run-tree)".to_string(), + name: "wr (3-arg with dynamic format not yet supported in VM; use --run-tree)".to_string(), }, ); return self.alloc_reg(); @@ -2820,12 +2836,13 @@ fn chunk_is_all_numeric(chunk: &Chunk) -> bool { match op { OP_RECNEW | OP_LISTNEW | OP_RECWITH | OP_WRAPOK | OP_WRAPERR | OP_STR | OP_CAT | OP_SPL | OP_REV | OP_SRT | OP_SRTDESC | OP_SLC | OP_TAKE | OP_DROP | OP_UNQ - | OP_UNIQBY | OP_FRQ | OP_PARTITION | OP_LISTAPPEND | OP_JPAR | OP_JDMP | OP_ENV - | OP_GET | OP_GETH | OP_GETMANY | OP_POST | OP_POSTH | OP_RD | OP_RDL | OP_RDJL - | OP_WR | OP_WRL | OP_MAPNEW | OP_MGET | OP_MSET | OP_MKEYS | OP_MVALS | OP_HD - | OP_AT | OP_LST | OP_TL | OP_FMT2 | OP_RGXSUB | OP_ZIP | OP_ENUMERATE | OP_WINDOW - | OP_FFT | OP_IFFT | OP_RANGE | OP_CHUNKS | OP_CUMSUM | OP_SETUNION | OP_SETINTER - | OP_SETDIFF | OP_TRANSPOSE | OP_MATMUL | OP_INV | OP_SOLVE | OP_DTFMT | OP_DTPARSE => { + | OP_UNIQBY | OP_FRQ | OP_PARTITION | OP_LISTAPPEND | OP_JPAR | OP_JDMP | OP_CSVDMP + | OP_ENV | OP_GET | OP_GETH | OP_GETMANY | OP_POST | OP_POSTH | OP_RD | OP_RDL + | OP_RDJL | OP_WR | OP_WRL | OP_MAPNEW | OP_MGET | OP_MSET | OP_MKEYS | OP_MVALS + | OP_HD | OP_AT | OP_LST | OP_TL | OP_FMT2 | OP_RGXSUB | OP_ZIP | OP_ENUMERATE + | OP_WINDOW | OP_FFT | OP_IFFT | OP_RANGE | OP_CHUNKS | OP_CUMSUM | OP_SETUNION + | OP_SETINTER | OP_SETDIFF | OP_TRANSPOSE | OP_MATMUL | OP_INV | OP_SOLVE + | OP_DTFMT | OP_DTPARSE => { return false; } _ => {} @@ -6260,6 +6277,28 @@ impl<'a> VM<'a> { let json_val = nanval_to_json(v); reg_set!(a, NanVal::heap_string(json_val.to_string())); } + OP_CSVDMP => { + // Format the value in R[B] as csv (C==0) or tsv (C==1). + // Re-uses the tree-walker writer so quoting/header rules + // stay identical across engines. + let a = ((inst >> 16) & 0xFF) as usize + base; + let b = ((inst >> 8) & 0xFF) as usize + base; + let c = (inst & 0xFF) as u8; + let sep = if c == 0 { ',' } else { '\t' }; + let data = reg!(b).to_value(); + let rows = match data { + Value::List(l) => l, + _ => { + vm_err!(VmError::Type("wr csv/tsv: data must be a list of rows")); + } + }; + match crate::interpreter::write_csv_tsv(&rows, sep) { + Ok(s) => reg_set!(a, NanVal::heap_string(s)), + Err(_e) => { + vm_err!(VmError::Type("wr csv/tsv: row shape mismatch")); + } + } + } OP_JPAR => { let a = ((inst >> 16) & 0xFF) as usize + base; let b = ((inst >> 8) & 0xFF) as usize + base; @@ -24319,17 +24358,12 @@ f>n;r=mk 10 20;+r.x r.y"; } #[test] - fn vm_compile_wr_3arg_non_json_emits_compile_error() { - // csv/tsv 3-arg form is not yet implemented in the VM compiler; - // it should report a clear UndefinedFunction error. + fn vm_compile_wr_3arg_csv_compiles() { + // csv/tsv 3-arg form now compiles. Pinning the successful path. let src = r#"f>R t t;wr "/tmp/__ilo_test_wr.csv" [[1,2]] "csv""#; let prog = parse_for_vm(src); - let err = match crate::vm::compile(&prog) { - Ok(_) => panic!("csv 3-arg should not compile"), - Err(e) => e, - }; - let msg = format!("{err:?}"); - assert!(msg.contains("3-arg non-json"), "got {msg}"); + let compiled = crate::vm::compile(&prog).expect("compile should succeed for csv overload"); + assert!(!compiled.chunks.is_empty()); } // ---- OP_RNDN VM dispatch + jit_rndn helper coverage ---- diff --git a/tests/regression_csv_tsv_writer.rs b/tests/regression_csv_tsv_writer.rs new file mode 100644 index 00000000..f5f21fce --- /dev/null +++ b/tests/regression_csv_tsv_writer.rs @@ -0,0 +1,125 @@ +// Regression: the 3-arg `wr path data "csv"` / `wr path data "tsv"` overload +// should type-check and serialise `data` as a delimited table. +// +// History: PR #179 added the 3-arg `wr path data "json"` shortcut. The csv/tsv +// variants were deferred until this branch. Behaviour: +// +// * `L (L _)` (list of lists) → no header, just rows. +// * RFC 4180 quoting → fields containing the delimiter, +// a `"`, or a newline are wrapped +// and inner `"` doubled. +// * Round-trip via `rdl` + `spl` → preserves shape. +// +// The VM csv/tsv path uses a small OP_CSVDMP helper (paralleling OP_JDMP) so +// both engines emit identical bytes — the helper itself defers to the +// tree-walker's `write_csv_tsv` implementation to keep behaviour in lockstep. + +use std::process::Command; + +fn ilo() -> Command { + Command::new(env!("CARGO_BIN_EXE_ilo")) +} + +fn run_ok(engine: &str, src: &str, entry: &str) -> String { + let out = ilo() + .args([src, engine, entry]) + .output() + .expect("failed to run ilo"); + assert!( + out.status.success(), + "ilo {engine} failed for `{src}`: stderr={}", + String::from_utf8_lossy(&out.stderr) + ); + String::from_utf8_lossy(&out.stdout).trim().to_string() +} + +fn engines() -> &'static [&'static str] { + &["--run-tree", "--run-vm"] +} + +// `L (L t)` → no header row, just data rows. +#[test] +fn wr_csv_list_of_lists_no_header() { + for (i, engine) in engines().iter().enumerate() { + let path = format!("/tmp/ilo_wr_csv_ll_{i}.csv"); + let _ = std::fs::remove_file(&path); + let src = format!(r#"f>R t t;wr "{path}" [["a","b"],["c","d"]] "csv""#); + let _ = run_ok(engine, &src, "f"); + let body = std::fs::read_to_string(&path).expect("missing output file"); + assert_eq!(body, "a,b\nc,d\n", "engine={engine}"); + let _ = std::fs::remove_file(&path); + } +} + +// tsv variant uses `\t` as the delimiter. +#[test] +fn wr_tsv_list_of_lists() { + for (i, engine) in engines().iter().enumerate() { + let path = format!("/tmp/ilo_wr_tsv_ll_{i}.tsv"); + let _ = std::fs::remove_file(&path); + let src = format!(r#"f>R t t;wr "{path}" [["a","b"],["c","d"]] "tsv""#); + let _ = run_ok(engine, &src, "f"); + let body = std::fs::read_to_string(&path).expect("missing output file"); + assert_eq!(body, "a\tb\nc\td\n", "engine={engine}"); + let _ = std::fs::remove_file(&path); + } +} + +// Field containing the delimiter is RFC-4180-quoted. +#[test] +fn wr_csv_field_with_comma_is_quoted() { + for (i, engine) in engines().iter().enumerate() { + let path = format!("/tmp/ilo_wr_csv_comma_{i}.csv"); + let _ = std::fs::remove_file(&path); + let src = format!(r#"f>R t t;wr "{path}" [["a,b","plain"]] "csv""#); + let _ = run_ok(engine, &src, "f"); + let body = std::fs::read_to_string(&path).expect("missing output file"); + assert_eq!(body, "\"a,b\",plain\n", "engine={engine}"); + let _ = std::fs::remove_file(&path); + } +} + +// Inner `"` is escaped as `""`. +#[test] +fn wr_csv_field_with_quote_is_escaped() { + for (i, engine) in engines().iter().enumerate() { + let path = format!("/tmp/ilo_wr_csv_quote_{i}.csv"); + let _ = std::fs::remove_file(&path); + let src = format!(r#"f>R t t;wr "{path}" [["he said \"hi\"","x"]] "csv""#); + let _ = run_ok(engine, &src, "f"); + let body = std::fs::read_to_string(&path).expect("missing output file"); + assert_eq!(body, "\"he said \"\"hi\"\"\",x\n", "engine={engine}"); + let _ = std::fs::remove_file(&path); + } +} + +// Field containing a newline is quoted. +#[test] +fn wr_csv_field_with_newline_is_quoted() { + for (i, engine) in engines().iter().enumerate() { + let path = format!("/tmp/ilo_wr_csv_nl_{i}.csv"); + let _ = std::fs::remove_file(&path); + let src = format!(r#"f>R t t;wr "{path}" [["a\nb","c"]] "csv""#); + let _ = run_ok(engine, &src, "f"); + let body = std::fs::read_to_string(&path).expect("missing output file"); + assert_eq!(body, "\"a\nb\",c\n", "engine={engine}"); + let _ = std::fs::remove_file(&path); + } +} + +// Round-trip: write csv then read back with `rdl` and confirm we can +// split the first line on `,` to recover the original cells. +#[test] +fn wr_csv_roundtrip_via_rdl_spl() { + for (i, engine) in engines().iter().enumerate() { + let path = format!("/tmp/ilo_wr_csv_rt_{i}.csv"); + let _ = std::fs::remove_file(&path); + let write_src = format!(r#"f>R t t;wr "{path}" [["a","b"],["c","d"]] "csv""#); + let _ = run_ok(engine, &write_src, "f"); + // Read first line back and split on the csv separator. + let read_src = format!(r#"g>t;p=rdl "{path}";?p{{~v:hd (spl (hd v) ",");^_:"err"}}"#); + let first = run_ok(engine, &read_src, "g"); + assert_eq!(first, "a", "engine={engine}: rdl/spl roundtrip mismatch"); + let _ = std::fs::remove_file(&path); + } +}