From e48584e946f3f6ea1cf909b8a5ec3ebedb5d8856 Mon Sep 17 00:00:00 2001 From: "Jonathan D.A. Jewell" <6759885+hyperpolymath@users.noreply.github.com> Date: Sat, 30 May 2026 15:07:25 +0100 Subject: [PATCH] fix(codegen-js): emit \uXXXX/\u{X} for non-ASCII (closes #460) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit OCaml's `String.escaped` emits non-ASCII bytes as `\NNN` *decimal* sequences. JavaScript parses `\NNN` as *octal* escapes which strict-mode ESM rejects outright (`SyntaxError: Octal escape sequences are not allowed in strict mode`), and which would decode to wrong characters even outside strict mode. Adds `Js_codegen.js_string_lit` that walks the UTF-8 byte sequence, decodes code points, and emits `\uXXXX` (BMP) or `\u{XXXXX}` (non-BMP) Unicode escapes. ASCII printable bytes pass through unchanged; `\\` `\"` `\n` `\r` `\t` use conventional escapes; ASCII control bytes use `\xHH`. Wired into both `js_codegen.ml` (Node target) and `codegen_deno.ml` (Deno-ESM target) LitString/LitChar emit sites. Regression fixture `tests/codegen-deno/non_ascii.affine` + harness exercise BMP emoji (❌ ✓), CJK (你好), Latin accents (café résumé), non-BMP code points (😭 = U+1F62D), mixed strings, and the existing-escape regression path (\\ and \"). Pre-fix: harness `import` itself fails with SyntaxError. Post-fix: 8/8 assertions pass. Verified: full `tools/run_codegen_deno_tests.sh` (13/13 harnesses green); full `dune test` suite (352/352 green). Closes #460 Refs hyperpolymath/standards#284 (the seam-analyst PR that surfaced this) Co-Authored-By: Claude Opus 4.7 (1M context) --- lib/codegen_deno.ml | 4 +- lib/js_codegen.ml | 60 +++++++++++++++++++++++- tests/codegen-deno/non_ascii.affine | 17 +++++++ tests/codegen-deno/non_ascii.harness.mjs | 28 +++++++++++ 4 files changed, 105 insertions(+), 4 deletions(-) create mode 100644 tests/codegen-deno/non_ascii.affine create mode 100644 tests/codegen-deno/non_ascii.harness.mjs diff --git a/lib/codegen_deno.ml b/lib/codegen_deno.ml index 6bf77afe..1ad5dbaa 100644 --- a/lib/codegen_deno.ml +++ b/lib/codegen_deno.ml @@ -721,8 +721,8 @@ and gen_literal (lit : literal) : string = if String.length s > 0 && s.[String.length s - 1] = '.' then s ^ "0" else s | LitBool (true, _) -> "true" | LitBool (false, _) -> "false" - | LitString (s, _) -> "\"" ^ String.escaped s ^ "\"" - | LitChar (c, _) -> "\"" ^ Char.escaped c ^ "\"" + | LitString (s, _) -> Js_codegen.js_string_lit s + | LitChar (c, _) -> Js_codegen.js_string_lit (String.make 1 c) | LitUnit _ -> "Unit" and gen_pattern ctx (pat : pattern) : string = diff --git a/lib/js_codegen.ml b/lib/js_codegen.ml index eea6e3ff..606b3ea3 100644 --- a/lib/js_codegen.ml +++ b/lib/js_codegen.ml @@ -101,6 +101,62 @@ let mangle (name : string) : string = if List.mem name js_reserved then name ^ "_" else name +(** Lower a UTF-8 byte string to a JS double-quoted literal that is + safe under strict-mode ESM. + + OCaml's [String.escaped] emits non-ASCII bytes as [\NNN] *decimal* + sequences; JavaScript parses [\NNN] as *octal* escapes which strict + mode rejects ([SyntaxError: Octal escape sequences are not allowed + in strict mode]) and which would decode to wrong characters even + outside strict mode. This helper instead decodes the UTF-8 byte + sequence to code points and emits [\uXXXX] (BMP) or [\u{XXXXX}] + (non-BMP) Unicode escapes — accepted everywhere, no parser-mode + surprises, and preserves the original character. Closes #460. *) +let js_string_lit (s : string) : string = + let buf = Buffer.create (String.length s + 8) in + Buffer.add_char buf '"'; + let n = String.length s in + let i = ref 0 in + while !i < n do + let b0 = Char.code s.[!i] in + if b0 < 0x80 then begin + (match Char.chr b0 with + | '\\' -> Buffer.add_string buf "\\\\" + | '"' -> Buffer.add_string buf "\\\"" + | '\n' -> Buffer.add_string buf "\\n" + | '\r' -> Buffer.add_string buf "\\r" + | '\t' -> Buffer.add_string buf "\\t" + | c when b0 >= 0x20 && b0 <= 0x7E -> Buffer.add_char buf c + | _ -> Buffer.add_string buf (Printf.sprintf "\\x%02X" b0)); + incr i + end else begin + let cp, len = + if b0 < 0xC0 then (b0, 1) + else if b0 < 0xE0 && !i + 1 < n then + let b1 = Char.code s.[!i + 1] in + (((b0 land 0x1F) lsl 6) lor (b1 land 0x3F), 2) + else if b0 < 0xF0 && !i + 2 < n then + let b1 = Char.code s.[!i + 1] in + let b2 = Char.code s.[!i + 2] in + (((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F), 3) + else if !i + 3 < n then + let b1 = Char.code s.[!i + 1] in + let b2 = Char.code s.[!i + 2] in + let b3 = Char.code s.[!i + 3] in + (((b0 land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) + lor ((b2 land 0x3F) lsl 6) lor (b3 land 0x3F), 4) + else (b0, 1) + in + if cp <= 0xFFFF then + Buffer.add_string buf (Printf.sprintf "\\u%04X" cp) + else + Buffer.add_string buf (Printf.sprintf "\\u{%X}" cp); + i := !i + len + end + done; + Buffer.add_char buf '"'; + Buffer.contents buf + (* ============================================================================ Expression Code Generation ============================================================================ *) @@ -230,8 +286,8 @@ and gen_literal (lit : literal) : string = if String.length s > 0 && s.[String.length s - 1] = '.' then s ^ "0" else s | LitBool (true, _) -> "true" | LitBool (false, _) -> "false" - | LitString (s, _) -> "\"" ^ String.escaped s ^ "\"" - | LitChar (c, _) -> "\"" ^ Char.escaped c ^ "\"" + | LitString (s, _) -> js_string_lit s + | LitChar (c, _) -> js_string_lit (String.make 1 c) | LitUnit _ -> "Unit" and gen_pattern ctx (pat : pattern) : string = diff --git a/tests/codegen-deno/non_ascii.affine b/tests/codegen-deno/non_ascii.affine new file mode 100644 index 00000000..a341cb48 --- /dev/null +++ b/tests/codegen-deno/non_ascii.affine @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: MPL-2.0 +// issue #460 — non-ASCII string literals must round-trip under +// strict-mode ESM. Pre-fix, the JS codegen used OCaml `String.escaped` +// which emitted `\NNN` decimal sequences; the JS parser reads `\NNN` +// as OCTAL escapes, which strict-mode ESM rejects with +// `SyntaxError: Octal escape sequences are not allowed in strict mode`. +// Post-fix, non-ASCII bytes lower to `\uXXXX` / `\u{XXXXX}` Unicode +// escapes which all JS parser modes accept. + +pub fn emoji_cross() -> String { return "❌"; } +pub fn emoji_check() -> String { return "✓"; } +pub fn cjk_hello() -> String { return "你好"; } +pub fn latin_accent() -> String { return "café résumé"; } +pub fn non_bmp_sob() -> String { return "😭"; } +pub fn mixed() -> String { return "[OK] café 你好 ❌"; } +pub fn ascii_only() -> String { return "plain ASCII"; } +pub fn quotes_and_backslash() -> String { return "\"escaped\" and \\back"; } diff --git a/tests/codegen-deno/non_ascii.harness.mjs b/tests/codegen-deno/non_ascii.harness.mjs new file mode 100644 index 00000000..a36c8b40 --- /dev/null +++ b/tests/codegen-deno/non_ascii.harness.mjs @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MPL-2.0 +// issue #460 — round-trip non-ASCII string literals through the +// Deno-ESM backend under strict-mode ESM. The `import` itself is the +// strictest test: if the emitted `.deno.js` contains octal escapes, +// the module fails to parse and the import throws SyntaxError before +// any assertion can run. +import assert from "node:assert/strict"; +import { + emoji_cross, + emoji_check, + cjk_hello, + latin_accent, + non_bmp_sob, + mixed, + ascii_only, + quotes_and_backslash, +} from "./non_ascii.deno.js"; + +assert.equal(emoji_cross(), "❌", "BMP emoji ❌ round-trips"); +assert.equal(emoji_check(), "✓", "BMP check mark ✓ round-trips"); +assert.equal(cjk_hello(), "你好", "CJK 'nihao' round-trips"); +assert.equal(latin_accent(), "café résumé", "Latin accented round-trips"); +assert.equal(non_bmp_sob(), "\u{1F62D}", "non-BMP code point round-trips"); +assert.equal(mixed(), "[OK] café 你好 ❌", "mixed ASCII+non-ASCII round-trips"); +assert.equal(ascii_only(), "plain ASCII", "ASCII-only unchanged"); +assert.equal(quotes_and_backslash(), "\"escaped\" and \\back", "quote+backslash escapes preserved"); + +console.log("non_ascii.harness.mjs OK");