From e48584e946f3f6ea1cf909b8a5ec3ebedb5d8856 Mon Sep 17 00:00:00 2001
From: "Jonathan D.A. Jewell" <6759885+hyperpolymath@users.noreply.github.com>
Date: Sat, 30 May 2026 15:07:25 +0100
Subject: [PATCH] fix(codegen-js): emit \uXXXX/\u{X} for non-ASCII (closes
 #460)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

OCaml's `String.escaped` emits non-ASCII bytes as `\NNN` *decimal*
sequences. JavaScript parses `\NNN` as *octal* escapes which strict-mode
ESM rejects outright (`SyntaxError: Octal escape sequences are not
allowed in strict mode`), and which would decode to wrong characters
even outside strict mode.

Adds `Js_codegen.js_string_lit` that walks the UTF-8 byte sequence,
decodes code points, and emits `\uXXXX` (BMP) or `\u{XXXXX}` (non-BMP)
Unicode escapes. ASCII printable bytes pass through unchanged; `\\` `\"`
`\n` `\r` `\t` use conventional escapes; ASCII control bytes use
`\xHH`. Wired into both `js_codegen.ml` (Node target) and
`codegen_deno.ml` (Deno-ESM target) LitString/LitChar emit sites.

Regression fixture `tests/codegen-deno/non_ascii.affine` + harness
exercise BMP emoji (❌ ✓), CJK (你好), Latin accents (café résumé),
non-BMP code points (😭 = U+1F62D), mixed strings, and the
existing-escape regression path (\\ and \"). Pre-fix: harness
`import` itself fails with SyntaxError. Post-fix: 8/8 assertions pass.

Verified: full `tools/run_codegen_deno_tests.sh` (13/13 harnesses
green); full `dune test` suite (352/352 green).

Closes #460
Refs hyperpolymath/standards#284 (the seam-analyst PR that surfaced
this)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 lib/codegen_deno.ml                      |  4 +-
 lib/js_codegen.ml                        | 60 +++++++++++++++++++++++-
 tests/codegen-deno/non_ascii.affine      | 17 +++++++
 tests/codegen-deno/non_ascii.harness.mjs | 28 +++++++++++
 4 files changed, 105 insertions(+), 4 deletions(-)
 create mode 100644 tests/codegen-deno/non_ascii.affine
 create mode 100644 tests/codegen-deno/non_ascii.harness.mjs

diff --git a/lib/codegen_deno.ml b/lib/codegen_deno.ml
index 6bf77afe..1ad5dbaa 100644
--- a/lib/codegen_deno.ml
+++ b/lib/codegen_deno.ml
@@ -721,8 +721,8 @@ and gen_literal (lit : literal) : string =
       if String.length s > 0 && s.[String.length s - 1] = '.' then s ^ "0" else s
   | LitBool (true, _)  -> "true"
   | LitBool (false, _) -> "false"
-  | LitString (s, _)   -> "\"" ^ String.escaped s ^ "\""
-  | LitChar (c, _)     -> "\"" ^ Char.escaped c ^ "\""
+  | LitString (s, _)   -> Js_codegen.js_string_lit s
+  | LitChar (c, _)     -> Js_codegen.js_string_lit (String.make 1 c)
   | LitUnit _          -> "Unit"
 
 and gen_pattern ctx (pat : pattern) : string =
diff --git a/lib/js_codegen.ml b/lib/js_codegen.ml
index eea6e3ff..606b3ea3 100644
--- a/lib/js_codegen.ml
+++ b/lib/js_codegen.ml
@@ -101,6 +101,62 @@ let mangle (name : string) : string =
   if List.mem name js_reserved then name ^ "_"
   else name
 
+(** Lower a UTF-8 byte string to a JS double-quoted literal that is
+    safe under strict-mode ESM.
+
+    OCaml's [String.escaped] emits non-ASCII bytes as [\NNN] *decimal*
+    sequences; JavaScript parses [\NNN] as *octal* escapes which strict
+    mode rejects ([SyntaxError: Octal escape sequences are not allowed
+    in strict mode]) and which would decode to wrong characters even
+    outside strict mode. This helper instead decodes the UTF-8 byte
+    sequence to code points and emits [\uXXXX] (BMP) or [\u{XXXXX}]
+    (non-BMP) Unicode escapes — accepted everywhere, no parser-mode
+    surprises, and preserves the original character. Closes #460. *)
+let js_string_lit (s : string) : string =
+  let buf = Buffer.create (String.length s + 8) in
+  Buffer.add_char buf '"';
+  let n = String.length s in
+  let i = ref 0 in
+  while !i < n do
+    let b0 = Char.code s.[!i] in
+    if b0 < 0x80 then begin
+      (match Char.chr b0 with
+       | '\\' -> Buffer.add_string buf "\\\\"
+       | '"'  -> Buffer.add_string buf "\\\""
+       | '\n' -> Buffer.add_string buf "\\n"
+       | '\r' -> Buffer.add_string buf "\\r"
+       | '\t' -> Buffer.add_string buf "\\t"
+       | c when b0 >= 0x20 && b0 <= 0x7E -> Buffer.add_char buf c
+       | _ -> Buffer.add_string buf (Printf.sprintf "\\x%02X" b0));
+      incr i
+    end else begin
+      let cp, len =
+        if b0 < 0xC0 then (b0, 1)
+        else if b0 < 0xE0 && !i + 1 < n then
+          let b1 = Char.code s.[!i + 1] in
+          (((b0 land 0x1F) lsl 6) lor (b1 land 0x3F), 2)
+        else if b0 < 0xF0 && !i + 2 < n then
+          let b1 = Char.code s.[!i + 1] in
+          let b2 = Char.code s.[!i + 2] in
+          (((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F), 3)
+        else if !i + 3 < n then
+          let b1 = Char.code s.[!i + 1] in
+          let b2 = Char.code s.[!i + 2] in
+          let b3 = Char.code s.[!i + 3] in
+          (((b0 land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12)
+            lor ((b2 land 0x3F) lsl 6) lor (b3 land 0x3F), 4)
+        else (b0, 1)
+      in
+      if cp <= 0xFFFF then
+        Buffer.add_string buf (Printf.sprintf "\\u%04X" cp)
+      else
+        Buffer.add_string buf (Printf.sprintf "\\u{%X}" cp);
+      i := !i + len
+    end
+  done;
+  Buffer.add_char buf '"';
+  Buffer.contents buf
+
 (* ============================================================================
    Expression Code Generation
    ============================================================================ *)
@@ -230,8 +286,8 @@ and gen_literal (lit : literal) : string =
       if String.length s > 0 && s.[String.length s - 1] = '.' then s ^ "0" else s
   | LitBool (true, _)    -> "true"
   | LitBool (false, _)   -> "false"
-  | LitString (s, _)     -> "\"" ^ String.escaped s ^ "\""
-  | LitChar (c, _)       -> "\"" ^ Char.escaped c ^ "\""
+  | LitString (s, _)     -> js_string_lit s
+  | LitChar (c, _)       -> js_string_lit (String.make 1 c)
   | LitUnit _            -> "Unit"
 
 and gen_pattern ctx (pat : pattern) : string =
diff --git a/tests/codegen-deno/non_ascii.affine b/tests/codegen-deno/non_ascii.affine
new file mode 100644
index 00000000..a341cb48
--- /dev/null
+++ b/tests/codegen-deno/non_ascii.affine
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: MPL-2.0
+// issue #460 — non-ASCII string literals must round-trip under
+// strict-mode ESM. Pre-fix, the JS codegen used OCaml `String.escaped`
+// which emitted `\NNN` decimal sequences; the JS parser reads `\NNN`
+// as OCTAL escapes, which strict-mode ESM rejects with
+// `SyntaxError: Octal escape sequences are not allowed in strict mode`.
+// Post-fix, non-ASCII bytes lower to `\uXXXX` / `\u{XXXXX}` Unicode
+// escapes which all JS parser modes accept.
+
+pub fn emoji_cross() -> String { return "❌"; }
+pub fn emoji_check() -> String { return "✓"; }
+pub fn cjk_hello() -> String { return "你好"; }
+pub fn latin_accent() -> String { return "café résumé"; }
+pub fn non_bmp_sob() -> String { return "😭"; }
+pub fn mixed() -> String { return "[OK] café 你好 ❌"; }
+pub fn ascii_only() -> String { return "plain ASCII"; }
+pub fn quotes_and_backslash() -> String { return "\"escaped\" and \\back"; }
diff --git a/tests/codegen-deno/non_ascii.harness.mjs b/tests/codegen-deno/non_ascii.harness.mjs
new file mode 100644
index 00000000..a36c8b40
--- /dev/null
+++ b/tests/codegen-deno/non_ascii.harness.mjs
@@ -0,0 +1,28 @@
+// SPDX-License-Identifier: MPL-2.0
+// issue #460 — round-trip non-ASCII string literals through the
+// Deno-ESM backend under strict-mode ESM. The `import` itself is the
+// strictest test: if the emitted `.deno.js` contains octal escapes,
+// the module fails to parse and the import throws SyntaxError before
+// any assertion can run.
+import assert from "node:assert/strict";
+import {
+  emoji_cross,
+  emoji_check,
+  cjk_hello,
+  latin_accent,
+  non_bmp_sob,
+  mixed,
+  ascii_only,
+  quotes_and_backslash,
+} from "./non_ascii.deno.js";
+
+assert.equal(emoji_cross(), "❌", "BMP emoji ❌ round-trips");
+assert.equal(emoji_check(), "✓", "BMP check mark ✓ round-trips");
+assert.equal(cjk_hello(), "你好", "CJK 'nihao' round-trips");
+assert.equal(latin_accent(), "café résumé", "Latin accented round-trips");
+assert.equal(non_bmp_sob(), "\u{1F62D}", "non-BMP code point round-trips");
+assert.equal(mixed(), "[OK] café 你好 ❌", "mixed ASCII+non-ASCII round-trips");
+assert.equal(ascii_only(), "plain ASCII", "ASCII-only unchanged");
+assert.equal(quotes_and_backslash(), "\"escaped\" and \\back", "quote+backslash escapes preserved");
+
+console.log("non_ascii.harness.mjs OK");