diff --git a/lib/codegen_deno.ml b/lib/codegen_deno.ml index 6bf77af..1ad5dba 100644 --- a/lib/codegen_deno.ml +++ b/lib/codegen_deno.ml @@ -721,8 +721,8 @@ and gen_literal (lit : literal) : string = if String.length s > 0 && s.[String.length s - 1] = '.' then s ^ "0" else s | LitBool (true, _) -> "true" | LitBool (false, _) -> "false" - | LitString (s, _) -> "\"" ^ String.escaped s ^ "\"" - | LitChar (c, _) -> "\"" ^ Char.escaped c ^ "\"" + | LitString (s, _) -> Js_codegen.js_string_lit s + | LitChar (c, _) -> Js_codegen.js_string_lit (String.make 1 c) | LitUnit _ -> "Unit" and gen_pattern ctx (pat : pattern) : string = diff --git a/lib/js_codegen.ml b/lib/js_codegen.ml index eea6e3f..606b3ea 100644 --- a/lib/js_codegen.ml +++ b/lib/js_codegen.ml @@ -101,6 +101,62 @@ let mangle (name : string) : string = if List.mem name js_reserved then name ^ "_" else name +(** Lower a UTF-8 byte string to a JS double-quoted literal that is + safe under strict-mode ESM. + + OCaml's [String.escaped] emits non-ASCII bytes as [\NNN] *decimal* + sequences; JavaScript parses [\NNN] as *octal* escapes which strict + mode rejects ([SyntaxError: Octal escape sequences are not allowed + in strict mode]) and which would decode to wrong characters even + outside strict mode. This helper instead decodes the UTF-8 byte + sequence to code points and emits [\uXXXX] (BMP) or [\u{XXXXX}] + (non-BMP) Unicode escapes — accepted everywhere, no parser-mode + surprises, and preserves the original character. Closes #460. *) +let js_string_lit (s : string) : string = + let buf = Buffer.create (String.length s + 8) in + Buffer.add_char buf '"'; + let n = String.length s in + let i = ref 0 in + while !i < n do + let b0 = Char.code s.[!i] in + if b0 < 0x80 then begin + (match Char.chr b0 with + | '\\' -> Buffer.add_string buf "\\\\" + | '"' -> Buffer.add_string buf "\\\"" + | '\n' -> Buffer.add_string buf "\\n" + | '\r' -> Buffer.add_string buf "\\r" + | '\t' -> Buffer.add_string buf "\\t" + | c when b0 >= 0x20 && b0 <= 0x7E -> Buffer.add_char buf c + | _ -> Buffer.add_string buf (Printf.sprintf "\\x%02X" b0)); + incr i + end else begin + let cp, len = + if b0 < 0xC0 then (b0, 1) + else if b0 < 0xE0 && !i + 1 < n then + let b1 = Char.code s.[!i + 1] in + (((b0 land 0x1F) lsl 6) lor (b1 land 0x3F), 2) + else if b0 < 0xF0 && !i + 2 < n then + let b1 = Char.code s.[!i + 1] in + let b2 = Char.code s.[!i + 2] in + (((b0 land 0x0F) lsl 12) lor ((b1 land 0x3F) lsl 6) lor (b2 land 0x3F), 3) + else if !i + 3 < n then + let b1 = Char.code s.[!i + 1] in + let b2 = Char.code s.[!i + 2] in + let b3 = Char.code s.[!i + 3] in + (((b0 land 0x07) lsl 18) lor ((b1 land 0x3F) lsl 12) + lor ((b2 land 0x3F) lsl 6) lor (b3 land 0x3F), 4) + else (b0, 1) + in + if cp <= 0xFFFF then + Buffer.add_string buf (Printf.sprintf "\\u%04X" cp) + else + Buffer.add_string buf (Printf.sprintf "\\u{%X}" cp); + i := !i + len + end + done; + Buffer.add_char buf '"'; + Buffer.contents buf + (* ============================================================================ Expression Code Generation ============================================================================ *) @@ -230,8 +286,8 @@ and gen_literal (lit : literal) : string = if String.length s > 0 && s.[String.length s - 1] = '.' then s ^ "0" else s | LitBool (true, _) -> "true" | LitBool (false, _) -> "false" - | LitString (s, _) -> "\"" ^ String.escaped s ^ "\"" - | LitChar (c, _) -> "\"" ^ Char.escaped c ^ "\"" + | LitString (s, _) -> js_string_lit s + | LitChar (c, _) -> js_string_lit (String.make 1 c) | LitUnit _ -> "Unit" and gen_pattern ctx (pat : pattern) : string = diff --git a/tests/codegen-deno/non_ascii.affine b/tests/codegen-deno/non_ascii.affine new file mode 100644 index 0000000..a341cb4 --- /dev/null +++ b/tests/codegen-deno/non_ascii.affine @@ -0,0 +1,17 @@ +// SPDX-License-Identifier: MPL-2.0 +// issue #460 — non-ASCII string literals must round-trip under +// strict-mode ESM. Pre-fix, the JS codegen used OCaml `String.escaped` +// which emitted `\NNN` decimal sequences; the JS parser reads `\NNN` +// as OCTAL escapes, which strict-mode ESM rejects with +// `SyntaxError: Octal escape sequences are not allowed in strict mode`. +// Post-fix, non-ASCII bytes lower to `\uXXXX` / `\u{XXXXX}` Unicode +// escapes which all JS parser modes accept. + +pub fn emoji_cross() -> String { return "❌"; } +pub fn emoji_check() -> String { return "✓"; } +pub fn cjk_hello() -> String { return "你好"; } +pub fn latin_accent() -> String { return "café résumé"; } +pub fn non_bmp_sob() -> String { return "😭"; } +pub fn mixed() -> String { return "[OK] café 你好 ❌"; } +pub fn ascii_only() -> String { return "plain ASCII"; } +pub fn quotes_and_backslash() -> String { return "\"escaped\" and \\back"; } diff --git a/tests/codegen-deno/non_ascii.harness.mjs b/tests/codegen-deno/non_ascii.harness.mjs new file mode 100644 index 0000000..a36c8b4 --- /dev/null +++ b/tests/codegen-deno/non_ascii.harness.mjs @@ -0,0 +1,28 @@ +// SPDX-License-Identifier: MPL-2.0 +// issue #460 — round-trip non-ASCII string literals through the +// Deno-ESM backend under strict-mode ESM. The `import` itself is the +// strictest test: if the emitted `.deno.js` contains octal escapes, +// the module fails to parse and the import throws SyntaxError before +// any assertion can run. +import assert from "node:assert/strict"; +import { + emoji_cross, + emoji_check, + cjk_hello, + latin_accent, + non_bmp_sob, + mixed, + ascii_only, + quotes_and_backslash, +} from "./non_ascii.deno.js"; + +assert.equal(emoji_cross(), "❌", "BMP emoji ❌ round-trips"); +assert.equal(emoji_check(), "✓", "BMP check mark ✓ round-trips"); +assert.equal(cjk_hello(), "你好", "CJK 'nihao' round-trips"); +assert.equal(latin_accent(), "café résumé", "Latin accented round-trips"); +assert.equal(non_bmp_sob(), "\u{1F62D}", "non-BMP code point round-trips"); +assert.equal(mixed(), "[OK] café 你好 ❌", "mixed ASCII+non-ASCII round-trips"); +assert.equal(ascii_only(), "plain ASCII", "ASCII-only unchanged"); +assert.equal(quotes_and_backslash(), "\"escaped\" and \\back", "quote+backslash escapes preserved"); + +console.log("non_ascii.harness.mjs OK");