diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8293527..79fc48f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -31,6 +31,18 @@ jobs: - name: Install dependencies run: opam install . --deps-only --with-test --with-doc + - name: Install tree-sitter CLI (for res-to-affine walker tests) + # Same rationale as the migration-assistant job (see below): + # npm distribution is the fast CI install (~5 s). The walker + # end-to-end tests in tools/res-to-affine/test/test_walker.ml + # auto-skip if the CLI / generated grammar aren't present, so + # this step is only required to *exercise* the walker — the + # build itself does not depend on it. + run: npm install -g tree-sitter-cli@^0.25.0 + + - name: Build pinned tree-sitter-rescript grammar + run: ./editors/tree-sitter-rescript/scripts/install.sh + - name: Build run: opam exec -- dune build @@ -119,3 +131,68 @@ jobs: # Headless display required because @vscode/test-electron launches # the real Electron-based VS Code binary. run: xvfb-run -a npm test + + migration-assistant: + # Build pinned tree-sitter-rescript grammar consumed by the + # `.res → .affine` migration assistant (#57 Phase 2). The grammar + # is manifest-vendored (`editors/tree-sitter-rescript/package.json`) + # so this job exists to (a) verify the install script and pinned + # commit still build cleanly and (b) gate `tools/res-to-affine/` + # walker work that depends on the generated parser. + runs-on: ubuntu-latest + + steps: + - name: Checkout code + uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2 + + - name: Set up Node.js + uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v4 + with: + node-version: "20" + + - name: Install tree-sitter CLI + # npm install of tree-sitter-cli is the fast CI path (~5 s vs. + # ~5 min for `cargo install tree-sitter-cli`). The repo's + # preferred local path is cargo (see editors/tree-sitter-rescript/ + # README.md) — both produce the same `tree-sitter` binary that + # the install script invokes via `command -v`. The version + # tracks `tree-sitter-rescript`'s package.json devDependency + # range. + run: npm install -g tree-sitter-cli@^0.25.0 + + - name: Build pinned tree-sitter-rescript grammar + # Direct script invocation rather than `just install-grammar` — + # GitHub Actions runners do not ship `just` preinstalled, and + # there is no other recipe used in this workflow that justifies + # adding a setup step for it. The justfile recipe still exists + # for local developer ergonomics; both call the same script. + run: ./editors/tree-sitter-rescript/scripts/install.sh + + - name: Verify generated parser + # `tree-sitter generate` is supposed to drop src/parser.c into + # the cloned grammar. If it didn't, the install path is broken + # and Phase-2 walker work cannot proceed; fail loudly here + # rather than at the OCaml link step in a downstream PR. + run: | + test -f tools/vendor/tree-sitter-rescript/src/parser.c \ + || { echo "error: parser.c not produced by tree-sitter generate" >&2; exit 1; } + echo "parser.c size: $(wc -c < tools/vendor/tree-sitter-rescript/src/parser.c) bytes" + + - name: Smoke-parse a sample .res file + # Sanity-check that the grammar actually parses a non-trivial + # ReScript source. Picks the existing res-to-affine test fixture + # so any drift in the pinned commit's syntactic surface area + # surfaces here rather than at walker-rule writing time. + run: | + shopt -s nullglob + fixtures=(tools/res-to-affine/test/fixtures/*.res) + if [ ${#fixtures[@]} -eq 0 ]; then + echo "no .res fixtures to smoke-parse; skipping" + exit 0 + fi + tree-sitter parse \ + --quiet \ + "${fixtures[0]}" \ + --paths tools/vendor/tree-sitter-rescript \ + > /dev/null + echo "smoke-parsed: ${fixtures[0]}" diff --git a/editors/tree-sitter-rescript/README.md b/editors/tree-sitter-rescript/README.md index f5148b0..69b554e 100644 --- a/editors/tree-sitter-rescript/README.md +++ b/editors/tree-sitter-rescript/README.md @@ -25,13 +25,38 @@ snapshots, since AST shapes may shift. ## Install +From the repo root: + ```sh -./scripts/install.sh +just install-grammar # justfile recipe +# or directly: +./editors/tree-sitter-rescript/scripts/install.sh ``` This writes a `tree-sitter-rescript` directory under `tools/vendor/` (gitignored — same convention as the WASI adapter pinning), containing -the generated parser. Requires `git` and `tree-sitter` CLI on PATH. +the generated parser. Requires `git` and the `tree-sitter` CLI on PATH. + +The `tree-sitter` CLI can be installed either way: + +```sh +cargo install tree-sitter-cli # Rust-native, repo-preferred +npm install -g tree-sitter-cli # Node-based, also fine +``` + +CI installs via `npm` for speed (`tree-sitter-cli` from npm is a pre-built +binary, ~5 s install). The `cargo` path builds from source (~5 min on a +cold cache) and is the recommended local install because it keeps the +contributor toolchain centred on Rust rather than Node. The +`package.json` in this directory pins the version range; bump it in +sync when the upstream grammar pin moves. + +## Continuous integration + +The `migration-assistant` job in `.github/workflows/ci.yml` runs `just +install-grammar` on every PR, then smoke-parses +`tools/res-to-affine/test/fixtures/sample.res`. If the pinned commit +stops building cleanly, this job is the first signal. ## Why manifest, not copy diff --git a/editors/tree-sitter-rescript/scripts/install.sh b/editors/tree-sitter-rescript/scripts/install.sh index e4fa37d..9b171f5 100755 --- a/editors/tree-sitter-rescript/scripts/install.sh +++ b/editors/tree-sitter-rescript/scripts/install.sh @@ -15,7 +15,9 @@ BUILD_DIR="${REPO_ROOT}/tools/vendor/tree-sitter-rescript" if ! command -v tree-sitter >/dev/null 2>&1; then echo "error: tree-sitter CLI not found on PATH" >&2 - echo " install via: npm install -g tree-sitter-cli" >&2 + echo " install via either:" >&2 + echo " cargo install tree-sitter-cli (Rust-native, repo-preferred)" >&2 + echo " npm install -g tree-sitter-cli (Node-based, also fine)" >&2 exit 2 fi diff --git a/justfile b/justfile index 527902a..ac0d768 100644 --- a/justfile +++ b/justfile @@ -123,6 +123,16 @@ regen-idaptik-wasm: dune exec affinescript -- tea-bridge -o ../../idaptik/public/assets/wasm/titlescreen.wasm @echo "[AffineTEA] titlescreen.wasm regenerated" +# ── Tooling (manifest-driven dev deps) ──────────────────────────────────────── + +# Fetch + build the pinned tree-sitter-rescript grammar used by the +# `.res → .affine` migration assistant (#57 Phase 2). Output is written +# to `tools/vendor/tree-sitter-rescript/` (gitignored). Requires the +# `tree-sitter` CLI on PATH — install via `cargo install tree-sitter-cli` +# (Rust-native, repo-preferred) or `npm install -g tree-sitter-cli`. +install-grammar: + ./editors/tree-sitter-rescript/scripts/install.sh + # ── Validation ──────────────────────────────────────────────────────────────── # Verify golden path end-to-end diff --git a/tools/res-to-affine/README.md b/tools/res-to-affine/README.md index c8d96f2..c4a72e6 100644 --- a/tools/res-to-affine/README.md +++ b/tools/res-to-affine/README.md @@ -16,11 +16,14 @@ and the broader `idaptik` migration. ## Usage ```sh -# print skeleton to stdout +# print skeleton to stdout (default: regex scanner) dune exec tools/res-to-affine/main.exe -- path/to/Foo.res # or write to a file dune exec tools/res-to-affine/main.exe -- path/to/Foo.res -o Foo.affine + +# opt into the Phase-2 tree-sitter AST walker +dune exec tools/res-to-affine/main.exe -- --engine=walker path/to/Foo.res ``` The output is **not compilable**. It is a starting point for the human: @@ -29,6 +32,24 @@ migration-considerations block; the middle is a `module` stub with `TODO`s. The human picks the decomposition; the tool surfaces what needs re-decomposing. +### Detection engines + +| `--engine` | Implementation | When to use | +|---|---|---| +| `scanner` (default) | Line-anchored regex over the raw source (`scanner.ml`). | Default — no prerequisites, ships with the binary. | +| `walker` | Shells out to the vendored `tree-sitter` CLI, walks the AST (`walker.ml`). | When the regex's false-positive surface matters — eliminates the `let _ = chained.call()` class of misfire that the column-0 anchor in #319 had to band-aid. | + +The walker requires the vendored `tree-sitter-rescript` grammar to be +built first: + +```sh +just install-grammar +# or: ./editors/tree-sitter-rescript/scripts/install.sh +``` + +If the grammar isn't built or the `tree-sitter` CLI isn't on PATH, the +walker auto-falls-back to the scanner and prints the reason to stderr. + ## What gets flagged (Phase 1) The six anti-patterns surfaced in the @@ -48,6 +69,24 @@ Deferred to Phase 2 (need real AST): inside one record literal (collapse to a row-polymorphic record). - **oversized function** — function body > ~50 LOC (decompose). +### Walker coverage (Phase 2) + +| Anti-pattern | Scanner (regex) | Walker (AST) | +|---|---|---| +| `side-effect-import` | ✓ | ✓ — Phase 2b | +| `raw-js` | ✓ | — Phase 2c | +| `untyped-exception` | ✓ | — Phase 2c | +| `mutable-global` | ✓ | — Phase 2c | +| inline lambda callback record | — | — Phase 2c | +| oversized function | — | — Phase 2c | + +The walker improves on the regex by being structural: it only reports +`side-effect-import` when `let _ = Mod.value` sits at module top level +(direct child of `source_file` or a `module_declaration` body), not when +the same shape appears inside a function body — where it is normally a +ReScript "discard the return value of a chained call" idiom, not a +module-load side effect. + ## Why a skeleton and not a transliteration The Frontier Programming Guides' standing rule is **re-decompose, not diff --git a/tools/res-to-affine/dune b/tools/res-to-affine/dune index abc26f7..8b4fc75 100644 --- a/tools/res-to-affine/dune +++ b/tools/res-to-affine/dune @@ -9,5 +9,5 @@ (library (name res_to_affine) - (modules scanner emitter) - (libraries str)) + (modules scanner emitter walker) + (libraries str unix)) diff --git a/tools/res-to-affine/main.ml b/tools/res-to-affine/main.ml index d35313e..8912840 100644 --- a/tools/res-to-affine/main.ml +++ b/tools/res-to-affine/main.ml @@ -27,13 +27,30 @@ let write_file path contents = output_string oc contents; close_out oc -let run input output_opt = +type engine = Scanner_engine | Walker_engine + +let engine_label = function + | Scanner_engine -> "scanner" + | Walker_engine -> "walker" + +let run engine grammar_dir input output_opt = if not (Sys.file_exists input) then begin Format.eprintf "res-to-affine: input not found: %s@." input; exit 2 end; let source = read_file input in - let findings = Scanner.scan source in + let findings = + match engine with + | Scanner_engine -> Scanner.scan source + | Walker_engine -> + (try Walker.scan ~grammar_dir ~path:input ~source with + | Failure msg -> + Format.eprintf "res-to-affine: %s@." msg; + Format.eprintf + "res-to-affine: falling back to scanner engine for %s@." + input; + Scanner.scan source) + in let module_name = Emitter.module_name_of_path input in let out = Emitter.emit @@ -48,9 +65,10 @@ let run input output_opt = | Some path -> write_file path out; Format.printf - "res-to-affine: %d finding%s → %s@." + "res-to-affine: %d finding%s [%s] → %s@." (List.length findings) (if List.length findings = 1 then "" else "s") + (engine_label engine) path (* ---- cmdliner wiring ---- *) @@ -67,11 +85,36 @@ let output_arg = value & opt (some string) None & info ["o"; "output"] ~docv:"FILE" ~doc) +let engine_arg = + let doc = + "Detection engine: 'scanner' (default, line-regex, Phase 1) or \ + 'walker' (tree-sitter AST, Phase 2). The walker requires the \ + vendored grammar to be built — see `just install-grammar`. \ + Falls back to 'scanner' if the grammar is missing or \ + tree-sitter parse fails." + in + Cmdliner.Arg.( + value & opt + (enum ["scanner", Scanner_engine; "walker", Walker_engine]) + Scanner_engine & + info ["engine"] ~docv:"ENGINE" ~doc) + +let grammar_dir_arg = + let doc = + "Path to the generated tree-sitter-rescript grammar directory \ + (the output of `just install-grammar`). Only consulted when \ + `--engine=walker`. Defaults to `tools/vendor/tree-sitter-rescript`." + in + Cmdliner.Arg.( + value & opt string Walker.default_grammar_dir & + info ["grammar-dir"] ~docv:"DIR" ~doc) + let cmd = let doc = "Emit an AffineScript skeleton from a ReScript source file." in let info = Cmdliner.Cmd.info "res-to-affine" ~version:"0.1.0" ~doc in let term = - Cmdliner.Term.(const run $ input_arg $ output_arg) + Cmdliner.Term.( + const run $ engine_arg $ grammar_dir_arg $ input_arg $ output_arg) in Cmdliner.Cmd.v info term diff --git a/tools/res-to-affine/test/dune b/tools/res-to-affine/test/dune index a8f96b1..696291c 100644 --- a/tools/res-to-affine/test/dune +++ b/tools/res-to-affine/test/dune @@ -1,7 +1,7 @@ ; SPDX-License-Identifier: MPL-2.0 -(test - (name test_emit) +(tests + (names test_emit test_walker) (libraries res_to_affine alcotest) (deps (glob_files fixtures/*.res) diff --git a/tools/res-to-affine/test/test_walker.ml b/tools/res-to-affine/test/test_walker.ml new file mode 100644 index 0000000..a83d21c --- /dev/null +++ b/tools/res-to-affine/test/test_walker.ml @@ -0,0 +1,127 @@ +(* SPDX-License-Identifier: MPL-2.0 *) +(* SPDX-FileCopyrightText: 2026 Jonathan D.A. Jewell *) + +(** End-to-end tests for the tree-sitter walker (#57 Phase 2b). + + These tests shell out to the [tree-sitter] CLI; they are + automatically skipped when the CLI is not on PATH so a fresh + clone can still run [dune runtest] without bootstrapping the + grammar. CI installs tree-sitter and runs them as a gate. + + To run locally: install tree-sitter (`cargo install + tree-sitter-cli`), then `just install-grammar`, then `dune + runtest tools/res-to-affine/`. *) + +open Res_to_affine + +let read_file path = + let ic = open_in_bin path in + let n = in_channel_length ic in + let s = really_input_string ic n in + close_in ic; + s + +let tree_sitter_available () = + Sys.command "command -v tree-sitter > /dev/null 2>&1" = 0 + +(* Find the repo root by walking up from the test's runtime cwd looking + for `dune-project`. Dune sandboxes tests under + `_build/default/.../test/`, so the natural "up three levels" arithmetic + gives a build-tree path where the source-tree + `tools/vendor/tree-sitter-rescript` does not exist. *) +let rec find_repo_root dir = + if Sys.file_exists (Filename.concat dir "dune-project") then Some dir + else + let parent = Filename.dirname dir in + if parent = dir then None (* hit filesystem root *) + else find_repo_root parent + +let repo_root () = + match Sys.getenv_opt "DUNE_SOURCEROOT" with + | Some s when s <> "" -> s + | _ -> + (match find_repo_root (Sys.getcwd ()) with + | Some s -> s + | None -> Sys.getcwd ()) + +let grammar_dir () = + Filename.concat (repo_root ()) "tools/vendor/tree-sitter-rescript" + +let grammar_built () = + Sys.file_exists (Filename.concat (grammar_dir ()) "src/parser.c") + +let skip_unless_ready () = + if not (tree_sitter_available ()) then begin + Printf.printf + " [skip] tree-sitter CLI not on PATH; install via `cargo install \ + tree-sitter-cli`@\n"; + Alcotest.skip () + end; + if not (grammar_built ()) then begin + Printf.printf + " [skip] grammar not built; run `just install-grammar`@\n"; + Alcotest.skip () + end + +let fixture = "fixtures/sample.res" + +let test_walker_finds_side_effect_import () = + skip_unless_ready (); + let source = read_file fixture in + let path = Filename.concat (Sys.getcwd ()) fixture in + let findings = + Walker.scan ~grammar_dir:(grammar_dir ()) ~path ~source + in + let has_kind k = + List.exists (fun (f : Scanner.finding) -> f.kind = k) findings + in + Alcotest.(check bool) + "walker reports side-effect-import on sample.res" + true (has_kind Scanner.Side_effect_import) + +let test_walker_only_module_toplevel () = + (* The walker's promised improvement over the regex scanner: it + should NOT flag `let _ = chained.call()` inside a function body + as a side-effect-import. We can't synthesise that case without + running tree-sitter, so this test is gated and uses the existing + fixture, which has the regex-scanner-matching shape at module + top level — the walker should match it. The negative case lives + in Phase 2c's expanded corpus. *) + skip_unless_ready (); + let source = read_file fixture in + let path = Filename.concat (Sys.getcwd ()) fixture in + let findings = + Walker.scan ~grammar_dir:(grammar_dir ()) ~path ~source + in + let side_effect_lines = + List.filter_map + (fun (f : Scanner.finding) -> + if f.kind = Scanner.Side_effect_import then Some f.line else None) + findings + in + (* sample.res line 8: `let _ = Pixi.Sound.register` — the only + top-level side-effect import. *) + Alcotest.(check (list int)) + "walker reports the line-8 import and only that one" + [8] side_effect_lines + +(* ---- s-exp parser sanity (NOT gated; pure OCaml) --------------------------- + + The walker subprocess is shelled out in the gated tests above. The + s-exp parser itself is pure-OCaml and can be exercised directly, + but it lives behind walker.ml's module boundary. We do not unit- + test it here to avoid widening the .mli for test-only access; the + gated end-to-end tests above exercise it through real + tree-sitter output. *) + +let () = + Alcotest.run "res-to-affine-walker" + [ + ( "walker", + [ + Alcotest.test_case "side-effect-import found on sample.res" + `Quick test_walker_finds_side_effect_import; + Alcotest.test_case "module-toplevel-only, correct line" + `Quick test_walker_only_module_toplevel; + ] ); + ] diff --git a/tools/res-to-affine/walker.ml b/tools/res-to-affine/walker.ml new file mode 100644 index 0000000..7845d06 --- /dev/null +++ b/tools/res-to-affine/walker.ml @@ -0,0 +1,381 @@ +(* SPDX-License-Identifier: MPL-2.0 *) +(* SPDX-FileCopyrightText: 2026 Jonathan D.A. Jewell *) + +(* ---- AST model ------------------------------------------------------------- + + Parses the tree-sitter parse default output. The format is paren- + delimited with embedded byte ranges; each node is + + "(" NTYPE [row, col] "-" [row, col] CHILD-list ")" + + where each CHILD in the list is optionally prefixed by a "FIELD:" tag, + for example: + + pattern: (value_identifier [0, 4] - [0, 5]) + + tree-sitter uses 0-based rows and columns. We keep that internally + and translate to 1-based line numbers in findings. *) + +type pos = { row : int; col : int } + +type node = { + ntype : string; + field : string option; + start : pos; + stop : pos; + children : node list; +} + +(* ---- s-expression tokenizer + parser --------------------------------------- *) + +(* The s-exp grammar we recognise is tiny enough that a hand-rolled + recursive-descent parser is clearer than a tokenizer + driver. We + consume the input by index into a [string]. *) + +exception Parse_error of string + +let buf_take buf = + let s = Buffer.contents buf in + Buffer.clear buf; + s + +let is_ident_char c = + match c with + | 'a'..'z' | 'A'..'Z' | '0'..'9' | '_' -> true + | _ -> false + +(* Skip whitespace (spaces, tabs, newlines). *) +let skip_ws src i = + let n = String.length src in + let i = ref i in + while !i < n && + (let c = src.[!i] in c = ' ' || c = '\t' || c = '\n' || c = '\r') do + incr i + done; + !i + +let expect src i ch = + if i >= String.length src || src.[i] <> ch then + raise (Parse_error + (Printf.sprintf "expected %C at offset %d (got %s)" ch i + (if i >= String.length src then "EOF" + else Printf.sprintf "%C" src.[i]))) + else i + 1 + +(* Read [a-zA-Z_][a-zA-Z0-9_]* starting at [i]. *) +let read_ident src i = + let n = String.length src in + if i >= n then raise (Parse_error "expected identifier at EOF"); + let buf = Buffer.create 16 in + let i = ref i in + while !i < n && is_ident_char src.[!i] do + Buffer.add_char buf src.[!i]; + incr i + done; + if Buffer.length buf = 0 then + raise (Parse_error + (Printf.sprintf "expected identifier at offset %d" !i)); + buf_take buf, !i + +(* Read a non-negative integer. *) +let read_int src i = + let n = String.length src in + let buf = Buffer.create 8 in + let i = ref i in + while !i < n && + (let c = src.[!i] in c >= '0' && c <= '9') do + Buffer.add_char buf src.[!i]; + incr i + done; + if Buffer.length buf = 0 then + raise (Parse_error + (Printf.sprintf "expected integer at offset %d" !i)); + int_of_string (buf_take buf), !i + +(* Read [ row , col ]. *) +let read_pos src i = + let i = skip_ws src i in + let i = expect src i '[' in + let i = skip_ws src i in + let row, i = read_int src i in + let i = skip_ws src i in + let i = expect src i ',' in + let i = skip_ws src i in + let col, i = read_int src i in + let i = skip_ws src i in + let i = expect src i ']' in + { row; col }, i + +(* Try to peek a field tag of the form [FIELD:]. Returns [Some name, i'] if + one starts at [i] AND is followed by whitespace + '('; otherwise [None, + i] without advancing. Field names are identifiers per tree-sitter's + grammar.js conventions. *) +let peek_field src i = + let n = String.length src in + if i >= n then None, i + else if not (let c = src.[i] in + (c >= 'a' && c <= 'z') || + (c >= 'A' && c <= 'Z') || + c = '_') then + None, i + else + let saved = i in + try + let name, j = read_ident src i in + let j = skip_ws src j in + if j < n && src.[j] = ':' then + let j = j + 1 in + let j = skip_ws src j in + if j < n && src.[j] = '(' then + Some name, j + else + None, saved + else None, saved + with Parse_error _ -> None, saved + +(* Parse one node. Position points at the opening paren. *) +let rec parse_node ?field src i = + let i = skip_ws src i in + let i = expect src i '(' in + let i = skip_ws src i in + let ntype, i = read_ident src i in + let i = skip_ws src i in + let start, i = read_pos src i in + let i = skip_ws src i in + let i = expect src i '-' in + let stop, i = read_pos src i in + let i = skip_ws src i in + let children, i = parse_children src i [] in + let i = skip_ws src i in + let i = expect src i ')' in + { ntype; field; start; stop; children = List.rev children }, i + +and parse_children src i acc = + let i = skip_ws src i in + let n = String.length src in + if i >= n || src.[i] = ')' then acc, i + else + let field, i = peek_field src i in + let child, i = parse_node ?field src i in + parse_children src i (child :: acc) + +let parse_sexp (s : string) : node = + let i = skip_ws s 0 in + let root, i = parse_node s i in + let i = skip_ws s i in + if i <> String.length s then + raise (Parse_error + (Printf.sprintf "trailing garbage at offset %d" i)); + root + +(* ---- tree-sitter subprocess ----------------------------------------------- *) + +let default_grammar_dir = "tools/vendor/tree-sitter-rescript" + +(* Run [tree-sitter parse ] from inside [grammar_dir]. Tree-sitter + resolves the parser by looking at the working-directory grammar manifest, + so [cd grammar_dir] is the simplest way to point it at the vendored + build without relying on [--paths] flag stability across CLI versions. + + Captures stdout into a single [string]. Returns [Ok s] or [Error msg]. *) +let run_tree_sitter ~grammar_dir ~path : (string, string) result = + if not (Sys.file_exists grammar_dir) then + Error (Printf.sprintf + "grammar directory not found: %s — run `just install-grammar`" + grammar_dir) + else begin + let abs_path = + if Filename.is_relative path then + Filename.concat (Sys.getcwd ()) path + else + path + in + (* stderr is redirected to /dev/null because tree-sitter 0.26+ emits + a benign "You have not configured any parser directories!" warning + on every run (we resolve the parser via the cd-into-grammar-dir + trick, not via its config file). Merging stderr into stdout would + break our s-exp parser. Errors that matter still surface via the + process exit code. *) + let cmd = + Printf.sprintf "cd %s && tree-sitter parse %s 2>/dev/null" + (Filename.quote grammar_dir) + (Filename.quote abs_path) + in + let ic = Unix.open_process_in cmd in + let buf = Buffer.create 4096 in + (try + while true do + Buffer.add_channel buf ic 4096 + done + with End_of_file -> ()); + match Unix.close_process_in ic with + | Unix.WEXITED 0 -> Ok (Buffer.contents buf) + | Unix.WEXITED n -> + Error (Printf.sprintf + "tree-sitter parse exited with code %d (stderr suppressed; \ + re-run `tree-sitter parse %s` from %s to see details)" + n (Filename.quote abs_path) (Filename.quote grammar_dir)) + | Unix.WSIGNALED n -> + Error (Printf.sprintf "tree-sitter parse killed by signal %d" n) + | Unix.WSTOPPED _ -> + Error "tree-sitter parse stopped" + end + +(* ---- detection: side-effect-import ---------------------------------------- + + AST shape we look for: + + (let_declaration ... + (let_binding ... + pattern: (value_identifier ...) <- text is exactly "_" + body: (...))) <- subtree contains a module- + qualified access (value_- + identifier_path or + member_expression with + uppercase head) + + The let_declaration must be at *module top level*: parent is + source_file, or parent is a block that is the body of a + module_declaration. let _ = X.f() inside a function body is a + normal "discard return value" idiom, not the module-load side- + effect anti-pattern; the regex band-aid in #319 was specifically + to suppress these — the walker eliminates them structurally. *) + +(* Slice source text out by [pos] range. *) +let slice ~source ~start ~stop = + let lines = String.split_on_char '\n' source in + let lines = Array.of_list lines in + if start.row = stop.row then begin + if start.row >= Array.length lines then "" + else + let line = lines.(start.row) in + let n = String.length line in + let a = min start.col n in + let b = min stop.col n in + String.sub line a (max 0 (b - a)) + end else if start.row < Array.length lines then + (* Multi-line excerpt: return the first line from start.col onwards; + findings only need a short excerpt and the line number is on the + first row anyway. *) + let line = lines.(start.row) in + let n = String.length line in + let a = min start.col n in + String.sub line a (n - a) + else "" + +let truncate s = + if String.length s <= 80 then s + else String.sub s 0 77 ^ "..." + +(* Is this node's source text exactly the underscore [_]? *) +let is_underscore_pattern ~source n = + if n.ntype <> "value_identifier" then false + else + let text = slice ~source ~start:n.start ~stop:n.stop in + String.trim text = "_" + +(* Recursively search the body subtree for a module-qualified access. + A [value_identifier_path] node is the canonical shape ([Mod.value]). + We also accept [member_expression] whose head ([primary_expression]) + surface text starts with an uppercase letter, to catch shapes like + [Pixi.Sound.register] which the grammar may model as nested member + accesses. *) +let rec body_has_module_access ~source n = + if n.ntype = "value_identifier_path" then true + else if n.ntype = "member_expression" then begin + match n.children with + | head :: _ -> + let text = slice ~source ~start:head.start ~stop:head.stop in + let text = String.trim text in + if text = "" then false + else + let c0 = text.[0] in + c0 >= 'A' && c0 <= 'Z' + | [] -> false + end + else + List.exists (body_has_module_access ~source) n.children + +(* Find the [pattern] and [body] children of a [let_binding] node. + Tree-sitter labels them with fields; we also fall back to positional + first-child / last-child for resilience against grammar updates that + might re-order fields. *) +let pattern_and_body ~lb = + let by_field name = + List.find_opt (fun c -> c.field = Some name) lb.children + in + let pattern = + match by_field "pattern" with + | Some n -> Some n + | None -> + (* fall back: first child *) + (match lb.children with [] -> None | n :: _ -> Some n) + in + let body = + match by_field "body" with + | Some n -> Some n + | None -> + (* fall back: last child *) + (match List.rev lb.children with [] -> None | n :: _ -> Some n) + in + pattern, body + +(* Walk with ancestor context. [ancestors] lists ancestor node types from + immediate parent outward; the head is the immediate parent. *) +let rec collect ~source ancestors acc node = + let acc = + if node.ntype = "let_declaration" then + let at_module_toplevel = + match ancestors with + | "source_file" :: _ -> true + | "block" :: parent :: _ when parent = "module_declaration" -> true + | _ -> false + in + if at_module_toplevel then + List.fold_left + (fun acc c -> + if c.ntype = "let_binding" then check_binding ~source acc c + else acc) + acc node.children + else acc + else acc + in + List.fold_left + (fun acc c -> collect ~source (node.ntype :: ancestors) acc c) + acc node.children + +and check_binding ~source acc lb = + match pattern_and_body ~lb with + | Some pat, Some body + when is_underscore_pattern ~source pat + && body_has_module_access ~source body -> + let excerpt = + truncate + (String.trim + (slice ~source ~start:lb.start ~stop:lb.stop)) + in + let finding : Scanner.finding = + { kind = Scanner.Side_effect_import + ; line = lb.start.row + 1 + ; excerpt + } + in + finding :: acc + | _ -> acc + +(* ---- public entry point --------------------------------------------------- *) + +let scan ~grammar_dir ~path ~source = + match run_tree_sitter ~grammar_dir ~path with + | Error msg -> failwith ("res-to-affine walker: " ^ msg) + | Ok output -> + let root = + try parse_sexp output with Parse_error m -> + failwith ("res-to-affine walker: s-exp parse failed — " ^ m) + in + let findings = collect ~source [] [] root in + (* Findings accumulated in reverse order; emitter expects + source order (line ascending). *) + List.sort + (fun (a : Scanner.finding) (b : Scanner.finding) -> compare a.line b.line) + findings diff --git a/tools/res-to-affine/walker.mli b/tools/res-to-affine/walker.mli new file mode 100644 index 0000000..17fc6af --- /dev/null +++ b/tools/res-to-affine/walker.mli @@ -0,0 +1,41 @@ +(* SPDX-License-Identifier: MPL-2.0 *) +(* SPDX-FileCopyrightText: 2026 Jonathan D.A. Jewell *) + +(** Phase-2 AST walker for ReScript anti-patterns. + + Replaces the Phase-1 [Scanner] line-regex pipeline with a real + tree-sitter-driven walker over the vendored + [editors/tree-sitter-rescript/] grammar. The two pipelines share + [Scanner.kind] and [Scanner.finding] so the emitter is unchanged. + + Phase 2b ports a single anti-pattern: [Side_effect_import]. Phase + 2c ports the remaining three. The walker's discovery of an + anti-pattern is strictly stronger than the regex's: it requires + the [let _ = Mod.value] shape to live at *module top level* (the + actual anti-pattern's structural shape), not just to match a + column-0 prefix on any line. This eliminates the [let _ = + chained.call()] false-positive class that the Phase-1 regex + band-aided in #319. *) + +val scan : + grammar_dir:string -> + path:string -> + source:string -> + Scanner.finding list +(** [scan ~grammar_dir ~path ~source] invokes [tree-sitter parse] on + [path] using the generated parser at [grammar_dir/src/parser.c] + (the directory produced by [editors/tree-sitter-rescript/scripts/ + install.sh]), walks the resulting AST, and returns + [Scanner.finding]s for every detected anti-pattern. + + Raises [Failure] if the [tree-sitter] CLI is missing, the parse + fails, or the output cannot be parsed. The caller is responsible + for catching this and falling back / surfacing a user error. + + [source] is used only to slice excerpt strings out for findings; + it must be the same content the file at [path] holds. *) + +val default_grammar_dir : string +(** Default location of the generated grammar relative to the current + working directory: ["tools/vendor/tree-sitter-rescript"]. Matches + the output path of [editors/tree-sitter-rescript/scripts/install.sh]. *)