From a64082aaaf9054d527f7539d2e8172d9517fad04 Mon Sep 17 00:00:00 2001
From: Johnson Chu <johnsoncodehk@gmail.com>
Date: Mon, 8 Jun 2026 21:19:05 +0800
Subject: [PATCH 1/6] Generative testing: grammar-derived inputs +
 by-construction consistency (#25)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Walk the shared combinator IR to emit guaranteed-legal inputs for any Monogram
grammar, replacing corpus sampling with systematic, bounded coverage — the lever
a normal highlighter lacks (the source IS a grammar). Two by-construction judges,
no external oracle:

- round-trip: every generated derivation parses as the rule it was rooted at
  (parser self-consistency); the structured strategies are ~88% legal, fuzz is
  exploratory (random choices wander outside the IR's context constraints).
- scope ≡ role: the flat highlighter's scope at each parsed token must agree with
  the token's by-construction role (the scope the grammar declares). Where they
  disagree is the #23/#24 class — a value-leading `---` the parser keeps a plain
  scalar but a flat grammar mis-scopes as a marker; an inner sequence `-` the
  parser knows is an indicator but a flat grammar folds into a string. Floor-blind
  (compares the punctuation class directly), so a `-` painted string is caught.

The check independently re-surfaces both: a directed-nesting derivation produces
`- - x\n  - x` (#24); the anchored-marker scan catches a value-leading marker
misfire (#23). Verified by reverting each fix — the gate fires — and depth-site
coverage is asserted so generation can't silently stop exercising them.

Test-suite cleanup alongside:
- delete 9 dev-only scratch / superseded probes (each confirmed not a CI gate).
- fold the per-language scope-gap + src-coverage adapters into two data-driven
  drivers (scope-gap-run.ts / src-coverage-run.ts) + a config table, the
  per-language entry preserved as a <lang> parameter. Output byte-identical to
  the old adapters; coverage-table.ts and package.json rewired. The thicker html
  / yaml / vue adapters keep their files and are delegated to.

Adds: grammar-gen.ts (the walker), generative.ts (the judges), curated-corpora.ts.
CI runs node test/generative.ts.
---
 .github/workflows/ci.yml                      |   1 +
 package.json                                  |  27 +-
 test/bench-vs-ts-agg.ts                       |  19 -
 test/bench-vs-ts.ts                           |  38 -
 test/classify-ts.ts                           |  93 ---
 test/coverage-table.ts                        |  38 +-
 test/{scope-gap-jsx.ts => curated-corpora.ts} |  51 +-
 test/diag.ts                                  |  13 -
 test/generative.ts                            | 324 ++++++++
 test/grammar-gen.ts                           | 694 ++++++++++++++++++
 test/parser-gap.ts                            | 254 -------
 test/prof.ts                                  |  10 -
 test/scope-gap-html.ts                        |  48 --
 test/scope-gap-js.ts                          |  30 -
 test/scope-gap-run.ts                         | 130 ++++
 test/scope-gap-ts.ts                          |  39 -
 test/scope-gap-tsx.ts                         |  28 -
 test/scope-gap-yaml.ts                        |  68 --
 test/src-coverage-js.ts                       |  27 -
 test/src-coverage-jsx.ts                      |  60 --
 test/src-coverage-run.ts                      |  52 ++
 test/src-coverage-ts.ts                       |  27 -
 test/src-coverage-tsx.ts                      |  20 -
 test/ts-ast.ts                                |   9 -
 test/yaml-diag.ts                             |  40 -
 test/yaml-poc.ts                              |  33 -
 26 files changed, 1260 insertions(+), 913 deletions(-)
 delete mode 100644 test/bench-vs-ts-agg.ts
 delete mode 100644 test/bench-vs-ts.ts
 delete mode 100644 test/classify-ts.ts
 rename test/{scope-gap-jsx.ts => curated-corpora.ts} (55%)
 delete mode 100644 test/diag.ts
 create mode 100644 test/generative.ts
 create mode 100644 test/grammar-gen.ts
 delete mode 100644 test/parser-gap.ts
 delete mode 100644 test/prof.ts
 delete mode 100644 test/scope-gap-html.ts
 delete mode 100644 test/scope-gap-js.ts
 create mode 100644 test/scope-gap-run.ts
 delete mode 100644 test/scope-gap-ts.ts
 delete mode 100644 test/scope-gap-tsx.ts
 delete mode 100644 test/scope-gap-yaml.ts
 delete mode 100644 test/src-coverage-js.ts
 delete mode 100644 test/src-coverage-jsx.ts
 create mode 100644 test/src-coverage-run.ts
 delete mode 100644 test/src-coverage-ts.ts
 delete mode 100644 test/src-coverage-tsx.ts
 delete mode 100644 test/ts-ast.ts
 delete mode 100644 test/yaml-diag.ts
 delete mode 100644 test/yaml-poc.ts
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4eaafd0..4962098 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -57,6 +57,7 @@ jobs:
           node test/vue-interp-expr.ts
           node test/yaml-issue12-regressions.ts
           node test/yaml-depth-witnesses.ts
+          node test/generative.ts
 
   # The derived tree-sitter highlighter is the strongest thesis proof (a real GLR
   # parser from the same grammar, beating the official hand-written one). Build its
diff --git a/package.json b/package.json
index 3491dd9..b3937f7 100644
--- a/package.json
+++ b/package.json
@@ -5,6 +5,7 @@
   "scripts": {
     "gen": "node src/cli.ts typescript.ts && node src/cli.ts javascript.ts && node src/cli.ts typescriptreact.ts && node src/cli.ts javascriptreact.ts && node src/cli.ts html.ts && node src/cli.ts vue.ts && node src/cli.ts yaml.ts",
     "test": "node test/sanity-check.ts",
+    "generative": "node test/generative.ts",
     "conformance": "node test/run-conformance.ts",
     "conformance:js": "node test/js-conformance.ts",
     "conformance:tsx": "node test/tsx-conformance.ts",
@@ -29,19 +30,19 @@
     "bench:perf": "node test/perf-bench.ts",
     "coverage": "node test/scope-coverage.ts",
     "compat": "node test/repo-compat.ts",
-    "src-coverage:ts": "node test/src-coverage-ts.ts",
-    "src-coverage:js": "node test/src-coverage-js.ts",
-    "src-coverage:jsx": "node test/src-coverage-jsx.ts",
-    "src-coverage:tsx": "node test/src-coverage-tsx.ts",
-    "src-coverage:html": "node test/src-coverage-html.ts",
-    "src-coverage:yaml": "node test/src-coverage-yaml.ts",
-    "scope-gap:ts": "node test/scope-gap-ts.ts",
-    "scope-gap:js": "node test/scope-gap-js.ts",
-    "scope-gap:jsx": "node test/scope-gap-jsx.ts",
-    "scope-gap:tsx": "node test/scope-gap-tsx.ts",
-    "scope-gap:html": "node test/scope-gap-html.ts",
-    "scope-gap:yaml": "node test/scope-gap-yaml.ts",
-    "scope-gap:vue": "node test/scope-gap-vue.ts",
+    "src-coverage:ts": "node test/src-coverage-run.ts ts",
+    "src-coverage:js": "node test/src-coverage-run.ts js",
+    "src-coverage:jsx": "node test/src-coverage-run.ts jsx",
+    "src-coverage:tsx": "node test/src-coverage-run.ts tsx",
+    "src-coverage:html": "node test/src-coverage-run.ts html",
+    "src-coverage:yaml": "node test/src-coverage-run.ts yaml",
+    "scope-gap:ts": "node test/scope-gap-run.ts ts",
+    "scope-gap:js": "node test/scope-gap-run.ts js",
+    "scope-gap:jsx": "node test/scope-gap-run.ts jsx",
+    "scope-gap:tsx": "node test/scope-gap-run.ts tsx",
+    "scope-gap:html": "node test/scope-gap-run.ts html",
+    "scope-gap:yaml": "node test/scope-gap-run.ts yaml",
+    "scope-gap:vue": "node test/scope-gap-run.ts vue",
     "coverage:table": "node test/coverage-table.ts --write"
   },
   "devDependencies": {
diff --git a/test/bench-vs-ts-agg.ts b/test/bench-vs-ts-agg.ts
deleted file mode 100644
index a9a42ad..0000000
--- a/test/bench-vs-ts-agg.ts
+++ /dev/null
@@ -1,19 +0,0 @@
-import { createParser } from '../src/gen-parser.ts';
-import { readdir } from 'fs/promises';
-import { readFileSync } from 'fs';
-import { join } from 'path';
-import ts from 'typescript';
-const grammar = (await import('../typescript.ts')).default;
-const { parse } = createParser(grammar);
-const base = '/tmp/ts-repo/tests/cases/conformance';
-async function all(d: string): Promise<string[]> { const o:string[]=[]; for(const e of await readdir(d,{withFileTypes:true})){const f=join(d,e.name); if(e.isDirectory())o.push(...await all(f)); else if(e.name.endsWith('.ts')&&!e.name.endsWith('.d.ts'))o.push(f);} return o; }
-const files = (await all(base)).map(f => readFileSync(f,'utf-8'));
-const totalKB = files.reduce((s,c)=>s+c.length,0)/1024;
-// warm up
-for(const c of files.slice(0,200)){ try{parse(c);}catch{} ts.createSourceFile('t.ts',c,ts.ScriptTarget.Latest,false,ts.ScriptKind.TS); }
-let t0=process.hrtime.bigint(); for(const c of files){ try{parse(c);}catch{} } const ours=Number(process.hrtime.bigint()-t0)/1e6;
-t0=process.hrtime.bigint(); for(const c of files){ ts.createSourceFile('t.ts',c,ts.ScriptTarget.Latest,false,ts.ScriptKind.TS); } const tsms=Number(process.hrtime.bigint()-t0)/1e6;
-console.log(`${files.length} files, ${totalKB.toFixed(0)} KB total`);
-console.log(`  ours: ${ours.toFixed(0)} ms  (${(totalKB/1024/(ours/1000)).toFixed(1)} MB/s)`);
-console.log(`  ts:   ${tsms.toFixed(0)} ms  (${(totalKB/1024/(tsms/1000)).toFixed(1)} MB/s)`);
-console.log(`  ours/ts: ×${(ours/tsms).toFixed(1)}`);
diff --git a/test/bench-vs-ts.ts b/test/bench-vs-ts.ts
deleted file mode 100644
index c15d4ea..0000000
--- a/test/bench-vs-ts.ts
+++ /dev/null
@@ -1,38 +0,0 @@
-// Compare our grammar-driven parser against TypeScript's own parser (ts.createSourceFile)
-// on the same inputs. Both do a full from-scratch parse (no incremental reuse).
-import { createParser } from '../src/gen-parser.ts';
-import { readFileSync } from 'fs';
-import ts from 'typescript';
-
-const grammar = (await import('../typescript.ts')).default;
-const { parse } = createParser(grammar);
-
-const tsParse = (code: string) =>
-  ts.createSourceFile('t.ts', code, ts.ScriptTarget.Latest, /*setParentNodes*/ false, ts.ScriptKind.TS);
-
-function timeIt(fn: () => void, iters: number): number {
-  for (let i = 0; i < 3; i++) fn();                  // warm up
-  const start = process.hrtime.bigint();
-  for (let i = 0; i < iters; i++) fn();
-  return Number(process.hrtime.bigint() - start) / 1e6 / iters;  // ms/parse
-}
-
-const files = [
-  ['parserharness.ts',        '/tmp/ts-repo/tests/cases/conformance/parser/ecmascript5/RealWorld/parserharness.ts'],
-  ['fixSignatureCaching.ts',  '/tmp/ts-repo/tests/cases/conformance/fixSignatureCaching.ts'],
-  ['parserRealSource7.ts',    '/tmp/ts-repo/tests/cases/conformance/parser/ecmascript5/parserRealSource7.ts'],
-  ['parserindenter.ts',       '/tmp/ts-repo/tests/cases/conformance/parser/ecmascript5/RealWorld/parserindenter.ts'],
-];
-
-console.log('file                         KB    ours(ms)   ts(ms)   ours/ts');
-for (const [name, path] of files) {
-  const code = readFileSync(path, 'utf-8');
-  const kb = (code.length / 1024).toFixed(0);
-  const ours = timeIt(() => { try { parse(code); } catch {} }, 30);
-  const tsms = timeIt(() => { tsParse(code); }, 30);
-  console.log(
-    name.padEnd(28) + kb.padStart(4) +
-    ours.toFixed(1).padStart(11) + tsms.toFixed(2).padStart(9) +
-    ('×' + (ours / tsms).toFixed(1)).padStart(10),
-  );
-}
diff --git a/test/classify-ts.ts b/test/classify-ts.ts
deleted file mode 100644
index 348a8de..0000000
--- a/test/classify-ts.ts
+++ /dev/null
@@ -1,93 +0,0 @@
-import { createParser } from '../src/gen-parser.ts';
-import { readdir, writeFile } from 'fs/promises';
-import { readFileSync } from 'fs';
-import { join } from 'path';
-import ts from 'typescript';
-
-const grammar = (await import('../typescript.ts')).default;
-const { parse } = createParser(grammar);
-const baseDir = '/tmp/ts-repo/tests/cases/conformance';
-
-async function getAllTsFiles(dir: string): Promise<string[]> {
-  const files: string[] = [];
-  for (const entry of await readdir(dir, { withFileTypes: true })) {
-    const full = join(dir, entry.name);
-    if (entry.isDirectory()) files.push(...await getAllTsFiles(full));
-    else if (entry.name.endsWith('.ts') && !entry.name.endsWith('.d.ts')) files.push(full);
-  }
-  return files;
-}
-
-// Count syntactic parse diagnostics for a chunk of TS source.
-function syntaxErrors(text: string, name = 't.ts'): number {
-  const sf = ts.createSourceFile(name, text, ts.ScriptTarget.Latest, true, ts.ScriptKind.TS);
-  return (sf as any).parseDiagnostics?.length ?? 0;
-}
-
-// Split TS conformance file by `// @filename:` directives.
-function splitMultiFile(text: string): string[] {
-  if (!/^\s*\/\/\s*@filename:/im.test(text)) return [text];
-  const parts: string[] = [];
-  const re = /^\s*\/\/\s*@filename:.*$/gim;
-  let last = 0, m: RegExpExecArray | null, started = false;
-  const idxs: number[] = [];
-  while ((m = re.exec(text))) idxs.push(m.index);
-  if (idxs.length === 0) return [text];
-  // preamble before first @filename (global directives) — ignore as its own chunk
-  for (let i = 0; i < idxs.length; i++) {
-    const start = idxs[i];
-    const end = i + 1 < idxs.length ? idxs[i + 1] : text.length;
-    parts.push(text.slice(start, end));
-  }
-  return parts;
-}
-
-const files = await getAllTsFiles(baseDir);
-files.sort();
-
-interface Row { file: string; ourMsg: string; tsWhole: number; tsParts: number; multi: boolean; }
-const rows: Row[] = [];
-
-for (const file of files) {
-  const code = readFileSync(file, 'utf-8');
-  let ourFail = false, ourMsg = '';
-  try { parse(code); } catch (e: any) { ourFail = true; ourMsg = e.message.replace(/\s*\[farthest.*/, ''); }
-  if (!ourFail) continue;
-
-  const path = file.replace(baseDir + '/', '');
-  const tsWhole = syntaxErrors(code);
-  const parts = splitMultiFile(code);
-  const multi = parts.length > 1;
-  const tsParts = multi ? parts.reduce((a, p) => a + syntaxErrors(p), 0) : tsWhole;
-  rows.push({ file: path, ourMsg, tsWhole, tsParts, multi });
-}
-
-// Categories:
-//  REAL: TS reports 0 syntax errors (on parts if multi, else whole) -> we should parse
-//  MULTI: multi-file, parts clean but whole dirty (concatenation issue, structural)
-//  ERRORTEST: TS reports syntax errors -> intentional
-const real = rows.filter(r => !r.multi && r.tsWhole === 0);
-const multiClean = rows.filter(r => r.multi && r.tsParts === 0);
-const multiDirty = rows.filter(r => r.multi && r.tsParts > 0);
-const errorTest = rows.filter(r => !r.multi && r.tsWhole > 0);
-
-const out: string[] = [];
-out.push(`Total our failures: ${rows.length}`);
-out.push(`REAL (TS clean, single-file)         : ${real.length}`);
-out.push(`MULTI-CLEAN (parts clean, concat fails): ${multiClean.length}`);
-out.push(`MULTI-DIRTY (multi-file w/ syntax err) : ${multiDirty.length}`);
-out.push(`ERROR-TEST (TS reports syntax error)   : ${errorTest.length}`);
-out.push('');
-out.push('===== REAL (should fix) =====');
-for (const r of real) out.push(`  ${r.file}\n      ${r.ourMsg}`);
-out.push('');
-out.push('===== MULTI-CLEAN (structural, @filename concat) =====');
-for (const r of multiClean) out.push(`  ${r.file}\n      ${r.ourMsg}`);
-out.push('');
-out.push('===== MULTI-DIRTY (has intentional errors in some part) =====');
-for (const r of multiDirty) out.push(`  ${r.file} (tsParts=${r.tsParts})`);
-
-const text = out.join('\n');
-await writeFile('/tmp/classify.txt', text);
-console.log(text.split('\n').slice(0, 6).join('\n'));
-console.log('\nFull report: /tmp/classify.txt');
diff --git a/test/coverage-table.ts b/test/coverage-table.ts
index 5484881..eb61dbb 100644
--- a/test/coverage-table.ts
+++ b/test/coverage-table.ts
@@ -18,27 +18,29 @@ function runAdapter(script: string, args: string[], marker: string, env?: NodeJS
   } catch { return null; }
 }
 
-// TS/JS use deterministic stride subsets for speed; the rest run their full corpus.
+// Both metrics now run through ONE data-driven driver each, parameterised by the `<lang>` code
+// (test/scope-gap-run.ts, test/src-coverage-run.ts). TS/JS use deterministic stride subsets for
+// speed; the rest run their full corpus.
 const COV = [
-  { lang: 'TypeScript', script: 'test/src-coverage-ts.ts', args: ['1500'] },
-  { lang: 'JavaScript', script: 'test/src-coverage-js.ts', args: ['800'] },
-  { lang: 'JSX', script: 'test/src-coverage-jsx.ts', args: [] },
-  { lang: 'TSX', script: 'test/src-coverage-tsx.ts', args: [] },
-  { lang: 'HTML', script: 'test/src-coverage-html.ts', args: [] },
-  { lang: 'YAML', script: 'test/src-coverage-yaml.ts', args: [] },
+  { lang: 'TypeScript', script: 'test/src-coverage-run.ts', args: ['ts', '1500'] },
+  { lang: 'JavaScript', script: 'test/src-coverage-run.ts', args: ['js', '800'] },
+  { lang: 'JSX', script: 'test/src-coverage-run.ts', args: ['jsx'] },
+  { lang: 'TSX', script: 'test/src-coverage-run.ts', args: ['tsx'] },
+  { lang: 'HTML', script: 'test/src-coverage-run.ts', args: ['html'] },
+  { lang: 'YAML', script: 'test/src-coverage-run.ts', args: ['yaml'] },
 ];
-// The 4 TS-family scope-gap adapters all read ONE shared env var (MONOGRAM_OFFICIAL_TM) for
-// the official grammar, so each needs its OWN grammar mapped in (CI sets MONOGRAM_OFFICIAL_TS/
-// TSX/JS/JSX). html/yaml read their own var (MONOGRAM_OFFICIAL_HTML/_YAML), inherited as-is;
-// vue is vendored. Absent (local, no env) → each adapter's VS Code-install fallback path.
+// The 4 TS-family scope-gap entries all read ONE shared env var (MONOGRAM_OFFICIAL_TM) for the
+// official grammar, so each needs its OWN grammar mapped in (CI sets MONOGRAM_OFFICIAL_TS/TSX/JS/JSX).
+// html/yaml read their own var (MONOGRAM_OFFICIAL_HTML/_YAML), inherited as-is; vue is vendored.
+// Absent (local, no env) → the driver's VS Code-install fallback path.
 const GAP = [
-  { lang: 'TypeScript', script: 'test/scope-gap-ts.ts', args: ['800'], officialEnv: 'MONOGRAM_OFFICIAL_TS' },
-  { lang: 'JavaScript', script: 'test/scope-gap-js.ts', args: ['800'], officialEnv: 'MONOGRAM_OFFICIAL_JS' },
-  { lang: 'JSX', script: 'test/scope-gap-jsx.ts', args: [], officialEnv: 'MONOGRAM_OFFICIAL_JSX' },
-  { lang: 'TSX', script: 'test/scope-gap-tsx.ts', args: [], officialEnv: 'MONOGRAM_OFFICIAL_TSX' },
-  { lang: 'HTML', script: 'test/scope-gap-html.ts', args: [] },
-  { lang: 'YAML', script: 'test/scope-gap-yaml.ts', args: [] },
-  { lang: 'Vue', script: 'test/scope-gap-vue.ts', args: [] },
+  { lang: 'TypeScript', script: 'test/scope-gap-run.ts', args: ['ts', '800'], officialEnv: 'MONOGRAM_OFFICIAL_TS' },
+  { lang: 'JavaScript', script: 'test/scope-gap-run.ts', args: ['js', '800'], officialEnv: 'MONOGRAM_OFFICIAL_JS' },
+  { lang: 'JSX', script: 'test/scope-gap-run.ts', args: ['jsx'], officialEnv: 'MONOGRAM_OFFICIAL_JSX' },
+  { lang: 'TSX', script: 'test/scope-gap-run.ts', args: ['tsx'], officialEnv: 'MONOGRAM_OFFICIAL_TSX' },
+  { lang: 'HTML', script: 'test/scope-gap-run.ts', args: ['html'] },
+  { lang: 'YAML', script: 'test/scope-gap-run.ts', args: ['yaml'] },
+  { lang: 'Vue', script: 'test/scope-gap-run.ts', args: ['vue'] },
 ] as { lang: string; script: string; args: string[]; officialEnv?: string }[];
 
 const pct = (v: number | null | undefined) => (v == null ? '—' : v.toFixed(1) + '%');
diff --git a/test/scope-gap-jsx.ts b/test/curated-corpora.ts
similarity index 55%
rename from test/scope-gap-jsx.ts
rename to test/curated-corpora.ts
index 9a0a8bf..136c3e8 100644
--- a/test/scope-gap-jsx.ts
+++ b/test/curated-corpora.ts
@@ -1,18 +1,7 @@
-// scope-gap-jsx.ts — JSX (.jsx) adapter for the unified scope-gap harness. Grades VS Code's
-// OFFICIAL JavaScriptReact.tmLanguage.json AND Monogram's javascriptreact.tmLanguage.json against
-// the parser oracle (oracle.ts with ScriptKind.JSX). Both grammars declare scopeName `source.js.jsx`.
-// Neither the TS suite nor Test262 ships a .jsx corpus, so this uses a CURATED set exercising both
-// halves (plain JS + JSX), copied verbatim from src-coverage-jsx.ts. It is small, so token counts
-// are low; a real .jsx corpus is a follow-up. Run (bare node): node test/scope-gap-jsx.ts
-import ts from 'typescript';
-import { run } from './scope-gap.ts';
-import { oracle } from './oracle.ts';
+// curated-corpora.ts — small hand-written corpora shared by the folded scope-gap / src-coverage drivers.
+// JSX (plain-JS + JSX halves) and realistic HTML — the languages with no public single-file corpus.
 
-const OFFICIAL = process.env.MONOGRAM_OFFICIAL_TM
-  ?? '/Applications/Visual Studio Code.app/Contents/Resources/app/extensions/javascript/syntaxes/JavaScriptReact.tmLanguage.json';
-
-// No TS types — these are .jsx (JavaScript + JSX) only. Copied verbatim from src-coverage-jsx.ts.
-const JSX_CASES: string[] = [
+export const JSX_CASES: string[] = [
   // --- plain JS half ---
   'const x = 1, y = 2;',
   'function f(a, b = 1, ...rest) { return a + b + rest.length; }',
@@ -52,15 +41,25 @@ const JSX_CASES: string[] = [
   'const boolAttr = <button autofocus formNoValidate>ok</button>;',
 ];
 
-await run({
-  name: 'JavaScriptReact (.jsx)',
-  scopeName: 'source.js.jsx',
-  officialPath: OFFICIAL,
-  monogramPath: 'javascriptreact.tmLanguage.json',
-  loadCorpus: () => JSX_CASES.map((code, i) => ({ name: `<curated #${i}>`, text: code })),
-  roleOracle: (text) => oracle(text, ts.ScriptKind.JSX),
-  isGradable: (text) => {
-    const sf = ts.createSourceFile('c.jsx', text, ts.ScriptTarget.Latest, true, ts.ScriptKind.JSX);
-    return (((sf as any).parseDiagnostics?.length ?? 0) === 0);
-  },
-});
+export const HTML_GENERAL: string[] = [
+  '<div class="container" id="main"><p>Hello <a href="/x">world</a>.</p></div>',
+  '<ul><li>one</li><li>two</li><li>three</li></ul>',
+  '<img src="a.png" alt="a picture" width="100" height="80">',
+  '<input type="text" name="q" placeholder="Search" disabled>',
+  '<button type="submit" class="btn btn-primary" data-id="42">Go</button>',
+  '<section><h1>Title</h1><p>Body with <strong>bold</strong> and <em>italic</em>.</p></section>',
+  '<nav><a href="/">Home</a> | <a href="/about">About</a></nav>',
+  '<form action="/submit" method="post"><label for="n">Name</label><input id="n"></form>',
+  '<table><thead><tr><th>A</th><th>B</th></tr></thead><tbody><tr><td>1</td><td>2</td></tr></tbody></table>',
+  '<!-- a comment --><div><!-- inline --><span>x</span></div>',
+  '<meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1">',
+  '<br><hr>',
+  '<select><option value="1">One</option><option value="2" selected>Two</option></select>',
+  '<video controls width="320"><source src="m.mp4" type="video/mp4"></video>',
+  '<article data-index=3 hidden><header>H</header><footer>F</footer></article>',
+  '<span class="a b c" title="x y z">text</span>',
+  '<div\n  class="multi-line"\n  id="tag"\n  data-x="1">body</div>',
+  '<a href="https://example.com?a=1&b=2" target="_blank" rel="noopener">link</a>',
+  '<label>Email <input type="email" required></label>',
+  '<figure><img src="p.jpg" alt="photo"><figcaption>cap</figcaption></figure>',
+];
diff --git a/test/diag.ts b/test/diag.ts
deleted file mode 100644
index 9fcd116..0000000
--- a/test/diag.ts
+++ /dev/null
@@ -1,13 +0,0 @@
-import { createParser } from '../src/gen-parser.ts';
-import { readFileSync } from 'fs';
-const grammar = (await import('../typescript.ts')).default;
-const { parse } = createParser(grammar);
-for (const f of process.argv.slice(2)) {
-  const code = readFileSync(f, 'utf-8');
-  try { parse(code); console.log(f.split('/').pop(), 'OK'); }
-  catch (e: any) {
-    console.log(f.split('/').pop(), '\n  ', e.message);
-    const m = e.message.match(/farthest: offset (\d+)/);
-    if (m) { const o = +m[1]; console.log('   CTX:', JSON.stringify(code.slice(Math.max(0, o - 70), o + 30))); }
-  }
-}
diff --git a/test/generative.ts b/test/generative.ts
new file mode 100644
index 0000000..02d964a
--- /dev/null
+++ b/test/generative.ts
@@ -0,0 +1,324 @@
+// ─────────────────────────────────────────────────────────────────────────────
+//  generative.ts — monogram#25 parts (b): the JUDGING harness over grammar-DERIVED
+//  inputs (test/grammar-gen.ts). Two by-construction consistency checks, no external
+//  oracle, for every Monogram grammar:
+//
+//   (2) ROUND-TRIP — every generated derivation parses (as the rule it was rooted at).
+//       Validates parser self-consistency: what the grammar's IR generates, the parser
+//       accepts. Reported per strategy; the structured strategies are the gate.
+//
+//   (3) SCOPE ≡ ROLE — the flat highlighter's scope at every parsed token must agree
+//       with the token's BY-CONSTRUCTION role (the scope the grammar DECLARES for it).
+//       The parser resolves context with its full stack (indent / column / markup
+//       depth); the flat TextMate grammar can only approximate it. Where they disagree
+//       is exactly the monogram#23/#24 class — a value-leading `---` the parser lexes
+//       as a plain scalar (string) but a flat grammar mis-scopes as a document marker;
+//       an inner sequence `-` the parser knows is an indicator but a flat grammar folds
+//       into a string. The check is FLOOR-BLIND (it compares the visual bucket directly,
+//       incl. punctuation) so a `-` mis-painted as string is caught — the exact blind
+//       spot that hid #24 from the role-graded scope-gap metric.
+//
+//  Coverage is grammar×bound, not a fixed corpus — so it surfaces the depth-bug CLASS
+//  without anyone naming the shape (the motivation for #25). The named regressions
+//  (yaml-depth-witnesses.ts, *-issue-cases.ts) stay — generation replaces their
+//  DISCOVERY function, not their value as documented gates.
+//
+//  Run (bare node):  node test/generative.ts            # all languages
+//                    node test/generative.ts yaml       # one language
+// ─────────────────────────────────────────────────────────────────────────────
+import { readFileSync, existsSync } from 'node:fs';
+import { createRequire } from 'node:module';
+import vsctm from 'vscode-textmate';
+import onig from 'vscode-oniguruma';
+import { createParser, type CstNode, type CstChild } from '../src/gen-parser.ts';
+import type { CstGrammar, TokenPattern } from '../src/types.ts';
+import { normScope } from './scope-roles.ts';
+import { generateInputs, type GenInput } from './grammar-gen.ts';
+
+// ── language registry: every per-language fact (grammar module, scope, flat grammar file,
+//    any multi-file sub-grammars) is DATA — the harness body is language-agnostic. ──
+interface LangCfg {
+  name: string;
+  module: string;          // grammar module to import (default export = CstGrammar)
+  scopeName: string;       // TextMate scope, e.g. source.yaml
+  tmPath: string;          // the derived flat .tmLanguage.json
+  tmExtra?: Record<string, string>;  // extra scopeName → file for multi-file grammars
+  gen?: Parameters<typeof generateInputs>[1];   // generation knobs override
+  // Depth-site CLASSES the generated legal corpus MUST contain — the shapes whose correct scope
+  // depends on cross-line parser state, so the scope≡role gate provably covers monogram#23/#24. The
+  // gate FAILS if generation stops producing them (a coverage regression). Asserted per shape.
+  mustCover?: { name: string; re: RegExp }[];
+}
+const LANGS: LangCfg[] = [
+  { name: 'yaml', module: '../yaml.ts', scopeName: 'source.yaml', tmPath: 'yaml.tmLanguage.json',
+    mustCover: [
+      // #24: a nested compact block sequence with an inner sibling (`- - x\n  - x`) — the inner `-`'s
+      // role (indicator vs plain-fold) depends on the indent stack a flat grammar lacks.
+      { name: '#24 nested-compact-sequence', re: /- - \S.*\n\s+- /m },
+      // #23: a value-leading document-marker (`k: --- x`, `- --- x`) — string content, NOT a marker,
+      // a position the flat grammar's `^`-retried marker pattern can mis-fire on.
+      { name: '#23 value-leading-marker', re: /(?::|-) +(?:---|\.\.\.)(?:\s|$)/ },
+    ] },
+  { name: 'typescript', module: '../typescript.ts', scopeName: 'source.ts', tmPath: 'typescript.tmLanguage.json' },
+  { name: 'javascript', module: '../javascript.ts', scopeName: 'source.js', tmPath: 'javascript.tmLanguage.json' },
+  { name: 'typescriptreact', module: '../typescriptreact.ts', scopeName: 'source.tsx', tmPath: 'typescriptreact.tmLanguage.json' },
+  { name: 'javascriptreact', module: '../javascriptreact.ts', scopeName: 'source.js.jsx', tmPath: 'javascriptreact.tmLanguage.json' },
+  // HTML/Vue embed source.js/ts/tsx (script blocks, on* handlers); provide them so embedded regions
+  // tokenize instead of erroring. The consistency check reads the host markup tokens regardless.
+  { name: 'html', module: '../html.ts', scopeName: 'text.html.basic', tmPath: 'html.tmLanguage.json',
+    tmExtra: { 'source.js': 'javascript.tmLanguage.json', 'source.css': 'html.tmLanguage.json' } },
+  { name: 'vue', module: '../vue.ts', scopeName: 'text.html.vue', tmPath: 'vue.tmLanguage.json',
+    tmExtra: { 'text.html.basic': 'html.tmLanguage.json', 'source.js': 'javascript.tmLanguage.json', 'source.ts': 'typescript.tmLanguage.json', 'source.tsx': 'typescriptreact.tmLanguage.json' } },
+];
+
+// ── vscode-textmate tokenizer (one shared WASM load) ─────────────────────────────────────────────
+const { INITIAL, Registry, parseRawGrammar } = vsctm;
+const { loadWASM, OnigScanner, OnigString } = onig;
+const require = createRequire(import.meta.url);
+const bin = readFileSync(require.resolve('vscode-oniguruma/release/onig.wasm'));
+await loadWASM(bin.buffer.slice(bin.byteOffset, bin.byteOffset + bin.byteLength));
+
+async function loadTm(scopeName: string, files: Record<string, string>) {
+  const cache: Record<string, string> = {};
+  const reg = new Registry({
+    onigLib: Promise.resolve({ createOnigScanner: (p: string[]) => new OnigScanner(p), createOnigString: (s: string) => new OnigString(s) }),
+    loadGrammar: async (sn: string) => { const p = files[sn]; if (!p) return null; const c = cache[sn] ?? (cache[sn] = readFileSync(p, 'utf8')); return parseRawGrammar(c, sn + '.json'); },
+  });
+  return reg.loadGrammar(scopeName);
+}
+interface TmTok { start: number; end: number; scopes: string[] }
+function tmTokenize(grammar: vsctm.IGrammar, text: string): TmTok[] {
+  const toks: TmTok[] = []; let rs = INITIAL, off = 0;
+  for (const line of text.split('\n')) { const r = grammar.tokenizeLine(line, rs); for (const t of r.tokens) toks.push({ start: off + t.startIndex, end: off + t.endIndex, scopes: t.scopes }); rs = r.ruleStack; off += line.length + 1; }
+  return toks;
+}
+function scopeAt(toks: TmTok[], pos: number): string[] {
+  let lo = 0, hi = toks.length - 1, ans = -1;
+  while (lo <= hi) { const mid = (lo + hi) >> 1; if (toks[mid].start <= pos) { ans = mid; lo = mid + 1; } else hi = mid - 1; }
+  return ans >= 0 && toks[ans].end > pos ? toks[ans].scopes : [];
+}
+const innerOf = (s: string[]): string => (s.length ? s[s.length - 1] : '(none)');
+
+// ── visual bucket of a scope chain — the level at which a highlight difference is actually visible.
+//    Same partition the scope-gap differential pass uses; the consistency check compares buckets so a
+//    `-` painted as string (punct≠string) is caught even though punctuation is a lexical-floor role. ──
+type Bucket = 'invalid' | 'comment' | 'string' | 'number' | 'keyword' | 'name' | 'punct' | 'none';
+const DISTINCT = new Set<Bucket>(['invalid', 'comment', 'string', 'number', 'keyword']);
+function scopeBucket(chain: string[]): Bucket {
+  for (let i = chain.length - 1; i >= 0; i--) {
+    const s = normScope(chain[i]);
+    if (/^invalid/.test(s)) return 'invalid';
+    if (/^comment/.test(s)) return 'comment';
+    if (/^constant\.numeric/.test(s)) return 'number';
+    if (/^(string|constant\.character|constant\.other\.symbol)/.test(s)) return 'string';
+    if (/^(keyword|storage|constant\.language|support\.constant|variable\.language)/.test(s)) return 'keyword';
+    if (/^(entity|variable|support|constant)/.test(s)) return 'name';
+    if (/^punctuation/.test(s)) return 'punct';
+  }
+  return 'none';
+}
+// every visual bucket a scope CHAIN spans (a YAML number is `string.unquoted constant.numeric` →
+// {string, number} — both are legitimate, since the same token folds to a multi-line string).
+function chainBuckets(scope: string): Set<Bucket> {
+  const out = new Set<Bucket>();
+  for (const seg of scope.split(/\s+/)) if (seg) out.add(scopeBucket([seg]));
+  return out;
+}
+const CONTENT = new Set<Bucket>(['string', 'comment', 'number']);   // a STRUCTURAL literal is never one of these
+
+// ── by-construction expected role of a parsed leaf, from the grammar ALONE ──────────────────────
+// A leaf's token TYPE → the bucket SET the grammar DECLARES for it: a named token → its `scope`
+// chain's buckets; a `$punct`/`$keyword` literal → any `scopes` override, else punctuation / keyword.
+// `lit` marks a STRUCTURAL literal (`$punct`/`$keyword`) — one the parser placed as grammar structure,
+// so the highlighter painting it as CONTENT (string/comment/number) is always wrong (monogram#24).
+interface LeafRole { start: number; end: number; text: string; tokenType: string; expected: Set<Bucket>; lit: boolean }
+function buildRoleMap(grammar: CstGrammar): (leaf: { tokenType: string; text: string }) => { buckets: Set<Bucket>; lit: boolean } | null {
+  const tokScope = new Map<string, string | undefined>();
+  for (const t of grammar.tokens) tokScope.set(t.name, t.scope);
+  const skip = new Set<string>();
+  if (grammar.indent) { skip.add(grammar.indent.indentToken); skip.add(grammar.indent.dedentToken); skip.add(grammar.indent.newlineToken); }
+  if (grammar.newline) skip.add(grammar.newline.token);
+  const over = grammar.scopeOverrides;
+  return (leaf) => {
+    const ty = leaf.tokenType;
+    if (skip.has(ty)) return null;
+    if (ty === '$punct') { const o = over.get(leaf.text); return { buckets: o ? new Set(o.flatMap((s) => [...chainBuckets(s)])) : new Set<Bucket>(['punct']), lit: true }; }
+    if (ty === '$keyword') { const o = over.get(leaf.text); return { buckets: o ? new Set(o.flatMap((s) => [...chainBuckets(s)])) : new Set<Bucket>(['keyword']), lit: true }; }
+    if (ty.startsWith('$template')) return { buckets: new Set<Bucket>(['string']), lit: false };
+    if (tokScope.has(ty)) { const sc = tokScope.get(ty); return sc ? { buckets: chainBuckets(sc), lit: false } : null; }
+    return null;   // unscoped / contextual token (a bare identifier) → not checkable by-construction
+  };
+}
+function leafRoles(grammar: CstGrammar, cst: CstNode, roleOf: (l: { tokenType: string; text: string }) => { buckets: Set<Bucket>; lit: boolean } | null): LeafRole[] {
+  const out: LeafRole[] = [];
+  const walk = (n: CstChild) => {
+    if (n.kind === 'leaf') {
+      if (n.end <= n.offset) return;
+      const r = roleOf(n);
+      if (r) out.push({ start: n.offset, end: n.end, text: n.text, tokenType: n.tokenType, expected: r.buckets, lit: r.lit });
+    } else for (const c of n.children) walk(c);
+  };
+  walk(cst);
+  return out;
+}
+
+// Scopes that belong to a POSITION-ANCHORED token — one whose pattern contains a `start()` anchor
+// (e.g. YAML's DocStart/DocEnd `^---`/`^...`). Such a scope is the parser's signal "a marker AT a
+// line/stream position"; the flat highlighter, retrying the pattern at every token boundary, may
+// paint it on a token the parser placed elsewhere (a value-leading `---`, monogram#23). Map each
+// such scope → the set of token names allowed to carry it, so a mismatch is detectable generically.
+function anchoredScopes(grammar: CstGrammar): Map<string, Set<string>> {
+  const hasStart = (p: TokenPattern): boolean => {
+    if (typeof p === 'string') return false;
+    switch (p.type) {
+      case 'anchor': return p.kind === 'start';
+      case 'seq': case 'alt': return p.items.some(hasStart);
+      case 'repeat': case 'lookahead': case 'lookbehind': return hasStart(p.body);
+      default: return false;
+    }
+  };
+  const m = new Map<string, Set<string>>();
+  for (const t of grammar.tokens) if (t.scope && hasStart(t.pattern)) { const s = m.get(t.scope) ?? new Set(); s.add(t.name); m.set(t.scope, s); }
+  return m;
+}
+
+// ── the run ──────────────────────────────────────────────────────────────────────────────────────
+interface Violation { input: string; strategy: string; pos: number; text: string; tokenType: string; expected: string; got: Bucket; gotScope: string; kind: string }
+
+async function runLang(cfg: LangCfg): Promise<{ name: string; ok: boolean; violations: number; reason: string }> {
+  if (!existsSync(cfg.tmPath)) { console.log(`  [skip ${cfg.name}: ${cfg.tmPath} not found — run npm run gen]`); return { name: cfg.name, ok: true, violations: 0 }; }
+  const grammar = (await import(cfg.module)).default as CstGrammar;
+  const { parse } = createParser(grammar);
+  const tm = await loadTm(cfg.scopeName, { [cfg.scopeName]: cfg.tmPath, ...(cfg.tmExtra ?? {}) });
+  if (!tm) throw new Error(`failed to load ${cfg.tmPath}`);
+  const roleOf = buildRoleMap(grammar);
+  const anchored = anchoredScopes(grammar);
+
+  const inputs = generateInputs(grammar, cfg.gen ?? { depth: 5, nestDepth: 5, cap: 7, fuzzRounds: 250, maxInputs: 1500, seed: 5 });
+
+  // ── (2) round-trip: parse each input AS THE RULE it was rooted at ──
+  const byStrat = new Map<string, { ok: number; n: number }>();
+  const entryLegal: GenInput[] = [];
+  for (const inp of inputs) {
+    const k = inp.strategy.split(/[:@]/)[0];
+    const s = byStrat.get(k) ?? { ok: 0, n: 0 }; s.n++;
+    let rootOk = false;
+    try { parse(inp.text, inp.rule); rootOk = true; } catch { /* illegal derivation (IR over-permits vs the parser) */ }
+    if (rootOk) s.ok++;
+    byStrat.set(k, s);
+    // the consistency check needs FULL documents (highlighter tokenizes the whole text as the entry
+    // scope), so keep inputs that parse at the ENTRY rule.
+    try { parse(inp.text); entryLegal.push(inp); } catch { /* not a full document — skip for scope≡role */ }
+  }
+
+  // ── (3) scope ≡ role on the entry-legal inputs ──────────────────────────────────────────────────
+  // Two BY-CONSTRUCTION gates (each a flat-vs-stack disagreement that is unambiguously the
+  // highlighter's error), plus a lenient report-only differential for context refinements:
+  //   • gate-1 STRUCTURAL-LITERAL contradiction — a `$punct`/`$keyword` the parser placed as grammar
+  //     structure, painted as CONTENT (string/comment/number). A `-` indicator is never a string
+  //     (monogram#24). Floor-blind: it compares the punctuation class directly.
+  //   • gate-2 ANCHORED-MARKER misfire — a leaf painted with a position-anchored token's scope when
+  //     the parser did NOT place that token here (a value-leading `---` scoped document-marker,
+  //     monogram#23). The flat grammar retried the `^`-anchored pattern off-position.
+  // Leniency: a token is CONSISTENT when the highlighter paints ANY part of its span with a scope in
+  // the token's declared-chain bucket SET — so a quote-delimiter sub-scope (`"…"` opens punctuation)
+  // and a context fold (a number folded into a multi-line string) are NOT false-positives.
+  const violations: Violation[] = [];
+  let checkedTokens = 0;
+  const spanBuckets = (toks: TmTok[], text: string, start: number, end: number): Set<Bucket> => {
+    const s = new Set<Bucket>();
+    for (let p = start; p < end; p++) { const c = text.charCodeAt(p); if (c === 32 || c === 9) continue; s.add(scopeBucket(scopeAt(toks, p))); }
+    return s.size ? s : new Set<Bucket>(['none']);
+  };
+  for (const inp of entryLegal) {
+    let cst: CstNode, toks: TmTok[];
+    try { cst = parse(inp.text); toks = tmTokenize(tm, inp.text); } catch { continue; }
+    const leaves = leafRoles(grammar, cst, roleOf);
+    const leafCover = (pos: number) => leaves.find((l) => pos >= l.start && pos < l.end);
+    for (const lr of leaves) {
+      checkedTokens++;
+      const got = spanBuckets(toks, inp.text, lr.start, lr.end);
+      const overlap = [...lr.expected].some((b) => got.has(b));
+      if (overlap) continue;                                                  // highlighter painted the declared scope somewhere → consistent
+      // gate-1: a structural literal painted entirely as a content class
+      const contentGot = [...got].find((b) => CONTENT.has(b));
+      if (lr.lit && contentGot && violations.length < 200) {
+        violations.push({ input: inp.text, strategy: inp.strategy, pos: lr.start, text: lr.text, tokenType: lr.tokenType, expected: [...lr.expected].join('|') as any, got: contentGot, gotScope: innerOf(scopeAt(toks, lr.start)), kind: '#24 structural-literal→content' });
+      }
+    }
+    // gate-2: scan the highlighter's tokens for an anchored-marker scope on a leaf that is NOT that token
+    if (anchored.size) for (const t of toks) {
+      if (t.end <= t.start) continue;
+      const inner = innerOf(t.scopes);
+      const owners = anchored.get(inner.replace(/\.[a-z0-9]+$/, '')) ?? anchored.get(inner);
+      if (!owners) continue;
+      const leaf = leafCover(t.start);
+      if (leaf && !owners.has(leaf.tokenType) && violations.length < 200) {
+        violations.push({ input: inp.text, strategy: inp.strategy, pos: t.start, text: inp.text.slice(t.start, t.end), tokenType: leaf.tokenType, expected: [...owners].join('|') as any, got: 'name', gotScope: inner, kind: '#23 anchored-marker misfire' });
+      }
+    }
+  }
+
+  // ── report ──
+  const totalLegal = [...byStrat.values()].reduce((a, s) => a + s.ok, 0);
+  const totalN = [...byStrat.values()].reduce((a, s) => a + s.n, 0);
+  const structuredLegal = [...byStrat.entries()].filter(([k]) => k !== 'fuzz').reduce((a, [, s]) => a + s.ok, 0);
+  const structuredN = [...byStrat.entries()].filter(([k]) => k !== 'fuzz').reduce((a, [, s]) => a + s.n, 0);
+  const fuzzLegal = totalLegal - structuredLegal, fuzzN = totalN - structuredN;
+  const rate = (a: number, b: number) => b ? (100 * a / b).toFixed(0) + '%' : 'n/a';
+  console.log(`\n── ${cfg.name} ──  ${inputs.length} generated · ${entryLegal.length} full-document`);
+  // STRUCTURED is the by-construction round-trip guarantee (every derivation parses as its rule);
+  // FUZZ is exploratory (random choices wander outside the IR's context constraints → many illegal,
+  // which is expected and filtered) and is what surfaces divergences beyond the structured shapes.
+  console.log(`  round-trip (rule-rooted):  structured ${structuredLegal}/${structuredN} (${rate(structuredLegal, structuredN)} — the by-construction gate) · fuzz ${fuzzLegal}/${fuzzN} (exploratory)` + ['', ...[...byStrat.entries()].filter(([k]) => k !== 'fuzz').map(([k, s]) => `${k} ${s.ok}/${s.n}`)].join('  '));
+  // What GATES vs what is a report-only DISCOVERY:
+  //  • an ANCHORED-MARKER misfire (#23) ALWAYS gates — a position-anchored marker scope on a token the
+  //    parser placed elsewhere is unambiguously the flat grammar mis-firing the pattern off-position;
+  //    there is no legitimate "frontier limit" version of it.
+  //  • a STRUCTURAL-LITERAL→content divergence (#24) gates on the STRUCTURED strategies (canonical,
+  //    clean nested shapes — the by-construction guarantee: the dirnest `- - x\n  - x` reproduces #24),
+  //    but is report-only on gnarly FUZZ inputs, which legitimately reach STANDING flat-TM frontier
+  //    limits (a block plain scalar containing an unclosed flow indicator `[`/`{` — block-vs-flow
+  //    disambiguation that needs the indent/flow stack a flat grammar lacks). Those are not
+  //    regressions of a known-fixed shape, and #25 is the testing harness, not a fix for every limit.
+  const isGated = (v: Violation) => v.kind.startsWith('#23') || !v.strategy.startsWith('fuzz');
+  const gated = violations.filter(isGated);
+  const discovered = violations.filter((v) => !isGated(v));
+  console.log(`  scope≡role: ${checkedTokens} declared-scope tokens checked · ${gated.length} gated inconsistenc${gated.length === 1 ? 'y' : 'ies'} · ${discovered.length} discovered (fuzz frontier-limit, report-only)`);
+  const show = (vs: Violation[], tag: string) => {
+    const grouped = new Map<string, { v: Violation; n: number }>();
+    for (const v of vs) { const key = `${v.kind} ${v.tokenType}`; const e = grouped.get(key); if (e) e.n++; else grouped.set(key, { v, n: 1 }); }
+    for (const [key, { v, n }] of [...grouped.entries()].slice(0, 8)) console.log(`    ${tag} ${key} ×${n}  «${v.text.slice(0, 14).replace(/\n/g, '\\n')}» got «${v.gotScope}»  in ${JSON.stringify(v.input.slice(0, 40))}`);
+  };
+  if (gated.length) show(gated, '✗');
+  if (discovered.length) show(discovered, '·');
+
+  // depth-site COVERAGE: the generated legal corpus must contain each declared depth-bug class, so the
+  // scope≡role gate provably exercises monogram#23/#24 (not just happens to be clean on a fixed corpus).
+  const legalTexts = entryLegal.map((i) => i.text);
+  const missing = (cfg.mustCover ?? []).filter((m) => !legalTexts.some((t) => m.re.test(t)));
+  if (cfg.mustCover?.length) {
+    const covered = cfg.mustCover.length - missing.length;
+    console.log(`  depth-site coverage: ${covered}/${cfg.mustCover.length} classes present in the legal corpus` + (missing.length ? `  — MISSING: ${missing.map((m) => m.name).join(', ')}` : `  (${cfg.mustCover.map((m) => m.name).join(', ')})`));
+  }
+  // GATE: (a) the generator produced a real LEGAL corpus (a coverage floor — proves round-trip works:
+  // the grammar's IR generates inputs the parser accepts), and (b) ZERO scope≡role gated inconsistencies.
+  // The structured legal RATE is reported for visibility but not gated on a percentage — the generator
+  // legitimately over-produces (the IR over-permits vs the parser; markup materialisation is rough), and
+  // the validated corpus is the inputs that DO parse.
+  const enoughLegal = entryLegal.length >= 15;
+  const reason = gated.length ? `${gated.length} scope≡role` : !enoughLegal ? `only ${entryLegal.length} legal docs` : missing.length ? `missing ${missing.map((m) => m.name).join('/')}` : '';
+  return { name: cfg.name, ok: gated.length === 0 && enoughLegal && missing.length === 0, violations: gated.length, reason };
+}
+
+const only = process.argv[2];
+const targets = only ? LANGS.filter((l) => l.name === only || (only === 'tsfamily' && /script/.test(l.name))) : LANGS;
+if (!targets.length) { console.error(`unknown language: ${only}`); process.exit(1); }
+console.log('Generative consistency — grammar-derived inputs, by-construction round-trip + scope≡role');
+const results = [];
+for (const cfg of targets) results.push(await runLang(cfg));
+const bad = results.filter((r) => !r.ok);
+console.log(`\n${'='.repeat(70)}`);
+console.log(`  ${results.length - bad.length}/${results.length} languages consistent` + (bad.length ? `  — FAILED: ${bad.map((b) => `${b.name} (${b.reason})`).join(', ')}` : ''));
+if (bad.length) { console.error('\nGENERATIVE GATE FAILED — a scope≡role inconsistency (flat highlighter ≠ parser) or too small a legal corpus.'); process.exit(1); }
+console.log('\nDone.');
diff --git a/test/grammar-gen.ts b/test/grammar-gen.ts
new file mode 100644
index 0000000..31eef55
--- /dev/null
+++ b/test/grammar-gen.ts
@@ -0,0 +1,694 @@
+// ─────────────────────────────────────────────────────────────────────────────
+//  grammar-gen.ts — a GENERIC, grammar-derived input GENERATOR (monogram#25 part 1).
+//
+//  The premise of the whole project is that the source IS a grammar: the same
+//  combinator object (`yaml.ts`, `typescript.ts`, …) the parser / highlighter /
+//  tree-sitter derive from is ALSO a generator. Walk its rule IR — `alt`=branch,
+//  `seq`=concat, `*`/`+`/`?`=repeat, `ref`=descend, token=sample — and it emits
+//  guaranteed-legal inputs. That replaces "hope the corpus contains the shape" (the
+//  blind spot that hid monogram#23/#24 from a corpus-bound metric) with systematic,
+//  bounded coverage derived from the grammar itself.
+//
+//  This file is the ENGINE; the judging (round-trip + scope≡role) lives in the
+//  drivers that import it (test/generative.ts). It is language-AGNOSTIC: every
+//  per-language fact (indent tokens, flow brackets, markup delimiters, compact
+//  indicators) is read from the grammar's own config (`grammar.indent` / `.markup`),
+//  never hardcoded — the same discipline the engines follow.
+//
+//  Three production strategies, all over the SAME walker:
+//   • bounded-exhaustive — every derivation to a small depth N (provably complete at
+//     small scope; this is what makes coverage `grammar × bound` instead of imagination).
+//   • self-recursive nesting — for each rule that can contain itself, the nested shape
+//     at depth 1..N. Deep self-embedding is exactly where a flat highlighter loses to
+//     the stack-keeping parser (monogram#24 is `BlockSequence` inside `BlockSequence`).
+//   • fuzzing — random production choices, for deeper / wider structures.
+// ─────────────────────────────────────────────────────────────────────────────
+import type { CstGrammar, RuleExpr, RuleDecl, TokenDecl, TokenPattern, TokenCharClassItem } from '../src/types.ts';
+
+// Max emissions in one derivation. A deep tree of 2-rep quantifiers grows the list multiplicatively;
+// copying huge lists (not the call count) is what makes a naive enumerator hang — cap it.
+const MAX_EMS = 220;
+
+// ── An EMISSION: the atomic unit the walker produces; the materializer renders it. ──
+export type Emission =
+  | { t: 'tok'; name: string; text: string }                 // a real lexer token (text sampled from its pattern)
+  | { t: 'lit'; value: string }                              // a grammar literal (keyword or punctuation)
+  | { t: 'struct'; kind: 'indent' | 'dedent' | 'newline' }   // indentation control (YAML indent mode)
+  | { t: 'compact' };                                        // marks an indent that the lexer would emit INLINE (YAML compact `- - a`)
+
+// A finished input: rendered text + the real tokens it should lex back to (round-trip witnesses).
+export interface GenInput {
+  text: string;
+  tokens: { start: number; end: number; name: string; text: string }[];
+  strategy: string;
+  rule: string;        // the top rule the derivation started from (entry, or a self-recursive rule)
+}
+
+// ── deterministic PRNG (Date.now/Math.random are unavailable in workflow scripts and make
+//    a generator unreproducible anyway — seed it). xorshift32. ──
+function rng(seed: number): () => number {
+  let s = seed | 0 || 1;
+  return () => { s ^= s << 13; s ^= s >>> 17; s ^= s << 5; return ((s >>> 0) % 1_000_000) / 1_000_000; };
+}
+
+// ─── TOKEN SAMPLING ──────────────────────────────────────────────────────────────
+// Produce a string that MATCHES a TokenPattern. Conservative by default (a short,
+// unambiguous lexeme) so the generated input round-trips; `interesting` injects
+// grammar-derived boundary literals (e.g. `---`, `#`, `-`) into free-form tokens so a
+// plain scalar can be sampled as `--- x` — legal as that token, but a shape that
+// stresses the flat highlighter's context guessing (monogram#23). Returns null when
+// the pattern can't be sampled (a `never()` placeholder — a structural token).
+interface SampleCtx { rand: () => number; interesting: string[]; variant: number }
+
+function pickNonExcluded(items: TokenCharClassItem[]): string | null {
+  // a char NOT in the negated class — try common, readable candidates in order
+  const cands = ['a', 'b', 'c', 'x', 'y', 'z', 'A', 'M', '1', '5', '_', '.', '@', '~'];
+  const inClass = (ch: string) => items.some((it) =>
+    it.type === 'char' ? it.value === ch : ch >= it.from && ch <= it.to);
+  for (const ch of cands) if (!inClass(ch)) return ch;
+  return null;
+}
+function firstOfClass(items: TokenCharClassItem[]): string | null {
+  for (const it of items) {
+    if (it.type === 'char') { if (it.value !== '\n' && it.value !== '\r') return it.value; }
+    else { const c = it.from; if (c !== '\n' && c !== '\r') return c; }
+  }
+  return null;
+}
+
+function sample(pat: TokenPattern, ctx: SampleCtx): string | null {
+  if (typeof pat === 'string') return pat;
+  switch (pat.type) {
+    case 'never': return null;                        // structural-token placeholder
+    case 'anyChar': return 'x';
+    case 'anchor': return '';
+    case 'lookahead': case 'lookbehind': return '';   // zero-width; context handled by the materializer's separators
+    case 'charClass': {
+      const ch = pat.negate ? pickNonExcluded(pat.items) : firstOfClass(pat.items);
+      return ch ?? 'x';
+    }
+    case 'seq': {
+      let out = '';
+      for (const it of pat.items) { const s = sample(it, ctx); if (s === null) return null; out += s; }
+      return out;
+    }
+    case 'alt': {
+      // bias toward branch 0 (usually the simplest); `variant` rotates for variety
+      const idx = pat.items.length ? ctx.variant % pat.items.length : 0;
+      for (let k = 0; k < pat.items.length; k++) {
+        const s = sample(pat.items[(idx + k) % pat.items.length], ctx);
+        if (s !== null) return s;
+      }
+      return null;
+    }
+    case 'repeat': {
+      const reps = pat.min === 0 ? (ctx.variant % 2 === 0 ? 1 : 0) : pat.min;   // 0/1 for *, min for +
+      let out = '';
+      for (let i = 0; i < Math.max(reps, pat.min); i++) { const s = sample(pat.body, ctx); if (s === null) return null; out += s; }
+      return out;
+    }
+  }
+}
+
+// Sample several distinct, legal texts for a token (variants + interesting-literal embeds).
+function sampleVariants(decl: TokenDecl, ctx: { rand: () => number; interesting: string[] }, n: number): string[] {
+  const out = new Set<string>();
+  for (let v = 0; v < n + 2 && out.size < n; v++) {
+    const s = sample(decl.pattern, { ...ctx, variant: v });
+    if (s !== null && s.length > 0) out.add(s);
+  }
+  // a base sample to seed interesting-literal embeds
+  const base = sample(decl.pattern, { ...ctx, variant: 0 }) ?? '';
+  // Embed grammar-derived boundary literals into free-form (multi-char-capable) tokens, where
+  // the result is still a single legal instance of the token — this is what produces the
+  // monogram#23 shape (a plain scalar whose text is `--- x`). Verified per-token by re-lexing
+  // in the driver; an embed that doesn't re-lex to this token is simply dropped there.
+  if (base.length >= 1) {
+    for (const lit of ctx.interesting) {
+      if (lit.length === 0 || /[\n\r]/.test(lit)) continue;
+      out.add(lit + base);            // glued leading boundary (`---` + `x` → `---x`)
+      // a SPACE-separated form (`--- x`): a boundary literal that is line-structural only with a
+      // trailing space (a doc marker `---␣`, a comment `#␣`) re-fires its structural meaning here, so
+      // this is the form that exercises monogram#23 (a value-leading `--- x` the parser keeps a plain
+      // scalar but a flat grammar may mis-scope as a marker). Legal where the token body admits a space.
+      out.add(lit + ' ' + base);
+      if (out.size > n + ctx.interesting.length * 2) break;
+    }
+  }
+  return [...out];
+}
+
+// ─── THE WALKER ──────────────────────────────────────────────────────────────────
+export interface GenOptions {
+  depth?: number;       // bounded-exhaustive derivation depth (rule-ref recursion)
+  cap?: number;         // max alternatives kept at each combinator node (anti-explosion)
+  maxInputs?: number;   // global cap on emitted inputs per rule
+  fuzzRounds?: number;  // random derivations
+  seed?: number;
+  nestDepth?: number;   // self-recursive nesting depth
+  timeBudgetMs?: number; // wall-clock cap for the depth strategies (large token-stream grammars)
+}
+
+class Walker {
+  tokenByName = new Map<string, TokenDecl>();
+  ruleByName = new Map<string, RuleDecl>();
+  interesting: string[];
+  structKind = new Map<string, 'indent' | 'dedent' | 'newline'>();
+  compactLits: Set<string>;
+  reachMap = new Map<string, Set<string>>();   // rule → every rule it can transitively reach
+  ruleMin = new Map<string, Emission[] | null>();
+  rand: () => number;
+  cap: number;
+  grammar: CstGrammar;
+  budgetCalls = 0;          // anti-explosion: enum() is a tree walk; cap the work PER top-level call
+  maxCalls = 60_000;
+  enumTop(e: RuleExpr, budget: number): Emission[][] { this.budgetCalls = 0; return this.enum(e, budget); }
+
+  constructor(grammar: CstGrammar, seed: number, cap: number) {
+    this.grammar = grammar;
+    this.rand = rng(seed);
+    this.cap = cap;
+    for (const t of grammar.tokens) this.tokenByName.set(t.name, t);
+    for (const r of grammar.rules) this.ruleByName.set(r.name, r);
+    const ind = grammar.indent;
+    if (ind) {
+      this.structKind.set(ind.indentToken, 'indent');
+      this.structKind.set(ind.dedentToken, 'dedent');
+      this.structKind.set(ind.newlineToken, 'newline');
+    }
+    this.compactLits = new Set(grammar.indent?.compactIndicators ?? []);
+    this.interesting = this.collectInteresting();
+    this.computeReach();
+    this.computeMins();
+  }
+
+  computeReach(): void {
+    const refs = (e: RuleExpr, acc: Set<string>) => {
+      switch (e.type) {
+        case 'ref': if (this.ruleByName.has(e.name)) acc.add(e.name); break;
+        case 'seq': case 'alt': e.items.forEach((i) => refs(i, acc)); break;
+        case 'quantifier': case 'group': case 'not': refs(e.body, acc); break;
+        case 'sep': refs(e.element, acc); break;
+      }
+    };
+    for (const r of this.grammar.rules) { const s = new Set<string>(); refs(r.body, s); this.reachMap.set(r.name, s); }
+    for (let i = 0; i < this.grammar.rules.length; i++)
+      for (const r of this.grammar.rules) { const s = this.reachMap.get(r.name)!; for (const n of [...s]) for (const m of this.reachMap.get(n) ?? []) s.add(m); }
+  }
+  // does an expression (transitively) reference `target` — i.e. descending into it can reach target?
+  // memoised (per expr-object × target) — `nestChain` queries it on every item, so the cache matters.
+  reachesCache = new WeakMap<object, Map<string, boolean>>();
+  exprReaches(e: RuleExpr, target: string): boolean {
+    if (typeof e === 'object') {
+      let m = this.reachesCache.get(e); if (!m) { m = new Map(); this.reachesCache.set(e, m); }
+      const c = m.get(target); if (c !== undefined) return c;
+      const v = this.exprReachesRaw(e, target); m.set(target, v); return v;
+    }
+    return this.exprReachesRaw(e, target);
+  }
+  exprReachesRaw(e: RuleExpr, target: string): boolean {
+    switch (e.type) {
+      case 'ref': return e.name === target || (this.reachMap.get(e.name)?.has(target) ?? false);
+      case 'seq': case 'alt': return e.items.some((i) => this.exprReaches(i, target));
+      case 'quantifier': case 'group': case 'not': return this.exprReaches(e.body, target);
+      case 'sep': return this.exprReaches(e.element, target);
+      default: return false;
+    }
+  }
+
+  // shortest rule-ref distance FROM each rule TO `target` (BFS on the reversed ref graph), memoised.
+  // `nestChain` uses it to descend the DIRECT path to target each level — picking merely "a branch
+  // that reaches target" loops forever through a long cycle that technically reaches it but never
+  // arrives (Node→[Indent,Node]→Node…), producing an empty indent chain instead of nested content.
+  distCache = new Map<string, Map<string, number>>();
+  distTo(target: string): Map<string, number> {
+    let m = this.distCache.get(target); if (m) return m;
+    m = new Map([[target, 0]]);
+    // reversed adjacency: who DIRECTLY refs each rule
+    const back = new Map<string, string[]>();
+    for (const r of this.grammar.rules) for (const ref of this.directRuleRefs(r.body)) { (back.get(ref) ?? back.set(ref, []).get(ref)!).push(r.name); }
+    const queue = [target];
+    while (queue.length) { const cur = queue.shift()!; const d = m.get(cur)!; for (const pre of back.get(cur) ?? []) if (!m.has(pre)) { m.set(pre, d + 1); queue.push(pre); } }
+    this.distCache.set(target, m); return m;
+  }
+  directRuleRefs(e: RuleExpr): string[] {
+    const out: string[] = [];
+    const go = (x: RuleExpr) => { switch (x.type) {
+      case 'ref': if (this.ruleByName.has(x.name)) out.push(x.name); break;
+      case 'seq': case 'alt': x.items.forEach(go); break;
+      case 'quantifier': case 'group': case 'not': go(x.body); break;
+      case 'sep': go(x.element); break;
+    } };
+    go(e); return out;
+  }
+  // min distance an expression sits from re-entering `target` (Infinity if it can't reach it)
+  distExprCache = new WeakMap<object, Map<string, number>>();
+  exprDist(e: RuleExpr, target: string): number {
+    if (typeof e === 'object') { let m = this.distExprCache.get(e); if (!m) { m = new Map(); this.distExprCache.set(e, m); } const c = m.get(target); if (c !== undefined) return c; const v = this.exprDistRaw(e, target); m.set(target, v); return v; }
+    return this.exprDistRaw(e, target);
+  }
+  exprDistRaw(e: RuleExpr, target: string): number {
+    const dm = this.distTo(target);
+    switch (e.type) {
+      case 'ref': return e.name === target ? 0 : (dm.has(e.name) ? dm.get(e.name)! : Infinity);
+      case 'seq': case 'alt': return Math.min(Infinity, ...e.items.map((i) => this.exprDist(i, target)));
+      case 'quantifier': case 'group': case 'not': return this.exprDist(e.body, target);
+      case 'sep': return this.exprDist(e.element, target);
+      default: return Infinity;
+    }
+  }
+
+  // grammar-derived boundary literals: every literal in the rules + structural sigils that
+  // a free-form token could legally contain but that ALSO start another token (the collision
+  // shapes a flat highlighter mis-scopes). Short, non-alphabetic ones are the interesting ones.
+  collectInteresting(): string[] {
+    const lits = new Set<string>();
+    const walk = (e: RuleExpr) => {
+      switch (e.type) {
+        case 'literal': lits.add(e.value); break;
+        case 'seq': case 'alt': e.items.forEach(walk); break;
+        case 'quantifier': case 'group': case 'not': walk(e.body); break;
+        case 'sep': walk(e.element); break;
+      }
+    };
+    for (const r of this.grammar.rules) walk(r.body);
+    // doc markers / block-scalar introducers live in indent config, not the rules
+    const ind = this.grammar.indent;
+    for (const m of ind?.blockScalar?.documentMarkers ?? []) lits.add(m);
+    return [...lits].filter((l) => l.length > 0 && l.length <= 3 && !/^[A-Za-z]+$/.test(l));
+  }
+
+  isToken(name: string): boolean { return this.tokenByName.has(name); }
+  isStruct(name: string): boolean { return this.structKind.has(name); }
+
+  // ── minimal terminating expansion (fixpoint), so any budget cut-off still produces legal text ──
+  computeMins(): void {
+    for (const r of this.grammar.rules) this.ruleMin.set(r.name, null);
+    for (let iter = 0; iter < this.grammar.rules.length + 2; iter++) {
+      let changed = false;
+      for (const r of this.grammar.rules) {
+        if (this.ruleMin.get(r.name)) continue;
+        const m = this.minExpand(r.body);
+        if (m) { this.ruleMin.set(r.name, m); changed = true; }
+      }
+      if (!changed) break;
+    }
+  }
+  minExpand(e: RuleExpr): Emission[] | null {
+    switch (e.type) {
+      case 'literal': return [{ t: 'lit', value: e.value }];
+      case 'ref': {
+        if (this.isStruct(e.name)) return [{ t: 'struct', kind: this.structKind.get(e.name)! }];
+        if (this.isToken(e.name)) {
+          const txt = sample(this.tokenByName.get(e.name)!.pattern, { rand: this.rand, interesting: [], variant: 0 });
+          return txt === null ? null : [{ t: 'tok', name: e.name, text: txt || 'x' }];
+        }
+        return this.ruleMin.get(e.name) ?? null;
+      }
+      case 'seq': {
+        const out: Emission[] = [];
+        for (const it of e.items) { const m = this.minExpand(it); if (!m) return null; out.push(...m); }
+        return out;
+      }
+      case 'alt': {
+        let best: Emission[] | null = null;
+        for (const it of e.items) { const m = this.minExpand(it); if (m && (!best || m.length < best.length)) best = m; }
+        return best;
+      }
+      case 'quantifier': return e.kind === '+' ? this.minExpand(e.body) : [];
+      case 'group': return this.minExpand(e.body);
+      case 'sep': return this.minExpand(e.element);
+      case 'not': case 'sameLine': case 'noCommentBefore': case 'noMultilineFlowBefore':
+      case 'op': case 'prefix': case 'postfix': return [];
+    }
+  }
+
+  // Minimal-but-CONTENT-BEARING expansion: like minExpand, but `opt`/`*` fire ONE rep when their body
+  // can yield a token, and `alt` prefers a branch that produces a token — so a `- opt(Value)` becomes
+  // `- <scalar>` instead of a bare `-`. Bounded by `fuel`; falls back to minExpand at the floor.
+  fillBudget = 0;   // global anti-explosion for fillContent's all-branches alt search (huge TS alts)
+  fillContent(e: RuleExpr, fuel: number): Emission[] {
+    if (--fuel <= 0 || --this.fillBudget <= 0) return this.minExpand(e) ?? [];
+    const hasTok = (xs: Emission[]) => xs.some((em) => em.t === 'tok');
+    switch (e.type) {
+      case 'literal': return [{ t: 'lit', value: e.value }];
+      case 'ref': {
+        if (this.isStruct(e.name)) return [{ t: 'struct', kind: this.structKind.get(e.name)! }];
+        if (this.isToken(e.name)) { const v = sample(this.tokenByName.get(e.name)!.pattern, { rand: this.rand, interesting: [], variant: 0 }); return [{ t: 'tok', name: e.name, text: v || 'x' }]; }
+        return this.fillContent(this.ruleByName.get(e.name)!.body, fuel);
+      }
+      case 'seq': { const out: Emission[] = []; for (const it of e.items) for (const x of this.fillContent(it, fuel)) out.push(x); return out; }
+      case 'alt': {
+        // prefer a SHORT branch that yields PLAIN-STRING content — a clean scalar value (`- a`), not a
+        // sigil-led node (alias `*a`, flow `[…]`) or a multi-line fold. A plain string is what a
+        // sibling `-` line can (wrongly) fold into, which is the monogram#24 trigger.
+        let best: Emission[] | null = null, bestScore = -Infinity;
+        for (const it of e.items) {
+          const r = this.fillContent(it, fuel);
+          if (!hasTok(r)) continue;
+          const stringy = r.some((em) => em.t === 'tok' && /^string\.unquoted/.test(this.tokenByName.get(em.name)?.scope ?? '') && !/[&*!|>[\]{}#%'"]/.test(em.text[0] ?? ''));
+          const score = (stringy ? 100 : 0) - r.length;
+          if (score > bestScore) { bestScore = score; best = r; }
+        }
+        return best ?? this.fillContent(e.items[0], fuel);
+      }
+      case 'quantifier': { const r = this.fillContent(e.body, fuel); if ((e.kind === '?' || e.kind === '*') && !hasTok(r)) return []; return r; }
+      case 'group': return this.fillContent(e.body, fuel);
+      case 'sep': return this.fillContent(e.element, fuel);
+      default: return [];
+    }
+  }
+
+  // ── bounded-exhaustive enumeration: a capped set of emission-sequences for `e` ──
+  enum(e: RuleExpr, budget: number): Emission[][] {
+    const cap = this.cap;
+    // global work cap: the walk is a tree whose SIZE (not just output) grows with depth×cap×rules;
+    // once exceeded, collapse to the minimal expansion so a run always terminates in bounded time.
+    if (++this.budgetCalls > this.maxCalls) { const m = this.minExpand(e); return m ? [m] : [[]]; }
+    switch (e.type) {
+      case 'literal': return [[{ t: 'lit', value: e.value }]];
+      case 'ref': {
+        if (this.isStruct(e.name)) return [[{ t: 'struct', kind: this.structKind.get(e.name)! }]];
+        if (this.isToken(e.name)) {
+          const vs = sampleVariants(this.tokenByName.get(e.name)!, { rand: this.rand, interesting: this.interesting }, 3);
+          return (vs.length ? vs : ['x']).slice(0, cap).map((t) => [{ t: 'tok', name: e.name, text: t }]);
+        }
+        if (budget <= 0) { const m = this.ruleMin.get(e.name); return m ? [m] : [[]]; }
+        return this.enum(this.ruleByName.get(e.name)!.body, budget - 1);
+      }
+      case 'seq': {
+        let acc: Emission[][] = [[]];
+        for (const it of e.items) {
+          const parts = this.enum(it, budget);
+          const next: Emission[][] = [];
+          // skip combos whose emission list would blow past MAX_EMS — a deep tree of 2-rep quantifiers
+          // grows the list multiplicatively, and copying huge lists (not the call count) is the cost.
+          for (const a of acc) for (const p of parts) { if (a.length + p.length <= MAX_EMS) next.push([...a, ...p]); if (next.length >= cap) break; }
+          acc = next.length ? next : acc;
+          if (acc.length >= cap) acc = acc.slice(0, cap);
+        }
+        return acc;
+      }
+      case 'alt': {
+        // round-robin across branches so a deep/recursive branch (usually LAST) is not starved by an
+        // earlier scalar branch filling the cap — the difference between ever generating `- - a` or not.
+        const perBranch = e.items.map((it) => this.enum(it, budget));
+        const out: Emission[][] = [];
+        for (let i = 0; out.length < cap; i++) {
+          let any = false;
+          for (const b of perBranch) { if (i < b.length) { out.push(b[i]); any = true; if (out.length >= cap) break; } }
+          if (!any) break;
+        }
+        return out;
+      }
+      case 'quantifier': {
+        const body = this.enum(e.body, budget);
+        const out: Emission[][] = [];
+        if (e.kind !== '+') out.push([]);                       // 0 reps for ? and *
+        for (const b of body) { out.push(b); if (out.length >= cap) return out; }
+        if (e.kind !== '?') for (const b of body) { if (b.length * 2 <= MAX_EMS) { out.push([...b, ...b]); if (out.length >= cap) return out; } }  // 2 reps for * and +
+        return out;
+      }
+      case 'group': return this.enum(e.body, budget);
+      case 'sep': {
+        const el = this.enum(e.element, budget);
+        const out: Emission[][] = [];
+        for (const b of el) { out.push(b); if (out.length >= cap) return out; }
+        for (const b of el) { if (b.length * 2 + 1 <= MAX_EMS) { out.push([...b, { t: 'lit', value: e.delimiter }, ...b]); if (out.length >= cap) return out; } }
+        return out;
+      }
+      case 'not': case 'sameLine': case 'noCommentBefore': case 'noMultilineFlowBefore':
+      case 'op': case 'prefix': case 'postfix': return [[]];
+    }
+  }
+
+  // ── random derivation (fuzzing): one emission sequence, forced to terminate at budget 0 ──
+  fuzz(e: RuleExpr, budget: number): Emission[] {
+    const pick = <T,>(xs: T[]): T => xs[Math.floor(this.rand() * xs.length)];
+    // bounded `for`-push (NOT spread on a possibly-huge array → stack overflow + size blowup)
+    const fappend = (out: Emission[], add: Emission[]) => { if (out.length < MAX_EMS) for (const x of add) out.push(x); };
+    switch (e.type) {
+      case 'literal': return [{ t: 'lit', value: e.value }];
+      case 'ref': {
+        if (this.isStruct(e.name)) return [{ t: 'struct', kind: this.structKind.get(e.name)! }];
+        if (this.isToken(e.name)) {
+          const vs = sampleVariants(this.tokenByName.get(e.name)!, { rand: this.rand, interesting: this.interesting }, 4);
+          return [{ t: 'tok', name: e.name, text: vs.length ? pick(vs) : 'x' }];
+        }
+        if (budget <= 0) return this.ruleMin.get(e.name) ?? [];
+        return this.fuzz(this.ruleByName.get(e.name)!.body, budget - 1);
+      }
+      case 'seq': { const out: Emission[] = []; for (const it of e.items) fappend(out, this.fuzz(it, budget)); return out; }
+      case 'alt': {
+        if (budget <= 0) { const m = this.minExpand(e); if (m) return m; }
+        return this.fuzz(pick(e.items), budget);
+      }
+      case 'quantifier': {
+        const reps = budget <= 0 ? (e.kind === '+' ? 1 : 0) : (e.kind === '?' ? Math.floor(this.rand() * 2) : Math.floor(this.rand() * 3) + (e.kind === '+' ? 1 : 0));
+        const out: Emission[] = []; for (let i = 0; i < reps; i++) fappend(out, this.fuzz(e.body, budget - 1)); return out;
+      }
+      case 'group': return this.fuzz(e.body, budget);
+      case 'sep': {
+        const reps = budget <= 0 ? 1 : Math.floor(this.rand() * 3) + 1; const out: Emission[] = [];
+        for (let i = 0; i < reps; i++) { if (i) out.push({ t: 'lit', value: e.delimiter }); fappend(out, this.fuzz(e.element, budget - 1)); }
+        return out;
+      }
+      case 'not': case 'sameLine': case 'noCommentBefore': case 'noMultilineFlowBefore':
+      case 'op': case 'prefix': case 'postfix': return [];
+    }
+  }
+
+  // Rules that can (transitively) contain themselves — the self-recursive nesting targets.
+  selfRecursive(): string[] {
+    return this.grammar.rules.filter((r) => this.reachMap.get(r.name)!.has(r.name)).map((r) => r.name);
+  }
+
+  // ── DIRECTED nesting: a random derivation BIASED to descend back toward `target` until `depth`
+  //    runs out, then to terminate — so a self-recursive rule is forced to NEST, and its repetitions
+  //    fire to add SIBLINGS. This deterministically reaches the deep self-embedding shapes (a
+  //    BlockSequence inside a BlockSequence with an inner sibling — monogram#24) that an un-biased,
+  //    capped enumeration starves. Agnostic: `target` is any self-recursive rule, found generically.
+  siblingLeft = 0;
+  // Build a CLEAN, SHORT nested chain of `target` (a collection inside a collection, `nest` levels
+  // deep) with ONE inner sibling — the monogram#24 class. Fast and deterministic: at each rule, take
+  // the SINGLE first sub-path that re-enters `target` and minimal-fill everything else, so the output
+  // is the bare nested skeleton (no kitchen-sink filler). `target` is any self-recursive rule, found
+  // generically. The sibling (`- a`/`- b`) is added at the target's own repetition, innermost first.
+  nestChain(body: RuleExpr, target: string, nest: number): Emission[] {
+    this.siblingLeft = nest + 1;   // one inner sibling per nesting level (the `- a`/`- b` pairs)
+    this.fillBudget = 200_000;     // a high backstop (nestChain only runs on small indent/markup grammars now)
+    return this.nestRec(body, target, nest, 300, false);
+  }
+  nestRec(e: RuleExpr, target: string, nest: number, fuel: number, atTarget: boolean): Emission[] {
+    if (--fuel <= 0 || nest < 0) { return this.fillContent(e, 30) ?? []; }
+    // at the INNERMOST level (nest 0) fill with CONTENT (a scalar value) so a collection item is
+    // `- a`, not a bare `-` — monogram#24 needs a plain scalar for the sibling `-` to (wrongly) fold
+    // into. Off the recursive path at deeper levels → the minimal terminating filler (short chain).
+    if (nest === 0) { return this.fillContent(e, 30); }
+    if (!this.exprReaches(e, target)) { const m = this.minExpand(e); if (m) return m; }
+    switch (e.type) {
+      case 'literal': return [{ t: 'lit', value: e.value }];
+      case 'ref': {
+        if (this.isStruct(e.name)) return [{ t: 'struct', kind: this.structKind.get(e.name)! }];
+        if (this.isToken(e.name)) { const v = sample(this.tokenByName.get(e.name)!.pattern, { rand: this.rand, interesting: [], variant: 0 }); return [{ t: 'tok', name: e.name, text: v || 'x' }]; }
+        const re = e.name === target;
+        return this.nestRec(this.ruleByName.get(e.name)!.body, target, re ? nest - 1 : nest, fuel, re);
+      }
+      case 'seq': {
+        // descend the item with the SHORTEST distance to re-entering target (the direct path), and —
+        // when at the target rule's own body — fire ONE shallow sibling from its repetition (the
+        // `- a`/`- b` inner pair, monogram#24). Minimal-fill everything else → a clean nested chain.
+        let idx = -1, best = Infinity;
+        e.items.forEach((it, i) => { const d = this.exprDist(it, target); if (d < best) { best = d; idx = i; } });
+        const out: Emission[] = [];
+        e.items.forEach((it, i) => {
+          let part: Emission[];
+          if (i === idx) part = this.nestRec(it, target, nest, fuel, atTarget);                                // deepen the chain
+          else if (atTarget && this.siblingLeft > 0 && it.type === 'quantifier' && this.exprReaches(it, target)) {
+            this.siblingLeft--; part = this.nestRec(it.body, target, 0, fuel, false);                          // one shallow SIBLING
+          } else part = this.minExpand(it) ?? [];
+          for (const x of part) out.push(x);
+        });
+        return out;
+      }
+      case 'alt': {
+        // the branch that re-enters target SOONEST (min distance) — so the chain actually descends
+        let pickEl = e.items[0], best = Infinity;
+        for (const it of e.items) { const d = this.exprDist(it, target); if (d < best) { best = d; pickEl = it; } }
+        return this.nestRec(pickEl, target, nest, fuel, atTarget);
+      }
+      case 'quantifier': { const out: Emission[] = []; for (const x of this.nestRec(e.body, target, nest, fuel, atTarget)) out.push(x); return out; }
+      case 'group': return this.nestRec(e.body, target, nest, fuel, atTarget);
+      case 'sep': { const out: Emission[] = []; for (const x of this.nestRec(e.element, target, nest, fuel, atTarget)) out.push(x); return out; }
+      case 'not': case 'sameLine': case 'noCommentBefore': case 'noMultilineFlowBefore':
+      case 'op': case 'prefix': case 'postfix': return [];
+    }
+  }
+}
+
+// ─── MATERIALIZE: emissions → text + token spans ──────────────────────────────────
+// The per-language structural-token materialization hook. Token-stream grammars join with a
+// space (whitespace-insensitive); indentation grammars (YAML) render struct emissions through an
+// indent STACK that mirrors the lexer (newline = same-column sibling, indent = deeper block,
+// compact = an inline indent for `- - a`); markup grammars keep tag punctuation adjacent.
+interface MatOptions { mode: 'token-stream' | 'indent' | 'markup'; indentStep: number }
+
+function materialize(grammar: CstGrammar, ems: Emission[], opts: MatOptions): { text: string; tokens: GenInput['tokens'] } {
+  let text = '';
+  const tokens: GenInput['tokens'] = [];
+  // hard length cap: a pathological derivation (deep indent, many reps) must never grow text without
+  // bound — past the cap, appends are dropped (the input is over-long and discarded by the caller).
+  const emit = (s: string) => { if (text.length < 16_000) text += s; };
+  const emitTok = (name: string, s: string) => { tokens.push({ start: text.length, end: text.length + s.length, name, text: s }); text += s; };
+
+  if (opts.mode === 'indent') {
+    const stack: number[] = [0];            // indentation columns; top = current block column
+    let atLineStart = true;
+    let pendingCompact = false;             // the previous struct was a compact indicator's inline indent
+    const sp = (n: number) => ' '.repeat(n);
+    for (let i = 0; i < ems.length; i++) {
+      const e = ems[i];
+      if (e.t === 'struct') {
+        if (e.kind === 'indent') {
+          const col = stack[stack.length - 1] + opts.indentStep;
+          stack.push(col); emit('\n' + sp(col)); atLineStart = true;
+        } else if (e.kind === 'dedent') {
+          if (stack.length > 1) stack.pop();
+        } else { // newline — sibling at the current column
+          emit('\n' + sp(stack[stack.length - 1])); atLineStart = true;
+        }
+        continue;
+      }
+      if (e.t === 'compact') {
+        // an inline indent: the next content sits on the SAME line; defer the column PUSH until that
+        // content is emitted, so the pushed column is exactly where the inner indicator lands (the
+        // lexer's compactIndicators geometry — `- - a` pushes column 2, where the second `-` sits).
+        pendingCompact = true; continue;
+      }
+      const s = e.t === 'lit' ? e.value : e.text;
+      if (s.length === 0) continue;
+      if (pendingCompact) { emit(' '); stack.push(text.length - (text.lastIndexOf('\n') + 1)); pendingCompact = false; }   // inner COLUMN (in-line), not absolute offset
+      else if (!atLineStart) emit(' ');                   // ordinary inline separator (`- a`, `key: v`)
+      if (e.t === 'tok') emitTok(e.name, s); else emit(s);
+      atLineStart = false;
+    }
+    return { text, tokens };
+  }
+
+  if (opts.mode === 'markup') {
+    const noSpaceBefore = new Set([grammar.markup?.tagClose, grammar.markup?.closeMarker].filter(Boolean) as string[]);
+    let prev = '';
+    for (const e of ems) {
+      if (e.t === 'struct' || e.t === 'compact') continue;
+      const s = e.t === 'lit' ? e.value : e.text;
+      if (s.length === 0) continue;
+      const adjacent = prev === grammar.markup?.tagOpen || prev === grammar.markup?.closeMarker || noSpaceBefore.has(s) || prev === '';
+      if (!adjacent) emit(' ');
+      if (e.t === 'tok') emitTok(e.name, s); else emit(s);
+      prev = s;
+    }
+    return { text, tokens };
+  }
+
+  // token-stream: join with a single space (always legal in a whitespace-insensitive language)
+  let first = true;
+  for (const e of ems) {
+    if (e.t === 'struct' || e.t === 'compact') continue;
+    const s = e.t === 'lit' ? e.value : e.text;
+    if (s.length === 0) continue;
+    if (!first) emit(' ');
+    if (e.t === 'tok') emitTok(e.name, s); else emit(s);
+    first = false;
+  }
+  return { text, tokens };
+}
+
+// Rewrite a YAML compact indicator's following `[Indent, …, Dedent]` so the indent renders INLINE
+// (`- - a`) rather than next-line (`-\n  - a`). Both are legal and parse identically; the compact
+// form is what reproduces monogram#24's column geometry. Applied to a copy of the emission list.
+function compactify(ems: Emission[], compactLits: Set<string>): Emission[] {
+  const out: Emission[] = [];
+  for (let i = 0; i < ems.length; i++) {
+    const e = ems[i];
+    out.push(e);
+    // a compact indicator literal (`-`/`?`) immediately followed by a struct indent → inline it
+    if (e.t === 'lit' && compactLits.has(e.value)) {
+      const nxt = ems[i + 1];
+      if (nxt && nxt.t === 'struct' && nxt.kind === 'indent') { out.push({ t: 'compact' }); i++; }
+    }
+  }
+  return out;
+}
+
+// ─── TOP LEVEL ────────────────────────────────────────────────────────────────────
+export function generateInputs(grammar: CstGrammar, opts: GenOptions = {}): GenInput[] {
+  const depth = opts.depth ?? 5;
+  const cap = opts.cap ?? 6;
+  const maxInputs = opts.maxInputs ?? 400;
+  const fuzzRounds = opts.fuzzRounds ?? 300;
+  const nestDepth = opts.nestDepth ?? 5;
+  const seed = opts.seed ?? 12345;
+  const w = new Walker(grammar, seed, cap);
+
+  const mode: MatOptions['mode'] = grammar.indent ? 'indent' : grammar.markup ? 'markup' : 'token-stream';
+  const matOpts: MatOptions = { mode, indentStep: 2 };
+  const entry = grammar.rules[grammar.rules.length - 1];
+
+  // wall-clock budget: the depth strategies (nest / dirnest) over a LARGE token-stream grammar (the
+  // TS family — 50+ self-recursive rules, huge Pratt-expression alts) are heavy and add little, since
+  // those grammars have no indent/markup depth bugs for the scope≡role check to find. Cap total time
+  // so one driver stays tractable across all 7 languages; each per-rule loop checks it.
+  const t0 = Date.now();
+  const timeBudgetMs = opts.timeBudgetMs ?? 9000;
+  const timeUp = () => Date.now() - t0 > timeBudgetMs;
+
+  const seen = new Set<string>();
+  const out: GenInput[] = [];
+  const push = (ems: Emission[], strategy: string, rule: string) => {
+    if (out.length >= maxInputs * 4) return;
+    for (const variant of mode === 'indent' ? [ems, compactify(ems, w.compactLits)] : [ems]) {
+      const { text, tokens } = materialize(grammar, variant, matOpts);
+      if (!text.trim() || text.length > 2000 || seen.has(text)) continue;   // skip blank / over-long / duplicate
+      seen.add(text);
+      out.push({ text, tokens, strategy, rule });
+    }
+  };
+
+  // 1) bounded-exhaustive from the entry rule: the canonical small shapes (every derivation to depth N)
+  for (const ems of w.enumTop(entry.body, depth)) push(ems, 'exhaustive', entry.name);
+
+  // The depth strategies (2,3) only matter for INDENTATION / MARKUP grammars — those are where a flat
+  // highlighter loses to the stack-keeping parser (the monogram#23/#24 class). Token-stream grammars
+  // are whitespace-insensitive and the flat grammar is exact, so their (large) self-recursive rule set
+  // is skipped: it adds no depth coverage and would dominate the time budget.
+  const depthMatters = !!(grammar.indent || grammar.markup);
+  const recursive = depthMatters ? w.selfRecursive() : [];
+
+  // 2) bounded-exhaustive ROOTED at each self-recursive rule: exercises every rule's own small shapes
+  //    (round-tripped against that rule as the entry), incl. the FIRST level of self-embedding.
+  for (const rn of recursive) {
+    if (timeUp()) break;
+    const r = w.ruleByName.get(rn)!;
+    for (let d = 1; d <= Math.min(nestDepth, 3); d++) for (const ems of w.enumTop(r.body, d)) push(ems, `nest:${rn}@${d}`, rn);
+  }
+
+  // 3) directed nesting: a clean, deterministic nested chain of each self-recursive rule (with one
+  //    inner sibling) at depth 1..N — monogram#24 is a BlockSequence inside a BlockSequence with an
+  //    inner sibling (`- - a\n  - b\n- c`), which the un-biased capped enumeration starves.
+  for (const rn of recursive) {
+    if (timeUp()) break;
+    const r = w.ruleByName.get(rn)!;
+    for (let d = 1; d <= nestDepth; d++) push(w.nestChain(r.body, rn, d), `dirnest:${rn}@${d}`, rn);
+  }
+
+  // 4) fuzzing for deeper / wider structures (random production choices), rooted at the entry AND at
+  //    each self-recursive rule so deep shapes are reached quickly.
+  for (let i = 0; i < fuzzRounds; i++) push(w.fuzz(entry.body, depth + 2), 'fuzz', entry.name);
+  for (const rn of recursive) {
+    if (timeUp()) break;
+    const r = w.ruleByName.get(rn)!;
+    for (let i = 0; i < Math.ceil(fuzzRounds / 8); i++) push(w.fuzz(r.body, depth + 2), `fuzz:${rn}`, rn);
+  }
+
+  return out.slice(0, maxInputs);
+}
diff --git a/test/parser-gap.ts b/test/parser-gap.ts
deleted file mode 100644
index a2f8226..0000000
--- a/test/parser-gap.ts
+++ /dev/null
@@ -1,254 +0,0 @@
-import { readFileSync, readdirSync, statSync } from 'node:fs';
-import { join, relative } from 'node:path';
-
-const TEST_DIR = '/tmp/ts-repo/tests/cases/conformance';
-
-// ── Collect test files ──
-
-function walkDir(dir: string): string[] {
-  const files: string[] = [];
-  for (const entry of readdirSync(dir)) {
-    const full = join(dir, entry);
-    if (statSync(full).isDirectory()) {
-      files.push(...walkDir(full));
-    } else if (full.endsWith('.ts') && !full.endsWith('.d.ts')) {
-      files.push(full);
-    }
-  }
-  return files;
-}
-
-const testFiles = walkDir(TEST_DIR).sort();
-
-// ── Gap patterns: syntax our grammar does NOT define rules for ──
-// Grouped by category, ordered by likely impact
-
-interface Gap {
-  name: string;
-  category: string;
-  test: (s: string) => boolean;
-  difficulty: 'easy' | 'medium' | 'hard';
-  covered?: boolean;
-}
-
-const gaps: Gap[] = [
-  // ── Destructuring (COVERED — rules added) ──
-  { name: 'Object destructuring binding',   category: 'Destructuring', difficulty: 'hard', covered: true,
-    test: s => /(?:let|const|var)\s+\{/.test(s) },
-  { name: 'Array destructuring binding',    category: 'Destructuring', difficulty: 'hard', covered: true,
-    test: s => /(?:let|const|var)\s+\[/.test(s) },
-  { name: 'Destructuring in params',        category: 'Destructuring', difficulty: 'hard', covered: true,
-    test: s => /\(\s*\{[^}]*\}\s*[,:)]/.test(s) || /\(\s*\[[^\]]*\]\s*[,:)]/.test(s) },
-  { name: 'Destructuring in for-of/in',     category: 'Destructuring', difficulty: 'hard', covered: true,
-    test: s => /for\s*\(\s*(?:const|let|var)\s+[\[{]/.test(s) },
-  { name: 'Default values in destructuring', category: 'Destructuring', difficulty: 'hard', covered: true,
-    test: s => /\{\s*\w+\s*=\s*[^=]/.test(s) && /(?:let|const|var|function|\()/.test(s) },
-
-  // ── Statements (COVERED) ──
-  { name: 'Labeled statement',              category: 'Statements', difficulty: 'easy', covered: true,
-    test: s => /^\s*[a-zA-Z_$]\w*\s*:\s*(?:for|while|do|switch)/m.test(s) },
-  { name: 'debugger statement',             category: 'Statements', difficulty: 'easy', covered: true,
-    test: s => /^\s*debugger\s*;?\s*$/m.test(s) },
-  { name: 'with statement',                 category: 'Statements', difficulty: 'easy', covered: true,
-    test: s => /\bwith\s*\(/.test(s) },
-  { name: 'Empty statement (bare ;)',        category: 'Statements', difficulty: 'easy', covered: true,
-    test: s => /^\s*;\s*$/m.test(s) },
-
-  // ── Type features ──
-  { name: 'Index signature [k: T]: V',      category: 'Types', difficulty: 'medium', covered: true,
-    test: s => /\[\s*\w+\s*:\s*(?:string|number|symbol)\s*\]\s*:/.test(s) },
-  { name: 'Conditional type extends?:',      category: 'Types', difficulty: 'medium', covered: true,
-    test: s => /\bextends\b[^{]*\?\s*\S[^;]*\s*:/.test(s) && /\btype\b/.test(s) },
-  { name: 'Mapped type {[K in T]: V}',      category: 'Types', difficulty: 'medium', covered: true,
-    test: s => /\{\s*\[?\s*\w+\s+in\s+/.test(s) },
-  { name: 'infer keyword',                  category: 'Types', difficulty: 'medium', covered: true,
-    test: s => /\binfer\s+[A-Z]/.test(s) },
-  { name: 'Template literal type',          category: 'Types', difficulty: 'medium', covered: true,
-    test: s => /type\s+\w+[^=]*=\s*`/.test(s) },
-  { name: 'Type predicate (x is T)',        category: 'Types', difficulty: 'easy', covered: true,
-    test: s => /\)\s*:\s*\w+\s+is\s+\w/.test(s) },
-  { name: 'asserts keyword',                category: 'Types', difficulty: 'easy', covered: true,
-    test: s => /\basserts\s+\w+/.test(s) },
-  { name: 'import type / export type',      category: 'Types', difficulty: 'easy', covered: true,
-    test: s => /\b(?:import|export)\s+type\s+[{A-Z]/.test(s) },
-  { name: 'satisfies operator',             category: 'Types', difficulty: 'easy', covered: true,
-    test: s => /\bsatisfies\s+\w/.test(s) },
-
-  // ── Expression features ──
-  { name: 'Template literal ${expr}',       category: 'Expressions', difficulty: 'hard', covered: true,
-    test: s => /`[^`]*\$\{/.test(s) },
-  { name: 'Default parameter value',        category: 'Expressions', difficulty: 'easy', covered: true,
-    test: s => /\(\s*\w+\s*(?::\s*\w[^)]*?)?\s*=[^=>][^)]*\)/.test(s) },
-  { name: 'Optional chaining ?.( / ?.[',    category: 'Expressions', difficulty: 'easy', covered: true,
-    test: s => /\?\.\s*[\[(]/.test(s) },
-  { name: 'Dynamic import()',               category: 'Expressions', difficulty: 'easy', covered: true,
-    test: s => /\bimport\s*\(/.test(s) },
-  { name: 'import.meta',                    category: 'Expressions', difficulty: 'easy', covered: true,
-    test: s => /\bimport\s*\.\s*meta\b/.test(s) },
-  { name: 'Tagged template f`...`',         category: 'Expressions', difficulty: 'medium', covered: true,
-    test: s => /\w\s*`/.test(s) && /`[^`]*\$\{/.test(s) },
-  { name: 'Comma operator',                 category: 'Expressions', difficulty: 'easy', covered: true,
-    test: s => /\breturn\s*\(.*,.*\)\s*;/.test(s) },
-  { name: 'Class expression',               category: 'Expressions', difficulty: 'medium', covered: true,
-    test: s => /=\s*class\s*(?:\w+\s*)?\{/.test(s) },
-  { name: 'Function expression',            category: 'Expressions', difficulty: 'easy', covered: true,
-    test: s => /=\s*function\s*\w*\s*[\(<]/.test(s) },
-  { name: 'void expression',                category: 'Expressions', difficulty: 'easy', covered: true,
-    test: s => /\bvoid\s+\w/.test(s) },
-
-  // ── Declaration features ──
-  { name: 'export default',                 category: 'Declarations', difficulty: 'easy', covered: true,
-    test: s => /\bexport\s+default\b/.test(s) },
-  { name: 'export * / re-export',           category: 'Declarations', difficulty: 'easy', covered: true,
-    test: s => /\bexport\s+\*/.test(s) || /\bexport\s+\{[^}]+\}\s+from\b/.test(s) },
-  { name: 'export = / import =',            category: 'Declarations', difficulty: 'easy', covered: true,
-    test: s => /\bexport\s*=/.test(s) || /\bimport\s+\w+\s*=\s*require/.test(s) },
-  { name: 'const enum',                     category: 'Declarations', difficulty: 'easy', covered: true,
-    test: s => /\bconst\s+enum\b/.test(s) },
-  { name: 'Class static block',             category: 'Declarations', difficulty: 'medium', covered: true,
-    test: s => /\bstatic\s*\{/.test(s) },
-  { name: 'Call/construct signature',        category: 'Declarations', difficulty: 'medium', covered: true,
-    test: s => /(?:interface|type)[^{]*\{[^}]*(?:new\s*\(|^\s*\()/ms.test(s) },
-  { name: 'Method overloads',               category: 'Declarations', difficulty: 'medium', covered: true,
-    test: s => /\w+\s*\([^)]*\)\s*:\s*\w[^{;]*;\s*\n\s*\w+\s*\(/m.test(s) },
-  { name: 'using / await using',            category: 'Declarations', difficulty: 'easy', covered: true,
-    test: s => /\b(?:await\s+)?using\s+\w+\s*=/.test(s) },
-  { name: 'accessor keyword',               category: 'Declarations', difficulty: 'easy', covered: true,
-    test: s => /\baccessor\s+\w+/.test(s) },
-
-  // ── Class features ──
-  { name: 'Parameter properties',           category: 'Classes', difficulty: 'easy', covered: true,
-    test: s => /constructor\s*\([^)]*\b(?:public|private|protected|readonly)\b/.test(s) },
-  { name: 'Decorators with args @f()',       category: 'Classes', difficulty: 'easy', covered: true,
-    test: s => /@\w+\s*\([^)]*\)/.test(s) },
-];
-
-// ── Scan ──
-
-console.log(`Scanning ${testFiles.length} conformance test files...\n`);
-
-const gapHits = new Map<string, { count: number; examples: string[] }>();
-for (const g of gaps) gapHits.set(g.name, { count: 0, examples: [] });
-
-let filesWithGaps = 0;
-
-for (const file of testFiles) {
-  const source = readFileSync(file, 'utf-8');
-  const rel = relative(TEST_DIR, file);
-  let hasGap = false;
-
-  for (const g of gaps) {
-    if (g.test(source)) {
-      const hit = gapHits.get(g.name)!;
-      hit.count++;
-      if (hit.examples.length < 3) hit.examples.push(rel);
-      hasGap = true;
-    }
-  }
-
-  if (hasGap) filesWithGaps++;
-}
-
-// ── Report ──
-
-console.log('═══════════════════════════════════════════════════════════════');
-console.log('  Monogram — Parser Gap Analysis');
-console.log(`  ${testFiles.length} TypeScript conformance tests → ${filesWithGaps} files with gaps`);
-console.log('═══════════════════════════════════════════════════════════════\n');
-
-const categories = [...new Set(gaps.map(g => g.category))];
-const allHitsRaw = gaps.map(g => ({ ...g, ...gapHits.get(g.name)! })).filter(g => g.count > 0);
-
-// Recompute filesWithGaps excluding covered constructs
-let filesWithRemainingGaps = 0;
-for (const file of testFiles) {
-  const source = readFileSync(file, 'utf-8');
-  let hasUncoveredGap = false;
-  for (const g of gaps) {
-    if (g.covered) continue;
-    if (g.test(source)) { hasUncoveredGap = true; break; }
-  }
-  if (hasUncoveredGap) filesWithRemainingGaps++;
-}
-
-// ── Covered constructs ──
-const coveredHits = allHitsRaw.filter(g => g.covered && g.count > 0);
-if (coveredHits.length > 0) {
-  const covTotal = coveredHits.reduce((s, g) => s + g.count, 0);
-  console.log(`── COVERED (in grammar) ──  (${covTotal} file hits)\n`);
-  for (const g of coveredHits.sort((a, b) => b.count - a.count)) {
-    const pct = ((g.count / testFiles.length) * 100).toFixed(1);
-    console.log(`  ✓ ${g.name.padEnd(38)} ${String(g.count).padStart(4)} files  (${pct}%)`);
-  }
-  console.log();
-}
-
-// ── Remaining gaps ──
-let totalGapFiles = 0;
-
-for (const cat of categories) {
-  const catGaps = gaps.filter(g => g.category === cat && !g.covered);
-  const catHits = catGaps
-    .map(g => ({ ...g, ...gapHits.get(g.name)! }))
-    .filter(g => g.count > 0)
-    .sort((a, b) => b.count - a.count);
-
-  if (catHits.length === 0) continue;
-
-  const catTotal = catHits.reduce((s, g) => s + g.count, 0);
-  totalGapFiles += catTotal;
-
-  console.log(`── ${cat} ──  (${catTotal} hits)\n`);
-
-  for (const g of catHits) {
-    const pct = ((g.count / testFiles.length) * 100).toFixed(1);
-    const diff = g.difficulty === 'easy' ? '●' : g.difficulty === 'medium' ? '◐' : '○';
-    console.log(`  ${diff} ${g.name.padEnd(38)} ${String(g.count).padStart(4)} files  (${pct}%)  [${g.difficulty}]`);
-  }
-  console.log();
-}
-
-// ── Difficulty summary ──
-const allHits = allHitsRaw.filter(g => !g.covered);
-const easy   = allHits.filter(g => g.difficulty === 'easy');
-const medium = allHits.filter(g => g.difficulty === 'medium');
-const hard   = allHits.filter(g => g.difficulty === 'hard');
-
-const easyFiles   = easy.reduce((s, g) => s + g.count, 0);
-const mediumFiles = medium.reduce((s, g) => s + g.count, 0);
-const hardFiles   = hard.reduce((s, g) => s + g.count, 0);
-
-console.log('═══════════════════════════════════════════════════════════════');
-console.log('  Summary');
-console.log('═══════════════════════════════════════════════════════════════\n');
-const coveredCount = coveredHits.reduce((s, g) => s + g.count, 0);
-console.log(`  Total test files:        ${testFiles.length}`);
-console.log(`  Files fully covered:     ${testFiles.length - filesWithRemainingGaps}  (${(((testFiles.length - filesWithRemainingGaps) / testFiles.length) * 100).toFixed(1)}%)`);
-console.log(`  Files with gaps:         ${filesWithRemainingGaps}  (${((filesWithRemainingGaps / testFiles.length) * 100).toFixed(1)}%)`);
-console.log(`  Recently covered:        ${coveredHits.length} constructs (${coveredCount} file hits)`);
-console.log();
-console.log(`  Remaining gaps:`);
-console.log(`  ● Easy:    ${easy.length.toString().padStart(2)} constructs  (${easyFiles} file hits)    — add rule/keyword`);
-console.log(`  ◐ Medium:  ${medium.length.toString().padStart(2)} constructs  (${mediumFiles} file hits)    — new rule + patterns`);
-console.log(`  ○ Hard:    ${hard.length.toString().padStart(2)} constructs  (${hardFiles} file hits)    — recursive patterns / new concepts`);
-console.log();
-
-// ── What closing easy gaps would achieve ──
-let onlyEasyGapFiles = 0;
-for (const file of testFiles) {
-  const source = readFileSync(file, 'utf-8');
-  let hasHard = false;
-  let hasMedium = false;
-  let hasAny = false;
-  for (const g of gaps) {
-    if (g.covered) continue;
-    if (!g.test(source)) continue;
-    hasAny = true;
-    if (g.difficulty === 'hard') hasHard = true;
-    if (g.difficulty === 'medium') hasMedium = true;
-  }
-  if (hasAny && !hasHard && !hasMedium) onlyEasyGapFiles++;
-}
-
-const afterEasy = testFiles.length - filesWithRemainingGaps + onlyEasyGapFiles;
-console.log(`\n  After closing ● easy:    ${afterEasy}/${testFiles.length} files covered (${((afterEasy / testFiles.length) * 100).toFixed(1)}%)`);
diff --git a/test/prof.ts b/test/prof.ts
deleted file mode 100644
index f896d0a..0000000
--- a/test/prof.ts
+++ /dev/null
@@ -1,10 +0,0 @@
-import { readFileSync } from 'fs';
-import { createParser } from '../src/gen-parser.ts';
-const grammar = (await import('../typescript.ts')).default;
-process.env.PROF = '1';
-const p: any = createParser(grammar);
-const code = readFileSync('/tmp/ts-repo/tests/cases/conformance/parser/ecmascript5/RealWorld/parserharness.ts','utf-8');
-try { p.parse(code); } catch {}
-const g = (k:string)=>p.profCounts.get(k)??0;
-console.log('memo hit/miss:', g('$memoHit'), '/', g('$memoMiss'), '=> hit rate', (100*g('$memoHit')/(g('$memoHit')+g('$memoMiss'))).toFixed(0)+'%');
-console.log('LED loop: tries', g('$ledTry'), ' hits', g('$ledHit'), '=> wasted', (100*(1-g('$ledHit')/g('$ledTry'))).toFixed(0)+'% of led matchSeq attempts fail fast');
diff --git a/test/scope-gap-html.ts b/test/scope-gap-html.ts
deleted file mode 100644
index 31a9309..0000000
--- a/test/scope-gap-html.ts
+++ /dev/null
@@ -1,48 +0,0 @@
-// scope-gap-html.ts — HTML adapter for the unified scope-gap harness. The FIRST real
-// vscode#203212 comparative gap: VS Code's HTML grammar is the unmaintained textmate/html.tmbundle;
-// the oracle is parse5 (maintained, authoritative). Run (bare node): node test/scope-gap-html.ts
-//   Override the official grammar: MONOGRAM_OFFICIAL_HTML=/path/to/html.tmLanguage.json
-import { run } from './scope-gap.ts';
-import { htmlOracle } from './html-oracle.ts';
-import { cases as htmlIssueCases } from './html-issue-cases.ts';
-
-const OFFICIAL = process.env.MONOGRAM_OFFICIAL_HTML
-  ?? '/Applications/Visual Studio Code.app/Contents/Resources/app/extensions/html/syntaxes/html.tmLanguage.json';
-
-// Realistic HTML (baseline) — tags, quoted/unquoted/boolean attrs, nesting, comments, voids.
-const GENERAL: string[] = [
-  '<div class="container" id="main"><p>Hello <a href="/x">world</a>.</p></div>',
-  '<ul><li>one</li><li>two</li><li>three</li></ul>',
-  '<img src="a.png" alt="a picture" width="100" height="80">',
-  '<input type="text" name="q" placeholder="Search" disabled>',
-  '<button type="submit" class="btn btn-primary" data-id="42">Go</button>',
-  '<section><h1>Title</h1><p>Body with <strong>bold</strong> and <em>italic</em>.</p></section>',
-  '<nav><a href="/">Home</a> | <a href="/about">About</a></nav>',
-  '<form action="/submit" method="post"><label for="n">Name</label><input id="n"></form>',
-  '<table><thead><tr><th>A</th><th>B</th></tr></thead><tbody><tr><td>1</td><td>2</td></tr></tbody></table>',
-  '<!-- a comment --><div><!-- inline --><span>x</span></div>',
-  '<meta charset="utf-8"><meta name="viewport" content="width=device-width, initial-scale=1">',
-  '<br><hr>',
-  '<select><option value="1">One</option><option value="2" selected>Two</option></select>',
-  '<video controls width="320"><source src="m.mp4" type="video/mp4"></video>',
-  '<article data-index=3 hidden><header>H</header><footer>F</footer></article>',
-  '<span class="a b c" title="x y z">text</span>',
-  '<div\n  class="multi-line"\n  id="tag"\n  data-x="1">body</div>',
-  '<a href="https://example.com?a=1&b=2" target="_blank" rel="noopener">link</a>',
-  '<label>Email <input type="email" required></label>',
-  '<figure><img src="p.jpg" alt="photo"><figcaption>cap</figcaption></figure>',
-];
-
-const corpus = [
-  ...GENERAL.map((text, i) => ({ name: `general#${i}`, text })),
-  ...htmlIssueCases.map((c: any, i: number) => ({ name: `issue:${c.title ?? i}`, text: c.src as string })),
-];
-
-await run({
-  name: 'HTML',
-  scopeName: 'text.html.basic',
-  officialPath: OFFICIAL,
-  monogramPath: 'html.tmLanguage.json',
-  loadCorpus: () => corpus,
-  roleOracle: htmlOracle,
-});
diff --git a/test/scope-gap-js.ts b/test/scope-gap-js.ts
deleted file mode 100644
index e483ed3..0000000
--- a/test/scope-gap-js.ts
+++ /dev/null
@@ -1,30 +0,0 @@
-// scope-gap-js.ts — JavaScript (.js) adapter for the unified scope-gap harness. Grades VS Code's
-// OFFICIAL JavaScript.tmLanguage.json AND Monogram's javascript.tmLanguage.json against the parser
-// oracle (oracle.ts with ScriptKind.JS). Both grammars declare scopeName `source.js`, so they load
-// + compare on one scale. Corpus = Test262 (tc39/test262), the canonical ECMAScript corpus — the TS
-// suite has ~no .js. Provision once:  git clone --depth 1 https://github.com/tc39/test262 /tmp/test262
-// Run (bare node): node test/scope-gap-js.ts [N|all]   (Test262 is huge; default sample 800)
-import ts from 'typescript';
-import { run } from './scope-gap.ts';
-import { oracle } from './oracle.ts';
-import { walkCorpus, subsetArg } from './src-coverage-tsfamily.ts';
-
-const BASE = '/tmp/test262/test/language'; // the syntax-relevant subtree of Test262
-const OFFICIAL = process.env.MONOGRAM_OFFICIAL_TM
-  ?? '/Applications/Visual Studio Code.app/Contents/Resources/app/extensions/javascript/syntaxes/JavaScript.tmLanguage.json';
-
-// walkCorpus already drops .d.ts + multi-file (@filename) fixtures and stride-samples.
-const corpus = walkCorpus([BASE], ['.js'], subsetArg(800)).filter((c) => !c.file.endsWith('_FIXTURE.js'));
-
-await run({
-  name: 'JavaScript (.js)',
-  scopeName: 'source.js',
-  officialPath: OFFICIAL,
-  monogramPath: 'javascript.tmLanguage.json',
-  loadCorpus: () => corpus.map((c) => ({ name: c.file, text: c.code })),
-  roleOracle: (text) => oracle(text, ts.ScriptKind.JS),
-  isGradable: (text) => {
-    const sf = ts.createSourceFile('c.js', text, ts.ScriptTarget.Latest, true, ts.ScriptKind.JS);
-    return (((sf as any).parseDiagnostics?.length ?? 0) === 0);
-  },
-});
diff --git a/test/scope-gap-run.ts b/test/scope-gap-run.ts
new file mode 100644
index 0000000..c754442
--- /dev/null
+++ b/test/scope-gap-run.ts
@@ -0,0 +1,130 @@
+// ─────────────────────────────────────────────────────────────────────────────
+//  scope-gap-run.ts — the UNIFIED, data-driven entry for the scope-gap metric (monogram#25
+//  part 2B). One driver + a per-language config TABLE, replacing the seven thin
+//  scope-gap-{ts,js,jsx,tsx,html,yaml,vue} adapter files: each was mostly the same `run(adapter)`
+//  literal differing only in corpus path / grammar path / scopeName / oracle / official path.
+//  Those vary as DATA here; the shared core stays scope-gap.ts's `run()`.
+//
+//  Run (bare node):  node test/scope-gap-run.ts <lang> [N|all]
+//                    lang ∈ ts | js | jsx | tsx | html | yaml | vue
+//
+//  Per-language entry is preserved as the `<lang>` PARAMETER (the npm scripts pass it). The
+//  thicker html / yaml specifics (multi-file official loader, fullSpan, differential) live in their
+//  TABLE ENTRY, not a separate file. VUE is genuinely different — it is an INJECTION grammar that
+//  needs vuejs/language-tools' own tokenizer (a bare Registry.loadGrammar never fires the directive
+//  / interpolation injections), so it cannot use `run()`; `<lang> vue` DELEGATES to scope-gap-vue.ts.
+// ─────────────────────────────────────────────────────────────────────────────
+import ts from 'typescript';
+import { readdirSync, readFileSync } from 'node:fs';
+import { dirname, join } from 'node:path';
+import { parse as yamlParse, parseAllDocuments } from 'yaml';
+import { run, type ScopeGapAdapter } from './scope-gap.ts';
+import { oracle } from './oracle.ts';
+import { yamlOracle } from './yaml-oracle.ts';
+import { htmlOracle } from './html-oracle.ts';
+import { walkCorpus } from './src-coverage-tsfamily.ts';
+import { JSX_CASES, HTML_GENERAL } from './curated-corpora.ts';
+import { cases as htmlIssueCases } from './html-issue-cases.ts';
+import { cases as yamlIssue12 } from './yaml-issue12-regressions.ts';
+
+const VSCODE_TM = '/Applications/Visual Studio Code.app/Contents/Resources/app/extensions';
+// subset size from argv[3] (argv[2] is the language) / env SUBSET / default; `all` = the full corpus.
+const subN = (def = 400): number => { const a = process.argv[3]; return a === 'all' ? Infinity : Number(a ?? process.env.SUBSET ?? def); };
+const tsParseClean = (kind: ts.ScriptKind, fn: string) => (text: string): boolean => {
+  const sf = ts.createSourceFile(fn, text, ts.ScriptTarget.Latest, true, kind);
+  return (((sf as any).parseDiagnostics?.length ?? 0) === 0);
+};
+
+// One TS-family scope-gap adapter (TS/JS/JSX/TSX differ only by ScriptKind + corpus + paths).
+function tsFamily(o: { name: string; scopeName: string; kind: ts.ScriptKind; mono: string; officialEnv: string; officialDefault: string; fn: string; corpus: () => { name: string; text: string }[] }): ScopeGapAdapter {
+  return {
+    name: o.name, scopeName: o.scopeName,
+    officialPath: process.env[o.officialEnv] ?? o.officialDefault,
+    monogramPath: o.mono,
+    loadCorpus: o.corpus,
+    roleOracle: (text) => oracle(text, o.kind),
+    isGradable: tsParseClean(o.kind, o.fn),
+  };
+}
+
+// ── per-language config table ────────────────────────────────────────────────────────────────────
+const BUILDERS: Record<string, () => ScopeGapAdapter> = {
+  ts: () => {
+    // The TS entry strides over the FULL .ts file list then drops multi-file (@filename) fixtures —
+    // the original scope-gap-ts order (walk-all → stride-pick → filter), preserved so the metric is
+    // byte-identical (it differs subtly from walkCorpus, which filters before the stride).
+    const DIR = '/tmp/ts-repo/tests/cases/conformance/parser';
+    const all: string[] = [];
+    const walk = (d: string) => { for (const e of readdirSync(d, { withFileTypes: true })) { const f = join(d, e.name); if (e.isDirectory()) walk(f); else if (e.name.endsWith('.ts') && !e.name.endsWith('.d.ts')) all.push(f); } };
+    walk(DIR); all.sort();
+    return tsFamily({
+      name: 'TypeScript', scopeName: 'source.ts', kind: ts.ScriptKind.TS, mono: 'typescript.tmLanguage.json', fn: 'c.ts',
+      officialEnv: 'MONOGRAM_OFFICIAL_TM', officialDefault: `${VSCODE_TM}/typescript-basics/syntaxes/TypeScript.tmLanguage.json`,
+      corpus: () => { const N = subN(400); const pick = !isFinite(N) || N >= all.length ? all : Array.from({ length: N }, (_, i) => all[Math.floor(i * all.length / N)]); return pick.map((f) => ({ name: f, text: readFileSync(f, 'utf8') })).filter((x) => !/^\s*\/\/\s*@filename:/im.test(x.text)); },
+    });
+  },
+  js: () => tsFamily({
+    name: 'JavaScript (.js)', scopeName: 'source.js', kind: ts.ScriptKind.JS, mono: 'javascript.tmLanguage.json', fn: 'c.js',
+    officialEnv: 'MONOGRAM_OFFICIAL_TM', officialDefault: `${VSCODE_TM}/javascript/syntaxes/JavaScript.tmLanguage.json`,
+    corpus: () => walkCorpus(['/tmp/test262/test/language'], ['.js'], subN(800)).filter((c) => !c.file.endsWith('_FIXTURE.js')).map((c) => ({ name: c.file, text: c.code })),
+  }),
+  jsx: () => tsFamily({
+    name: 'JavaScriptReact (.jsx)', scopeName: 'source.js.jsx', kind: ts.ScriptKind.JSX, mono: 'javascriptreact.tmLanguage.json', fn: 'c.jsx',
+    officialEnv: 'MONOGRAM_OFFICIAL_TM', officialDefault: `${VSCODE_TM}/javascript/syntaxes/JavaScriptReact.tmLanguage.json`,
+    corpus: () => JSX_CASES.map((text, i) => ({ name: `<curated #${i}>`, text })),
+  }),
+  tsx: () => {
+    const BASE = '/tmp/ts-repo/tests/cases';
+    return tsFamily({
+      name: 'TypeScriptReact (.tsx)', scopeName: 'source.tsx', kind: ts.ScriptKind.TSX, mono: 'typescriptreact.tmLanguage.json', fn: 'c.tsx',
+      officialEnv: 'MONOGRAM_OFFICIAL_TM', officialDefault: `${VSCODE_TM}/typescript-basics/syntaxes/TypeScriptReact.tmLanguage.json`,
+      corpus: () => walkCorpus([`${BASE}/conformance`, `${BASE}/compiler`], ['.tsx'], subN(Infinity)).map((c) => ({ name: c.file, text: c.code })),
+    });
+  },
+  html: () => ({
+    name: 'HTML', scopeName: 'text.html.basic',
+    officialPath: process.env.MONOGRAM_OFFICIAL_HTML ?? `${VSCODE_TM}/html/syntaxes/html.tmLanguage.json`,
+    monogramPath: 'html.tmLanguage.json',
+    loadCorpus: () => [
+      ...HTML_GENERAL.map((text, i) => ({ name: `general#${i}`, text })),
+      ...htmlIssueCases.map((c: any, i: number) => ({ name: `issue:${c.title ?? i}`, text: c.src as string })),
+    ],
+    roleOracle: htmlOracle,
+  }),
+  yaml: () => {
+    // The "official" YAML baseline is the MAINTAINED RedCMD/VS Code grammar (microsoft/vscode#232244),
+    // a multi-file dispatcher that include()s version-specific sub-grammars in the same dir.
+    const OFFICIAL = process.env.MONOGRAM_OFFICIAL_YAML ?? '/tmp/redcmd-yaml/syntaxes/yaml.tmLanguage.json';
+    const SYN = dirname(OFFICIAL);
+    const SUITE = '/tmp/yaml-test-suite/src';
+    const decode = (s: string) => s.replace(/␣/g, ' ').replace(/—+»/g, '\t').replace(/[↵∎]/g, '');
+    const corpus: { name: string; text: string }[] = [];
+    for (const f of readdirSync(SUITE).filter((n) => n.endsWith('.yaml'))) {
+      try { const meta = yamlParse(readFileSync(`${SUITE}/${f}`, 'utf8')); for (const t of (Array.isArray(meta) ? meta : [meta])) if (t && typeof t.yaml === 'string') corpus.push({ name: f, text: decode(t.yaml) }); } catch { /* skip */ }
+    }
+    for (const c of yamlIssue12) corpus.push({ name: `monogram#12 ${c.id}`, text: c.src });
+    return {
+      name: 'YAML', scopeName: 'source.yaml', officialPath: OFFICIAL, monogramPath: 'yaml.tmLanguage.json',
+      officialExtra: {
+        'source.yaml.1.2': join(SYN, 'yaml-1.2.tmLanguage.json'), 'source.yaml.1.1': join(SYN, 'yaml-1.1.tmLanguage.json'),
+        'source.yaml.1.0': join(SYN, 'yaml-1.0.tmLanguage.json'), 'source.yaml.1.3': join(SYN, 'yaml-1.3.tmLanguage.json'),
+        'source.yaml.embedded': join(SYN, 'yaml-embedded.tmLanguage.json'),
+      },
+      loadCorpus: () => corpus,
+      roleOracle: yamlOracle,
+      // Only grade valid YAML (the AST's key/value resolution is unreliable on malformed input); the
+      // invalid-input blind spot is covered by the asserted issue12 gate + the differential pass.
+      isGradable: (text) => { try { return parseAllDocuments(text).every((d: any) => d.errors.length === 0); } catch { return false; } },
+      fullSpan: true,       // YAML's oracle emits coarse, role-homogeneous spans — grade every char
+      differential: true,   // also report oracle-independent Monogram-vs-official divergences
+    };
+  },
+};
+
+const lang = process.argv[2];
+if (lang === 'vue') { await import('./scope-gap-vue.ts'); }   // injection grammar — its own tokenizer
+else {
+  const build = BUILDERS[lang];
+  if (!build) { console.error(`usage: node test/scope-gap-run.ts <ts|js|jsx|tsx|html|yaml|vue> [N|all]\nunknown language: ${lang ?? '(none)'}`); process.exit(1); }
+  await run(build());
+}
diff --git a/test/scope-gap-ts.ts b/test/scope-gap-ts.ts
deleted file mode 100644
index 92d4d72..0000000
--- a/test/scope-gap-ts.ts
+++ /dev/null
@@ -1,39 +0,0 @@
-// scope-gap-ts.ts — TypeScript adapter for the unified scope-gap harness. Demonstrates the
-// harness reproduces the official-vs-Monogram gap from a parser-role oracle
-// (oracle.ts = tsc → roles). Run (bare node): node test/scope-gap-ts.ts [N|all]
-import ts from 'typescript';
-import { readdirSync, readFileSync } from 'node:fs';
-import { join } from 'node:path';
-import { run } from './scope-gap.ts';
-import { oracle } from './oracle.ts';
-
-const PARSER_DIR = '/tmp/ts-repo/tests/cases/conformance/parser';
-const OFFICIAL = process.env.MONOGRAM_OFFICIAL_TM
-  ?? '/Applications/Visual Studio Code.app/Contents/Resources/app/extensions/typescript-basics/syntaxes/TypeScript.tmLanguage.json';
-
-function walk(d: string): string[] {
-  let o: string[] = [];
-  for (const e of readdirSync(d, { withFileTypes: true })) {
-    const f = join(d, e.name);
-    if (e.isDirectory()) o = o.concat(walk(f));
-    else if (e.name.endsWith('.ts') && !e.name.endsWith('.d.ts')) o.push(f);
-  }
-  return o;
-}
-const arg = process.argv[2];
-const N = arg === 'all' ? Infinity : Number(arg ?? 400);
-const all = walk(PARSER_DIR).sort();
-const pick = !isFinite(N) || N >= all.length ? all : Array.from({ length: N }, (_, i) => all[Math.floor(i * all.length / N)]);
-
-await run({
-  name: 'TypeScript',
-  scopeName: 'source.ts',
-  officialPath: OFFICIAL,
-  monogramPath: 'typescript.tmLanguage.json',
-  loadCorpus: () => pick.map((f) => ({ name: f, text: readFileSync(f, 'utf8') })).filter((x) => !/^\s*\/\/\s*@filename:/im.test(x.text)),
-  roleOracle: (text) => oracle(text, ts.ScriptKind.TS),
-  isGradable: (text) => {
-    const sf = ts.createSourceFile('c.ts', text, ts.ScriptTarget.Latest, true, ts.ScriptKind.TS);
-    return (((sf as any).parseDiagnostics?.length ?? 0) === 0);
-  },
-});
diff --git a/test/scope-gap-tsx.ts b/test/scope-gap-tsx.ts
deleted file mode 100644
index 270b832..0000000
--- a/test/scope-gap-tsx.ts
+++ /dev/null
@@ -1,28 +0,0 @@
-// scope-gap-tsx.ts — TSX (.tsx) adapter for the unified scope-gap harness. Grades VS Code's
-// OFFICIAL TypeScriptReact.tmLanguage.json AND Monogram's typescriptreact.tmLanguage.json against
-// the parser oracle (oracle.ts with ScriptKind.TSX). Both grammars declare scopeName `source.tsx`.
-// Corpus = the TypeScript repo's single-file .tsx tests (conformance/jsx + compiler).
-// Run (bare node): node test/scope-gap-tsx.ts [N|all]   (default: all — the .tsx set is small)
-import ts from 'typescript';
-import { run } from './scope-gap.ts';
-import { oracle } from './oracle.ts';
-import { walkCorpus, subsetArg } from './src-coverage-tsfamily.ts';
-
-const BASE = '/tmp/ts-repo/tests/cases';
-const OFFICIAL = process.env.MONOGRAM_OFFICIAL_TM
-  ?? '/Applications/Visual Studio Code.app/Contents/Resources/app/extensions/typescript-basics/syntaxes/TypeScriptReact.tmLanguage.json';
-
-const corpus = walkCorpus([`${BASE}/conformance`, `${BASE}/compiler`], ['.tsx'], subsetArg(Infinity));
-
-await run({
-  name: 'TypeScriptReact (.tsx)',
-  scopeName: 'source.tsx',
-  officialPath: OFFICIAL,
-  monogramPath: 'typescriptreact.tmLanguage.json',
-  loadCorpus: () => corpus.map((c) => ({ name: c.file, text: c.code })),
-  roleOracle: (text) => oracle(text, ts.ScriptKind.TSX),
-  isGradable: (text) => {
-    const sf = ts.createSourceFile('c.tsx', text, ts.ScriptTarget.Latest, true, ts.ScriptKind.TSX);
-    return (((sf as any).parseDiagnostics?.length ?? 0) === 0);
-  },
-});
diff --git a/test/scope-gap-yaml.ts b/test/scope-gap-yaml.ts
deleted file mode 100644
index 95e1e91..0000000
--- a/test/scope-gap-yaml.ts
+++ /dev/null
@@ -1,68 +0,0 @@
-// scope-gap-yaml.ts — YAML adapter for the unified scope-gap harness. NOTE: unlike most of the
-// vscode#203212 list, VS Code already switched YAML OFF the dead textmate/yaml.tmbundle TO the
-// maintained RedCMD/YAML-Syntax-Highlighter (microsoft/vscode#232244). So YAML's "official"
-// baseline here is that MAINTAINED grammar — this gap is Monogram vs a maintained competitor, not
-// a dead bundle. Default = RedCMD UPSTREAM; clone it first:
-//   git clone --depth 1 https://github.com/RedCMD/YAML-Syntax-Highlighter /tmp/redcmd-yaml
-// (VS Code's bundled YAML is the same grammar — identical result; set MONOGRAM_OFFICIAL_YAML to
-//  .../extensions/yaml/syntaxes/yaml.tmLanguage.json for that.) Oracle = the `yaml` package.
-// Run (bare node): node test/scope-gap-yaml.ts
-import { readdirSync, readFileSync } from 'node:fs';
-import { dirname, join } from 'node:path';
-import { parse as yamlParse, parseAllDocuments } from 'yaml';
-import { run } from './scope-gap.ts';
-import { yamlOracle } from './yaml-oracle.ts';
-import { cases as issue12 } from './yaml-issue12-regressions.ts';
-
-const OFFICIAL = process.env.MONOGRAM_OFFICIAL_YAML ?? '/tmp/redcmd-yaml/syntaxes/yaml.tmLanguage.json';
-// The RedCMD/VS Code YAML grammar is a dispatcher stub that include()s version-specific
-// sub-grammars in the same syntaxes/ dir — load them all, or the official scopes nothing.
-const SYN = dirname(OFFICIAL);
-const officialExtra: Record<string, string> = {
-  'source.yaml.1.2': join(SYN, 'yaml-1.2.tmLanguage.json'),
-  'source.yaml.1.1': join(SYN, 'yaml-1.1.tmLanguage.json'),
-  'source.yaml.1.0': join(SYN, 'yaml-1.0.tmLanguage.json'),
-  'source.yaml.1.3': join(SYN, 'yaml-1.3.tmLanguage.json'),
-  'source.yaml.embedded': join(SYN, 'yaml-embedded.tmLanguage.json'),
-};
-
-// Corpus: yaml-test-suite inputs (src meta-files; decode the visible-whitespace markers).
-const SUITE = '/tmp/yaml-test-suite/src';
-const decode = (s: string) => s.replace(/␣/g, ' ').replace(/—+»/g, '\t').replace(/[↵∎]/g, '');
-const corpus: { name: string; text: string }[] = [];
-for (const f of readdirSync(SUITE).filter((n) => n.endsWith('.yaml'))) {
-  try {
-    const meta = yamlParse(readFileSync(`${SUITE}/${f}`, 'utf8'));
-    for (const t of (Array.isArray(meta) ? meta : [meta])) {
-      if (t && typeof t.yaml === 'string') corpus.push({ name: f, text: decode(t.yaml) });
-    }
-  } catch { /* skip */ }
-}
-// Plus the RedCMD monogram#12 repros (many are tiny edge/error inputs absent from the suite) so the
-// metric actually SEES the constructs the comment flagged. Asserted should-be scopes live in their
-// own gate (yaml-issue12-regressions.ts); here they just widen what the gap/differential pass covers.
-for (const c of issue12) corpus.push({ name: `monogram#12 ${c.id}`, text: c.src });
-
-await run({
-  name: 'YAML',
-  scopeName: 'source.yaml',
-  officialPath: OFFICIAL,
-  officialExtra,
-  monogramPath: 'yaml.tmLanguage.json',
-  loadCorpus: () => corpus,
-  roleOracle: yamlOracle,
-  // The GRADED headline stays valid-only: on malformed YAML the AST's key/value resolution is itself
-  // unreliable, so grading it would inject false "Monogram-wrong" tokens and poison the very signal
-  // we're making trustworthy. The invalid-input blind spot is instead closed by TWO mechanisms that
-  // stay honest there: (1) the asserted regression gate (yaml-issue12-regressions.ts) pins the
-  // should-be scope of the specific malformed repros (#4/#5/#8); (2) the differential pass below runs
-  // on ALL inputs and FLAGS invalid-input divergences for human review without auto-judging them.
-  isGradable: (text) => { try { return parseAllDocuments(text).every((d: any) => d.errors.length === 0); } catch { return false; } },
-  // YAML's oracle emits COARSE, role-homogeneous spans (a whole plain scalar, a block-scalar body, a
-  // directive line); grade every char so a bug mid-span (a `%YAML` folded into a scalar, a block line
-  // bailing to a comment) is caught instead of hidden behind a correct start. See scope-gap.ts.
-  fullSpan: true,
-  // Also report oracle-INDEPENDENT divergences (Monogram vs official, where the oracle is silent) so a
-  // construct the CST oracle doesn't model can't become a silent blind spot. See scope-gap.ts.
-  differential: true,
-});
diff --git a/test/src-coverage-js.ts b/test/src-coverage-js.ts
deleted file mode 100644
index f6c1ec4..0000000
--- a/test/src-coverage-js.ts
+++ /dev/null
@@ -1,27 +0,0 @@
-// src-coverage-js.ts — JavaScript (.js, VS Code "javascript") entrypoint.
-// Official parser = typescript.js with ScriptKind.JS (this IS VS Code's built-in JS support);
-// Monogram grammar = javascript.ts. The TS test suite has ~no .js corpus, so we use Test262
-// (tc39/test262) — the canonical ECMAScript corpus, including negative parse tests (great
-// reject cases). Provision once:
-//   git clone --depth 1 https://github.com/tc39/test262 /tmp/test262
-// Run (bare node): node test/src-coverage-js.ts [N|all]   (Test262 is huge; default sample 800)
-//
-// Note: VS Code's `javascript` (ScriptKind.JS) ALLOWS JSX, but Monogram's javascript.ts models
-// no JSX (that lives in javascriptreact.ts). Test262 is pure ECMAScript with no JSX, so this
-// definitional gap doesn't trigger here — the comparison stays clean.
-
-import ts from 'typescript';
-import { run } from './src-coverage.ts';
-import { tsFamilyAdapter, walkCorpus, subsetArg } from './src-coverage-tsfamily.ts';
-
-const BASE = '/tmp/test262/test/language'; // the syntax-relevant subtree of Test262
-const corpus = walkCorpus([BASE], ['.js'], subsetArg(800)).filter((c) => !c.file.endsWith('_FIXTURE.js'));
-console.log(`JavaScript corpus: ${corpus.length} Test262 .js cases (test/language, stride-sampled).`);
-
-await run(tsFamilyAdapter({
-  name: 'JavaScript (.js)',
-  scriptKind: ts.ScriptKind.JS,
-  grammar: (await import('../javascript.ts')).default,
-  corpus,
-  originBase: '/tmp/test262',
-}));
diff --git a/test/src-coverage-jsx.ts b/test/src-coverage-jsx.ts
deleted file mode 100644
index 0cbeb2c..0000000
--- a/test/src-coverage-jsx.ts
+++ /dev/null
@@ -1,60 +0,0 @@
-// src-coverage-jsx.ts — JSX (.jsx, VS Code "javascriptreact") entrypoint.
-// Official parser = typescript.js with ScriptKind.JSX; Monogram grammar = javascriptreact.ts
-// (JS + JSX, NO TypeScript types). Neither the TS suite nor Test262 ships a .jsx corpus, so
-// this uses a CURATED set exercising both halves (plain JS + JSX constructs). It is small, so
-// completeness% is honestly low; a real .jsx corpus is a follow-up. Run: node test/src-coverage-jsx.ts
-
-import ts from 'typescript';
-import { run } from './src-coverage.ts';
-import { tsFamilyAdapter } from './src-coverage-tsfamily.ts';
-
-// No TS types — these are .jsx (JavaScript + JSX) only.
-const JSX_CASES: string[] = [
-  // --- plain JS half ---
-  'const x = 1, y = 2;',
-  'function f(a, b = 1, ...rest) { return a + b + rest.length; }',
-  'class C extends B { #p = 1; static s() {} get v() { return this.#p; } }',
-  'const g = async (x) => { for await (const v of x) console.log(v); };',
-  'const { a, b: { c } = {}, ...r } = obj;',
-  'label: for (let i = 0; i < 10; i++) { if (i) continue label; }',
-  'try { risky(); } catch { recover(); } finally { done(); }',
-  'a ??= b; c ||= d; e &&= f; g?.h?.[i]?.(j);',
-  'const t = `a${b}c${d}e`, n = 1_000_000n, hex = 0xFF, oct = 0o17, bin = 0b101;',
-  'export default function () {}; export const z = 1; export * from "m";',
-  'import def, { named as alias } from "mod"; import * as ns from "ns";',
-  'switch (x) { case 1: break; default: { let y = 2; } }',
-  'do { step(); } while (cond);',
-  'const re = /foo\\d+/giu; const s = "a\\u{1F600}b";',
-  'new.target; import.meta.url; function* gen() { yield* other(); }',
-  // --- JSX half ---
-  'const a = <div />;',
-  'const b = <div className="x" id={y} data-z={1} {...props}>text</div>;',
-  'const frag = <><Alpha /><Beta /></>;',
-  'const member = <Foo.Bar.Baz prop={1} />;',
-  'const ns = <svg:rect width="10" />;',
-  'const nested = <Outer header={<Inner title="x" />}>{children}</Outer>;',
-  'const cond = ok ? <Yes /> : <No />;',
-  'const list = items.map((it) => <li key={it.id}>{it.label}</li>);',
-  'const guard = <div>{show && <Modal />}{count || <Empty />}</div>;',
-  'const text = <p> leading {a} middle {b} trailing </p>;',
-  'const selfClosingVoid = <input type="text" disabled />;',
-  'const entity = <span>a &amp; b &lt; c &#x1F600;</span>;',
-  'const multiline = (\n  <section>\n    <h1>Title</h1>\n    <p>Body</p>\n  </section>\n);',
-  'const exprChild = <div>{/* comment */}{items.length}</div>;',
-  'const spreadChild = <List>{...rows}</List>;',
-  'function App() { return <main><Header /><Content /></main>; }',
-  'const attrExpr = <a href={"/" + slug} onClick={() => go()}>link</a>;',
-  'const deep = <a><b><c>deep</c></b></a>;',
-  'const stringAttr = <div title=\'single\' alt="double" />;',
-  'const boolAttr = <button autofocus formNoValidate>ok</button>;',
-];
-
-const corpus = JSX_CASES.map((code, i) => ({ file: `<curated #${i}>`, code }));
-console.log(`JSX corpus: ${corpus.length} curated .jsx snippets (no .jsx corpus exists in the TS suite / Test262; partial — completeness% will be low).`);
-
-await run(tsFamilyAdapter({
-  name: 'JavaScriptReact (.jsx)',
-  scriptKind: ts.ScriptKind.JSX,
-  grammar: (await import('../javascriptreact.ts')).default,
-  corpus,
-}));
diff --git a/test/src-coverage-run.ts b/test/src-coverage-run.ts
new file mode 100644
index 0000000..2be7dff
--- /dev/null
+++ b/test/src-coverage-run.ts
@@ -0,0 +1,52 @@
+// ─────────────────────────────────────────────────────────────────────────────
+//  src-coverage-run.ts — the UNIFIED, data-driven entry for the source-coverage parser-alignment
+//  metric (monogram#25 part 2B). One driver + a per-language config TABLE, replacing the four thin
+//  src-coverage-{ts,js,jsx,tsx} adapters: each was just a corpus + ScriptKind + grammar over the
+//  SHARED `tsFamilyAdapter` (the accept/reject oracle) and `run()` core (src-coverage.ts).
+//
+//  Run (bare node):  node test/src-coverage-run.ts <lang> [N|all]
+//                    lang ∈ ts | js | jsx | tsx | html | yaml
+//
+//  The thicker html / yaml adapters use a DIFFERENT oracle (html = parse5 STRUCTURAL tree-equality,
+//  yaml = the `yaml` package accept/reject) and their own corpus, so they keep their files;
+//  `<lang> html|yaml` DELEGATES to them. The per-language entry stays a `<lang>` parameter throughout.
+// ─────────────────────────────────────────────────────────────────────────────
+import ts from 'typescript';
+import { run } from './src-coverage.ts';
+import { tsFamilyAdapter, walkCorpus, type TsFamilyCase } from './src-coverage-tsfamily.ts';
+import { JSX_CASES } from './curated-corpora.ts';
+
+const subN = (def = 400): number => { const a = process.argv[3]; return a === 'all' ? Infinity : Number(a ?? process.env.SUBSET ?? def); };
+
+const lang = process.argv[2];
+
+// html / yaml use a different oracle + corpus → their own files; delegate (preserves the `<lang>` entry).
+if (lang === 'html') { await import('./src-coverage-html.ts'); }
+else if (lang === 'yaml') { await import('./src-coverage-yaml.ts'); }
+else {
+  // ── TS-family config table: ts/js/jsx/tsx differ only by ScriptKind + grammar + corpus ──
+  const TS_BASE = '/tmp/ts-repo/tests/cases';
+  const BUILDERS: Record<string, () => Promise<{ opts: Parameters<typeof tsFamilyAdapter>[0]; note: string }>> = {
+    ts: async () => {
+      const corpus = walkCorpus([`${TS_BASE}/conformance`], ['.ts'], subN(400));
+      return { opts: { name: 'TypeScript (.ts)', scriptKind: ts.ScriptKind.TS, grammar: (await import('../typescript.ts')).default, corpus, originBase: `${TS_BASE}/conformance` }, note: `${corpus.length} single-file .ts cases (tests/cases/conformance).` };
+    },
+    js: async () => {
+      const corpus = walkCorpus(['/tmp/test262/test/language'], ['.js'], subN(800)).filter((c) => !c.file.endsWith('_FIXTURE.js'));
+      return { opts: { name: 'JavaScript (.js)', scriptKind: ts.ScriptKind.JS, grammar: (await import('../javascript.ts')).default, corpus, originBase: '/tmp/test262' }, note: `${corpus.length} Test262 .js cases (test/language, stride-sampled).` };
+    },
+    jsx: async () => {
+      const corpus: TsFamilyCase[] = JSX_CASES.map((code, i) => ({ file: `<curated #${i}>`, code }));
+      return { opts: { name: 'JavaScriptReact (.jsx)', scriptKind: ts.ScriptKind.JSX, grammar: (await import('../javascriptreact.ts')).default, corpus }, note: `${corpus.length} curated .jsx cases.` };
+    },
+    tsx: async () => {
+      const corpus = walkCorpus([`${TS_BASE}/conformance`, `${TS_BASE}/compiler`], ['.tsx'], subN(Infinity));
+      return { opts: { name: 'TypeScriptReact (.tsx)', scriptKind: ts.ScriptKind.TSX, grammar: (await import('../typescriptreact.ts')).default, corpus, originBase: TS_BASE }, note: `${corpus.length} single-file .tsx cases (conformance + compiler).` };
+    },
+  };
+  const build = BUILDERS[lang];
+  if (!build) { console.error(`usage: node test/src-coverage-run.ts <ts|js|jsx|tsx|html|yaml> [N|all]\nunknown language: ${lang ?? '(none)'}`); process.exit(1); }
+  const { opts, note } = await build();
+  console.log(`${opts.name} corpus: ${note}`);
+  await run(tsFamilyAdapter(opts));
+}
diff --git a/test/src-coverage-ts.ts b/test/src-coverage-ts.ts
deleted file mode 100644
index b9f7a57..0000000
--- a/test/src-coverage-ts.ts
+++ /dev/null
@@ -1,27 +0,0 @@
-// src-coverage-ts.ts — TypeScript (.ts) entrypoint for the source-coverage alignment metric.
-// Thin: corpus + dialect knobs only; the TS-family adapter is in ./src-coverage-tsfamily.ts
-// and the coverage harness in ./src-coverage.ts.
-//
-// Oracle/corpus/Monogram-invocation mirror the accept/reject oracle: ts.createSourceFile (TS),
-// accept iff no parseDiagnostics; /tmp/ts-repo/tests/cases/conformance, single-file .ts.
-//
-// Run (Node 24+, bare node — NOT tsx):
-//   node test/src-coverage-ts.ts            # default subset (env SUBSET, default 400)
-//   node test/src-coverage-ts.ts 1000       # subset size as arg
-//   node test/src-coverage-ts.ts all        # full single-file corpus
-
-import ts from 'typescript';
-import { run } from './src-coverage.ts';
-import { tsFamilyAdapter, walkCorpus, subsetArg } from './src-coverage-tsfamily.ts';
-
-const BASE = '/tmp/ts-repo/tests/cases';
-const corpus = walkCorpus([`${BASE}/conformance`], ['.ts'], subsetArg());
-console.log(`TypeScript corpus: ${corpus.length} single-file .ts cases (tests/cases/conformance).`);
-
-await run(tsFamilyAdapter({
-  name: 'TypeScript (.ts)',
-  scriptKind: ts.ScriptKind.TS,
-  grammar: (await import('../typescript.ts')).default,
-  corpus,
-  originBase: `${BASE}/conformance`,
-}));
diff --git a/test/src-coverage-tsx.ts b/test/src-coverage-tsx.ts
deleted file mode 100644
index 2485da2..0000000
--- a/test/src-coverage-tsx.ts
+++ /dev/null
@@ -1,20 +0,0 @@
-// src-coverage-tsx.ts — TSX (.tsx, VS Code "typescriptreact") entrypoint.
-// Same official parser as TS (typescript.js) but ScriptKind.TSX + the typescriptreact grammar.
-// Corpus = the TypeScript repo's .tsx tests (conformance/jsx + compiler), single-file.
-// Run (bare node): node test/src-coverage-tsx.ts [N|all]   (default: all — the .tsx set is small)
-
-import ts from 'typescript';
-import { run } from './src-coverage.ts';
-import { tsFamilyAdapter, walkCorpus, subsetArg } from './src-coverage-tsfamily.ts';
-
-const BASE = '/tmp/ts-repo/tests/cases';
-const corpus = walkCorpus([`${BASE}/conformance`, `${BASE}/compiler`], ['.tsx'], subsetArg(Infinity));
-console.log(`TSX corpus: ${corpus.length} single-file .tsx cases (conformance + compiler).`);
-
-await run(tsFamilyAdapter({
-  name: 'TypeScriptReact (.tsx)',
-  scriptKind: ts.ScriptKind.TSX,
-  grammar: (await import('../typescriptreact.ts')).default,
-  corpus,
-  originBase: BASE,
-}));
diff --git a/test/ts-ast.ts b/test/ts-ast.ts
deleted file mode 100644
index a82b482..0000000
--- a/test/ts-ast.ts
+++ /dev/null
@@ -1,9 +0,0 @@
-import ts from 'typescript';
-const code = process.argv[2] ?? `x ? y => ({ y }) : z => ({ z })`;
-const sf = ts.createSourceFile('t.ts', code, ts.ScriptTarget.Latest, true);
-function show(n: ts.Node, d=0){
-  console.log('  '.repeat(d) + ts.SyntaxKind[n.kind] + (n.kind===ts.SyntaxKind.Identifier?`(${(n as any).text})`:''));
-  n.forEachChild(c=>show(c,d+1));
-}
-show(sf);
-console.log('parseDiagnostics:', (sf as any).parseDiagnostics?.length ?? 0);
diff --git a/test/yaml-diag.ts b/test/yaml-diag.ts
deleted file mode 100644
index 01e875a..0000000
--- a/test/yaml-diag.ts
+++ /dev/null
@@ -1,40 +0,0 @@
-// Throwaway diagnostic: categorize yaml-test-suite inputs as FN (yaml accepts, we reject) /
-// FP (yaml rejects, we accept) with Monogram's error, to drive grammar work. Run: node test/yaml-diag.ts
-import { readdirSync, readFileSync } from 'node:fs';
-import { parse as yamlParse, parseAllDocuments } from 'yaml';
-import { createParser } from '../src/gen-parser.ts';
-import grammar from '../yaml.ts';
-
-const { parse } = createParser(grammar);
-const SUITE = '/tmp/yaml-test-suite/src';
-const decode = (s: string) => s.replace(/␣/g, ' ').replace(/—*»/g, '\t').replace(/[↵∎]/g, '');
-const corpus: { code: string; origin: string; name: string }[] = [];
-for (const f of readdirSync(SUITE).filter((n) => n.endsWith('.yaml'))) {
-  try {
-    const meta = yamlParse(readFileSync(`${SUITE}/${f}`, 'utf8'));
-    for (const t of (Array.isArray(meta) ? meta : [meta])) {
-      if (t && typeof t.yaml === 'string') corpus.push({ code: decode(t.yaml), origin: f, name: t.name ?? '' });
-    }
-  } catch { /* skip */ }
-}
-const oAccept = (c: string) => { try { return parseAllDocuments(c).every((d: any) => d.errors.length === 0); } catch { return false; } };
-const mRes = (c: string) => { try { parse(c); return { ok: true, err: '' }; } catch (e) { return { ok: false, err: String((e as Error).message).split('\n')[0] }; } };
-
-const FN: any[] = [], FP: any[] = [];
-let TP = 0, TN = 0;
-for (const x of corpus) {
-  const o = oAccept(x.code), m = mRes(x.code);
-  if (o && m.ok) TP++; else if (o && !m.ok) FN.push({ ...x, err: m.err }); else if (!o && m.ok) FP.push(x); else TN++;
-}
-console.log(`corpus ${corpus.length}: TP=${TP} FN=${FN.length} FP=${FP.length} TN=${TN}`);
-
-// Group FN by Monogram error message (the failure mode).
-const byErr = new Map<string, any[]>();
-for (const x of FN) { const k = x.err.replace(/offset \d+/, 'offset N'); (byErr.get(k) ?? byErr.set(k, []).get(k)!).push(x); }
-console.log(`\n=== FN grouped by error (${byErr.size} kinds) ===`);
-for (const [err, xs] of [...byErr.entries()].sort((a, b) => b[1].length - a[1].length)) {
-  console.log(`\n[${xs.length}] ${err}`);
-  for (const x of xs.slice(0, 4)) console.log(`   ${JSON.stringify(x.code.slice(0, 60))}`);
-}
-console.log(`\n=== FP sample (yaml rejects, we accept) — ${FP.length} ===`);
-for (const x of FP.slice(0, 18)) console.log(`   ${JSON.stringify(x.code.slice(0, 60))}`);
diff --git a/test/yaml-poc.ts b/test/yaml-poc.ts
deleted file mode 100644
index fd0635d..0000000
--- a/test/yaml-poc.ts
+++ /dev/null
@@ -1,33 +0,0 @@
-// Throwaway PoC: verify the indentation lexer emits correct INDENT/DEDENT/NEWLINE and that the
-// first-cut yaml.ts grammar parses common documents. Run: node test/yaml-poc.ts
-import { createLexer } from '../src/gen-lexer.ts';
-import { createParser } from '../src/gen-parser.ts';
-import grammar from '../yaml.ts';
-
-const { tokenize } = createLexer(grammar);
-const { parse } = createParser(grammar);
-
-const samples = [
-  'a: 1\nb: 2',
-  'a:\n  b: 1\n  c: 2\nd: 3',
-  '- one\n- two\n- three',
-  'key:\n  - a\n  - b',
-  'nested:\n  list:\n    - x\n    - y\n  val: z',
-  '{a: 1, b: 2}',
-  '[1, 2, 3]',
-  'name: "John"\nage: 30',
-  'list: [a, b, c]',
-  '# comment\nkey: value  # trailing',
-];
-
-const show = (t: any) =>
-  t.type === 'Indent' ? '»IND' : t.type === 'Dedent' ? '«DED' : t.type === 'Newline' ? '⏎NL'
-  : t.type === '' ? JSON.stringify(t.text) : `${t.type}(${JSON.stringify(t.text)})`;
-
-for (const s of samples) {
-  console.log('\n=== ' + JSON.stringify(s) + ' ===');
-  let toks: any[];
-  try { toks = tokenize(s); } catch (e) { console.log('  LEX THREW:', (e as Error).message); continue; }
-  console.log('  toks:', toks.map(show).join(' '));
-  try { parse(s); console.log('  PARSE: ok'); } catch (e) { console.log('  PARSE FAIL:', (e as Error).message.split('\n')[0]); }
-}

From 1a28ba28fc3da7830734dabe0b5249b26068b36a Mon Sep 17 00:00:00 2001
From: Johnson Chu <johnsoncodehk@gmail.com>
Date: Tue, 9 Jun 2026 05:36:51 +0800
Subject: [PATCH 2/6] Generator: directed tokenCover strategy for deterministic
 per-token coverage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The generated legal corpus never reached whole scoped token classes the
scope≡role judge checks — for TypeScript, numerics (Hex/Octal/Binary/BigInt/
Number), because the legal corpus is shallow/structural and never lands on an
expression-position literal (proven: raising cap/fuzz still yields zero numerics).

Add a 5th strategy `tokenCover`: for each scoped, samplable token, descend the
SHORTEST path from the entry rule that references it (reusing the distTo/exprDist
BFS), build a minimal legal context (fillContent/minExpand), and substitute
sampleVariants. Deterministic and minimal-context, so it stays cheap on the large
TS grammar (no depth strategies for token-stream). Also sweep all top-level
token-pattern `alt` branches in sampleVariants (so a Number emits hex/oct/bin/
float/bigint, not just `0`), guarded against the interesting-literal embed for
decimal-start / start()-anchored tokens (no `-0x1`, no broken column-0 anchor).

TS declared-scope tokens checked 157→326 (numerics now graded); generative 7/7
consistent, depth-site 2/2 (#23/#24 intact); agnostic 9/9.
---
 test/grammar-gen.ts | 184 +++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 182 insertions(+), 2 deletions(-)

diff --git a/test/grammar-gen.ts b/test/grammar-gen.ts
index 31eef55..4b539bf 100644
--- a/test/grammar-gen.ts
+++ b/test/grammar-gen.ts
@@ -24,6 +24,7 @@
 //   • fuzzing — random production choices, for deeper / wider structures.
 // ─────────────────────────────────────────────────────────────────────────────
 import type { CstGrammar, RuleExpr, RuleDecl, TokenDecl, TokenPattern, TokenCharClassItem } from '../src/types.ts';
+import { tokenPatternStartsWithDecimal, tokenPatternHasStartAnchor } from '../src/token-pattern.ts';
 
 // Max emissions in one derivation. A deep tree of 2-rep quantifiers grows the list multiplicatively;
 // copying huge lists (not the call count) is what makes a naive enumerator hang — cap it.
@@ -110,10 +111,30 @@ function sample(pat: TokenPattern, ctx: SampleCtx): string | null {
   }
 }
 
+// The number of branches in the SHALLOWEST `alt` reachable through the pattern's
+// leading seq/group/repeat spine — the branches that a different `variant` index makes
+// `sample` rotate through (it picks `variant % items.length` at each alt). A token whose
+// value is an alternation of forms (a Number's int / float branches, a string's escape
+// alternatives) needs at least this many variant indices for EVERY branch to be emitted,
+// not just branch 0 — otherwise the budget caps it at the first form (`0`, never `1.5`).
+function topAltBranches(pat: TokenPattern): number {
+  if (typeof pat === 'string') return 1;
+  switch (pat.type) {
+    case 'alt': return pat.items.length;
+    case 'seq': return Math.max(1, ...pat.items.map(topAltBranches));
+    case 'repeat': return topAltBranches(pat.body);
+    default: return 1;
+  }
+}
+
 // Sample several distinct, legal texts for a token (variants + interesting-literal embeds).
 function sampleVariants(decl: TokenDecl, ctx: { rand: () => number; interesting: string[] }, n: number): string[] {
   const out = new Set<string>();
-  for (let v = 0; v < n + 2 && out.size < n; v++) {
+  // Cover every top-level alt branch: a token that is itself an alternation (hex/oct/bin/float
+  // forms) must emit ALL its branches, not stop at branch 0 once `n` distinct samples are reached —
+  // so the budget is at least the branch count, and the all-branch sweep is NOT capped by `out.size`.
+  const budget = Math.max(n + 2, topAltBranches(decl.pattern) + 2);
+  for (let v = 0; v < budget; v++) {
     const s = sample(decl.pattern, { ...ctx, variant: v });
     if (s !== null && s.length > 0) out.add(s);
   }
@@ -123,7 +144,13 @@ function sampleVariants(decl: TokenDecl, ctx: { rand: () => number; interesting:
   // the result is still a single legal instance of the token — this is what produces the
   // monogram#23 shape (a plain scalar whose text is `--- x`). Verified per-token by re-lexing
   // in the driver; an embed that doesn't re-lex to this token is simply dropped there.
-  if (base.length >= 1) {
+  // GUARD: a token whose pattern starts with a DECIMAL digit (`0x1F`, `1.5`) or carries a
+  // `start()` line/stream anchor (a shebang `^#!…`) must NOT get a leading-literal embed: gluing
+  // `-`/`#`/`---` on front re-lexes as a different token (`-0x1` = minus + number, `#0x1` ≠ hex)
+  // or breaks the column-0 anchor — so the embed would never round-trip back to THIS token. The
+  // pure-variant samples above already cover such tokens; only free-form tokens take the embeds.
+  const anchored = tokenPatternStartsWithDecimal(decl) || tokenPatternHasStartAnchor(decl);
+  if (base.length >= 1 && !anchored) {
     for (const lit of ctx.interesting) {
       if (lit.length === 0 || /[\n\r]/.test(lit)) continue;
       out.add(lit + base);            // glued leading boundary (`---` + `x` → `---x`)
@@ -156,6 +183,7 @@ class Walker {
   structKind = new Map<string, 'indent' | 'dedent' | 'newline'>();
   compactLits: Set<string>;
   reachMap = new Map<string, Set<string>>();   // rule → every rule it can transitively reach
+  tokenHostRules = new Map<string, string[]>(); // token name → rules whose body DIRECTLY references it
   ruleMin = new Map<string, Emission[] | null>();
   rand: () => number;
   cap: number;
@@ -179,9 +207,29 @@ class Walker {
     this.compactLits = new Set(grammar.indent?.compactIndicators ?? []);
     this.interesting = this.collectInteresting();
     this.computeReach();
+    this.computeTokenHosts();
     this.computeMins();
   }
 
+  // For each token, the rules whose body DIRECTLY references it (`ref` to a token name). This is the
+  // entry point of tokenCover's directed descent: a scoped token only ever appears at these rules, so
+  // building the shortest legal path to one of them and substituting the token covers it. A token with
+  // NO host rule (a lexer-trivia token the parser never consumes — a shebang / JSDoc comment, skipped
+  // before the token stream) is unreachable by ANY derivation and is left out (it is not a CST leaf).
+  computeTokenHosts(): void {
+    for (const r of this.grammar.rules) {
+      const toks = new Set<string>();
+      const go = (e: RuleExpr) => { switch (e.type) {
+        case 'ref': if (this.isToken(e.name)) toks.add(e.name); break;
+        case 'seq': case 'alt': e.items.forEach(go); break;
+        case 'quantifier': case 'group': case 'not': go(e.body); break;
+        case 'sep': go(e.element); break;
+      } };
+      go(r.body);
+      for (const tn of toks) (this.tokenHostRules.get(tn) ?? this.tokenHostRules.set(tn, []).get(tn)!).push(r.name);
+    }
+  }
+
   computeReach(): void {
     const refs = (e: RuleExpr, acc: Set<string>) => {
       switch (e.type) {
@@ -524,6 +572,114 @@ class Walker {
       case 'op': case 'prefix': case 'postfix': return [];
     }
   }
+
+  // ── DIRECTED TOKEN COVERAGE ──────────────────────────────────────────────────────────────────────
+  // The same directed-descent idea as nestChain, but the target is a scoped TOKEN, not a self-recursive
+  // RULE. A grammar-derived LEGAL corpus is shallow/structural and never reaches an expression-position
+  // literal: every numeric, every private field — the scoped leaves the scope≡role judge checks — appears
+  // ZERO times. tokenCover fixes that by, for each scoped token, building the SHORTEST legal path from the
+  // entry rule to a rule that references it (the SAME reversed-BFS the nesting strategies use, retargeted
+  // at a token via its host rules) and substituting real samples of the token there. Minimal context only
+  // (shortest path + minExpand filler), so it stays cheap on a 50-rule grammar.
+
+  // shortest rule-ref distance FROM each rule TO any rule that references `tokenName` (reversed-BFS, like
+  // distTo but seeded at the token's host rules). Memoised. Infinity-absent ⇒ the rule can't reach the token.
+  // A host rule starts at distance 1 (entering its body costs one ref step to reach the direct token use);
+  // a DIRECT `ref:token` in an expression is 0. The gap is what makes the descent STOP at the first direct
+  // token use instead of recursing into a self-recursive host (`Type` → `aa is Type → …` never terminating):
+  // `ref:token` (0) strictly beats `ref:host` (≥1), so a `seq`/`alt`'s shortest branch is the one that
+  // actually places the token here, not the one that re-enters a host rule that also eventually reaches it.
+  tokenDistCache = new Map<string, Map<string, number>>();
+  tokenDistTo(tokenName: string): Map<string, number> {
+    let m = this.tokenDistCache.get(tokenName); if (m) return m;
+    m = new Map<string, number>();
+    const back = new Map<string, string[]>();
+    for (const r of this.grammar.rules) for (const ref of this.directRuleRefs(r.body)) (back.get(ref) ?? back.set(ref, []).get(ref)!).push(r.name);
+    const queue: string[] = [];
+    for (const host of this.tokenHostRules.get(tokenName) ?? []) if (!m.has(host)) { m.set(host, 1); queue.push(host); }   // host rule body = 1 step from the direct token use
+    while (queue.length) { const cur = queue.shift()!; const d = m.get(cur)!; for (const pre of back.get(cur) ?? []) if (!m.has(pre)) { m.set(pre, d + 1); queue.push(pre); } }
+    this.tokenDistCache.set(tokenName, m); return m;
+  }
+  // min rule-ref distance from an expression to `tokenName` — 0 if it DIRECTLY refs the token (a direct
+  // use strictly beats re-entering a host rule, so the descent terminates at the token, see tokenDistTo).
+  exprDistToToken(e: RuleExpr, tokenName: string): number {
+    const dm = this.tokenDistTo(tokenName);
+    switch (e.type) {
+      case 'ref': return e.name === tokenName ? 0 : (dm.has(e.name) ? dm.get(e.name)! : Infinity);
+      case 'seq': case 'alt': return Math.min(Infinity, ...e.items.map((i) => this.exprDistToToken(i, tokenName)));
+      case 'quantifier': case 'group': case 'not': return this.exprDistToToken(e.body, tokenName);
+      case 'sep': return this.exprDistToToken(e.element, tokenName);
+      default: return Infinity;
+    }
+  }
+  exprReachesToken(e: RuleExpr, tokenName: string): boolean { return this.exprDistToToken(e, tokenName) < Infinity; }
+
+  // Scoped tokens that tokenCover CAN reach: a declared `.scope`, a samplable pattern (not a `never()`
+  // structural placeholder), and at least one host rule reachable from the entry. A trivia token the
+  // parser never consumes (no host rule — a shebang / doc comment) is excluded HERE: no rule path reaches
+  // it (it is handled, where it can be at all, by `prefixOnlyTokens`).
+  coverableTokens(entryName: string): TokenDecl[] {
+    return this.grammar.tokens.filter((t) => {
+      if (!t.scope) return false;
+      if (typeof t.pattern !== 'string' && t.pattern.type === 'never') return false;   // structural placeholder
+      const dm = this.tokenDistTo(t.name);
+      return dm.has(entryName) || (this.tokenHostRules.get(t.name) ?? []).includes(entryName);
+    });
+  }
+
+  // Scoped tokens NO rule references but that carry a `start()` line/stream anchor (a shebang `^#!…`) —
+  // the parser treats them as leading trivia (skipped, never a CST leaf), so coverableTokens can't reach
+  // them, yet they ARE a legal document PREFIX the highlighter scopes. We emit each as a stand-alone line
+  // so the generated corpus contains it; it can only be the first emission (the anchor), which a one-token
+  // input trivially satisfies. (Such a token is not a CST leaf, so the scope≡role gate does not grade it —
+  // this widens the round-trip corpus, not the leaf check.)
+  prefixOnlyTokens(): TokenDecl[] {
+    return this.grammar.tokens.filter((t) =>
+      !!t.scope &&
+      !(typeof t.pattern !== 'string' && t.pattern.type === 'never') &&
+      !this.tokenHostRules.has(t.name) &&
+      tokenPatternHasStartAnchor(t));
+  }
+
+  // Build the minimal legal context from `entry` down to `tokenName`, with the token rendered as
+  // `sampleText` at its position. Descends the SHORTEST branch toward the token at each node and
+  // minimal-fills everything else — the directed, deterministic analogue of nestChain for a token.
+  coverToken(entryBody: RuleExpr, tokenName: string, sampleText: string): Emission[] {
+    this.coverFuel = 400;
+    return this.coverRec(entryBody, tokenName, sampleText);
+  }
+  coverFuel = 0;
+  coverRec(e: RuleExpr, tokenName: string, sampleText: string): Emission[] {
+    if (--this.coverFuel <= 0 || !this.exprReachesToken(e, tokenName)) return this.minExpand(e) ?? [];
+    switch (e.type) {
+      case 'literal': return [{ t: 'lit', value: e.value }];
+      case 'ref': {
+        if (e.name === tokenName) return [{ t: 'tok', name: e.name, text: sampleText }];               // THE target token → the sample
+        if (this.isStruct(e.name)) return [{ t: 'struct', kind: this.structKind.get(e.name)! }];
+        if (this.isToken(e.name)) { const v = sample(this.tokenByName.get(e.name)!.pattern, { rand: this.rand, interesting: [], variant: 0 }); return [{ t: 'tok', name: e.name, text: v || 'x' }]; }
+        return this.coverRec(this.ruleByName.get(e.name)!.body, tokenName, sampleText);                // descend into the rule
+      }
+      case 'seq': {
+        // descend the ONE item closest to the token; minimal-fill the rest → the shortest legal frame.
+        let idx = -1, best = Infinity;
+        e.items.forEach((it, i) => { const d = this.exprDistToToken(it, tokenName); if (d < best) { best = d; idx = i; } });
+        const out: Emission[] = [];
+        e.items.forEach((it, i) => { for (const x of (i === idx ? this.coverRec(it, tokenName, sampleText) : this.minExpand(it) ?? [])) out.push(x); });
+        return out;
+      }
+      case 'alt': {
+        // the branch that reaches the token soonest (so the frame actually contains it).
+        let pick = e.items[0], best = Infinity;
+        for (const it of e.items) { const d = this.exprDistToToken(it, tokenName); if (d < best) { best = d; pick = it; } }
+        return this.coverRec(pick, tokenName, sampleText);
+      }
+      case 'quantifier': return this.coverRec(e.body, tokenName, sampleText);   // fire exactly one rep (it carries the token)
+      case 'group': return this.coverRec(e.body, tokenName, sampleText);
+      case 'sep': return this.coverRec(e.element, tokenName, sampleText);       // one element (it carries the token)
+      case 'not': case 'sameLine': case 'noCommentBefore': case 'noMultilineFlowBefore':
+      case 'op': case 'prefix': case 'postfix': return [];
+    }
+  }
 }
 
 // ─── MATERIALIZE: emissions → text + token spans ──────────────────────────────────
@@ -690,5 +846,29 @@ export function generateInputs(grammar: CstGrammar, opts: GenOptions = {}): GenI
     for (let i = 0; i < Math.ceil(fuzzRounds / 8); i++) push(w.fuzz(r.body, depth + 2), `fuzz:${rn}`, rn);
   }
 
+  // 5) DIRECTED TOKEN COVERAGE — for each scoped token, the shortest legal context from the entry rule
+  //    with several real samples of the token at its position. The bounded-exhaustive / fuzz strategies
+  //    only reach a shallow structural skeleton, so an expression-position literal (every numeric, the
+  //    private field) — exactly the scoped leaves the scope≡role judge checks — is otherwise NEVER
+  //    generated. Each context is minimal (shortest path + minExpand filler), so this stays cheap even
+  //    on the 50-rule TS grammar and needs no depth budget. The samples are guard-filtered (sampleVariants
+  //    skips the leading-literal embeds for decimal-/anchor-led tokens, so `0x1F` is never mangled to `-0x1F`).
+  for (const tok of w.coverableTokens(entry.name)) {
+    if (timeUp()) break;
+    // CLEAN samples only (no interesting-literal embeds): tokenCover's job is to make the token APPEAR in
+    // a legal context, not to stress boundary collisions — that is the enum/fuzz strategies' role, where
+    // the embed belongs. Prepending a boundary sigil to a sigil-led token (`<` + `#name`, `>` + `@name`)
+    // just produces non-parsing junk here, so the directed contexts stay clean and ~100% legal.
+    for (const text of sampleVariants(tok, { rand: w.rand, interesting: [] }, 6)) {
+      push(w.coverToken(entry.body, tok.name, text), `tokenCover:${tok.name}`, entry.name);
+    }
+  }
+  // a position-anchored leading-trivia token (a shebang) as a stand-alone first line — see prefixOnlyTokens.
+  for (const tok of w.prefixOnlyTokens()) {
+    for (const text of sampleVariants(tok, { rand: w.rand, interesting: [] }, 3)) {
+      if (!/[\n\r]/.test(text)) push([{ t: 'tok', name: tok.name, text }], `tokenCover:${tok.name}`, entry.name);
+    }
+  }
+
   return out.slice(0, maxInputs);
 }

From 018959c3808c4e63d282ec3806a0293ac0a6b257 Mon Sep 17 00:00:00 2001
From: Johnson Chu <johnsoncodehk@gmail.com>
Date: Tue, 9 Jun 2026 06:06:23 +0800
Subject: [PATCH 3/6] Generator: replace random fuzz with deterministic
 systematic coverage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Generation was seed-dependent — different opts.seed → different fuzz outputs →
different "discovered" divergences. That's fatal for a reproducible gap ledger
(random testing shows presence, not absence, and can't be tracked across commits)
and contradicts the project's own "systematic, not a representativeness bet" thesis.

The only random STRUCTURE was `fuzz` (this.rand for alt/quantifier choices); enum/
nestChain/tokenCover already rotate on a variant index. Replace fuzz with `cover`:
the same walk, but every production choice comes from a deterministic mixed-radix
Chooser indexed by round i alone — the first few choice points form a full base-N
cartesian (t-wise interaction coverage by construction: measured complete to
3-wise), the tail perturbed by rotations. this.rand is seeded from a fixed constant;
opts.seed is now a no-op. generateInputs(grammar) is a pure function of the grammar:
byte-identical across runs for all 7 languages.

7/7 consistent, depth-site 2/2 (#23/#24 intact); agnostic 9/9. Foundation for a
deterministic, commit-trackable gap ledger.
---
 test/grammar-gen.ts | 158 ++++++++++++++++++++++++++++++++++++--------
 1 file changed, 129 insertions(+), 29 deletions(-)

diff --git a/test/grammar-gen.ts b/test/grammar-gen.ts
index 4b539bf..7eafb4f 100644
--- a/test/grammar-gen.ts
+++ b/test/grammar-gen.ts
@@ -15,13 +15,22 @@
 //  indicators) is read from the grammar's own config (`grammar.indent` / `.markup`),
 //  never hardcoded — the same discipline the engines follow.
 //
-//  Three production strategies, all over the SAME walker:
+//  Production strategies, all over the SAME walker — ALL DETERMINISTIC (no PRNG seed; the
+//  generator is a pure function of the grammar, so a gap ledger is reproducible across commits):
 //   • bounded-exhaustive — every derivation to a small depth N (provably complete at
 //     small scope; this is what makes coverage `grammar × bound` instead of imagination).
 //   • self-recursive nesting — for each rule that can contain itself, the nested shape
 //     at depth 1..N. Deep self-embedding is exactly where a flat highlighter loses to
 //     the stack-keeping parser (monogram#24 is `BlockSequence` inside `BlockSequence`).
-//   • fuzzing — random production choices, for deeper / wider structures.
+//   • directed token coverage — the shortest legal context for every scoped token.
+//   • systematic t-wise coverage (was random "fuzzing") — for deeper / wider structures: a
+//     DETERMINISTIC mixed-radix enumeration over the grammar's CHOICE POINTS (which `alt`
+//     branch, how many `quantifier`/`sep` reps). Round i → a choice vector derived from i
+//     alone (no external seed). A FULL cartesian over the first few choice-point digits
+//     covers every t-tuple (t≤digits) of (choice-point, value) among them BY CONSTRUCTION —
+//     so it reaches INTERACTION shapes (an explicit key × a `[` in its scalar, monogram's
+//     `[`-in-key leak) deterministically, not by the luck of a seed. Polynomial (C^D rounds),
+//     never the exponential full derivation tree.
 // ─────────────────────────────────────────────────────────────────────────────
 import type { CstGrammar, RuleExpr, RuleDecl, TokenDecl, TokenPattern, TokenCharClassItem } from '../src/types.ts';
 import { tokenPatternStartsWithDecimal, tokenPatternHasStartAnchor } from '../src/token-pattern.ts';
@@ -45,8 +54,13 @@ export interface GenInput {
   rule: string;        // the top rule the derivation started from (entry, or a self-recursive rule)
 }
 
-// ── deterministic PRNG (Date.now/Math.random are unavailable in workflow scripts and make
-//    a generator unreproducible anyway — seed it). xorshift32. ──
+// ── fixed-seed xorshift32. The generator has NO external randomness: every STRUCTURE choice is made
+//    by the deterministic t-wise schedule (the `cover` strategy / mixed-radix chooser), and every
+//    token-TEXT sample is indexed deterministically (`sample`/`sampleVariants` rotate on a `variant`
+//    INDEX, never on `rand`). This PRNG is retained only so any future text-sampling path that wants a
+//    tie-break has one; it is seeded from a FIXED constant so two `generateInputs(grammar)` calls are
+//    byte-identical regardless of any `opts.seed` (which is now a NO-OP, kept for back-compat). ──
+const FIXED_SEED = 0x9e3779b9 | 0;   // a constant (golden-ratio bits); NOT derived from time / opts.
 function rng(seed: number): () => number {
   let s = seed | 0 || 1;
   return () => { s ^= s << 13; s ^= s >>> 17; s ^= s << 5; return ((s >>> 0) % 1_000_000) / 1_000_000; };
@@ -165,13 +179,74 @@ function sampleVariants(decl: TokenDecl, ctx: { rand: () => number; interesting:
   return [...out];
 }
 
+// ─── DETERMINISTIC CHOICE SCHEDULE (t-wise systematic coverage) ────────────────────────────────────
+// A `Chooser` answers each production CHOICE POINT during a `cover` walk, in WALK ORDER. Two kinds:
+//   • `next(radix)`  — a STRUCTURAL choice (which alt branch · how many quantifier/sep reps). These drive
+//      the t-wise cartesian: because the walk is deterministic given the answers, the k-th structural call
+//      is always choice point k, so a Chooser IS a choice vector `(v_0, v_1, …)` and a derivation is a
+//      function of it. The shape of the tree (key-vs-seq, explicit-vs-plain, nesting) lives here.
+//   • `variant(n)` — a token-TEXT choice (which sampled lexeme for a token: `x` vs the boundary-embed
+//      `--- x`, an int vs a float form). These do NOT change the tree SHAPE, only a leaf's bytes, so they
+//      are kept on a SEPARATE fast counter — every token position (even a DEEP value scalar) then sweeps
+//      its variants across rounds, instead of being frozen by a slow high mixed-radix digit. That is what
+//      reliably lands a boundary-embed in VALUE position (`k: --- x`, monogram#23) — a structural-context
+//      × text-variant interaction the cartesian reaches the context for and the text counter the variant.
+export interface Chooser { next: (radix: number) => number; variant: (n: number) => number }
+
+// One round's choice vector, as a MIXED-RADIX reading of a round index `i` (NO external seed):
+//   structural digit k = ( ⌊ i / B^k ⌋ + k·rot ) mod radix_k
+// `B` is the schedule BASE. Reading `i` low-digit-first means the FIRST choice points (the structurally
+// decisive ones — which Node kind, key-vs-seq, explicit-vs-plain) move SLOWEST, so a contiguous block of
+// rounds holds a fixed prefix while the deeper tail varies. Enumerating i over `B^D` (the coverSchedule
+// loop) therefore walks the FULL cartesian product of the first D structural digits → every t-tuple
+// (t ≤ D) of (choice-point, value) among the first D points appears in SOME round, BY CONSTRUCTION. That
+// is the t-wise (here t≤D≈4) interaction guarantee — it covers an explicit-key × `[`-in-its-scalar pair,
+// monogram's `[`-in-key leak, deterministically, with no luck. `rot` (a per-schedule offset) perturbs
+// the deeper tail so a second/third pass reaches different deep shapes than the first; it does NOT affect
+// the prefix cartesian (it shifts every digit by a constant, a relabelling of values, so all tuples among
+// the first D points still occur — just at permuted round indices). Polynomial: B^D rounds, never the
+// exponential whole derivation tree (a structural point past digit D simply reads its slow-moving high
+// digit). The token-TEXT counter is an INDEPENDENT per-round walk index (j-th text choice = (i+j) mod n),
+// so it cycles every position's variants fast regardless of structural depth.
+function mixedRadixChooser(i: number, base: number, rot: number): Chooser {
+  let k = 0;   // structural choice-point index (drives the mixed-radix cartesian)
+  let j = 0;   // token-text choice index (independent fast counter)
+  return {
+    next(radix: number): number {
+      if (radix <= 1) return 0;                              // a forced single option consumes a (no-op) digit slot
+      const digit = Math.floor(i / Math.pow(base, k)) + k * rot;
+      k++;
+      return ((digit % radix) + radix) % radix;
+    },
+    variant(n: number): number {
+      if (n <= 1) return 0;
+      const idx = (i + j) % n;   // fast: sweeps each token position's variants across rounds, depth-agnostic
+      j++;
+      return idx;
+    },
+  };
+}
+
+// The deterministic schedule of choice vectors the `cover` strategy enumerates: the full cartesian over
+// the first D digits (radix `base`) — `base^D` rounds — optionally repeated under a few `rot` offsets so
+// the deep tail (past digit D) also varies. `rounds` caps it (polynomial, bounded). Pure function of its
+// args: identical every call, so `generateInputs` is reproducible. Yields `Chooser`s in order.
+function* coverSchedule(base: number, digits: number, rounds: number, rotations: number[]): Generator<Chooser> {
+  const span = Math.pow(base, digits);
+  let emitted = 0;
+  for (const rot of rotations) {
+    for (let i = 0; i < span && emitted < rounds; i++, emitted++) yield mixedRadixChooser(i, base, rot);
+    if (emitted >= rounds) return;
+  }
+}
+
 // ─── THE WALKER ──────────────────────────────────────────────────────────────────
 export interface GenOptions {
   depth?: number;       // bounded-exhaustive derivation depth (rule-ref recursion)
   cap?: number;         // max alternatives kept at each combinator node (anti-explosion)
   maxInputs?: number;   // global cap on emitted inputs per rule
-  fuzzRounds?: number;  // random derivations
-  seed?: number;
+  fuzzRounds?: number;  // budget (cap) on systematic-coverage rounds — DETERMINISTIC choice vectors, not random
+  seed?: number;        // NO-OP, retained for back-compat: the generator is a pure function of the grammar
   nestDepth?: number;   // self-recursive nesting depth
   timeBudgetMs?: number; // wall-clock cap for the depth strategies (large token-stream grammars)
 }
@@ -192,9 +267,9 @@ class Walker {
   maxCalls = 60_000;
   enumTop(e: RuleExpr, budget: number): Emission[][] { this.budgetCalls = 0; return this.enum(e, budget); }
 
-  constructor(grammar: CstGrammar, seed: number, cap: number) {
+  constructor(grammar: CstGrammar, cap: number) {
     this.grammar = grammar;
-    this.rand = rng(seed);
+    this.rand = rng(FIXED_SEED);   // FIXED — the walker is a pure function of the grammar (see rng note).
     this.cap = cap;
     for (const t of grammar.tokens) this.tokenByName.set(t.name, t);
     for (const r of grammar.rules) this.ruleByName.set(r.name, r);
@@ -470,35 +545,49 @@ class Walker {
     }
   }
 
-  // ── random derivation (fuzzing): one emission sequence, forced to terminate at budget 0 ──
-  fuzz(e: RuleExpr, budget: number): Emission[] {
-    const pick = <T,>(xs: T[]): T => xs[Math.floor(this.rand() * xs.length)];
+  // ── DETERMINISTIC SYSTEMATIC derivation (replaces random fuzzing): one emission sequence whose every
+  //    production CHOICE comes from a `Chooser`, not a PRNG. The walk is otherwise identical to the old
+  //    fuzz, so the SAME structures are reachable — but reproducibly. A Chooser is consulted at each
+  //    CHOICE POINT in walk order (alt branch · quantifier reps · sep reps · token-text variant); since
+  //    the walk is deterministic given the chooser's outputs, choice point k is ALWAYS the k-th call, so
+  //    a mixed-radix counter (slow-moving early digits, fast late ones) keeps a stable choice-point
+  //    PREFIX while sweeping the tail — which is what yields t-wise coverage over the prefix (see
+  //    coverSchedule). Forced to terminate at budget 0 (the minimal expansion), like fuzz. ──
+  cover(e: RuleExpr, budget: number, ch: Chooser): Emission[] {
     // bounded `for`-push (NOT spread on a possibly-huge array → stack overflow + size blowup)
-    const fappend = (out: Emission[], add: Emission[]) => { if (out.length < MAX_EMS) for (const x of add) out.push(x); };
+    const cappend = (out: Emission[], add: Emission[]) => { if (out.length < MAX_EMS) for (const x of add) out.push(x); };
     switch (e.type) {
       case 'literal': return [{ t: 'lit', value: e.value }];
       case 'ref': {
         if (this.isStruct(e.name)) return [{ t: 'struct', kind: this.structKind.get(e.name)! }];
         if (this.isToken(e.name)) {
           const vs = sampleVariants(this.tokenByName.get(e.name)!, { rand: this.rand, interesting: this.interesting }, 4);
-          return [{ t: 'tok', name: e.name, text: vs.length ? pick(vs) : 'x' }];
+          // pick a variant on the TOKEN-TEXT counter (ch.variant, not the structural ch.next), so the
+          // token TEXT (a plain scalar `--- x` vs `x`, a number's int vs float form) is swept fast at EVERY
+          // position regardless of structural depth — see the Chooser note (this lands #23's `k: --- x`).
+          return [{ t: 'tok', name: e.name, text: vs.length ? vs[ch.variant(vs.length)] : 'x' }];
         }
         if (budget <= 0) return this.ruleMin.get(e.name) ?? [];
-        return this.fuzz(this.ruleByName.get(e.name)!.body, budget - 1);
+        return this.cover(this.ruleByName.get(e.name)!.body, budget - 1, ch);
       }
-      case 'seq': { const out: Emission[] = []; for (const it of e.items) fappend(out, this.fuzz(it, budget)); return out; }
+      case 'seq': { const out: Emission[] = []; for (const it of e.items) cappend(out, this.cover(it, budget, ch)); return out; }
       case 'alt': {
-        if (budget <= 0) { const m = this.minExpand(e); if (m) return m; }
-        return this.fuzz(pick(e.items), budget);
+        if (budget <= 0) { const m = this.minExpand(e); if (m) return m; }      // no budget → shortest, no choice consumed
+        return this.cover(e.items[ch.next(e.items.length)], budget, ch);        // CHOICE POINT: which branch
       }
       case 'quantifier': {
-        const reps = budget <= 0 ? (e.kind === '+' ? 1 : 0) : (e.kind === '?' ? Math.floor(this.rand() * 2) : Math.floor(this.rand() * 3) + (e.kind === '+' ? 1 : 0));
-        const out: Emission[] = []; for (let i = 0; i < reps; i++) fappend(out, this.fuzz(e.body, budget - 1)); return out;
+        // CHOICE POINT: how many reps. `?`→{0,1} (radix 2), `*`/`+`→{0..2}/{1..3} (radix 3). At budget 0
+        // the count is forced to the minimum (radix 1 → digit is a fixed no-op, keeping schedules aligned).
+        const lo = e.kind === '+' ? 1 : 0;
+        const radix = budget <= 0 ? 1 : (e.kind === '?' ? 2 : 3);
+        const reps = lo + ch.next(radix);
+        const out: Emission[] = []; for (let i = 0; i < reps; i++) cappend(out, this.cover(e.body, budget - 1, ch)); return out;
       }
-      case 'group': return this.fuzz(e.body, budget);
+      case 'group': return this.cover(e.body, budget, ch);
       case 'sep': {
-        const reps = budget <= 0 ? 1 : Math.floor(this.rand() * 3) + 1; const out: Emission[] = [];
-        for (let i = 0; i < reps; i++) { if (i) out.push({ t: 'lit', value: e.delimiter }); fappend(out, this.fuzz(e.element, budget - 1)); }
+        // CHOICE POINT: element count (≥1). radix 3 → 1..3 elements; forced to 1 at budget 0.
+        const reps = 1 + (budget <= 0 ? 0 : ch.next(3)); const out: Emission[] = [];
+        for (let i = 0; i < reps; i++) { if (i) out.push({ t: 'lit', value: e.delimiter }); cappend(out, this.cover(e.element, budget - 1, ch)); }
         return out;
       }
       case 'not': case 'sameLine': case 'noCommentBefore': case 'noMultilineFlowBefore':
@@ -781,10 +870,12 @@ export function generateInputs(grammar: CstGrammar, opts: GenOptions = {}): GenI
   const depth = opts.depth ?? 5;
   const cap = opts.cap ?? 6;
   const maxInputs = opts.maxInputs ?? 400;
-  const fuzzRounds = opts.fuzzRounds ?? 300;
+  // `fuzzRounds` is honoured as the BUDGET (cap on systematic-coverage rounds), but the rounds are now
+  // DETERMINISTIC choice vectors, not random draws. `opts.seed` is a NO-OP (kept for back-compat): the
+  // generator is a pure function of the grammar, so two calls — with any seed or none — are identical.
+  const coverRounds = opts.fuzzRounds ?? 300;
   const nestDepth = opts.nestDepth ?? 5;
-  const seed = opts.seed ?? 12345;
-  const w = new Walker(grammar, seed, cap);
+  const w = new Walker(grammar, cap);
 
   const mode: MatOptions['mode'] = grammar.indent ? 'indent' : grammar.markup ? 'markup' : 'token-stream';
   const matOpts: MatOptions = { mode, indentStep: 2 };
@@ -837,13 +928,22 @@ export function generateInputs(grammar: CstGrammar, opts: GenOptions = {}): GenI
     for (let d = 1; d <= nestDepth; d++) push(w.nestChain(r.body, rn, d), `dirnest:${rn}@${d}`, rn);
   }
 
-  // 4) fuzzing for deeper / wider structures (random production choices), rooted at the entry AND at
-  //    each self-recursive rule so deep shapes are reached quickly.
-  for (let i = 0; i < fuzzRounds; i++) push(w.fuzz(entry.body, depth + 2), 'fuzz', entry.name);
+  // 4) SYSTEMATIC t-wise coverage for deeper / wider structures (DETERMINISTIC choice vectors, was random
+  //    fuzzing), rooted at the entry AND at each self-recursive rule. The schedule is a full mixed-radix
+  //    cartesian over the first `COVER_DIGITS` choice points at `COVER_BASE` values each (covers every
+  //    t-tuple, t≤COVER_DIGITS, of those points BY CONSTRUCTION → reaches an explicit-key × `[`-in-scalar
+  //    interaction without a seed), with a few rotation offsets perturbing the deeper tail. `coverRounds`
+  //    caps it — polynomial (COVER_BASE^COVER_DIGITS ≈ 256), never the exponential whole derivation tree.
+  // NB the emitted strategy key stays `fuzz` (the driver buckets it as the EXPLORATORY tier — deeper/wider
+  // shapes that legitimately reach STANDING flat-TM frontier limits, so #24 is report-only there; the
+  // STRUCTURED strategies remain the by-construction gate). Only the MECHANISM changed (deterministic, not
+  // random); the bucket's meaning is the same, so the driver's gating semantics are untouched.
+  const COVER_BASE = 4, COVER_DIGITS = 4, ROTS = [0, 1, 2];
+  for (const ch of coverSchedule(COVER_BASE, COVER_DIGITS, coverRounds, ROTS)) push(w.cover(entry.body, depth + 2, ch), 'fuzz', entry.name);
   for (const rn of recursive) {
     if (timeUp()) break;
     const r = w.ruleByName.get(rn)!;
-    for (let i = 0; i < Math.ceil(fuzzRounds / 8); i++) push(w.fuzz(r.body, depth + 2), `fuzz:${rn}`, rn);
+    for (const ch of coverSchedule(COVER_BASE, COVER_DIGITS, Math.ceil(coverRounds / 8), ROTS)) push(w.cover(r.body, depth + 2, ch), `fuzz:${rn}`, rn);
   }
 
   // 5) DIRECTED TOKEN COVERAGE — for each scoped token, the shortest legal context from the entry rule

From 9acf16caafcd195b16cb41facdcb569a146a4e18 Mon Sep 17 00:00:00 2001
From: Johnson Chu <johnsoncodehk@gmail.com>
Date: Tue, 9 Jun 2026 06:52:56 +0800
Subject: [PATCH 4/6] Generator precision: make the gap shape-classes
 systematically producible
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Deterministic generation found 0 divergences — the gaps random fuzz hit were
luck, and the deterministic generator couldn't produce those shapes. Discovery is
bounded by generator PRECISION, not luck; so make the known gap shape-classes
producible (config-derived, no language names):

- markup: a NO-SPACE (tight) render variant + a directed `markupSelfCloseAttr`
  producer so `<img src="a"/>` (quoted attr flush against `/>`) forms. The HTML/Vue
  self-close `/` gap now surfaces deterministically under "discovered":
  «/» got «string.unquoted.html».
- indent: sample plain scalars from `blockPattern` + splice a flow bracket mid-token,
  and directed `indentExplicitKeyBracket` producer, so `? k [y : …` forms (round-trips).
- indent: `indentBlockScalar` synthesis for the `never()`-token block scalar `|`/`>`
  (introducer + deeper-indented body), so `string.unquoted.block` is covered (was 0%).

Deterministic preserved (generateInputs pure); 7/7 gated-clean; depth-site 2/2
(#23/#24 intact); agnostic 9/9. Honest finding: the YAML explicit-key `[` divergence
is a `name`-bucket scope (entity.name.tag), which the scope≡role gates (literal→content,
anchored-marker) structurally don't flag — a check-precision item for a follow-up,
distinct from producibility (which is now done). The HTML `/` is unambiguously gate-1.
---
 test/grammar-gen.ts | 227 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 219 insertions(+), 8 deletions(-)

diff --git a/test/grammar-gen.ts b/test/grammar-gen.ts
index 7eafb4f..70a0b43 100644
--- a/test/grammar-gen.ts
+++ b/test/grammar-gen.ts
@@ -142,7 +142,10 @@ function topAltBranches(pat: TokenPattern): number {
 }
 
 // Sample several distinct, legal texts for a token (variants + interesting-literal embeds).
-function sampleVariants(decl: TokenDecl, ctx: { rand: () => number; interesting: string[] }, n: number): string[] {
+// `blockEmbed` (indent grammars) are content literals that are STRUCTURAL in flow context but
+// plain-scalar CONTENT in BLOCK context (the flow brackets `[`/`{`/`]`/`}`) — see the internal-embed
+// note below; passed from the grammar's `indent.flowOpen`/`flowClose`, empty otherwise.
+function sampleVariants(decl: TokenDecl, ctx: { rand: () => number; interesting: string[]; blockEmbed?: string[] }, n: number): string[] {
   const out = new Set<string>();
   // Cover every top-level alt branch: a token that is itself an alternation (hex/oct/bin/float
   // forms) must emit ALL its branches, not stop at branch 0 once `n` distinct samples are reached —
@@ -152,6 +155,24 @@ function sampleVariants(decl: TokenDecl, ctx: { rand: () => number; interesting:
     const s = sample(decl.pattern, { ...ctx, variant: v });
     if (s !== null && s.length > 0) out.add(s);
   }
+  // INTERNAL boundary-literal embeds (indent grammars with a block-context pattern): a flow bracket
+  // (`[`/`{`) is a flow INDICATOR inside `[ ]`/`{ }`, but ordinary plain-scalar CONTENT in block
+  // context — which is the whole reason a token carries a `blockPattern` (its body drops the flow
+  // exclusions). The default `.pattern` (flow-restricted) can NEVER sample such a char, so a block
+  // plain scalar like `k [y` — one scalar to the stack-keeping parser, but a phantom flow-open to a
+  // flat grammar — is otherwise unreachable. Sample the base from the BLOCK pattern and splice a
+  // bracket AFTER the head char (the head must stay a non-indicator, so the splice is mid-token, never
+  // leading); the parser re-lexes the result as ONE scalar (verified by the round-trip). This makes
+  // `? k [y : …` (the monogram `[`-in-key flow-leak) producible deterministically.
+  const blockBase = decl.blockPattern ? (sample(decl.blockPattern, { ...ctx, variant: 0 }) ?? '') : '';
+  if (blockBase.length >= 1 && ctx.blockEmbed?.length && !tokenPatternHasStartAnchor(decl) && !tokenPatternStartsWithDecimal(decl)) {
+    const head = blockBase[0], tail = blockBase.slice(1) || 'y';
+    for (const br of ctx.blockEmbed) {
+      if (br.length !== 1 || /[\n\r]/.test(br)) continue;
+      out.add(head + br + tail);          // glued mid-scalar (`k` + `[` + `y` → `k[y`)
+      out.add(head + ' ' + br + tail);    // space-led bracket (`k [y`) — the prompt's exact shape
+    }
+  }
   // a base sample to seed interesting-literal embeds
   const base = sample(decl.pattern, { ...ctx, variant: 0 }) ?? '';
   // Embed grammar-derived boundary literals into free-form (multi-char-capable) tokens, where
@@ -257,6 +278,7 @@ class Walker {
   interesting: string[];
   structKind = new Map<string, 'indent' | 'dedent' | 'newline'>();
   compactLits: Set<string>;
+  blockEmbed: string[];   // flow brackets (`[`/`{`/`]`/`}`) — flow indicators, but block-scalar CONTENT
   reachMap = new Map<string, Set<string>>();   // rule → every rule it can transitively reach
   tokenHostRules = new Map<string, string[]>(); // token name → rules whose body DIRECTLY references it
   ruleMin = new Map<string, Emission[] | null>();
@@ -280,6 +302,9 @@ class Walker {
       this.structKind.set(ind.newlineToken, 'newline');
     }
     this.compactLits = new Set(grammar.indent?.compactIndicators ?? []);
+    // flow brackets are flow indicators in `[ ]`/`{ }` but plain-scalar CONTENT in block context — the
+    // single-char ones seed the internal-embed that makes a `[`-in-block-scalar (`k [y`) producible.
+    this.blockEmbed = [...(ind?.flowOpen ?? []), ...(ind?.flowClose ?? [])].filter((b) => b.length === 1);
     this.interesting = this.collectInteresting();
     this.computeReach();
     this.computeTokenHosts();
@@ -493,7 +518,7 @@ class Walker {
       case 'ref': {
         if (this.isStruct(e.name)) return [[{ t: 'struct', kind: this.structKind.get(e.name)! }]];
         if (this.isToken(e.name)) {
-          const vs = sampleVariants(this.tokenByName.get(e.name)!, { rand: this.rand, interesting: this.interesting }, 3);
+          const vs = sampleVariants(this.tokenByName.get(e.name)!, { rand: this.rand, interesting: this.interesting, blockEmbed: this.blockEmbed }, 3);
           return (vs.length ? vs : ['x']).slice(0, cap).map((t) => [{ t: 'tok', name: e.name, text: t }]);
         }
         if (budget <= 0) { const m = this.ruleMin.get(e.name); return m ? [m] : [[]]; }
@@ -561,7 +586,7 @@ class Walker {
       case 'ref': {
         if (this.isStruct(e.name)) return [{ t: 'struct', kind: this.structKind.get(e.name)! }];
         if (this.isToken(e.name)) {
-          const vs = sampleVariants(this.tokenByName.get(e.name)!, { rand: this.rand, interesting: this.interesting }, 4);
+          const vs = sampleVariants(this.tokenByName.get(e.name)!, { rand: this.rand, interesting: this.interesting, blockEmbed: this.blockEmbed }, 4);
           // pick a variant on the TOKEN-TEXT counter (ch.variant, not the structural ch.next), so the
           // token TEXT (a plain scalar `--- x` vs `x`, a number's int vs float form) is swept fast at EVERY
           // position regardless of structural depth — see the Chooser note (this lands #23's `k: --- x`).
@@ -730,6 +755,136 @@ class Walker {
       tokenPatternHasStartAnchor(t));
   }
 
+  // ── DIRECTED MARKUP SELF-CLOSE-WITH-ATTRIBUTE (markup grammars only) ──────────────────────────────
+  // The minimal self-closing element carrying ONE quoted attribute: `<name attr="v"/>`. Built DIRECTLY
+  // from `grammar.markup` (tagOpen / attributeAssign / attributeQuotes / closeMarker / tagClose) plus two
+  // generically-discovered tokens — a NAME token (an `identifier` token: the tag + attribute name) and a
+  // QUOTED-VALUE token (a `string` token whose sample opens with an `attributeQuote`) — so it stays
+  // language-agnostic (no `<`/`/`/HTML hardcoded; a markup grammar with different delimiters yields its
+  // own shape). The un-biased bounded-exhaustive enumeration STARVES this combination at a small `cap`
+  // (the cross of "an attribute has a quoted value" × "the optional self-close `/` fired" is past the
+  // first few derivations), so — exactly like nestChain forces a starved nesting and coverToken a starved
+  // token — this forces it deterministically. Its tight rendering (`name="v"/>` flush) is what exposes
+  // the flat grammar mis-scoping the self-close `/` as unquoted-value content (a STANDING flat-TM limit).
+  // Returns [] when the grammar lacks the needed tokens (no string/identifier token) — then it is a no-op.
+  markupSelfCloseAttr(): Emission[] {
+    const mk = this.grammar.markup;
+    if (!mk || !mk.closeMarker) return [];
+    const nameTok = this.grammar.tokens.find((t) => t.identifier);   // the tag / attribute NAME token
+    // a string token whose conservative sample is a QUOTED value (opens with one of the attribute quotes)
+    const quotes = mk.attributeQuotes ?? ['"', "'"];
+    const valTok = this.grammar.tokens.find((t) => {
+      if (!t.string && !t.scope) return false;
+      const s = sample(t.pattern, { rand: this.rand, interesting: [], variant: 0 });
+      return s !== null && s.length >= 2 && quotes.includes(s[0]);
+    });
+    if (!nameTok || !valTok) return [];
+    const nameTxt = sample(nameTok.pattern, { rand: this.rand, interesting: [], variant: 0 }) || 'a';
+    const valTxt = sample(valTok.pattern, { rand: this.rand, interesting: [], variant: 0 })!;
+    const assign = mk.attributeAssign ?? '=';
+    return [
+      { t: 'lit', value: mk.tagOpen },
+      { t: 'tok', name: nameTok.name, text: nameTxt },     // tag name
+      { t: 'tok', name: nameTok.name, text: nameTxt },     // attribute name
+      { t: 'lit', value: assign },
+      { t: 'tok', name: valTok.name, text: valTxt },       // quoted attribute value
+      { t: 'lit', value: mk.closeMarker },                 // self-close marker
+      { t: 'lit', value: mk.tagClose },
+    ];
+  }
+
+  // The leading literal of an alt arm's seq/group spine (the indicator a `? …`/`- …` arm starts with).
+  private armLeadLiteral(e: RuleExpr): string | null {
+    if (e.type === 'literal') return e.value;
+    if (e.type === 'seq') return e.items.length ? this.armLeadLiteral(e.items[0]) : null;
+    if (e.type === 'group') return this.armLeadLiteral(e.body);
+    return null;
+  }
+  private exprContainsLiteral(e: RuleExpr, v: string): boolean {
+    switch (e.type) {
+      case 'literal': return e.value === v;
+      case 'seq': case 'alt': return e.items.some((i) => this.exprContainsLiteral(i, v));
+      case 'quantifier': case 'group': case 'not': return this.exprContainsLiteral(e.body, v);
+      case 'sep': return this.exprContainsLiteral(e.element, v);
+      default: return false;
+    }
+  }
+  // The explicit-key indicator of an indent grammar (YAML `?`), found GENERICALLY: the `compactIndicator`
+  // that heads a rule arm which ALSO carries the key/value separator (`? key : value`), distinguishing it
+  // from the block-SEQUENCE indicator (`-`, whose arm leads to an item, not a `:` pair). Config-derived
+  // (compactIndicators × keyValueSeparator), so no token/rule name is hardcoded; null if none qualifies.
+  explicitKeyIndicator(): string | null {
+    const ind = this.grammar.indent; if (!ind?.compactIndicators) return null;
+    const kv = ind.keyValueSeparator ?? ':';
+    const ci = new Set(ind.compactIndicators);
+    for (const r of this.grammar.rules) {
+      const arms = r.body.type === 'alt' ? r.body.items : [r.body];
+      for (const arm of arms) { const lead = this.armLeadLiteral(arm); if (lead && ci.has(lead) && this.exprContainsLiteral(arm, kv)) return lead; }
+    }
+    return null;
+  }
+
+  // ── DIRECTED INDENT EXPLICIT-KEY WITH A FLOW-BRACKET PLAIN SCALAR (indent grammars only) ───────────
+  // The shape `? k [y :\n  - p\n  - q`: an EXPLICIT-key entry whose KEY is a plain scalar containing a flow
+  // bracket, with a block-SEQUENCE value. To the stack-keeping parser the key is ONE plain scalar (its
+  // `blockPattern` admits `[`/`{` outside flow) and the `-` items are sequence indicators; a flat grammar
+  // instead opens a phantom flow at the `[` that never closes, so the value `-`s leak to the key scope.
+  // Two structural facts STARVE this in the un-biased strategies: a plain-scalar key in EXPLICIT position
+  // is itself rare (the cover walk reaches `? *alias :`/`? {flow} :`/`?\n indented`, but not `? plain :`),
+  // and the bracket must additionally land in THAT key — so it is forced here, deterministically, the
+  // indent analogue of markupSelfCloseAttr. All pieces are config-derived (the explicit-key indicator, the
+  // key/value separator, the flow brackets, the seq indicator = the OTHER compactIndicator, and the indent
+  // struct tokens), with the scalar drawn from a `blockPattern` token — no YAML token/rule name hardcoded.
+  // Returns [] when the grammar lacks the config (no explicit-key indicator / flow brackets / block scalar).
+  indentExplicitKeyBracket(): Emission[] {
+    const ind = this.grammar.indent; if (!ind) return [];
+    const qmark = this.explicitKeyIndicator(); if (!qmark) return [];
+    const bracket = this.blockEmbed[0]; if (!bracket) return [];                  // a flow-bracket content char
+    const kv = ind.keyValueSeparator ?? ':';
+    const seqInd = (ind.compactIndicators ?? []).find((c) => c !== qmark);        // the block-sequence indicator
+    if (!seqInd) return [];
+    // a block plain-scalar token whose blockPattern admits the bracket (the KEY), and one for the items.
+    const scalarTok = this.grammar.tokens.find((t) => t.blockPattern && t.scope); if (!scalarTok) return [];
+    const head = sample(scalarTok.blockPattern!, { rand: this.rand, interesting: [], variant: 0 }) || 'k';
+    const keyTxt = head[0] + ' ' + bracket + (head.slice(1) || 'y');             // `k [y` — bracket mid-scalar
+    const itemTxt = (sample(scalarTok.blockPattern!, { rand: this.rand, interesting: [], variant: 0 }) || 'p');
+    return [
+      { t: 'lit', value: qmark },                                                // `?`
+      { t: 'tok', name: scalarTok.name, text: keyTxt },                          // `k [y`
+      { t: 'lit', value: kv },                                                   // `:`
+      { t: 'struct', kind: 'indent' },                                          // block value, more-indented
+      { t: 'lit', value: seqInd }, { t: 'tok', name: scalarTok.name, text: itemTxt },        // `- p`
+      { t: 'struct', kind: 'newline' },                                         // sibling item
+      { t: 'lit', value: seqInd }, { t: 'tok', name: scalarTok.name, text: itemTxt },        // `- p`
+      { t: 'struct', kind: 'dedent' },
+    ];
+  }
+
+  // ── DIRECTED BLOCK SCALAR (indent grammars with a block-scalar config) ─────────────────────────────
+  // A YAML block scalar `|\n  body\n  more`: an introducer (`|`/`>`, +optional chomping/indent indicators)
+  // then verbatim more-indented lines emitted as ONE token (like raw text, but bounded by indentation, not
+  // a close tag). Its token is `never()` (the LEXER emits it from indentation state), so `sample()` yields
+  // null and the ordinary strategies NEVER produce it — leaving its scope (`string.unquoted.block`) at 0%
+  // coverage. This synthesizes it directly from `indent.blockScalar` (the introducers + token name) as a
+  // single multi-line tok at the document root (the minimal legal frame — a bare block scalar parses as a
+  // one-token document). Body lines are STRICTLY more-indented (the `indentWidth` columns) and plain words,
+  // never a col-0 `documentMarker` (`---`/`...`), which would terminate the scalar early (a doc boundary
+  // outranks indentation). Emitted as one tok (not a `lit`+struct), so `compactify` — which only rewrites a
+  // compact-indicator literal followed by a struct indent — leaves it untouched. Returns [] without config.
+  indentBlockScalar(indentWidth: number): Emission[] {
+    const bs = this.grammar.indent?.blockScalar; if (!bs || !bs.introducers.length) return [];
+    const tok = this.grammar.tokens.find((t) => t.name === bs.token); if (!tok) return [];
+    const pad = ' '.repeat(Math.max(1, indentWidth));
+    const markers = new Set(bs.documentMarkers ?? []);
+    // a plain body word that is NOT a document marker (so it can't terminate the scalar at col-0; here it is
+    // indented anyway, but keep it marker-free for safety) — derived from a block plain-scalar token sample.
+    const scalarTok = this.grammar.tokens.find((t) => t.blockPattern && t.scope);
+    let body = (scalarTok && sample(scalarTok.blockPattern!, { rand: this.rand, interesting: [], variant: 0 })) || 'body';
+    if (markers.has(body)) body = body + 'x';
+    const intro = bs.introducers[0];                                            // `|`
+    return [{ t: 'tok', name: bs.token, text: `${intro}\n${pad}${body}\n${pad}${body}` }];
+  }
+
   // Build the minimal legal context from `entry` down to `tokenName`, with the token rendered as
   // `sampleText` at its position. Descends the SHORTEST branch toward the token at each node and
   // minimal-fills everything else — the directed, deterministic analogue of nestChain for a token.
@@ -776,7 +931,13 @@ class Walker {
 // space (whitespace-insensitive); indentation grammars (YAML) render struct emissions through an
 // indent STACK that mirrors the lexer (newline = same-column sibling, indent = deeper block,
 // compact = an inline indent for `- - a`); markup grammars keep tag punctuation adjacent.
-interface MatOptions { mode: 'token-stream' | 'indent' | 'markup'; indentStep: number }
+// `tight` (markup only) ALSO glues the attribute-internal punctuation — `name="value"` with no
+// spaces around the `attributeAssign`/quotes — so a quoted value sits FLUSH against the self-close
+// `/>` (the WHATWG-canonical `<img src="a"/>`). That adjacency is what the spaced rendering never
+// forms, and it is exactly where a flat TextMate grammar mis-scopes the `/` (it reads the closing
+// quote then the `/` as an unquoted-value char, not tag punctuation). A SECOND, legal rendering of
+// the same emission list — the markup analogue of indent's compactify — in the exploratory tier.
+interface MatOptions { mode: 'token-stream' | 'indent' | 'markup'; indentStep: number; tight?: boolean }
 
 function materialize(grammar: CstGrammar, ems: Emission[], opts: MatOptions): { text: string; tokens: GenInput['tokens'] } {
   let text = '';
@@ -822,12 +983,18 @@ function materialize(grammar: CstGrammar, ems: Emission[], opts: MatOptions): {
 
   if (opts.mode === 'markup') {
     const noSpaceBefore = new Set([grammar.markup?.tagClose, grammar.markup?.closeMarker].filter(Boolean) as string[]);
+    const assign = grammar.markup?.attributeAssign;   // `=`; in tight mode it glues `name=value`
     let prev = '';
     for (const e of ems) {
       if (e.t === 'struct' || e.t === 'compact') continue;
       const s = e.t === 'lit' ? e.value : e.text;
       if (s.length === 0) continue;
-      const adjacent = prev === grammar.markup?.tagOpen || prev === grammar.markup?.closeMarker || noSpaceBefore.has(s) || prev === '';
+      // TIGHT also glues the attribute `=` to its name and value: `name=` (cur is the assign) and
+      // `=value` (prev was the assign). Combined with `noSpaceBefore` already gluing the value→`/>`,
+      // this renders `<img src="a"/>`. The inter-attribute / name boundary still takes a space (the
+      // value isn't an assign, the next name isn't), so `a="x" b="y"` stays well-formed.
+      const tightGlue = !!opts.tight && !!assign && (s === assign || prev === assign);
+      const adjacent = prev === grammar.markup?.tagOpen || prev === grammar.markup?.closeMarker || noSpaceBefore.has(s) || tightGlue || prev === '';
       if (!adjacent) emit(' ');
       if (e.t === 'tok') emitTok(e.name, s); else emit(s);
       prev = s;
@@ -891,13 +1058,30 @@ export function generateInputs(grammar: CstGrammar, opts: GenOptions = {}): GenI
 
   const seen = new Set<string>();
   const out: GenInput[] = [];
+  // The render JOBS for one emission list: each pairs an emission-variant with materialize options and
+  // the strategy label to file the resulting input under. Most modes have ONE job (the canonical
+  // rendering, same strategy). Two modes add a SECOND, equally-legal rendering of the same emissions:
+  //  • indent → a compactified copy (`- - a` inline), SAME strategy (a correct shape, still a gate).
+  //  • markup → a TIGHT copy (`name="value"/>` flush), filed in the EXPLORATORY (`fuzz`) tier. The
+  //    tight adjacency is where a flat grammar mis-scopes the self-close `/` — a STANDING flat-TM
+  //    limit in the unfixed grammar, not a regression of a structured shape — so, like a gnarly fuzz
+  //    derivation, it is report-only (`isGated` keys off the `fuzz` prefix). The spaced rendering keeps
+  //    the original strategy, so the structured round-trip guarantee is untouched.
+  const renderJobs = (ems: Emission[], strategy: string): { variant: Emission[]; mat: MatOptions; strat: string }[] => {
+    if (mode === 'indent') return [ems, compactify(ems, w.compactLits)].map((variant) => ({ variant, mat: matOpts, strat: strategy }));
+    if (mode === 'markup') return [
+      { variant: ems, mat: matOpts, strat: strategy },
+      { variant: ems, mat: { ...matOpts, tight: true }, strat: `fuzz:tight:${strategy}` },
+    ];
+    return [{ variant: ems, mat: matOpts, strat: strategy }];
+  };
   const push = (ems: Emission[], strategy: string, rule: string) => {
     if (out.length >= maxInputs * 4) return;
-    for (const variant of mode === 'indent' ? [ems, compactify(ems, w.compactLits)] : [ems]) {
-      const { text, tokens } = materialize(grammar, variant, matOpts);
+    for (const job of renderJobs(ems, strategy)) {
+      const { text, tokens } = materialize(grammar, job.variant, job.mat);
       if (!text.trim() || text.length > 2000 || seen.has(text)) continue;   // skip blank / over-long / duplicate
       seen.add(text);
-      out.push({ text, tokens, strategy, rule });
+      out.push({ text, tokens, strategy: job.strat, rule });
     }
   };
 
@@ -970,5 +1154,32 @@ export function generateInputs(grammar: CstGrammar, opts: GenOptions = {}): GenI
     }
   }
 
+  // 6) DIRECTED MARKUP SELF-CLOSE-WITH-ATTRIBUTE (markup grammars) — `<name attr="v"/>`. The un-biased
+  //    enumeration starves the quoted-attribute × self-close cross at a small cap, so this forces it (the
+  //    markup analogue of nestChain/tokenCover). Filed in the EXPLORATORY (`fuzz`) tier: even the SPACED
+  //    rendering puts the quoted value FLUSH against the self-close `/` (the `/` is structural punctuation,
+  //    always glued), and that flush value→`/` adjacency is exactly the STANDING flat-TM limit (the grammar
+  //    reads the `/` as unquoted-value content) — a real highlighter bug in the unfixed grammar, not a
+  //    regression of a by-construction shape, so it is report-only like a gnarly fuzz derivation, not a gate.
+  if (mode === 'markup') {
+    const sc = w.markupSelfCloseAttr();
+    if (sc.length) push(sc, 'fuzz:markupSelfClose', entry.name);
+  }
+
+  // 7) DIRECTED INDENT EXPLICIT-KEY-WITH-BRACKET-SCALAR (indent grammars) — `? k [y :\n  - p\n  - q`. The
+  //    un-biased strategies starve a plain-scalar explicit key (let alone one carrying a `[`), so this forces
+  //    it — the indent analogue of markupSelfCloseAttr. Filed EXPLORATORY (`fuzz:`): it deliberately stresses
+  //    the block-vs-flow-stack limit a flat grammar lacks (the phantom flow a `[`-in-key opens), so any
+  //    divergence is a STANDING limit of the unfixed grammar, report-only, not a by-construction gate.
+  if (mode === 'indent') {
+    const ek = w.indentExplicitKeyBracket();
+    if (ek.length) push(ek, 'fuzz:explicitKeyBracket', entry.name);
+    // a block scalar (`|\n  body`): its token is lexer-emitted (never() pattern), so no ordinary strategy
+    // produces it — synthesize one so its `string.unquoted.block` scope is covered. A clean structured
+    // shape that round-trips (a one-token document), so it is a normal `nest`-tier input (no flat-TM limit).
+    const bs = w.indentBlockScalar(matOpts.indentStep);
+    if (bs.length) push(bs, 'nest:blockScalar', entry.name);
+  }
+
   return out.slice(0, maxInputs);
 }

From 5d06488652e25a3c7a70bb3a661b796b0d9a386d Mon Sep 17 00:00:00 2001
From: Johnson Chu <johnsoncodehk@gmail.com>
Date: Tue, 9 Jun 2026 07:20:35 +0800
Subject: [PATCH 5/6] Gap ledger: deterministic, minimized, oracle-classified
 findings (KNOWN-GAPS.md)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Operationalize the scope≡role check's "discovered" divergences into a committed,
commit-trackable ledger instead of console output that vanishes. test/gap-ledger.ts:
for each language, collect the discovered divergences (reusing the EXACT detection,
factored into generative-detect.ts so generative.ts's gate is unchanged), MINIMIZE
each via delta-debugging to a stable minimal repro, CLASSIFY via the neutral oracle
(typescript/yaml/parse5) keeping only oracle-VALID-input gaps (over-accepts dropped),
and FINGERPRINT (content hash, stable across commits). Emits KNOWN-GAPS.md
(human + machine-readable), regenerated with `--write`, gated up-to-date with `--check`.

Deterministic: two runs → byte-identical ledger. Currently 2 gaps, 0 dropped — the
HTML/Vue self-close `/` mis-scope (`<aA aA = "a"/>` ddmin-minimized to `<A A=""/>`),
the floor-blind divergence the corpus-bound scope-gap metric can't see. CI runs the
selftest + `--check`. generative 7/7 unchanged; agnostic 9/9; deterministic.

The fixes for these gaps live on a separate branch (highlighter product changes), so
the ledger here demonstrates the tool FINDING them; a later layer can reconcile the
ledger into GitHub issues.
---
 .github/workflows/ci.yml    |   8 +
 KNOWN-GAPS.md               |  62 ++++++
 package.json                |   3 +
 test/gap-ledger-selftest.ts |  89 +++++++++
 test/gap-ledger.ts          | 380 ++++++++++++++++++++++++++++++++++++
 test/generative-detect.ts   | 181 +++++++++++++++++
 test/generative.ts          | 137 ++-----------
 7 files changed, 736 insertions(+), 124 deletions(-)
 create mode 100644 KNOWN-GAPS.md
 create mode 100644 test/gap-ledger-selftest.ts
 create mode 100644 test/gap-ledger.ts
 create mode 100644 test/generative-detect.ts

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4962098..905a304 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -58,6 +58,14 @@ jobs:
           node test/yaml-issue12-regressions.ts
           node test/yaml-depth-witnesses.ts
           node test/generative.ts
+          # The gap ledger is the deterministic, oracle-classified record of the divergences the
+          # generative check DISCOVERS. Its self-test asserts the ddmin keep-path + the oracle
+          # drop-path + determinism; --check fails if the committed KNOWN-GAPS.md is stale (the
+          # ledger is a pure function of the grammar, so it must be regenerated when a grammar
+          # changes — `npm run gap-ledger`). This is the deterministic source of truth; a later
+          # layer can turn rows into issues, but the committed artifact is gated here first.
+          node test/gap-ledger-selftest.ts
+          node test/gap-ledger.ts --check
 
   # The derived tree-sitter highlighter is the strongest thesis proof (a real GLR
   # parser from the same grammar, beating the official hand-written one). Build its
diff --git a/KNOWN-GAPS.md b/KNOWN-GAPS.md
new file mode 100644
index 0000000..8cd7f44
--- /dev/null
+++ b/KNOWN-GAPS.md
@@ -0,0 +1,62 @@
+# KNOWN-GAPS — Monogram flat-highlighter divergences (auto-generated)
+
+<!-- AUTO-GENERATED by `node test/gap-ledger.ts --write`. Do not edit by hand. -->
+
+A **gap** is a position where, on **valid input** (accepted by the language’s external
+authority — typescript / yaml / parse5), the **flat TextMate highlighter** paints a token a
+different visual role than the **Monogram parser** assigns it by construction. These are the
+floor-blind divergences the generative scope≡role check (`test/generative.ts`) DISCOVERS over
+grammar-derived inputs — the monogram#23/#24 class — which the corpus-bound scope-gap metric is
+blind to (a small/clean corpus may never contain the shape, and the role-graded metric ignores
+punctuation-floor mis-paints). Each gap’s input is **minimized** (delta-debugged to a minimal
+repro that still parses and still diverges) and **fingerprinted** (a content hash, stable across
+commits) so the ledger is deterministic and commit-trackable.
+
+Regenerate: `node test/gap-ledger.ts --write` · verify up-to-date: `node test/gap-ledger.ts --check`.
+
+**2 gaps** across 7 grammars · 0 dropped.
+
+## `525e867dc205` — html: #24 structural-literal→content
+
+- **Language:** html
+- **Minimal repro:** `<A A=""/>`
+- **Divergent token:** `/` (parser token `$punct`)
+- **Role vs scope:** want **punct**, got **string** (highlighter scope `string.unquoted.html`)
+- **Fingerprint:** `525e867dc205`
+
+```json
+{
+  "id": "525e867dc205",
+  "language": "html",
+  "kind": "#24 structural-literal→content",
+  "repro": "<A A=\"\"/>",
+  "tokenType": "$punct",
+  "tokenText": "/",
+  "want": "punct",
+  "got": "string",
+  "gotScope": "string.unquoted.html"
+}
+```
+
+## `85c793d02a86` — vue: #24 structural-literal→content
+
+- **Language:** vue
+- **Minimal repro:** `<A A=""/>`
+- **Divergent token:** `/` (parser token `$punct`)
+- **Role vs scope:** want **punct**, got **string** (highlighter scope `string.unquoted.vue`)
+- **Fingerprint:** `85c793d02a86`
+
+```json
+{
+  "id": "85c793d02a86",
+  "language": "vue",
+  "kind": "#24 structural-literal→content",
+  "repro": "<A A=\"\"/>",
+  "tokenType": "$punct",
+  "tokenText": "/",
+  "want": "punct",
+  "got": "string",
+  "gotScope": "string.unquoted.vue"
+}
+```
+
diff --git a/package.json b/package.json
index b3937f7..ec8f9c8 100644
--- a/package.json
+++ b/package.json
@@ -6,6 +6,9 @@
     "gen": "node src/cli.ts typescript.ts && node src/cli.ts javascript.ts && node src/cli.ts typescriptreact.ts && node src/cli.ts javascriptreact.ts && node src/cli.ts html.ts && node src/cli.ts vue.ts && node src/cli.ts yaml.ts",
     "test": "node test/sanity-check.ts",
     "generative": "node test/generative.ts",
+    "gap-ledger": "node test/gap-ledger.ts --write",
+    "gap-ledger:check": "node test/gap-ledger.ts --check",
+    "gap-ledger:selftest": "node test/gap-ledger-selftest.ts",
     "conformance": "node test/run-conformance.ts",
     "conformance:js": "node test/js-conformance.ts",
     "conformance:tsx": "node test/tsx-conformance.ts",
diff --git a/test/gap-ledger-selftest.ts b/test/gap-ledger-selftest.ts
new file mode 100644
index 0000000..4c2125f
--- /dev/null
+++ b/test/gap-ledger-selftest.ts
@@ -0,0 +1,89 @@
+// ─────────────────────────────────────────────────────────────────────────────
+//  gap-ledger-selftest.ts — asserts the gap ledger's two load-bearing behaviours
+//  on the REAL HTML probe, independent of how many gaps happen to surface:
+//
+//   (A) DETERMINISM — `generateInputs` + ddmin + fingerprint are a pure function of
+//       the grammar, so two full ledger builds are byte-identical. Asserted here over
+//       the rendered KNOWN-GAPS.md (the committed artifact) by building it twice.
+//
+//   (B) the oracle CLASSIFY DROP-PATH — a divergence whose minimized repro the external
+//       oracle REJECTS (a parser over-accept, not a real highlighter gap) is DROPPED,
+//       not filed. We assert the ledger's keep/drop predicate (`oracleAccepts(repro)`)
+//       routes a parser OVER-ACCEPT (a markup the Monogram parser accepts but parse5
+//       REJECTS — `< a/>`, `<:a/>`) to DROP, and the oracle-VALID `<a b="c"/>`-shape to
+//       KEEP. (Note: the self-close `/` divergence itself only arises on WELL-FORMED tag
+//       shapes — which parse5 also accepts — so a single input that BOTH diverges AND is
+//       oracle-rejected does not exist for this gap; the drop-path is exercised by the
+//       classify predicate over real over-accept markup, which is what would gate it.)
+//
+//  Run (bare node):  node test/gap-ledger-selftest.ts
+// ─────────────────────────────────────────────────────────────────────────────
+import { execFileSync } from 'node:child_process';
+import { createParser } from '../src/gen-parser.ts';
+import type { CstGrammar } from '../src/types.ts';
+import { buildRoleMap, anchoredScopes, leafRoles, collectViolations, isGated } from './generative-detect.ts';
+import { loadTm, tmTokenize, reproStillDiverges, sig, minimize, LANGS, type Probe } from './gap-ledger.ts';
+
+let failures = 0;
+const ok = (cond: boolean, msg: string) => { console.log(`${cond ? '  ✓' : '  ✗ FAIL:'} ${msg}`); if (!cond) failures++; };
+
+// ── build the HTML probe (the cheapest grammar with a known divergence) ──
+const htmlCfg = LANGS.find((l) => l.name === 'html')!;
+const grammar = (await import(htmlCfg.module)).default as CstGrammar;
+const { parse } = createParser(grammar);
+const tm = await loadTm(htmlCfg.scopeName, { [htmlCfg.scopeName]: htmlCfg.tmPath, ...(htmlCfg.tmExtra ?? {}) });
+if (!tm) throw new Error('failed to load html grammar');
+const probe: Probe = { parse, tm, grammar, roleOf: buildRoleMap(grammar), anchored: anchoredScopes(grammar) };
+
+// the ledger's CLASSIFY predicate, verbatim: keep iff the oracle accepts the minimal repro as VALID.
+const classifyKeeps = (text: string) => htmlCfg.oracleAccepts(text);
+
+console.log('gap-ledger self-test\n');
+
+// ── (B1) the canonical KEPT case: `<a b="c"/>`-shape, oracle-valid, still diverges ──
+const keptInput = '<aA aA = "a"/>';   // the generator's tight-markup shape
+{
+  // detect the divergence on the real input, minimize, classify
+  const v0 = probeDivergence(keptInput);
+  ok(!!v0, `kept case: a self-close \`/\` divergence is detected on ${JSON.stringify(keptInput)}`);
+  if (v0) {
+    const repro = minimize(probe, keptInput, v0.target);
+    ok(!!reproStillDiverges(probe, repro, v0.target), `kept case: minimized repro ${JSON.stringify(repro)} still diverges`);
+    ok(classifyKeeps(repro), `kept case: parse5 ACCEPTS the minimized repro → KEEP (a real highlighter gap)`);
+  }
+}
+
+// ── (B2) the DROP case: real parser OVER-ACCEPTS (parser accepts, parse5 REJECTS) ──
+// markup the Monogram markup parser accepts but parse5 does NOT recover as an element — exactly the
+// "Monogram parses but the oracle rejects" class the ledger must DROP (a parser concern, not a
+// highlighter gap). We assert each is parser-accepted AND classify-DROPPED (oracleAccepts == false).
+const overAccepts = ['< a/>', '<:a/>'];
+let dropProven = false;
+for (const cand of overAccepts) {
+  let parserOk = false; try { parse(cand); parserOk = true; } catch { /* */ }
+  if (!parserOk) continue;
+  ok(!classifyKeeps(cand), `drop case: ${JSON.stringify(cand)} is parser-accepted but parse5-REJECTS → classify DROPS it`);
+  dropProven = true;
+}
+ok(dropProven, 'drop case: at least one real parser-over-accept is parser-accepted and confirmed dropped');
+// and the dual: the oracle-VALID minimal repro is KEPT (not dropped) — the keep/drop split is real.
+ok(classifyKeeps('<A A=""/>'), 'keep/drop split: the oracle-VALID `<A A="">`-shape repro is KEPT (not dropped)');
+
+// ── (A) determinism of the rendered artifact: two builds byte-identical ──
+console.log('\n  determinism (two full ledger builds)…');
+const run = () => execFileSync('node', ['test/gap-ledger.ts'], { encoding: 'utf8', stdio: ['ignore', 'pipe', 'ignore'], maxBuffer: 64 * 1024 * 1024 });
+const a = run(), b = run();
+ok(a === b, `two \`node test/gap-ledger.ts\` runs produce byte-identical output (${a.length} bytes)`);
+
+console.log(failures ? `\n${failures} self-test failure(s).` : '\nAll gap-ledger self-tests passed.');
+process.exit(failures ? 1 : 0);
+
+// ── helper: detect the self-close `/` divergence on `text`, returning its signature ──
+function probeDivergence(text: string): { target: string } | null {
+  let cst; try { cst = parse(text); } catch { return null; }
+  let toks; try { toks = tmTokenize(probe.tm, text); } catch { return null; }
+  const leaves = leafRoles(grammar, cst, probe.roleOf);
+  const vs = collectViolations({ input: text, strategy: 'fuzz', cst, toks, leaves, anchored: probe.anchored });
+  const v = vs.find((x) => !isGated(x));
+  return v ? { target: sig(v) } : null;
+}
diff --git a/test/gap-ledger.ts b/test/gap-ledger.ts
new file mode 100644
index 0000000..7b3769d
--- /dev/null
+++ b/test/gap-ledger.ts
@@ -0,0 +1,380 @@
+// ─────────────────────────────────────────────────────────────────────────────
+//  gap-ledger.ts — a DETERMINISTIC, auto-maintained GAP LEDGER for Monogram.
+//
+//  The generative by-construction check (test/generative.ts) DISCOVERS divergences
+//  where the flat TextMate highlighter and the Monogram parser disagree on the
+//  visual role of a token in a grammar-DERIVED input — the floor-blind class the
+//  corpus-bound scope-gap metric is blind to (monogram#23/#24). That check REPORTS
+//  them; this ledger OPERATIONALIZES them into a stable, commit-trackable artifact:
+//
+//    1. DISCOVER — for each of the 7 grammars, generate inputs deterministically
+//       (grammar-gen.ts), tokenize with the flat grammar + parse with the parser,
+//       and collect the divergences using the SAME detector generative.ts uses
+//       (generative-detect.ts) — not a reimplementation.
+//    2. MINIMIZE — delta-debug (ddmin) each divergence's input down to a minimal
+//       repro that still parses AND still exhibits the SAME divergence (same parser
+//       role-bucket vs same highlighter bucket, identified by a position-independent
+//       signature). The generator + ddmin are deterministic, so the minimal repro is
+//       stable across runs and commits.
+//    3. CLASSIFY — parse the minimal repro with the language's EXTERNAL authority
+//       (typescript / yaml / parse5). File ONLY divergences the oracle accepts as
+//       VALID input (a real highlighter gap on valid input). A repro the parser
+//       accepts but the oracle rejects is a parser OVER-ACCEPT — a different concern;
+//       it is DROPPED from the gap list (its count is reported, not listed).
+//    4. FINGERPRINT — a stable id = hash(language, normalized repro, role, bucket),
+//       so the same gap keeps the same id across commits.
+//    5. EMIT — a sorted KNOWN-GAPS.md (committed artifact): per gap, the language,
+//       escaped minimal repro, role-vs-scope (want vs got), fingerprint, and a
+//       machine-readable JSON block.
+//
+//  DETERMINISM is the whole point (a commit-trackable ledger): two runs produce a
+//  BYTE-IDENTICAL KNOWN-GAPS.md. The generator is a pure function of the grammar
+//  (no seed), ddmin is deterministic, the oracle is deterministic, and the hash is
+//  content-only — so nothing varies run-to-run.
+//
+//  Run (bare node):
+//    node test/gap-ledger.ts            # print the ledger to stdout (don't write)
+//    node test/gap-ledger.ts --write    # (re)write KNOWN-GAPS.md
+//    node test/gap-ledger.ts --check    # fail if KNOWN-GAPS.md is stale (CI guard)
+//    node test/gap-ledger.ts yaml       # one language
+// ─────────────────────────────────────────────────────────────────────────────
+import { readFileSync, writeFileSync, existsSync } from 'node:fs';
+import { createHash } from 'node:crypto';
+import { createRequire } from 'node:module';
+import vsctm from 'vscode-textmate';
+import onig from 'vscode-oniguruma';
+import ts from 'typescript';
+import { parseAllDocuments } from 'yaml';
+import { parseFragment } from 'parse5';
+import sfcCompiler from '@vue/compiler-sfc';
+import { createParser, type CstNode } from '../src/gen-parser.ts';
+import type { CstGrammar } from '../src/types.ts';
+import { generateInputs } from './grammar-gen.ts';
+import {
+  type TmTok, type Violation,
+  buildRoleMap, leafRoles, anchoredScopes, collectViolations, isGated,
+} from './generative-detect.ts';
+
+// ── language registry — the SAME per-language DATA shape as generative.ts's LANGS, plus an
+//    `oracleAccepts(text)`: the external authority's verdict on whether the minimal repro is VALID
+//    input. THAT is the only per-language wiring (a config table, like generative.ts's LANGS); the
+//    ddmin / fingerprint / emit ENGINE below is language-agnostic. ──
+interface LangCfg {
+  name: string;
+  module: string;          // grammar module (default export = CstGrammar)
+  scopeName: string;       // TextMate scope, e.g. source.yaml
+  tmPath: string;          // the derived flat .tmLanguage.json
+  tmExtra?: Record<string, string>;  // extra scopeName → file for multi-file grammars
+  oracleAccepts: (text: string) => boolean;   // the neutral oracle's "is this VALID input?" verdict
+}
+
+// ── oracle validity verdicts (DATA) ──────────────────────────────────────────────────────────────
+// TS-family: tsc's own parser — zero parseDiagnostics means it accepts the text as valid source.
+const tsAccepts = (kind: ts.ScriptKind) => (text: string): boolean => {
+  try {
+    const sf = ts.createSourceFile('gap.ts', text, ts.ScriptTarget.Latest, /*setParentNodes*/ false, kind);
+    return ((sf as any).parseDiagnostics?.length ?? 0) === 0;
+  } catch { return false; }
+};
+// YAML: the `yaml` package — a document with zero `.errors` is valid (the same independent authority
+// the scope-gap YAML oracle uses). A throw or any error ⇒ not valid.
+const yamlAccepts = (text: string): boolean => {
+  try { const docs = parseAllDocuments(text); return docs.length > 0 && docs.every((d: any) => (d.errors?.length ?? 0) === 0); }
+  catch { return false; }
+};
+// HTML: parse5 is error-TOLERANT (never throws), so "valid" = it recovered a real element structure —
+// at least one element/tag node (not pure text / a dropped `</>`). This matches html-oracle.ts's own
+// emission gate (it only emits tag/attr roles when parse5 reports a tagName + location).
+const htmlAccepts = (text: string): boolean => {
+  try {
+    const frag: any = parseFragment(text, { sourceCodeLocationInfo: true });
+    const hasEl = (nodes: any[]): boolean => nodes.some((n) => (n.tagName && n.sourceCodeLocation) || (n.childNodes && hasEl(n.childNodes)));
+    return hasEl(frag.childNodes ?? []);
+  } catch { return false; }
+};
+// Vue SFC: the template markup sub-language IS HTML — vue-oracle.ts composes parse5 over the template
+// content as its markup authority. @vue/compiler-sfc only does SFC BLOCK splitting; a bare template-
+// level markup fragment (what the generator emits for the vue grammar — `<a b="c"/>`, not a full
+// `<template>…</template>` SFC) is NOT a top-level SFC block, so the SFC parser reports no template.
+// The right neutral verdict for such markup is therefore parse5's (the template arbiter): if the SFC
+// parser DID isolate a <template> block, validate ITS content with parse5; otherwise the whole input
+// is template-level markup → validate it directly with parse5. (A <script>/<style>-only SFC has no
+// markup to grade here; it is accepted as a valid block.)
+const vueAccepts = (text: string): boolean => {
+  try {
+    const { descriptor } = sfcCompiler.parse(text);
+    if (descriptor.template) return htmlAccepts(descriptor.template.content ?? '');
+    if (descriptor.script || descriptor.scriptSetup || (descriptor.styles ?? []).length) return true;
+    return htmlAccepts(text);   // bare template-level markup — parse5 is the arbiter (as in vue-oracle.ts)
+  } catch { return false; }
+};
+
+const LANGS: LangCfg[] = [
+  { name: 'yaml', module: '../yaml.ts', scopeName: 'source.yaml', tmPath: 'yaml.tmLanguage.json', oracleAccepts: yamlAccepts },
+  { name: 'typescript', module: '../typescript.ts', scopeName: 'source.ts', tmPath: 'typescript.tmLanguage.json', oracleAccepts: tsAccepts(ts.ScriptKind.TS) },
+  { name: 'javascript', module: '../javascript.ts', scopeName: 'source.js', tmPath: 'javascript.tmLanguage.json', oracleAccepts: tsAccepts(ts.ScriptKind.JS) },
+  { name: 'typescriptreact', module: '../typescriptreact.ts', scopeName: 'source.tsx', tmPath: 'typescriptreact.tmLanguage.json', oracleAccepts: tsAccepts(ts.ScriptKind.TSX) },
+  { name: 'javascriptreact', module: '../javascriptreact.ts', scopeName: 'source.js.jsx', tmPath: 'javascriptreact.tmLanguage.json', oracleAccepts: tsAccepts(ts.ScriptKind.JSX) },
+  { name: 'html', module: '../html.ts', scopeName: 'text.html.basic', tmPath: 'html.tmLanguage.json',
+    tmExtra: { 'source.js': 'javascript.tmLanguage.json', 'source.css': 'html.tmLanguage.json' }, oracleAccepts: htmlAccepts },
+  { name: 'vue', module: '../vue.ts', scopeName: 'text.html.vue', tmPath: 'vue.tmLanguage.json',
+    tmExtra: { 'text.html.basic': 'html.tmLanguage.json', 'source.js': 'javascript.tmLanguage.json', 'source.ts': 'typescript.tmLanguage.json', 'source.tsx': 'typescriptreact.tmLanguage.json' }, oracleAccepts: vueAccepts },
+];
+
+// ── shared vscode-textmate tokenizer (one WASM load) ─────────────────────────────────────────────
+const { INITIAL, Registry, parseRawGrammar } = vsctm;
+const { loadWASM, OnigScanner, OnigString } = onig;
+const require = createRequire(import.meta.url);
+const bin = readFileSync(require.resolve('vscode-oniguruma/release/onig.wasm'));
+await loadWASM(bin.buffer.slice(bin.byteOffset, bin.byteOffset + bin.byteLength));
+
+async function loadTm(scopeName: string, files: Record<string, string>) {
+  const cache: Record<string, string> = {};
+  const reg = new Registry({
+    onigLib: Promise.resolve({ createOnigScanner: (p: string[]) => new OnigScanner(p), createOnigString: (s: string) => new OnigString(s) }),
+    loadGrammar: async (sn: string) => { const p = files[sn]; if (!p) return null; const c = cache[sn] ?? (cache[sn] = readFileSync(p, 'utf8')); return parseRawGrammar(c, sn + '.json'); },
+  });
+  return reg.loadGrammar(scopeName);
+}
+function tmTokenize(grammar: vsctm.IGrammar, text: string): TmTok[] {
+  const toks: TmTok[] = []; let rs = INITIAL, off = 0;
+  for (const line of text.split('\n')) { const r = grammar.tokenizeLine(line, rs); for (const t of r.tokens) toks.push({ start: off + t.startIndex, end: off + t.endIndex, scopes: t.scopes }); rs = r.ruleStack; off += line.length + 1; }
+  return toks;
+}
+
+// ── a divergence's POSITION-INDEPENDENT signature — what "the SAME divergence" means across the
+//    shrinking candidates (their byte offsets all differ as the input shrinks). A divergence is the
+//    same iff it is the same KIND (#23/#24), on the same TOKEN TYPE, painting the same wrong visual
+//    BUCKET, on a leaf of the same TEXT (the `/` of a self-close, the `---` of a value-marker). The
+//    `pos` is deliberately EXCLUDED — that is exactly the coordinate ddmin changes. ──
+function sig(v: Violation): string { return `${v.kind}|${v.tokenType}|${v.got}|${v.text}`; }
+
+// One repro's check: parse it (parser must still ACCEPT — a shrink that breaks parsing is not a valid
+// candidate), tokenize, and report whether `target` (a divergence signature) still appears among the
+// detected violations. Returns the matching Violation (for its fresh offsets) or null.
+interface Probe { parse: (text: string, rule?: string) => CstNode; tm: vsctm.IGrammar; grammar: CstGrammar; roleOf: ReturnType<typeof buildRoleMap>; anchored: Map<string, Set<string>>; }
+function reproStillDiverges(p: Probe, text: string, target: string): Violation | null {
+  let cst: CstNode;
+  try { cst = p.parse(text); } catch { return null; }      // parser must accept the shrunk input
+  let toks: TmTok[];
+  try { toks = tmTokenize(p.tm, text); } catch { return null; }
+  const leaves = leafRoles(p.grammar, cst, p.roleOf);
+  const vs = collectViolations({ input: text, strategy: 'fuzz', cst, toks, leaves, anchored: p.anchored });
+  return vs.find((v) => sig(v) === target) ?? null;
+}
+
+// ── ddmin: delta-debugging minimization. Shrink `text` while `keeps(candidate)` stays true. Two
+//    passes, both deterministic: (1) LINE granularity (drop a contiguous block of lines), then
+//    (2) CHARACTER granularity (drop a contiguous run of chars). The classic ddmin schedule —
+//    halving the chunk size when no removal at the current size helps — gives a 1-minimal result
+//    (no single chunk at the finest granularity can be removed) and is order-deterministic. ──
+function ddminBy<T>(units: T[], join: (us: T[]) => string, keeps: (text: string) => boolean): T[] {
+  let cur = units;
+  let n = 2;
+  while (cur.length >= 2) {
+    const chunk = Math.ceil(cur.length / n);
+    let removedAny = false;
+    // try removing each contiguous chunk (left to right) — deterministic order
+    for (let i = 0; i < cur.length; i += chunk) {
+      const candidate = [...cur.slice(0, i), ...cur.slice(i + chunk)];
+      if (candidate.length && candidate.length < cur.length && keeps(join(candidate))) {
+        cur = candidate;
+        n = Math.max(n - 1, 2);
+        removedAny = true;
+        break;          // restart the schedule on the smaller input (deterministic)
+      }
+    }
+    if (!removedAny) {
+      if (n >= cur.length) break;       // finest granularity reached, nothing removable → 1-minimal
+      n = Math.min(cur.length, n * 2);
+    }
+  }
+  return cur;
+}
+
+function minimize(p: Probe, input: string, target: string): string {
+  // sanity: the unshrunk input must actually exhibit the target (it came from detection, so it does)
+  if (!reproStillDiverges(p, input, target)) return input;
+  const keeps = (text: string) => text.trim().length > 0 && !!reproStillDiverges(p, text, target);
+  // pass 1: drop whole lines (keeps the `\n` joins so indentation/line structure is preserved)
+  let lines = input.split('\n');
+  lines = ddminBy(lines, (us) => us.join('\n'), keeps);
+  let cur = lines.join('\n');
+  // pass 2: drop characters (the within-line minimization — trims `<aA aA = "a"/>` → `<a a="a"/>`)
+  const chars = ddminBy([...cur], (us) => us.join(''), keeps);
+  cur = chars.join('');
+  return cur;
+}
+
+// ── fingerprint: a stable content hash. Inputs are LANGUAGE + the NORMALIZED minimal repro + the
+//    parser ROLE side (expected buckets) + the highlighter BUCKET (got) — i.e. exactly the identity
+//    of the gap, nothing run-dependent. Normalization (trim trailing whitespace per line, LF) keeps
+//    the id stable against incidental formatting. 12 hex chars = 48 bits, ample for a small ledger. ──
+function normalizeRepro(text: string): string {
+  return text.replace(/\r\n?/g, '\n').split('\n').map((l) => l.replace(/[ \t]+$/, '')).join('\n').replace(/\n+$/, '');
+}
+function fingerprint(g: Gap): string {
+  const h = createHash('sha256');
+  h.update(`${g.language} ${normalizeRepro(g.repro)} ${g.expected} ${g.got}`);
+  return h.digest('hex').slice(0, 12);
+}
+
+// ── a filed gap (the ledger row) ──────────────────────────────────────────────────────────────────
+interface Gap {
+  id: string;            // fingerprint (filled after construction)
+  language: string;
+  kind: string;          // the divergence class (#23/#24)
+  repro: string;         // the minimized input
+  tokenType: string;     // the parser's token type for the divergent leaf
+  tokenText: string;     // the divergent leaf's text (`/`, `---`, …)
+  expected: string;      // by-construction declared role buckets (want)
+  got: string;           // the visual bucket the highlighter painted (the wrong one)
+  gotScope: string;      // the actual innermost scope the highlighter emitted
+}
+
+interface LangResult { name: string; kept: Gap[]; droppedOverAccept: number; discovered: number; }
+
+async function runLang(cfg: LangCfg): Promise<LangResult> {
+  if (!existsSync(cfg.tmPath)) return { name: cfg.name, kept: [], droppedOverAccept: 0, discovered: 0 };
+  const grammar = (await import(cfg.module)).default as CstGrammar;
+  const { parse } = createParser(grammar);
+  const tm = await loadTm(cfg.scopeName, { [cfg.scopeName]: cfg.tmPath, ...(cfg.tmExtra ?? {}) });
+  if (!tm) throw new Error(`failed to load ${cfg.tmPath}`);
+  const roleOf = buildRoleMap(grammar);
+  const anchored = anchoredScopes(grammar);
+  const probe: Probe = { parse, tm, grammar, roleOf, anchored };
+
+  // 1) DISCOVER — generate deterministically, then collect the DISCOVERED (report-only, !isGated)
+  //    divergences over the full-document inputs. (The gated ones are generative.ts's hard failures;
+  //    on this UNFIXED branch there are none. The ledger files the floor-blind DISCOVERED class.)
+  const inputs = generateInputs(grammar);
+  const discoveredVs: Violation[] = [];
+  for (const inp of inputs) {
+    let cst: CstNode, toks: TmTok[];
+    try { cst = parse(inp.text); } catch { continue; }       // only full-document (entry-rule) inputs
+    try { toks = tmTokenize(tm, inp.text); } catch { continue; }
+    const leaves = leafRoles(grammar, cst, roleOf);
+    const vs = collectViolations({ input: inp.text, strategy: inp.strategy, cst, toks, leaves, anchored });
+    for (const v of vs) if (!isGated(v)) discoveredVs.push(v);
+  }
+
+  // dedupe by signature — keep the SHORTEST-input witness per distinct divergence (ddmin shrinks it
+  // anyway, but starting from the shortest is faster and keeps the pre-ddmin choice deterministic).
+  const bySig = new Map<string, Violation>();
+  for (const v of discoveredVs) {
+    const k = sig(v);
+    const prev = bySig.get(k);
+    if (!prev || v.input.length < prev.input.length || (v.input.length === prev.input.length && v.input < prev.input)) bySig.set(k, v);
+  }
+
+  // 2) MINIMIZE + 3) CLASSIFY each distinct divergence
+  const kept: Gap[] = [];
+  let droppedOverAccept = 0;
+  for (const [target, v] of [...bySig.entries()].sort((a, b) => (a[0] < b[0] ? -1 : 1))) {
+    const repro = minimize(probe, v.input, target);
+    // re-detect on the minimized repro to read the divergent leaf's FINAL offsets/scope
+    const finalV = reproStillDiverges(probe, repro, target) ?? v;
+    // CLASSIFY: keep ONLY if the external oracle accepts the minimal repro as VALID input.
+    if (!cfg.oracleAccepts(repro)) { droppedOverAccept++; continue; }
+    const g: Gap = {
+      id: '', language: cfg.name, kind: finalV.kind, repro,
+      tokenType: finalV.tokenType, tokenText: finalV.text,
+      expected: finalV.expected, got: finalV.got, gotScope: finalV.gotScope,
+    };
+    g.id = fingerprint(g);
+    kept.push(g);
+  }
+  return { name: cfg.name, kept, droppedOverAccept, discovered: bySig.size };
+}
+
+// ── EMIT: KNOWN-GAPS.md (human-readable + machine-readable JSON block per gap) ────────────────────
+const esc = (s: string) => s.replace(/\\/g, '\\\\').replace(/\n/g, '\\n').replace(/\t/g, '\\t').replace(/\r/g, '\\r');
+
+function renderMarkdown(gaps: Gap[], dropped: number, langCount: number): string {
+  const lines: string[] = [];
+  lines.push('# KNOWN-GAPS — Monogram flat-highlighter divergences (auto-generated)');
+  lines.push('');
+  lines.push('<!-- AUTO-GENERATED by `node test/gap-ledger.ts --write`. Do not edit by hand. -->');
+  lines.push('');
+  lines.push('A **gap** is a position where, on **valid input** (accepted by the language’s external');
+  lines.push('authority — typescript / yaml / parse5), the **flat TextMate highlighter** paints a token a');
+  lines.push('different visual role than the **Monogram parser** assigns it by construction. These are the');
+  lines.push('floor-blind divergences the generative scope≡role check (`test/generative.ts`) DISCOVERS over');
+  lines.push('grammar-derived inputs — the monogram#23/#24 class — which the corpus-bound scope-gap metric is');
+  lines.push('blind to (a small/clean corpus may never contain the shape, and the role-graded metric ignores');
+  lines.push('punctuation-floor mis-paints). Each gap’s input is **minimized** (delta-debugged to a minimal');
+  lines.push('repro that still parses and still diverges) and **fingerprinted** (a content hash, stable across');
+  lines.push('commits) so the ledger is deterministic and commit-trackable.');
+  lines.push('');
+  lines.push('Regenerate: `node test/gap-ledger.ts --write` · verify up-to-date: `node test/gap-ledger.ts --check`.');
+  lines.push('');
+  lines.push(`**${gaps.length} gap${gaps.length === 1 ? '' : 's'}** across ${langCount} grammars` +
+    (dropped ? ` · ${dropped} divergence${dropped === 1 ? '' : 's'} dropped as parser over-accepts (oracle-rejected repro, not a highlighter gap)` : ' · 0 dropped') + '.');
+  lines.push('');
+  if (!gaps.length) {
+    lines.push('_No gaps currently surface._ The generative check reports no valid-input flat-highlighter');
+    lines.push('divergence on the derived corpus. (This is the ledger MECHANISM; it lists what the check finds.)');
+    lines.push('');
+    return lines.join('\n') + '\n';
+  }
+  for (const g of gaps) {
+    lines.push(`## \`${g.id}\` — ${g.language}: ${g.kind}`);
+    lines.push('');
+    lines.push(`- **Language:** ${g.language}`);
+    lines.push(`- **Minimal repro:** \`${esc(g.repro)}\``);
+    lines.push(`- **Divergent token:** \`${esc(g.tokenText)}\` (parser token \`${g.tokenType}\`)`);
+    lines.push(`- **Role vs scope:** want **${g.expected}**, got **${g.got}** (highlighter scope \`${g.gotScope}\`)`);
+    lines.push(`- **Fingerprint:** \`${g.id}\``);
+    lines.push('');
+    lines.push('```json');
+    lines.push(JSON.stringify({
+      id: g.id, language: g.language, kind: g.kind, repro: g.repro,
+      tokenType: g.tokenType, tokenText: g.tokenText,
+      want: g.expected, got: g.got, gotScope: g.gotScope,
+    }, null, 2));
+    lines.push('```');
+    lines.push('');
+  }
+  return lines.join('\n') + '\n';
+}
+
+// re-exports so the ENGINE is reusable/testable without re-running the driver (gap-ledger-selftest.ts
+// asserts the oracle drop-path); the driver itself only runs when this file is the entry module.
+export { runLang, minimize, reproStillDiverges, sig, fingerprint, normalizeRepro, loadTm, tmTokenize, LANGS };
+export type { LangCfg, Gap, Probe, LangResult };
+
+// ── driver (only when run directly, not when imported) ──────────────────────────────────────────────
+async function main(): Promise<void> {
+  const args = process.argv.slice(2);
+  const WRITE = args.includes('--write');
+  const CHECK = args.includes('--check');
+  const only = args.find((a) => !a.startsWith('--'));
+  const targets = only ? LANGS.filter((l) => l.name === only || (only === 'tsfamily' && /script/.test(l.name))) : LANGS;
+  if (!targets.length) { console.error(`unknown language: ${only}`); process.exit(1); }
+
+  const results: LangResult[] = [];
+  for (const cfg of targets) { console.error(`  gap-ledger: ${cfg.name}…`); results.push(await runLang(cfg)); }
+
+  // sort gaps deterministically: by language order (LANGS), then by fingerprint
+  const langOrder = new Map(LANGS.map((l, i) => [l.name, i]));
+  const allGaps = results.flatMap((r) => r.kept).sort((a, b) =>
+    (langOrder.get(a.language)! - langOrder.get(b.language)!) || (a.id < b.id ? -1 : a.id > b.id ? 1 : 0));
+  const droppedTotal = results.reduce((a, r) => a + r.droppedOverAccept, 0);
+  const md = renderMarkdown(allGaps, droppedTotal, targets.length);
+
+  // per-language summary to stderr (not part of the artifact, so it never affects determinism)
+  for (const r of results) console.error(`    ${r.name}: ${r.kept.length} kept · ${r.droppedOverAccept} dropped (over-accept) · ${r.discovered} distinct divergence(s)`);
+  console.error(`  TOTAL: ${allGaps.length} gaps · ${droppedTotal} dropped over-accepts`);
+
+  const OUT = 'KNOWN-GAPS.md';
+  if (CHECK) {
+    const existing = existsSync(OUT) ? readFileSync(OUT, 'utf8') : '';
+    if (existing !== md) { console.error(`\n${OUT} is STALE — run \`node test/gap-ledger.ts --write\`.`); process.exit(1); }
+    console.error(`\n${OUT} is up to date.`);
+    return;
+  }
+  if (WRITE) { writeFileSync(OUT, md); console.error(`\n✓ wrote ${OUT} (${allGaps.length} gaps).`); }
+  else { process.stdout.write(md); }
+}
+
+if ((import.meta as any).main) await main();
diff --git a/test/generative-detect.ts b/test/generative-detect.ts
new file mode 100644
index 0000000..c3e708f
--- /dev/null
+++ b/test/generative-detect.ts
@@ -0,0 +1,181 @@
+// ─────────────────────────────────────────────────────────────────────────────
+//  generative-detect.ts — the scope≡role DIVERGENCE DETECTION, factored out of
+//  test/generative.ts so a SECOND consumer (test/gap-ledger.ts) can reuse the exact
+//  same logic without re-implementing it (and without pulling in generative.ts's
+//  top-level WASM load / process.exit driver).
+//
+//  This module is PURE (no I/O, no global side effects): it takes an already-parsed
+//  CST and an already-tokenized flat-grammar token list and reports the positions
+//  where the flat highlighter's visual bucket disagrees with a token's by-construction
+//  role. test/generative.ts imports these and keeps its own gate behaviour; the gap
+//  ledger imports them and treats the SAME violations as the gap set to minimize.
+//
+//  The two divergence classes (monogram#23/#24) and the gate-1/gate-2 logic live here
+//  verbatim — see the comments inline; the original prose is in generative.ts's header.
+// ─────────────────────────────────────────────────────────────────────────────
+import { normScope } from './scope-roles.ts';
+import type { CstNode, CstChild } from '../src/gen-parser.ts';
+import type { CstGrammar, TokenPattern } from '../src/types.ts';
+import type { GenOptions } from './grammar-gen.ts';
+
+// The generation knobs BOTH consumers use, so the gap ledger sees the SAME derived corpus (hence the
+// SAME divergence set) as generative.ts's check. `seed` is a no-op (the generator is a pure function
+// of the grammar), kept only for back-compat; nothing here introduces run-to-run variation.
+export const GEN_OPTS: GenOptions = { depth: 5, nestDepth: 5, cap: 7, fuzzRounds: 250, maxInputs: 1500, seed: 5 };
+
+// ── flat-grammar token (one vscode-textmate token, absolute offsets) ──────────────
+export interface TmTok { start: number; end: number; scopes: string[] }
+// binary-search the innermost scope chain covering a byte position
+export function scopeAt(toks: TmTok[], pos: number): string[] {
+  let lo = 0, hi = toks.length - 1, ans = -1;
+  while (lo <= hi) { const mid = (lo + hi) >> 1; if (toks[mid].start <= pos) { ans = mid; lo = mid + 1; } else hi = mid - 1; }
+  return ans >= 0 && toks[ans].end > pos ? toks[ans].scopes : [];
+}
+export const innerOf = (s: string[]): string => (s.length ? s[s.length - 1] : '(none)');
+
+// ── visual bucket of a scope chain — the level at which a highlight difference is actually visible.
+//    Same partition the scope-gap differential pass uses; the consistency check compares buckets so a
+//    `-` painted as string (punct≠string) is caught even though punctuation is a lexical-floor role. ──
+export type Bucket = 'invalid' | 'comment' | 'string' | 'number' | 'keyword' | 'name' | 'punct' | 'none';
+export function scopeBucket(chain: string[]): Bucket {
+  for (let i = chain.length - 1; i >= 0; i--) {
+    const s = normScope(chain[i]);
+    if (/^invalid/.test(s)) return 'invalid';
+    if (/^comment/.test(s)) return 'comment';
+    if (/^constant\.numeric/.test(s)) return 'number';
+    if (/^(string|constant\.character|constant\.other\.symbol)/.test(s)) return 'string';
+    if (/^(keyword|storage|constant\.language|support\.constant|variable\.language)/.test(s)) return 'keyword';
+    if (/^(entity|variable|support|constant)/.test(s)) return 'name';
+    if (/^punctuation/.test(s)) return 'punct';
+  }
+  return 'none';
+}
+// every visual bucket a scope CHAIN spans (a YAML number is `string.unquoted constant.numeric` →
+// {string, number} — both are legitimate, since the same token folds to a multi-line string).
+export function chainBuckets(scope: string): Set<Bucket> {
+  const out = new Set<Bucket>();
+  for (const seg of scope.split(/\s+/)) if (seg) out.add(scopeBucket([seg]));
+  return out;
+}
+export const CONTENT = new Set<Bucket>(['string', 'comment', 'number']);   // a STRUCTURAL literal is never one of these
+
+// the visual buckets the highlighter actually painted ACROSS a leaf's span (whitespace skipped)
+export function spanBuckets(toks: TmTok[], text: string, start: number, end: number): Set<Bucket> {
+  const s = new Set<Bucket>();
+  for (let p = start; p < end; p++) { const c = text.charCodeAt(p); if (c === 32 || c === 9) continue; s.add(scopeBucket(scopeAt(toks, p))); }
+  return s.size ? s : new Set<Bucket>(['none']);
+}
+
+// ── by-construction expected role of a parsed leaf, from the grammar ALONE ──────────────────────
+// A leaf's token TYPE → the bucket SET the grammar DECLARES for it: a named token → its `scope`
+// chain's buckets; a `$punct`/`$keyword` literal → any `scopes` override, else punctuation / keyword.
+// `lit` marks a STRUCTURAL literal (`$punct`/`$keyword`) — one the parser placed as grammar structure,
+// so the highlighter painting it as CONTENT (string/comment/number) is always wrong (monogram#24).
+export interface LeafRole { start: number; end: number; text: string; tokenType: string; expected: Set<Bucket>; lit: boolean }
+export function buildRoleMap(grammar: CstGrammar): (leaf: { tokenType: string; text: string }) => { buckets: Set<Bucket>; lit: boolean } | null {
+  const tokScope = new Map<string, string | undefined>();
+  for (const t of grammar.tokens) tokScope.set(t.name, t.scope);
+  const skip = new Set<string>();
+  if (grammar.indent) { skip.add(grammar.indent.indentToken); skip.add(grammar.indent.dedentToken); skip.add(grammar.indent.newlineToken); }
+  if (grammar.newline) skip.add(grammar.newline.token);
+  const over = grammar.scopeOverrides;
+  return (leaf) => {
+    const ty = leaf.tokenType;
+    if (skip.has(ty)) return null;
+    if (ty === '$punct') { const o = over.get(leaf.text); return { buckets: o ? new Set(o.flatMap((s) => [...chainBuckets(s)])) : new Set<Bucket>(['punct']), lit: true }; }
+    if (ty === '$keyword') { const o = over.get(leaf.text); return { buckets: o ? new Set(o.flatMap((s) => [...chainBuckets(s)])) : new Set<Bucket>(['keyword']), lit: true }; }
+    if (ty.startsWith('$template')) return { buckets: new Set<Bucket>(['string']), lit: false };
+    if (tokScope.has(ty)) { const sc = tokScope.get(ty); return sc ? { buckets: chainBuckets(sc), lit: false } : null; }
+    return null;   // unscoped / contextual token (a bare identifier) → not checkable by-construction
+  };
+}
+export function leafRoles(grammar: CstGrammar, cst: CstNode, roleOf: (l: { tokenType: string; text: string }) => { buckets: Set<Bucket>; lit: boolean } | null): LeafRole[] {
+  const out: LeafRole[] = [];
+  const walk = (n: CstChild) => {
+    if (n.kind === 'leaf') {
+      if (n.end <= n.offset) return;
+      const r = roleOf(n);
+      if (r) out.push({ start: n.offset, end: n.end, text: n.text, tokenType: n.tokenType, expected: r.buckets, lit: r.lit });
+    } else for (const c of n.children) walk(c);
+  };
+  walk(cst);
+  return out;
+}
+
+// Scopes that belong to a POSITION-ANCHORED token — one whose pattern contains a `start()` anchor
+// (e.g. YAML's DocStart/DocEnd `^---`/`^...`). Such a scope is the parser's signal "a marker AT a
+// line/stream position"; the flat highlighter, retrying the pattern at every token boundary, may
+// paint it on a token the parser placed elsewhere (a value-leading `---`, monogram#23). Map each
+// such scope → the set of token names allowed to carry it, so a mismatch is detectable generically.
+export function anchoredScopes(grammar: CstGrammar): Map<string, Set<string>> {
+  const hasStart = (p: TokenPattern): boolean => {
+    if (typeof p === 'string') return false;
+    switch (p.type) {
+      case 'anchor': return p.kind === 'start';
+      case 'seq': case 'alt': return p.items.some(hasStart);
+      case 'repeat': case 'lookahead': case 'lookbehind': return hasStart(p.body);
+      default: return false;
+    }
+  };
+  const m = new Map<string, Set<string>>();
+  for (const t of grammar.tokens) if (t.scope && hasStart(t.pattern)) { const s = m.get(t.scope) ?? new Set(); s.add(t.name); m.set(t.scope, s); }
+  return m;
+}
+
+// ── a single scope≡role inconsistency (flat highlighter ≠ parser) at a position ──────────────────
+export interface Violation { input: string; strategy: string; pos: number; text: string; tokenType: string; expected: string; got: Bucket; gotScope: string; kind: string }
+
+// Per-input divergence detection — the gate-1 (structural-literal→content, #24) and gate-2
+// (anchored-marker misfire, #23) scans, factored out so both generative.ts and gap-ledger.ts run
+// the IDENTICAL check. Pure: it reads an already-parsed CST and already-tokenized flat tokens.
+//   • gate-1 — a `$punct`/`$keyword` the parser placed as grammar STRUCTURE, painted entirely as a
+//     CONTENT class (string/comment/number). A `-` indicator is never a string (#24). Floor-blind.
+//   • gate-2 — a leaf painted with a position-anchored token's scope when the parser did NOT place
+//     that token here (a value-leading `---` scoped document-marker, #23).
+// Leniency: a token is CONSISTENT when the highlighter paints ANY part of its span with a scope in the
+// token's declared-chain bucket SET (a quote-delimiter sub-scope, a number folded into a string are OK).
+export function collectViolations(args: {
+  input: string;
+  strategy: string;
+  cst: CstNode;
+  toks: TmTok[];
+  leaves: LeafRole[];
+  anchored: Map<string, Set<string>>;
+  cap?: number;          // stop after this many (generative.ts uses 200; the ledger leaves it open)
+  startCount?: number;   // current count toward the cap (so a caller accumulating across inputs can pass it)
+}): Violation[] {
+  const { input, strategy, toks, leaves, anchored } = args;
+  const cap = args.cap ?? Infinity;
+  let count = args.startCount ?? 0;
+  const out: Violation[] = [];
+  const leafCover = (pos: number) => leaves.find((l) => pos >= l.start && pos < l.end);
+  for (const lr of leaves) {
+    const got = spanBuckets(toks, input, lr.start, lr.end);
+    const overlap = [...lr.expected].some((b) => got.has(b));
+    if (overlap) continue;                                                  // highlighter painted the declared scope somewhere → consistent
+    const contentGot = [...got].find((b) => CONTENT.has(b));
+    if (lr.lit && contentGot && count < cap) {
+      out.push({ input, strategy, pos: lr.start, text: lr.text, tokenType: lr.tokenType, expected: [...lr.expected].join('|'), got: contentGot, gotScope: innerOf(scopeAt(toks, lr.start)), kind: '#24 structural-literal→content' });
+      count++;
+    }
+  }
+  if (anchored.size) for (const t of toks) {
+    if (t.end <= t.start) continue;
+    const inner = innerOf(t.scopes);
+    const owners = anchored.get(inner.replace(/\.[a-z0-9]+$/, '')) ?? anchored.get(inner);
+    if (!owners) continue;
+    const leaf = leafCover(t.start);
+    if (leaf && !owners.has(leaf.tokenType) && count < cap) {
+      out.push({ input, strategy, pos: t.start, text: input.slice(t.start, t.end), tokenType: leaf.tokenType, expected: [...owners].join('|'), got: 'name', gotScope: inner, kind: '#23 anchored-marker misfire' });
+      count++;
+    }
+  }
+  return out;
+}
+
+// What GATES vs what is a report-only DISCOVERY (generative.ts's exact predicate):
+//  • an ANCHORED-MARKER misfire (#23) ALWAYS gates.
+//  • a STRUCTURAL-LITERAL→content divergence (#24) gates on the STRUCTURED strategies, but is
+//    report-only on FUZZ inputs (a standing flat-TM frontier limit). The gap ledger lists the
+//    DISCOVERED ones (the !isGated set) — the floor-blind divergences a corpus metric is blind to.
+export const isGated = (v: { kind: string; strategy: string }): boolean => v.kind.startsWith('#23') || !v.strategy.startsWith('fuzz');
diff --git a/test/generative.ts b/test/generative.ts
index 02d964a..1b40fb0 100644
--- a/test/generative.ts
+++ b/test/generative.ts
@@ -30,10 +30,15 @@ import { readFileSync, existsSync } from 'node:fs';
 import { createRequire } from 'node:module';
 import vsctm from 'vscode-textmate';
 import onig from 'vscode-oniguruma';
-import { createParser, type CstNode, type CstChild } from '../src/gen-parser.ts';
-import type { CstGrammar, TokenPattern } from '../src/types.ts';
-import { normScope } from './scope-roles.ts';
+import { createParser, type CstNode } from '../src/gen-parser.ts';
+import type { CstGrammar } from '../src/types.ts';
 import { generateInputs, type GenInput } from './grammar-gen.ts';
+// The scope≡role divergence detection is factored into generative-detect.ts so the gap ledger
+// (test/gap-ledger.ts) reuses the EXACT same logic. This driver's gate behaviour is unchanged.
+import {
+  type TmTok, type Violation,
+  buildRoleMap, leafRoles, anchoredScopes, collectViolations, isGated, GEN_OPTS,
+} from './generative-detect.ts';
 
 // ── language registry: every per-language fact (grammar module, scope, flat grammar file,
 //    any multi-file sub-grammars) is DATA — the harness body is language-agnostic. ──
@@ -86,105 +91,13 @@ async function loadTm(scopeName: string, files: Record<string, string>) {
   });
   return reg.loadGrammar(scopeName);
 }
-interface TmTok { start: number; end: number; scopes: string[] }
 function tmTokenize(grammar: vsctm.IGrammar, text: string): TmTok[] {
   const toks: TmTok[] = []; let rs = INITIAL, off = 0;
   for (const line of text.split('\n')) { const r = grammar.tokenizeLine(line, rs); for (const t of r.tokens) toks.push({ start: off + t.startIndex, end: off + t.endIndex, scopes: t.scopes }); rs = r.ruleStack; off += line.length + 1; }
   return toks;
 }
-function scopeAt(toks: TmTok[], pos: number): string[] {
-  let lo = 0, hi = toks.length - 1, ans = -1;
-  while (lo <= hi) { const mid = (lo + hi) >> 1; if (toks[mid].start <= pos) { ans = mid; lo = mid + 1; } else hi = mid - 1; }
-  return ans >= 0 && toks[ans].end > pos ? toks[ans].scopes : [];
-}
-const innerOf = (s: string[]): string => (s.length ? s[s.length - 1] : '(none)');
-
-// ── visual bucket of a scope chain — the level at which a highlight difference is actually visible.
-//    Same partition the scope-gap differential pass uses; the consistency check compares buckets so a
-//    `-` painted as string (punct≠string) is caught even though punctuation is a lexical-floor role. ──
-type Bucket = 'invalid' | 'comment' | 'string' | 'number' | 'keyword' | 'name' | 'punct' | 'none';
-const DISTINCT = new Set<Bucket>(['invalid', 'comment', 'string', 'number', 'keyword']);
-function scopeBucket(chain: string[]): Bucket {
-  for (let i = chain.length - 1; i >= 0; i--) {
-    const s = normScope(chain[i]);
-    if (/^invalid/.test(s)) return 'invalid';
-    if (/^comment/.test(s)) return 'comment';
-    if (/^constant\.numeric/.test(s)) return 'number';
-    if (/^(string|constant\.character|constant\.other\.symbol)/.test(s)) return 'string';
-    if (/^(keyword|storage|constant\.language|support\.constant|variable\.language)/.test(s)) return 'keyword';
-    if (/^(entity|variable|support|constant)/.test(s)) return 'name';
-    if (/^punctuation/.test(s)) return 'punct';
-  }
-  return 'none';
-}
-// every visual bucket a scope CHAIN spans (a YAML number is `string.unquoted constant.numeric` →
-// {string, number} — both are legitimate, since the same token folds to a multi-line string).
-function chainBuckets(scope: string): Set<Bucket> {
-  const out = new Set<Bucket>();
-  for (const seg of scope.split(/\s+/)) if (seg) out.add(scopeBucket([seg]));
-  return out;
-}
-const CONTENT = new Set<Bucket>(['string', 'comment', 'number']);   // a STRUCTURAL literal is never one of these
-
-// ── by-construction expected role of a parsed leaf, from the grammar ALONE ──────────────────────
-// A leaf's token TYPE → the bucket SET the grammar DECLARES for it: a named token → its `scope`
-// chain's buckets; a `$punct`/`$keyword` literal → any `scopes` override, else punctuation / keyword.
-// `lit` marks a STRUCTURAL literal (`$punct`/`$keyword`) — one the parser placed as grammar structure,
-// so the highlighter painting it as CONTENT (string/comment/number) is always wrong (monogram#24).
-interface LeafRole { start: number; end: number; text: string; tokenType: string; expected: Set<Bucket>; lit: boolean }
-function buildRoleMap(grammar: CstGrammar): (leaf: { tokenType: string; text: string }) => { buckets: Set<Bucket>; lit: boolean } | null {
-  const tokScope = new Map<string, string | undefined>();
-  for (const t of grammar.tokens) tokScope.set(t.name, t.scope);
-  const skip = new Set<string>();
-  if (grammar.indent) { skip.add(grammar.indent.indentToken); skip.add(grammar.indent.dedentToken); skip.add(grammar.indent.newlineToken); }
-  if (grammar.newline) skip.add(grammar.newline.token);
-  const over = grammar.scopeOverrides;
-  return (leaf) => {
-    const ty = leaf.tokenType;
-    if (skip.has(ty)) return null;
-    if (ty === '$punct') { const o = over.get(leaf.text); return { buckets: o ? new Set(o.flatMap((s) => [...chainBuckets(s)])) : new Set<Bucket>(['punct']), lit: true }; }
-    if (ty === '$keyword') { const o = over.get(leaf.text); return { buckets: o ? new Set(o.flatMap((s) => [...chainBuckets(s)])) : new Set<Bucket>(['keyword']), lit: true }; }
-    if (ty.startsWith('$template')) return { buckets: new Set<Bucket>(['string']), lit: false };
-    if (tokScope.has(ty)) { const sc = tokScope.get(ty); return sc ? { buckets: chainBuckets(sc), lit: false } : null; }
-    return null;   // unscoped / contextual token (a bare identifier) → not checkable by-construction
-  };
-}
-function leafRoles(grammar: CstGrammar, cst: CstNode, roleOf: (l: { tokenType: string; text: string }) => { buckets: Set<Bucket>; lit: boolean } | null): LeafRole[] {
-  const out: LeafRole[] = [];
-  const walk = (n: CstChild) => {
-    if (n.kind === 'leaf') {
-      if (n.end <= n.offset) return;
-      const r = roleOf(n);
-      if (r) out.push({ start: n.offset, end: n.end, text: n.text, tokenType: n.tokenType, expected: r.buckets, lit: r.lit });
-    } else for (const c of n.children) walk(c);
-  };
-  walk(cst);
-  return out;
-}
-
-// Scopes that belong to a POSITION-ANCHORED token — one whose pattern contains a `start()` anchor
-// (e.g. YAML's DocStart/DocEnd `^---`/`^...`). Such a scope is the parser's signal "a marker AT a
-// line/stream position"; the flat highlighter, retrying the pattern at every token boundary, may
-// paint it on a token the parser placed elsewhere (a value-leading `---`, monogram#23). Map each
-// such scope → the set of token names allowed to carry it, so a mismatch is detectable generically.
-function anchoredScopes(grammar: CstGrammar): Map<string, Set<string>> {
-  const hasStart = (p: TokenPattern): boolean => {
-    if (typeof p === 'string') return false;
-    switch (p.type) {
-      case 'anchor': return p.kind === 'start';
-      case 'seq': case 'alt': return p.items.some(hasStart);
-      case 'repeat': case 'lookahead': case 'lookbehind': return hasStart(p.body);
-      default: return false;
-    }
-  };
-  const m = new Map<string, Set<string>>();
-  for (const t of grammar.tokens) if (t.scope && hasStart(t.pattern)) { const s = m.get(t.scope) ?? new Set(); s.add(t.name); m.set(t.scope, s); }
-  return m;
-}
 
 // ── the run ──────────────────────────────────────────────────────────────────────────────────────
-interface Violation { input: string; strategy: string; pos: number; text: string; tokenType: string; expected: string; got: Bucket; gotScope: string; kind: string }
-
 async function runLang(cfg: LangCfg): Promise<{ name: string; ok: boolean; violations: number; reason: string }> {
   if (!existsSync(cfg.tmPath)) { console.log(`  [skip ${cfg.name}: ${cfg.tmPath} not found — run npm run gen]`); return { name: cfg.name, ok: true, violations: 0 }; }
   const grammar = (await import(cfg.module)).default as CstGrammar;
@@ -225,38 +138,15 @@ async function runLang(cfg: LangCfg): Promise<{ name: string; ok: boolean; viola
   // and a context fold (a number folded into a multi-line string) are NOT false-positives.
   const violations: Violation[] = [];
   let checkedTokens = 0;
-  const spanBuckets = (toks: TmTok[], text: string, start: number, end: number): Set<Bucket> => {
-    const s = new Set<Bucket>();
-    for (let p = start; p < end; p++) { const c = text.charCodeAt(p); if (c === 32 || c === 9) continue; s.add(scopeBucket(scopeAt(toks, p))); }
-    return s.size ? s : new Set<Bucket>(['none']);
-  };
   for (const inp of entryLegal) {
     let cst: CstNode, toks: TmTok[];
     try { cst = parse(inp.text); toks = tmTokenize(tm, inp.text); } catch { continue; }
     const leaves = leafRoles(grammar, cst, roleOf);
-    const leafCover = (pos: number) => leaves.find((l) => pos >= l.start && pos < l.end);
-    for (const lr of leaves) {
-      checkedTokens++;
-      const got = spanBuckets(toks, inp.text, lr.start, lr.end);
-      const overlap = [...lr.expected].some((b) => got.has(b));
-      if (overlap) continue;                                                  // highlighter painted the declared scope somewhere → consistent
-      // gate-1: a structural literal painted entirely as a content class
-      const contentGot = [...got].find((b) => CONTENT.has(b));
-      if (lr.lit && contentGot && violations.length < 200) {
-        violations.push({ input: inp.text, strategy: inp.strategy, pos: lr.start, text: lr.text, tokenType: lr.tokenType, expected: [...lr.expected].join('|') as any, got: contentGot, gotScope: innerOf(scopeAt(toks, lr.start)), kind: '#24 structural-literal→content' });
-      }
-    }
-    // gate-2: scan the highlighter's tokens for an anchored-marker scope on a leaf that is NOT that token
-    if (anchored.size) for (const t of toks) {
-      if (t.end <= t.start) continue;
-      const inner = innerOf(t.scopes);
-      const owners = anchored.get(inner.replace(/\.[a-z0-9]+$/, '')) ?? anchored.get(inner);
-      if (!owners) continue;
-      const leaf = leafCover(t.start);
-      if (leaf && !owners.has(leaf.tokenType) && violations.length < 200) {
-        violations.push({ input: inp.text, strategy: inp.strategy, pos: t.start, text: inp.text.slice(t.start, t.end), tokenType: leaf.tokenType, expected: [...owners].join('|') as any, got: 'name', gotScope: inner, kind: '#23 anchored-marker misfire' });
-      }
-    }
+    checkedTokens += leaves.length;
+    // gate-1 (#24 structural-literal→content) + gate-2 (#23 anchored-marker misfire), via the shared
+    // detector — identical logic to before, now reused by the gap ledger. The 200-cap is the running
+    // total across all inputs (startCount), as the inline version was.
+    violations.push(...collectViolations({ input: inp.text, strategy: inp.strategy, cst, toks, leaves, anchored, cap: 200, startCount: violations.length }));
   }
 
   // ── report ──
@@ -281,7 +171,6 @@ async function runLang(cfg: LangCfg): Promise<{ name: string; ok: boolean; viola
   //    limits (a block plain scalar containing an unclosed flow indicator `[`/`{` — block-vs-flow
   //    disambiguation that needs the indent/flow stack a flat grammar lacks). Those are not
   //    regressions of a known-fixed shape, and #25 is the testing harness, not a fix for every limit.
-  const isGated = (v: Violation) => v.kind.startsWith('#23') || !v.strategy.startsWith('fuzz');
   const gated = violations.filter(isGated);
   const discovered = violations.filter((v) => !isGated(v));
   console.log(`  scope≡role: ${checkedTokens} declared-scope tokens checked · ${gated.length} gated inconsistenc${gated.length === 1 ? 'y' : 'ies'} · ${discovered.length} discovered (fuzz frontier-limit, report-only)`);

From b934b399c4f4e412ff2e2bef8b6f4fcb93cc817f Mon Sep 17 00:00:00 2001
From: Johnson Chu <johnsoncodehk@gmail.com>
Date: Tue, 9 Jun 2026 07:44:55 +0800
Subject: [PATCH 6/6] Generative: close the comment coverage hole via injected
 witnesses
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Comments are skip:true tokens — the parser drops them, so they are never CST leaves,
so the scope≡role judge (which walks the parser's CST) never checked the highlighter's
comment scopes (0% covered). Closing it needs a witness the GENERATOR records, not a
parser leaf.

4a — deterministic comment injection at one safe position per mode (config-derived, no
`//`/`#`/`<!--` hardcoded): token-stream → a no-newline block comment at an inter-token
space; indent → end-of-line `# c` outside flow; markup → `<!-- c -->` after a tagClose.
A re-parse-and-drop net keeps round-trip clean; the injected comment is recorded as a
witness in GenInput.tokens (its first consumer), inheriting the host's tier.

4b — the judge grades each witness span: the flat highlighter must paint `comment`
somewhere in it (same scopeBucket partition + leniency); a comment painted non-comment
is unambiguous, so it GATES. Coverage hole closed 0→N graded per language (YAML 442,
TS 46, …), all 0 uncolored today; proven non-trivial — mutating a comment scope makes
every witness uncolored and the gate fail.

Deterministic preserved; 7/7 + depth-site 2/2 (#23/#24); gap-ledger --check clean;
agnostic 9/9.
---
 test/generative-detect.ts |  54 +++++++++++++-
 test/generative.ts        |  12 +++
 test/grammar-gen.ts       | 149 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 211 insertions(+), 4 deletions(-)

diff --git a/test/generative-detect.ts b/test/generative-detect.ts
index c3e708f..b9655ca 100644
--- a/test/generative-detect.ts
+++ b/test/generative-detect.ts
@@ -16,7 +16,7 @@
 import { normScope } from './scope-roles.ts';
 import type { CstNode, CstChild } from '../src/gen-parser.ts';
 import type { CstGrammar, TokenPattern } from '../src/types.ts';
-import type { GenOptions } from './grammar-gen.ts';
+import { commentSpec, type GenOptions, type GenInput } from './grammar-gen.ts';
 
 // The generation knobs BOTH consumers use, so the gap ledger sees the SAME derived corpus (hence the
 // SAME divergence set) as generative.ts's check. `seed` is a no-op (the generator is a pure function
@@ -173,9 +173,59 @@ export function collectViolations(args: {
   return out;
 }
 
+// ── COMMENT-WITNESS check (the last coverage hole: comment scopes) ───────────────────────────────
+// A comment is a `skip:true` token the parser DROPS — it never becomes a CST leaf, so the leaf walk above
+// (and `checkedTokens`) can NEVER reach a comment's highlighter scope (0% coverage). The generator closes
+// that by INJECTING a comment at a safe position and recording its span as a WITNESS in `GenInput.tokens`
+// (grammar-gen.ts §8). THIS is the consumer: for each recorded comment witness, the flat highlighter must
+// paint that span with the COMMENT bucket — graded with the SAME `scopeBucket` partition the rest of the
+// check uses. The judge compares against the GENERATOR'S witness, not a parser leaf (there is none) — the
+// crux the prompt names. Leniency mirrors `collectViolations`: a comment is CONSISTENT if the highlighter
+// paints ANY part of its span `comment` (so the `<!--`/`-->` punctuation sub-scope is fine, only the body
+// needs `comment`); a span with NO `comment` anywhere (painted entirely string / text / etc.) is the gap.
+
+// The comment-token NAMES of a grammar (so a witness in `tokens` is identifiable as a comment) — the SAME
+// generic discovery the generator uses, memoised per grammar object so repeated calls are cheap.
+const commentNamesCache = new WeakMap<object, Set<string>>();
+export function commentTokenNames(grammar: CstGrammar): Set<string> {
+  let s = commentNamesCache.get(grammar);
+  if (!s) { s = commentSpec(grammar)?.names ?? new Set<string>(); commentNamesCache.set(grammar, s); }
+  return s;
+}
+
+// The comment WITNESSES of one generated input: the `tokens` entries whose name is a comment token. ONLY
+// a comment-INJECTED input (its strategy carries the `comment:` marker) has authoritative, span-verified
+// witnesses — there `tokens` is exactly the comment(s) the generator spliced at a known offset. A BASE
+// input may ALSO carry `materialize`-recorded comment tokens (the grammar can emit a native `<!-- -->`),
+// but those spans are unreliable for markup fragments (a degenerate `> <!--x-->` mis-spans, an empty
+// `<!---->` is all-punctuation) and were never meant as a ground-truth witness — so they are NOT graded.
+export function commentWitnesses(grammar: CstGrammar, input: GenInput): GenInput['tokens'] {
+  if (!input.strategy.includes('comment:')) return [];
+  const names = commentTokenNames(grammar);
+  return names.size ? input.tokens.filter((t) => names.has(t.name) && t.end > t.start) : [];
+}
+
+// Grade each comment witness: a divergence iff the highlighter painted NO part of the witness span with the
+// `comment` bucket. Returns the divergences as `Violation`s (kind `#comment uncolored`), filed like the
+// others so they flow into the same report / gate / gap-ledger plumbing.
+export function collectCommentViolations(args: { grammar: CstGrammar; input: string; strategy: string; witnesses: GenInput['tokens']; toks: TmTok[] }): Violation[] {
+  const out: Violation[] = [];
+  for (const w of args.witnesses) {
+    const got = spanBuckets(args.toks, args.input, w.start, w.end);
+    if (got.has('comment')) continue;                         // painted as a comment somewhere → consistent
+    const gotBucket = [...got][0] ?? 'none';
+    out.push({ input: args.input, strategy: args.strategy, pos: w.start, text: w.text, tokenType: w.name, expected: 'comment', got: gotBucket, gotScope: innerOf(scopeAt(args.toks, w.start)), kind: '#comment uncolored' });
+  }
+  return out;
+}
+
 // What GATES vs what is a report-only DISCOVERY (generative.ts's exact predicate):
 //  • an ANCHORED-MARKER misfire (#23) ALWAYS gates.
 //  • a STRUCTURAL-LITERAL→content divergence (#24) gates on the STRUCTURED strategies, but is
 //    report-only on FUZZ inputs (a standing flat-TM frontier limit). The gap ledger lists the
 //    DISCOVERED ones (the !isGated set) — the floor-blind divergences a corpus metric is blind to.
-export const isGated = (v: { kind: string; strategy: string }): boolean => v.kind.startsWith('#23') || !v.strategy.startsWith('fuzz');
+//  • a COMMENT-uncolored divergence ALWAYS gates: a comment (parser-dropped, highlighter-scoped
+//    `comment.*` by construction) painted as a NON-comment is unambiguous — there is no legitimate
+//    "frontier limit" where an injected comment is not a comment (its position is chosen safe + it
+//    round-trips). On today's correct grammars this finds 0; it CATCHES a future scope regression.
+export const isGated = (v: { kind: string; strategy: string }): boolean => v.kind.startsWith('#23') || v.kind.startsWith('#comment') || !v.strategy.startsWith('fuzz');
diff --git a/test/generative.ts b/test/generative.ts
index 1b40fb0..06032dc 100644
--- a/test/generative.ts
+++ b/test/generative.ts
@@ -38,6 +38,7 @@ import { generateInputs, type GenInput } from './grammar-gen.ts';
 import {
   type TmTok, type Violation,
   buildRoleMap, leafRoles, anchoredScopes, collectViolations, isGated, GEN_OPTS,
+  commentWitnesses, collectCommentViolations,
 } from './generative-detect.ts';
 
 // ── language registry: every per-language fact (grammar module, scope, flat grammar file,
@@ -138,6 +139,7 @@ async function runLang(cfg: LangCfg): Promise<{ name: string; ok: boolean; viola
   // and a context fold (a number folded into a multi-line string) are NOT false-positives.
   const violations: Violation[] = [];
   let checkedTokens = 0;
+  let checkedComments = 0;   // comment WITNESSES graded — was structurally 0 (comments are skip→no CST leaf)
   for (const inp of entryLegal) {
     let cst: CstNode, toks: TmTok[];
     try { cst = parse(inp.text); toks = tmTokenize(tm, inp.text); } catch { continue; }
@@ -147,6 +149,13 @@ async function runLang(cfg: LangCfg): Promise<{ name: string; ok: boolean; viola
     // detector — identical logic to before, now reused by the gap ledger. The 200-cap is the running
     // total across all inputs (startCount), as the inline version was.
     violations.push(...collectViolations({ input: inp.text, strategy: inp.strategy, cst, toks, leaves, anchored, cap: 200, startCount: violations.length }));
+    // COMMENT-WITNESS arm — a comment is a skip token (no CST leaf), so it is NEVER in `leaves`/`checkedTokens`
+    // and the highlighter's comment scope was previously UNCHECKED (0% coverage). Grade the spans the
+    // generator recorded as witnesses (grammar-gen §8): the highlighter must paint each `comment`. This is
+    // the first consumer of `GenInput.tokens`. ALWAYS-gating (see isGated) — but ~0 on the correct grammars.
+    const witnesses = commentWitnesses(grammar, inp);
+    checkedComments += witnesses.length;
+    violations.push(...collectCommentViolations({ grammar, input: inp.text, strategy: inp.strategy, witnesses, toks }));
   }
 
   // ── report ──
@@ -174,6 +183,9 @@ async function runLang(cfg: LangCfg): Promise<{ name: string; ok: boolean; viola
   const gated = violations.filter(isGated);
   const discovered = violations.filter((v) => !isGated(v));
   console.log(`  scope≡role: ${checkedTokens} declared-scope tokens checked · ${gated.length} gated inconsistenc${gated.length === 1 ? 'y' : 'ies'} · ${discovered.length} discovered (fuzz frontier-limit, report-only)`);
+  // comment-witness coverage: how many injected comment spans were graded (was structurally 0 — a comment
+  // is a skip token, dropped by the parser, so it never reached the leaf-walking scope≡role check).
+  console.log(`  comment witnesses: ${checkedComments} comment span${checkedComments === 1 ? '' : 's'} graded (highlighter must paint COMMENT) · ${violations.filter((v) => v.kind.startsWith('#comment')).length} uncolored`);
   const show = (vs: Violation[], tag: string) => {
     const grouped = new Map<string, { v: Violation; n: number }>();
     for (const v of vs) { const key = `${v.kind} ${v.tokenType}`; const e = grouped.get(key); if (e) e.n++; else grouped.set(key, { v, n: 1 }); }
diff --git a/test/grammar-gen.ts b/test/grammar-gen.ts
index 70a0b43..3a5c37b 100644
--- a/test/grammar-gen.ts
+++ b/test/grammar-gen.ts
@@ -33,7 +33,8 @@
 //     never the exponential full derivation tree.
 // ─────────────────────────────────────────────────────────────────────────────
 import type { CstGrammar, RuleExpr, RuleDecl, TokenDecl, TokenPattern, TokenCharClassItem } from '../src/types.ts';
-import { tokenPatternStartsWithDecimal, tokenPatternHasStartAnchor } from '../src/token-pattern.ts';
+import { tokenPatternStartsWithDecimal, tokenPatternHasStartAnchor, tokenPatternBlockDelimiters } from '../src/token-pattern.ts';
+import { createParser } from '../src/gen-parser.ts';
 
 // Max emissions in one derivation. A deep tree of 2-rep quantifiers grows the list multiplicatively;
 // copying huge lists (not the call count) is what makes a naive enumerator hang — cap it.
@@ -47,6 +48,11 @@ export type Emission =
   | { t: 'compact' };                                        // marks an indent that the lexer would emit INLINE (YAML compact `- - a`)
 
 // A finished input: rendered text + the real tokens it should lex back to (round-trip witnesses).
+// `tokens` also carries the COMMENT WITNESSES injected by `injectComments` — a comment is a `skip:true`
+// token the PARSER DROPS (it never becomes a CST leaf), so the scope≡role judge — which walks parser
+// leaves — can never check a comment's highlighter scope (0% coverage). The generator therefore RECORDS
+// each injected comment's span here as the GROUND TRUTH the judge grades the highlighter against; this
+// is the FIRST consumer of `tokens` (see the comment arm in generative-detect.ts / generative.ts).
 export interface GenInput {
   text: string;
   tokens: { start: number; end: number; name: string; text: string }[];
@@ -926,6 +932,104 @@ class Walker {
   }
 }
 
+// ─── COMMENT INJECTION + WITNESSES (the LAST coverage hole: comment scopes) ─────────────────────────
+// A comment is a `skip:true` token: the parser DROPS it (it never becomes a CST leaf), so the scope≡role
+// judge — which walks the parser's leaves — can NEVER reach a comment's highlighter scope. Injecting a
+// comment into the text does not, by itself, close that hole; the JUDGE must compare the highlighter
+// against a WITNESS the GENERATOR records (not against a parser leaf). This block is the generator side:
+// it discovers the comment delimiters from the grammar's OWN config (no `//`/`#`/`<!--` hardcoded),
+// injects a comment at a SAFE, DETERMINISTIC position per materialization mode, and records the comment's
+// span as a witness in `GenInput.tokens`. A re-parse-and-drop net keeps only injections that still parse.
+
+// The comment delimiters + the witness expectation, discovered GENERICALLY from the grammar config — the
+// SAME comment-classification gen-tm uses (`flags.includes('skip')` OR a `comment.`-prefixed scope), so
+// the set of `names` is exactly the tokens the highlighter paints with a `comment.*` scope. `mode` selects
+// the injection geometry (block delimiters sit mid-line; a line comment is end-of-line only). `names` lets
+// the judge filter the witnesses back out of `tokens`. Returns null when the grammar declares no comment.
+export interface CommentSpec {
+  open: string;          // the opening delimiter / line introducer (`/*`, `<!--`, `#`)
+  close: string;         // the closing delimiter (`*/`, `-->`); '' for a line comment
+  witnessName: string;   // the comment token's name, recorded on each witness
+  names: Set<string>;    // every comment-token name (so the judge can filter `tokens` to comment witnesses)
+  mode: 'token-stream' | 'indent' | 'markup';
+}
+export function commentSpec(grammar: CstGrammar): CommentSpec | null {
+  // every token the highlighter scopes as a comment (gen-tm's exact rule: a skip token, or an explicit
+  // comment.* scope). A skip token with NO `.scope` (TS/JS `BlockComment`/`LineComment`) is INCLUDED —
+  // gen-tm classifies it `comment.block`/`comment.line` from its flags, so the highlighter paints it.
+  const isCommentTok = (t: TokenDecl) => t.flags.includes('skip') || (t.scope?.startsWith('comment.') ?? false);
+  const names = new Set(grammar.tokens.filter(isCommentTok).map((t) => t.name));
+  if (!names.size) return null;
+
+  const mode: CommentSpec['mode'] = grammar.indent ? 'indent' : grammar.markup ? 'markup' : 'token-stream';
+  if (mode === 'markup') {
+    const c = grammar.markup!.comment; if (!c) return null;
+    return { open: c.open, close: c.close, witnessName: c.token, names, mode };
+  }
+  if (mode === 'indent') {
+    const intro = grammar.indent!.comment; if (!intro) return null;
+    // the comment TOKEN whose pattern starts with the introducer (YAML `#…`) — its `.scope` is the one
+    // the highlighter paints; a line comment (no closing delimiter) is EOL-only.
+    const tok = grammar.tokens.find((t) => isCommentTok(t) && (t.scope?.startsWith('comment.') ?? false));
+    return { open: intro, close: '', witnessName: tok?.name ?? [...names][0], names, mode };
+  }
+  // token-stream: a skip comment token DELIMITED on both sides (a block comment) — the only form safe to
+  // splice at an inter-token space in whitespace-insensitive text (a line comment would swallow the rest
+  // of the line; the generated text has no newlines, so it would eat the whole document). Prefer the
+  // SHORTEST opener (a plain `/*` over a doc `/**`), so the witness is an ordinary block comment. Returns
+  // null when the language has only line comments (no block form) — then no safe token-stream injection.
+  const blockToks = grammar.tokens.filter((t) => isCommentTok(t) && tokenPatternBlockDelimiters(t));
+  blockToks.sort((a, b) => tokenPatternBlockDelimiters(a)![0].length - tokenPatternBlockDelimiters(b)![0].length);
+  if (!blockToks.length) return null;
+  const [open, close] = tokenPatternBlockDelimiters(blockToks[0])!;
+  return { open, close, witnessName: blockToks[0].name, names, mode };
+}
+
+// Splice ONE comment into `text` at the first SAFE position for the mode (a FIXED rule — no randomness,
+// so the generator stays a pure function of the grammar), returning the new text + the comment's span.
+// Position rules (each measured to round-trip; the caller's re-parse net drops any that don't):
+//   • token-stream — at the first inter-token SPACE (a space with a non-space, non-newline neighbour on
+//     each side): a no-newline block comment there is 100% safe in whitespace-insensitive code.
+//   • indent (YAML) — appended at END-OF-LINE of the first non-blank line that doesn't already carry the
+//     comment introducer: a `# c` end-of-line comment is safe OUTSIDE flow / a block-scalar body, which
+//     the re-parse net rejects (a mid-line `#` is content in flow / ends a plain scalar, so never mid-line).
+//   • markup — right after the first `tagClose` (`>`): comment text BETWEEN tags / in content is safe;
+//     never inside a tag (the re-parse net rejects an in-tag splice).
+// The body is a minimal `' c '` (space-padded), legal in every comment grammar. `tagClose` (markup only)
+// is the grammar's tag-close delimiter (`>`), passed in so the function bakes in no HTML-specific literal.
+// Returns null when no safe position exists in this particular input (then no comment variant is produced).
+function injectComment(text: string, spec: CommentSpec, tagClose: string): { text: string; start: number; end: number; comment: string } | null {
+  if (spec.mode === 'token-stream') {
+    const comment = spec.open + ' c ' + spec.close;     // `/* c */`
+    for (let i = 1; i < text.length - 1; i++) {
+      if (text[i] === ' ' && text[i - 1] !== ' ' && text[i + 1] !== ' ' && text[i - 1] !== '\n' && text[i + 1] !== '\n') {
+        const start = i + 1;                            // splice the comment + a trailing space after the space
+        return { text: text.slice(0, start) + comment + ' ' + text.slice(start), start, end: start + comment.length, comment };
+      }
+    }
+    return null;
+  }
+  if (spec.mode === 'indent') {
+    const comment = spec.open + ' c';                   // `# c`
+    const lines = text.split('\n');
+    for (let li = 0; li < lines.length; li++) {
+      const ln = lines[li];
+      if (!ln.trim() || ln.includes(spec.open)) continue;        // skip blank / already-commented lines
+      const prefixLen = lines.slice(0, li).reduce((a, l) => a + l.length + 1, 0);   // chars before this line (+\n each)
+      const start = prefixLen + ln.length + 1;          // after the line text + the joining space
+      lines[li] = ln + ' ' + comment;
+      return { text: lines.join('\n'), start, end: start + comment.length, comment };
+    }
+    return null;
+  }
+  // markup — right after the first tagClose (`>`): comment text BETWEEN tags / in content is safe.
+  const comment = spec.open + ' c ' + spec.close;       // `<!-- c -->`
+  const gt = tagClose ? text.indexOf(tagClose) : -1;
+  if (gt < 0) return null;
+  const start = gt + tagClose.length;
+  return { text: text.slice(0, start) + comment + text.slice(start), start, end: start + comment.length, comment };
+}
+
 // ─── MATERIALIZE: emissions → text + token spans ──────────────────────────────────
 // The per-language structural-token materialization hook. Token-stream grammars join with a
 // space (whitespace-insensitive); indentation grammars (YAML) render struct emissions through an
@@ -1181,5 +1285,46 @@ export function generateInputs(grammar: CstGrammar, opts: GenOptions = {}): GenI
     if (bs.length) push(bs, 'nest:blockScalar', entry.name);
   }
 
-  return out.slice(0, maxInputs);
+  // 8) COMMENT INJECTION (the last coverage hole). A comment is a `skip` token the parser DROPS, so the
+  //    scope≡role judge — which walks parser LEAVES — never reaches a comment's highlighter scope (0%
+  //    coverage). Inject a comment into a SAFE, DETERMINISTIC position of each already-generated input and
+  //    record its span as a WITNESS in `tokens` (the FIRST consumer of that field) — the judge grades the
+  //    highlighter against THAT witness, not a parser leaf (see the comment arm in generative-detect.ts).
+  //    Re-parse-and-DROP: an injection that breaks parsing is discarded (the un-injected input is always
+  //    kept — these are ADDITIONAL variants). DETERMINISTIC: a fixed position rule, no randomness, and we
+  //    iterate a SNAPSHOT of the inputs generated above (in their deterministic order), so two calls match.
+  //    Collected SEPARATELY and concatenated AFTER the base `maxInputs` slice, so the comment witnesses are
+  //    never starved by the cap (closing the coverage hole is the point); bounded by the same maxInputs.
+  const base = out.slice(0, maxInputs);
+  const commentInputs: GenInput[] = [];
+  const cspec = commentSpec(grammar);
+  if (cspec) {
+    const tagClose = grammar.markup?.tagClose ?? '';
+    const { parse } = createParser(grammar);   // lazy: only built when a comment can be injected
+    for (const inp of base) {                  // a snapshot — never inject into an already-injected variant
+      if (commentInputs.length >= maxInputs) break;
+      const inj = injectComment(inp.text, cspec, tagClose);
+      if (!inj) continue;
+      if (inj.text.slice(inj.start, inj.end) !== inj.comment) continue;   // span sanity (the splice put it exactly here)
+      // re-parse-and-DROP, at the ENTRY rule: the injected text must be a FULL DOCUMENT (the highlighter
+      // tokenizes the whole text as the entry scope, and the judge grades comment witnesses only on
+      // entry-legal inputs) — this single full-document parse is the authoritative net (a fragment host
+      // whose injected form doesn't parse as a document is simply dropped here).
+      try { parse(inj.text); } catch { continue; }
+      if (!inj.text.trim() || inj.text.length > 2000 || seen.has(inj.text)) continue;
+      seen.add(inj.text);
+      // `tokens` becomes EXACTLY the injected comment WITNESS — the span-verified ground truth the judge
+      // grades the highlighter against. We do NOT carry over the host's `materialize`-recorded tokens: that
+      // list was never consumed and is unreliable for markup fragments (a degenerate `> <!--x-->` mis-spans),
+      // so the only authoritative witness is the comment we just spliced at a known offset (text === slice).
+      // The variant INHERITS the host's tier: a `fuzz`-host (exploratory, may carry a STANDING flat-TM limit
+      // like the self-close `/` #24) stays `fuzz:comment:…` so that inherited #24 remains report-only — only
+      // the COMMENT-witness check itself always gates (isGated keys `#comment` on kind, not the tier). A
+      // structured host stays `comment:…`. Either way the strategy CONTAINS `comment:`, the judge's marker.
+      const tier = inp.strategy.startsWith('fuzz') ? 'fuzz:' : '';
+      commentInputs.push({ text: inj.text, tokens: [{ start: inj.start, end: inj.end, name: cspec.witnessName, text: inj.comment }], strategy: `${tier}comment:${cspec.witnessName}`, rule: inp.rule });
+    }
+  }
+
+  return [...base, ...commentInputs];
 }