From a64082aaaf9054d527f7539d2e8172d9517fad04 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 8 Jun 2026 21:19:05 +0800 Subject: [PATCH 1/6] Generative testing: grammar-derived inputs + by-construction consistency (#25) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Walk the shared combinator IR to emit guaranteed-legal inputs for any Monogram grammar, replacing corpus sampling with systematic, bounded coverage — the lever a normal highlighter lacks (the source IS a grammar). Two by-construction judges, no external oracle: - round-trip: every generated derivation parses as the rule it was rooted at (parser self-consistency); the structured strategies are ~88% legal, fuzz is exploratory (random choices wander outside the IR's context constraints). - scope ≡ role: the flat highlighter's scope at each parsed token must agree with the token's by-construction role (the scope the grammar declares). Where they disagree is the #23/#24 class — a value-leading `---` the parser keeps a plain scalar but a flat grammar mis-scopes as a marker; an inner sequence `-` the parser knows is an indicator but a flat grammar folds into a string. Floor-blind (compares the punctuation class directly), so a `-` painted string is caught. The check independently re-surfaces both: a directed-nesting derivation produces `- - x\n - x` (#24); the anchored-marker scan catches a value-leading marker misfire (#23). Verified by reverting each fix — the gate fires — and depth-site coverage is asserted so generation can't silently stop exercising them. Test-suite cleanup alongside: - delete 9 dev-only scratch / superseded probes (each confirmed not a CI gate). - fold the per-language scope-gap + src-coverage adapters into two data-driven drivers (scope-gap-run.ts / src-coverage-run.ts) + a config table, the per-language entry preserved as a parameter. Output byte-identical to the old adapters; coverage-table.ts and package.json rewired. The thicker html / yaml / vue adapters keep their files and are delegated to. Adds: grammar-gen.ts (the walker), generative.ts (the judges), curated-corpora.ts. CI runs node test/generative.ts. --- .github/workflows/ci.yml | 1 + package.json | 27 +- test/bench-vs-ts-agg.ts | 19 - test/bench-vs-ts.ts | 38 - test/classify-ts.ts | 93 --- test/coverage-table.ts | 38 +- test/{scope-gap-jsx.ts => curated-corpora.ts} | 51 +- test/diag.ts | 13 - test/generative.ts | 324 ++++++++ test/grammar-gen.ts | 694 ++++++++++++++++++ test/parser-gap.ts | 254 ------- test/prof.ts | 10 - test/scope-gap-html.ts | 48 -- test/scope-gap-js.ts | 30 - test/scope-gap-run.ts | 130 ++++ test/scope-gap-ts.ts | 39 - test/scope-gap-tsx.ts | 28 - test/scope-gap-yaml.ts | 68 -- test/src-coverage-js.ts | 27 - test/src-coverage-jsx.ts | 60 -- test/src-coverage-run.ts | 52 ++ test/src-coverage-ts.ts | 27 - test/src-coverage-tsx.ts | 20 - test/ts-ast.ts | 9 - test/yaml-diag.ts | 40 - test/yaml-poc.ts | 33 - 26 files changed, 1260 insertions(+), 913 deletions(-) delete mode 100644 test/bench-vs-ts-agg.ts delete mode 100644 test/bench-vs-ts.ts delete mode 100644 test/classify-ts.ts rename test/{scope-gap-jsx.ts => curated-corpora.ts} (55%) delete mode 100644 test/diag.ts create mode 100644 test/generative.ts create mode 100644 test/grammar-gen.ts delete mode 100644 test/parser-gap.ts delete mode 100644 test/prof.ts delete mode 100644 test/scope-gap-html.ts delete mode 100644 test/scope-gap-js.ts create mode 100644 test/scope-gap-run.ts delete mode 100644 test/scope-gap-ts.ts delete mode 100644 test/scope-gap-tsx.ts delete mode 100644 test/scope-gap-yaml.ts delete mode 100644 test/src-coverage-js.ts delete mode 100644 test/src-coverage-jsx.ts create mode 100644 test/src-coverage-run.ts delete mode 100644 test/src-coverage-ts.ts delete mode 100644 test/src-coverage-tsx.ts delete mode 100644 test/ts-ast.ts delete mode 100644 test/yaml-diag.ts delete mode 100644 test/yaml-poc.ts diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4eaafd0..4962098 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -57,6 +57,7 @@ jobs: node test/vue-interp-expr.ts node test/yaml-issue12-regressions.ts node test/yaml-depth-witnesses.ts + node test/generative.ts # The derived tree-sitter highlighter is the strongest thesis proof (a real GLR # parser from the same grammar, beating the official hand-written one). Build its diff --git a/package.json b/package.json index 3491dd9..b3937f7 100644 --- a/package.json +++ b/package.json @@ -5,6 +5,7 @@ "scripts": { "gen": "node src/cli.ts typescript.ts && node src/cli.ts javascript.ts && node src/cli.ts typescriptreact.ts && node src/cli.ts javascriptreact.ts && node src/cli.ts html.ts && node src/cli.ts vue.ts && node src/cli.ts yaml.ts", "test": "node test/sanity-check.ts", + "generative": "node test/generative.ts", "conformance": "node test/run-conformance.ts", "conformance:js": "node test/js-conformance.ts", "conformance:tsx": "node test/tsx-conformance.ts", @@ -29,19 +30,19 @@ "bench:perf": "node test/perf-bench.ts", "coverage": "node test/scope-coverage.ts", "compat": "node test/repo-compat.ts", - "src-coverage:ts": "node test/src-coverage-ts.ts", - "src-coverage:js": "node test/src-coverage-js.ts", - "src-coverage:jsx": "node test/src-coverage-jsx.ts", - "src-coverage:tsx": "node test/src-coverage-tsx.ts", - "src-coverage:html": "node test/src-coverage-html.ts", - "src-coverage:yaml": "node test/src-coverage-yaml.ts", - "scope-gap:ts": "node test/scope-gap-ts.ts", - "scope-gap:js": "node test/scope-gap-js.ts", - "scope-gap:jsx": "node test/scope-gap-jsx.ts", - "scope-gap:tsx": "node test/scope-gap-tsx.ts", - "scope-gap:html": "node test/scope-gap-html.ts", - "scope-gap:yaml": "node test/scope-gap-yaml.ts", - "scope-gap:vue": "node test/scope-gap-vue.ts", + "src-coverage:ts": "node test/src-coverage-run.ts ts", + "src-coverage:js": "node test/src-coverage-run.ts js", + "src-coverage:jsx": "node test/src-coverage-run.ts jsx", + "src-coverage:tsx": "node test/src-coverage-run.ts tsx", + "src-coverage:html": "node test/src-coverage-run.ts html", + "src-coverage:yaml": "node test/src-coverage-run.ts yaml", + "scope-gap:ts": "node test/scope-gap-run.ts ts", + "scope-gap:js": "node test/scope-gap-run.ts js", + "scope-gap:jsx": "node test/scope-gap-run.ts jsx", + "scope-gap:tsx": "node test/scope-gap-run.ts tsx", + "scope-gap:html": "node test/scope-gap-run.ts html", + "scope-gap:yaml": "node test/scope-gap-run.ts yaml", + "scope-gap:vue": "node test/scope-gap-run.ts vue", "coverage:table": "node test/coverage-table.ts --write" }, "devDependencies": { diff --git a/test/bench-vs-ts-agg.ts b/test/bench-vs-ts-agg.ts deleted file mode 100644 index a9a42ad..0000000 --- a/test/bench-vs-ts-agg.ts +++ /dev/null @@ -1,19 +0,0 @@ -import { createParser } from '../src/gen-parser.ts'; -import { readdir } from 'fs/promises'; -import { readFileSync } from 'fs'; -import { join } from 'path'; -import ts from 'typescript'; -const grammar = (await import('../typescript.ts')).default; -const { parse } = createParser(grammar); -const base = '/tmp/ts-repo/tests/cases/conformance'; -async function all(d: string): Promise { const o:string[]=[]; for(const e of await readdir(d,{withFileTypes:true})){const f=join(d,e.name); if(e.isDirectory())o.push(...await all(f)); else if(e.name.endsWith('.ts')&&!e.name.endsWith('.d.ts'))o.push(f);} return o; } -const files = (await all(base)).map(f => readFileSync(f,'utf-8')); -const totalKB = files.reduce((s,c)=>s+c.length,0)/1024; -// warm up -for(const c of files.slice(0,200)){ try{parse(c);}catch{} ts.createSourceFile('t.ts',c,ts.ScriptTarget.Latest,false,ts.ScriptKind.TS); } -let t0=process.hrtime.bigint(); for(const c of files){ try{parse(c);}catch{} } const ours=Number(process.hrtime.bigint()-t0)/1e6; -t0=process.hrtime.bigint(); for(const c of files){ ts.createSourceFile('t.ts',c,ts.ScriptTarget.Latest,false,ts.ScriptKind.TS); } const tsms=Number(process.hrtime.bigint()-t0)/1e6; -console.log(`${files.length} files, ${totalKB.toFixed(0)} KB total`); -console.log(` ours: ${ours.toFixed(0)} ms (${(totalKB/1024/(ours/1000)).toFixed(1)} MB/s)`); -console.log(` ts: ${tsms.toFixed(0)} ms (${(totalKB/1024/(tsms/1000)).toFixed(1)} MB/s)`); -console.log(` ours/ts: ×${(ours/tsms).toFixed(1)}`); diff --git a/test/bench-vs-ts.ts b/test/bench-vs-ts.ts deleted file mode 100644 index c15d4ea..0000000 --- a/test/bench-vs-ts.ts +++ /dev/null @@ -1,38 +0,0 @@ -// Compare our grammar-driven parser against TypeScript's own parser (ts.createSourceFile) -// on the same inputs. Both do a full from-scratch parse (no incremental reuse). -import { createParser } from '../src/gen-parser.ts'; -import { readFileSync } from 'fs'; -import ts from 'typescript'; - -const grammar = (await import('../typescript.ts')).default; -const { parse } = createParser(grammar); - -const tsParse = (code: string) => - ts.createSourceFile('t.ts', code, ts.ScriptTarget.Latest, /*setParentNodes*/ false, ts.ScriptKind.TS); - -function timeIt(fn: () => void, iters: number): number { - for (let i = 0; i < 3; i++) fn(); // warm up - const start = process.hrtime.bigint(); - for (let i = 0; i < iters; i++) fn(); - return Number(process.hrtime.bigint() - start) / 1e6 / iters; // ms/parse -} - -const files = [ - ['parserharness.ts', '/tmp/ts-repo/tests/cases/conformance/parser/ecmascript5/RealWorld/parserharness.ts'], - ['fixSignatureCaching.ts', '/tmp/ts-repo/tests/cases/conformance/fixSignatureCaching.ts'], - ['parserRealSource7.ts', '/tmp/ts-repo/tests/cases/conformance/parser/ecmascript5/parserRealSource7.ts'], - ['parserindenter.ts', '/tmp/ts-repo/tests/cases/conformance/parser/ecmascript5/RealWorld/parserindenter.ts'], -]; - -console.log('file KB ours(ms) ts(ms) ours/ts'); -for (const [name, path] of files) { - const code = readFileSync(path, 'utf-8'); - const kb = (code.length / 1024).toFixed(0); - const ours = timeIt(() => { try { parse(code); } catch {} }, 30); - const tsms = timeIt(() => { tsParse(code); }, 30); - console.log( - name.padEnd(28) + kb.padStart(4) + - ours.toFixed(1).padStart(11) + tsms.toFixed(2).padStart(9) + - ('×' + (ours / tsms).toFixed(1)).padStart(10), - ); -} diff --git a/test/classify-ts.ts b/test/classify-ts.ts deleted file mode 100644 index 348a8de..0000000 --- a/test/classify-ts.ts +++ /dev/null @@ -1,93 +0,0 @@ -import { createParser } from '../src/gen-parser.ts'; -import { readdir, writeFile } from 'fs/promises'; -import { readFileSync } from 'fs'; -import { join } from 'path'; -import ts from 'typescript'; - -const grammar = (await import('../typescript.ts')).default; -const { parse } = createParser(grammar); -const baseDir = '/tmp/ts-repo/tests/cases/conformance'; - -async function getAllTsFiles(dir: string): Promise { - const files: string[] = []; - for (const entry of await readdir(dir, { withFileTypes: true })) { - const full = join(dir, entry.name); - if (entry.isDirectory()) files.push(...await getAllTsFiles(full)); - else if (entry.name.endsWith('.ts') && !entry.name.endsWith('.d.ts')) files.push(full); - } - return files; -} - -// Count syntactic parse diagnostics for a chunk of TS source. -function syntaxErrors(text: string, name = 't.ts'): number { - const sf = ts.createSourceFile(name, text, ts.ScriptTarget.Latest, true, ts.ScriptKind.TS); - return (sf as any).parseDiagnostics?.length ?? 0; -} - -// Split TS conformance file by `// @filename:` directives. -function splitMultiFile(text: string): string[] { - if (!/^\s*\/\/\s*@filename:/im.test(text)) return [text]; - const parts: string[] = []; - const re = /^\s*\/\/\s*@filename:.*$/gim; - let last = 0, m: RegExpExecArray | null, started = false; - const idxs: number[] = []; - while ((m = re.exec(text))) idxs.push(m.index); - if (idxs.length === 0) return [text]; - // preamble before first @filename (global directives) — ignore as its own chunk - for (let i = 0; i < idxs.length; i++) { - const start = idxs[i]; - const end = i + 1 < idxs.length ? idxs[i + 1] : text.length; - parts.push(text.slice(start, end)); - } - return parts; -} - -const files = await getAllTsFiles(baseDir); -files.sort(); - -interface Row { file: string; ourMsg: string; tsWhole: number; tsParts: number; multi: boolean; } -const rows: Row[] = []; - -for (const file of files) { - const code = readFileSync(file, 'utf-8'); - let ourFail = false, ourMsg = ''; - try { parse(code); } catch (e: any) { ourFail = true; ourMsg = e.message.replace(/\s*\[farthest.*/, ''); } - if (!ourFail) continue; - - const path = file.replace(baseDir + '/', ''); - const tsWhole = syntaxErrors(code); - const parts = splitMultiFile(code); - const multi = parts.length > 1; - const tsParts = multi ? parts.reduce((a, p) => a + syntaxErrors(p), 0) : tsWhole; - rows.push({ file: path, ourMsg, tsWhole, tsParts, multi }); -} - -// Categories: -// REAL: TS reports 0 syntax errors (on parts if multi, else whole) -> we should parse -// MULTI: multi-file, parts clean but whole dirty (concatenation issue, structural) -// ERRORTEST: TS reports syntax errors -> intentional -const real = rows.filter(r => !r.multi && r.tsWhole === 0); -const multiClean = rows.filter(r => r.multi && r.tsParts === 0); -const multiDirty = rows.filter(r => r.multi && r.tsParts > 0); -const errorTest = rows.filter(r => !r.multi && r.tsWhole > 0); - -const out: string[] = []; -out.push(`Total our failures: ${rows.length}`); -out.push(`REAL (TS clean, single-file) : ${real.length}`); -out.push(`MULTI-CLEAN (parts clean, concat fails): ${multiClean.length}`); -out.push(`MULTI-DIRTY (multi-file w/ syntax err) : ${multiDirty.length}`); -out.push(`ERROR-TEST (TS reports syntax error) : ${errorTest.length}`); -out.push(''); -out.push('===== REAL (should fix) ====='); -for (const r of real) out.push(` ${r.file}\n ${r.ourMsg}`); -out.push(''); -out.push('===== MULTI-CLEAN (structural, @filename concat) ====='); -for (const r of multiClean) out.push(` ${r.file}\n ${r.ourMsg}`); -out.push(''); -out.push('===== MULTI-DIRTY (has intentional errors in some part) ====='); -for (const r of multiDirty) out.push(` ${r.file} (tsParts=${r.tsParts})`); - -const text = out.join('\n'); -await writeFile('/tmp/classify.txt', text); -console.log(text.split('\n').slice(0, 6).join('\n')); -console.log('\nFull report: /tmp/classify.txt'); diff --git a/test/coverage-table.ts b/test/coverage-table.ts index 5484881..eb61dbb 100644 --- a/test/coverage-table.ts +++ b/test/coverage-table.ts @@ -18,27 +18,29 @@ function runAdapter(script: string, args: string[], marker: string, env?: NodeJS } catch { return null; } } -// TS/JS use deterministic stride subsets for speed; the rest run their full corpus. +// Both metrics now run through ONE data-driven driver each, parameterised by the `` code +// (test/scope-gap-run.ts, test/src-coverage-run.ts). TS/JS use deterministic stride subsets for +// speed; the rest run their full corpus. const COV = [ - { lang: 'TypeScript', script: 'test/src-coverage-ts.ts', args: ['1500'] }, - { lang: 'JavaScript', script: 'test/src-coverage-js.ts', args: ['800'] }, - { lang: 'JSX', script: 'test/src-coverage-jsx.ts', args: [] }, - { lang: 'TSX', script: 'test/src-coverage-tsx.ts', args: [] }, - { lang: 'HTML', script: 'test/src-coverage-html.ts', args: [] }, - { lang: 'YAML', script: 'test/src-coverage-yaml.ts', args: [] }, + { lang: 'TypeScript', script: 'test/src-coverage-run.ts', args: ['ts', '1500'] }, + { lang: 'JavaScript', script: 'test/src-coverage-run.ts', args: ['js', '800'] }, + { lang: 'JSX', script: 'test/src-coverage-run.ts', args: ['jsx'] }, + { lang: 'TSX', script: 'test/src-coverage-run.ts', args: ['tsx'] }, + { lang: 'HTML', script: 'test/src-coverage-run.ts', args: ['html'] }, + { lang: 'YAML', script: 'test/src-coverage-run.ts', args: ['yaml'] }, ]; -// The 4 TS-family scope-gap adapters all read ONE shared env var (MONOGRAM_OFFICIAL_TM) for -// the official grammar, so each needs its OWN grammar mapped in (CI sets MONOGRAM_OFFICIAL_TS/ -// TSX/JS/JSX). html/yaml read their own var (MONOGRAM_OFFICIAL_HTML/_YAML), inherited as-is; -// vue is vendored. Absent (local, no env) → each adapter's VS Code-install fallback path. +// The 4 TS-family scope-gap entries all read ONE shared env var (MONOGRAM_OFFICIAL_TM) for the +// official grammar, so each needs its OWN grammar mapped in (CI sets MONOGRAM_OFFICIAL_TS/TSX/JS/JSX). +// html/yaml read their own var (MONOGRAM_OFFICIAL_HTML/_YAML), inherited as-is; vue is vendored. +// Absent (local, no env) → the driver's VS Code-install fallback path. const GAP = [ - { lang: 'TypeScript', script: 'test/scope-gap-ts.ts', args: ['800'], officialEnv: 'MONOGRAM_OFFICIAL_TS' }, - { lang: 'JavaScript', script: 'test/scope-gap-js.ts', args: ['800'], officialEnv: 'MONOGRAM_OFFICIAL_JS' }, - { lang: 'JSX', script: 'test/scope-gap-jsx.ts', args: [], officialEnv: 'MONOGRAM_OFFICIAL_JSX' }, - { lang: 'TSX', script: 'test/scope-gap-tsx.ts', args: [], officialEnv: 'MONOGRAM_OFFICIAL_TSX' }, - { lang: 'HTML', script: 'test/scope-gap-html.ts', args: [] }, - { lang: 'YAML', script: 'test/scope-gap-yaml.ts', args: [] }, - { lang: 'Vue', script: 'test/scope-gap-vue.ts', args: [] }, + { lang: 'TypeScript', script: 'test/scope-gap-run.ts', args: ['ts', '800'], officialEnv: 'MONOGRAM_OFFICIAL_TS' }, + { lang: 'JavaScript', script: 'test/scope-gap-run.ts', args: ['js', '800'], officialEnv: 'MONOGRAM_OFFICIAL_JS' }, + { lang: 'JSX', script: 'test/scope-gap-run.ts', args: ['jsx'], officialEnv: 'MONOGRAM_OFFICIAL_JSX' }, + { lang: 'TSX', script: 'test/scope-gap-run.ts', args: ['tsx'], officialEnv: 'MONOGRAM_OFFICIAL_TSX' }, + { lang: 'HTML', script: 'test/scope-gap-run.ts', args: ['html'] }, + { lang: 'YAML', script: 'test/scope-gap-run.ts', args: ['yaml'] }, + { lang: 'Vue', script: 'test/scope-gap-run.ts', args: ['vue'] }, ] as { lang: string; script: string; args: string[]; officialEnv?: string }[]; const pct = (v: number | null | undefined) => (v == null ? '—' : v.toFixed(1) + '%'); diff --git a/test/scope-gap-jsx.ts b/test/curated-corpora.ts similarity index 55% rename from test/scope-gap-jsx.ts rename to test/curated-corpora.ts index 9a0a8bf..136c3e8 100644 --- a/test/scope-gap-jsx.ts +++ b/test/curated-corpora.ts @@ -1,18 +1,7 @@ -// scope-gap-jsx.ts — JSX (.jsx) adapter for the unified scope-gap harness. Grades VS Code's -// OFFICIAL JavaScriptReact.tmLanguage.json AND Monogram's javascriptreact.tmLanguage.json against -// the parser oracle (oracle.ts with ScriptKind.JSX). Both grammars declare scopeName `source.js.jsx`. -// Neither the TS suite nor Test262 ships a .jsx corpus, so this uses a CURATED set exercising both -// halves (plain JS + JSX), copied verbatim from src-coverage-jsx.ts. It is small, so token counts -// are low; a real .jsx corpus is a follow-up. Run (bare node): node test/scope-gap-jsx.ts -import ts from 'typescript'; -import { run } from './scope-gap.ts'; -import { oracle } from './oracle.ts'; +// curated-corpora.ts — small hand-written corpora shared by the folded scope-gap / src-coverage drivers. +// JSX (plain-JS + JSX halves) and realistic HTML — the languages with no public single-file corpus. -const OFFICIAL = process.env.MONOGRAM_OFFICIAL_TM - ?? '/Applications/Visual Studio Code.app/Contents/Resources/app/extensions/javascript/syntaxes/JavaScriptReact.tmLanguage.json'; - -// No TS types — these are .jsx (JavaScript + JSX) only. Copied verbatim from src-coverage-jsx.ts. -const JSX_CASES: string[] = [ +export const JSX_CASES: string[] = [ // --- plain JS half --- 'const x = 1, y = 2;', 'function f(a, b = 1, ...rest) { return a + b + rest.length; }', @@ -52,15 +41,25 @@ const JSX_CASES: string[] = [ 'const boolAttr = ;', ]; -await run({ - name: 'JavaScriptReact (.jsx)', - scopeName: 'source.js.jsx', - officialPath: OFFICIAL, - monogramPath: 'javascriptreact.tmLanguage.json', - loadCorpus: () => JSX_CASES.map((code, i) => ({ name: ``, text: code })), - roleOracle: (text) => oracle(text, ts.ScriptKind.JSX), - isGradable: (text) => { - const sf = ts.createSourceFile('c.jsx', text, ts.ScriptTarget.Latest, true, ts.ScriptKind.JSX); - return (((sf as any).parseDiagnostics?.length ?? 0) === 0); - }, -}); +export const HTML_GENERAL: string[] = [ + '

Hello world.

', + '
  • one
  • two
  • three
', + 'a picture', + '', + '', + '

Title

Body with bold and italic.

', + '', + '
', + '
AB
12
', + '
x
', + '', + '

', + '', + '', + '', + 'text', + 'body', + 'link', + '', + '
photo
cap
', +]; diff --git a/test/diag.ts b/test/diag.ts deleted file mode 100644 index 9fcd116..0000000 --- a/test/diag.ts +++ /dev/null @@ -1,13 +0,0 @@ -import { createParser } from '../src/gen-parser.ts'; -import { readFileSync } from 'fs'; -const grammar = (await import('../typescript.ts')).default; -const { parse } = createParser(grammar); -for (const f of process.argv.slice(2)) { - const code = readFileSync(f, 'utf-8'); - try { parse(code); console.log(f.split('/').pop(), 'OK'); } - catch (e: any) { - console.log(f.split('/').pop(), '\n ', e.message); - const m = e.message.match(/farthest: offset (\d+)/); - if (m) { const o = +m[1]; console.log(' CTX:', JSON.stringify(code.slice(Math.max(0, o - 70), o + 30))); } - } -} diff --git a/test/generative.ts b/test/generative.ts new file mode 100644 index 0000000..02d964a --- /dev/null +++ b/test/generative.ts @@ -0,0 +1,324 @@ +// ───────────────────────────────────────────────────────────────────────────── +// generative.ts — monogram#25 parts (b): the JUDGING harness over grammar-DERIVED +// inputs (test/grammar-gen.ts). Two by-construction consistency checks, no external +// oracle, for every Monogram grammar: +// +// (2) ROUND-TRIP — every generated derivation parses (as the rule it was rooted at). +// Validates parser self-consistency: what the grammar's IR generates, the parser +// accepts. Reported per strategy; the structured strategies are the gate. +// +// (3) SCOPE ≡ ROLE — the flat highlighter's scope at every parsed token must agree +// with the token's BY-CONSTRUCTION role (the scope the grammar DECLARES for it). +// The parser resolves context with its full stack (indent / column / markup +// depth); the flat TextMate grammar can only approximate it. Where they disagree +// is exactly the monogram#23/#24 class — a value-leading `---` the parser lexes +// as a plain scalar (string) but a flat grammar mis-scopes as a document marker; +// an inner sequence `-` the parser knows is an indicator but a flat grammar folds +// into a string. The check is FLOOR-BLIND (it compares the visual bucket directly, +// incl. punctuation) so a `-` mis-painted as string is caught — the exact blind +// spot that hid #24 from the role-graded scope-gap metric. +// +// Coverage is grammar×bound, not a fixed corpus — so it surfaces the depth-bug CLASS +// without anyone naming the shape (the motivation for #25). The named regressions +// (yaml-depth-witnesses.ts, *-issue-cases.ts) stay — generation replaces their +// DISCOVERY function, not their value as documented gates. +// +// Run (bare node): node test/generative.ts # all languages +// node test/generative.ts yaml # one language +// ───────────────────────────────────────────────────────────────────────────── +import { readFileSync, existsSync } from 'node:fs'; +import { createRequire } from 'node:module'; +import vsctm from 'vscode-textmate'; +import onig from 'vscode-oniguruma'; +import { createParser, type CstNode, type CstChild } from '../src/gen-parser.ts'; +import type { CstGrammar, TokenPattern } from '../src/types.ts'; +import { normScope } from './scope-roles.ts'; +import { generateInputs, type GenInput } from './grammar-gen.ts'; + +// ── language registry: every per-language fact (grammar module, scope, flat grammar file, +// any multi-file sub-grammars) is DATA — the harness body is language-agnostic. ── +interface LangCfg { + name: string; + module: string; // grammar module to import (default export = CstGrammar) + scopeName: string; // TextMate scope, e.g. source.yaml + tmPath: string; // the derived flat .tmLanguage.json + tmExtra?: Record; // extra scopeName → file for multi-file grammars + gen?: Parameters[1]; // generation knobs override + // Depth-site CLASSES the generated legal corpus MUST contain — the shapes whose correct scope + // depends on cross-line parser state, so the scope≡role gate provably covers monogram#23/#24. The + // gate FAILS if generation stops producing them (a coverage regression). Asserted per shape. + mustCover?: { name: string; re: RegExp }[]; +} +const LANGS: LangCfg[] = [ + { name: 'yaml', module: '../yaml.ts', scopeName: 'source.yaml', tmPath: 'yaml.tmLanguage.json', + mustCover: [ + // #24: a nested compact block sequence with an inner sibling (`- - x\n - x`) — the inner `-`'s + // role (indicator vs plain-fold) depends on the indent stack a flat grammar lacks. + { name: '#24 nested-compact-sequence', re: /- - \S.*\n\s+- /m }, + // #23: a value-leading document-marker (`k: --- x`, `- --- x`) — string content, NOT a marker, + // a position the flat grammar's `^`-retried marker pattern can mis-fire on. + { name: '#23 value-leading-marker', re: /(?::|-) +(?:---|\.\.\.)(?:\s|$)/ }, + ] }, + { name: 'typescript', module: '../typescript.ts', scopeName: 'source.ts', tmPath: 'typescript.tmLanguage.json' }, + { name: 'javascript', module: '../javascript.ts', scopeName: 'source.js', tmPath: 'javascript.tmLanguage.json' }, + { name: 'typescriptreact', module: '../typescriptreact.ts', scopeName: 'source.tsx', tmPath: 'typescriptreact.tmLanguage.json' }, + { name: 'javascriptreact', module: '../javascriptreact.ts', scopeName: 'source.js.jsx', tmPath: 'javascriptreact.tmLanguage.json' }, + // HTML/Vue embed source.js/ts/tsx (script blocks, on* handlers); provide them so embedded regions + // tokenize instead of erroring. The consistency check reads the host markup tokens regardless. + { name: 'html', module: '../html.ts', scopeName: 'text.html.basic', tmPath: 'html.tmLanguage.json', + tmExtra: { 'source.js': 'javascript.tmLanguage.json', 'source.css': 'html.tmLanguage.json' } }, + { name: 'vue', module: '../vue.ts', scopeName: 'text.html.vue', tmPath: 'vue.tmLanguage.json', + tmExtra: { 'text.html.basic': 'html.tmLanguage.json', 'source.js': 'javascript.tmLanguage.json', 'source.ts': 'typescript.tmLanguage.json', 'source.tsx': 'typescriptreact.tmLanguage.json' } }, +]; + +// ── vscode-textmate tokenizer (one shared WASM load) ───────────────────────────────────────────── +const { INITIAL, Registry, parseRawGrammar } = vsctm; +const { loadWASM, OnigScanner, OnigString } = onig; +const require = createRequire(import.meta.url); +const bin = readFileSync(require.resolve('vscode-oniguruma/release/onig.wasm')); +await loadWASM(bin.buffer.slice(bin.byteOffset, bin.byteOffset + bin.byteLength)); + +async function loadTm(scopeName: string, files: Record) { + const cache: Record = {}; + const reg = new Registry({ + onigLib: Promise.resolve({ createOnigScanner: (p: string[]) => new OnigScanner(p), createOnigString: (s: string) => new OnigString(s) }), + loadGrammar: async (sn: string) => { const p = files[sn]; if (!p) return null; const c = cache[sn] ?? (cache[sn] = readFileSync(p, 'utf8')); return parseRawGrammar(c, sn + '.json'); }, + }); + return reg.loadGrammar(scopeName); +} +interface TmTok { start: number; end: number; scopes: string[] } +function tmTokenize(grammar: vsctm.IGrammar, text: string): TmTok[] { + const toks: TmTok[] = []; let rs = INITIAL, off = 0; + for (const line of text.split('\n')) { const r = grammar.tokenizeLine(line, rs); for (const t of r.tokens) toks.push({ start: off + t.startIndex, end: off + t.endIndex, scopes: t.scopes }); rs = r.ruleStack; off += line.length + 1; } + return toks; +} +function scopeAt(toks: TmTok[], pos: number): string[] { + let lo = 0, hi = toks.length - 1, ans = -1; + while (lo <= hi) { const mid = (lo + hi) >> 1; if (toks[mid].start <= pos) { ans = mid; lo = mid + 1; } else hi = mid - 1; } + return ans >= 0 && toks[ans].end > pos ? toks[ans].scopes : []; +} +const innerOf = (s: string[]): string => (s.length ? s[s.length - 1] : '(none)'); + +// ── visual bucket of a scope chain — the level at which a highlight difference is actually visible. +// Same partition the scope-gap differential pass uses; the consistency check compares buckets so a +// `-` painted as string (punct≠string) is caught even though punctuation is a lexical-floor role. ── +type Bucket = 'invalid' | 'comment' | 'string' | 'number' | 'keyword' | 'name' | 'punct' | 'none'; +const DISTINCT = new Set(['invalid', 'comment', 'string', 'number', 'keyword']); +function scopeBucket(chain: string[]): Bucket { + for (let i = chain.length - 1; i >= 0; i--) { + const s = normScope(chain[i]); + if (/^invalid/.test(s)) return 'invalid'; + if (/^comment/.test(s)) return 'comment'; + if (/^constant\.numeric/.test(s)) return 'number'; + if (/^(string|constant\.character|constant\.other\.symbol)/.test(s)) return 'string'; + if (/^(keyword|storage|constant\.language|support\.constant|variable\.language)/.test(s)) return 'keyword'; + if (/^(entity|variable|support|constant)/.test(s)) return 'name'; + if (/^punctuation/.test(s)) return 'punct'; + } + return 'none'; +} +// every visual bucket a scope CHAIN spans (a YAML number is `string.unquoted constant.numeric` → +// {string, number} — both are legitimate, since the same token folds to a multi-line string). +function chainBuckets(scope: string): Set { + const out = new Set(); + for (const seg of scope.split(/\s+/)) if (seg) out.add(scopeBucket([seg])); + return out; +} +const CONTENT = new Set(['string', 'comment', 'number']); // a STRUCTURAL literal is never one of these + +// ── by-construction expected role of a parsed leaf, from the grammar ALONE ────────────────────── +// A leaf's token TYPE → the bucket SET the grammar DECLARES for it: a named token → its `scope` +// chain's buckets; a `$punct`/`$keyword` literal → any `scopes` override, else punctuation / keyword. +// `lit` marks a STRUCTURAL literal (`$punct`/`$keyword`) — one the parser placed as grammar structure, +// so the highlighter painting it as CONTENT (string/comment/number) is always wrong (monogram#24). +interface LeafRole { start: number; end: number; text: string; tokenType: string; expected: Set; lit: boolean } +function buildRoleMap(grammar: CstGrammar): (leaf: { tokenType: string; text: string }) => { buckets: Set; lit: boolean } | null { + const tokScope = new Map(); + for (const t of grammar.tokens) tokScope.set(t.name, t.scope); + const skip = new Set(); + if (grammar.indent) { skip.add(grammar.indent.indentToken); skip.add(grammar.indent.dedentToken); skip.add(grammar.indent.newlineToken); } + if (grammar.newline) skip.add(grammar.newline.token); + const over = grammar.scopeOverrides; + return (leaf) => { + const ty = leaf.tokenType; + if (skip.has(ty)) return null; + if (ty === '$punct') { const o = over.get(leaf.text); return { buckets: o ? new Set(o.flatMap((s) => [...chainBuckets(s)])) : new Set(['punct']), lit: true }; } + if (ty === '$keyword') { const o = over.get(leaf.text); return { buckets: o ? new Set(o.flatMap((s) => [...chainBuckets(s)])) : new Set(['keyword']), lit: true }; } + if (ty.startsWith('$template')) return { buckets: new Set(['string']), lit: false }; + if (tokScope.has(ty)) { const sc = tokScope.get(ty); return sc ? { buckets: chainBuckets(sc), lit: false } : null; } + return null; // unscoped / contextual token (a bare identifier) → not checkable by-construction + }; +} +function leafRoles(grammar: CstGrammar, cst: CstNode, roleOf: (l: { tokenType: string; text: string }) => { buckets: Set; lit: boolean } | null): LeafRole[] { + const out: LeafRole[] = []; + const walk = (n: CstChild) => { + if (n.kind === 'leaf') { + if (n.end <= n.offset) return; + const r = roleOf(n); + if (r) out.push({ start: n.offset, end: n.end, text: n.text, tokenType: n.tokenType, expected: r.buckets, lit: r.lit }); + } else for (const c of n.children) walk(c); + }; + walk(cst); + return out; +} + +// Scopes that belong to a POSITION-ANCHORED token — one whose pattern contains a `start()` anchor +// (e.g. YAML's DocStart/DocEnd `^---`/`^...`). Such a scope is the parser's signal "a marker AT a +// line/stream position"; the flat highlighter, retrying the pattern at every token boundary, may +// paint it on a token the parser placed elsewhere (a value-leading `---`, monogram#23). Map each +// such scope → the set of token names allowed to carry it, so a mismatch is detectable generically. +function anchoredScopes(grammar: CstGrammar): Map> { + const hasStart = (p: TokenPattern): boolean => { + if (typeof p === 'string') return false; + switch (p.type) { + case 'anchor': return p.kind === 'start'; + case 'seq': case 'alt': return p.items.some(hasStart); + case 'repeat': case 'lookahead': case 'lookbehind': return hasStart(p.body); + default: return false; + } + }; + const m = new Map>(); + for (const t of grammar.tokens) if (t.scope && hasStart(t.pattern)) { const s = m.get(t.scope) ?? new Set(); s.add(t.name); m.set(t.scope, s); } + return m; +} + +// ── the run ────────────────────────────────────────────────────────────────────────────────────── +interface Violation { input: string; strategy: string; pos: number; text: string; tokenType: string; expected: string; got: Bucket; gotScope: string; kind: string } + +async function runLang(cfg: LangCfg): Promise<{ name: string; ok: boolean; violations: number; reason: string }> { + if (!existsSync(cfg.tmPath)) { console.log(` [skip ${cfg.name}: ${cfg.tmPath} not found — run npm run gen]`); return { name: cfg.name, ok: true, violations: 0 }; } + const grammar = (await import(cfg.module)).default as CstGrammar; + const { parse } = createParser(grammar); + const tm = await loadTm(cfg.scopeName, { [cfg.scopeName]: cfg.tmPath, ...(cfg.tmExtra ?? {}) }); + if (!tm) throw new Error(`failed to load ${cfg.tmPath}`); + const roleOf = buildRoleMap(grammar); + const anchored = anchoredScopes(grammar); + + const inputs = generateInputs(grammar, cfg.gen ?? { depth: 5, nestDepth: 5, cap: 7, fuzzRounds: 250, maxInputs: 1500, seed: 5 }); + + // ── (2) round-trip: parse each input AS THE RULE it was rooted at ── + const byStrat = new Map(); + const entryLegal: GenInput[] = []; + for (const inp of inputs) { + const k = inp.strategy.split(/[:@]/)[0]; + const s = byStrat.get(k) ?? { ok: 0, n: 0 }; s.n++; + let rootOk = false; + try { parse(inp.text, inp.rule); rootOk = true; } catch { /* illegal derivation (IR over-permits vs the parser) */ } + if (rootOk) s.ok++; + byStrat.set(k, s); + // the consistency check needs FULL documents (highlighter tokenizes the whole text as the entry + // scope), so keep inputs that parse at the ENTRY rule. + try { parse(inp.text); entryLegal.push(inp); } catch { /* not a full document — skip for scope≡role */ } + } + + // ── (3) scope ≡ role on the entry-legal inputs ────────────────────────────────────────────────── + // Two BY-CONSTRUCTION gates (each a flat-vs-stack disagreement that is unambiguously the + // highlighter's error), plus a lenient report-only differential for context refinements: + // • gate-1 STRUCTURAL-LITERAL contradiction — a `$punct`/`$keyword` the parser placed as grammar + // structure, painted as CONTENT (string/comment/number). A `-` indicator is never a string + // (monogram#24). Floor-blind: it compares the punctuation class directly. + // • gate-2 ANCHORED-MARKER misfire — a leaf painted with a position-anchored token's scope when + // the parser did NOT place that token here (a value-leading `---` scoped document-marker, + // monogram#23). The flat grammar retried the `^`-anchored pattern off-position. + // Leniency: a token is CONSISTENT when the highlighter paints ANY part of its span with a scope in + // the token's declared-chain bucket SET — so a quote-delimiter sub-scope (`"…"` opens punctuation) + // and a context fold (a number folded into a multi-line string) are NOT false-positives. + const violations: Violation[] = []; + let checkedTokens = 0; + const spanBuckets = (toks: TmTok[], text: string, start: number, end: number): Set => { + const s = new Set(); + for (let p = start; p < end; p++) { const c = text.charCodeAt(p); if (c === 32 || c === 9) continue; s.add(scopeBucket(scopeAt(toks, p))); } + return s.size ? s : new Set(['none']); + }; + for (const inp of entryLegal) { + let cst: CstNode, toks: TmTok[]; + try { cst = parse(inp.text); toks = tmTokenize(tm, inp.text); } catch { continue; } + const leaves = leafRoles(grammar, cst, roleOf); + const leafCover = (pos: number) => leaves.find((l) => pos >= l.start && pos < l.end); + for (const lr of leaves) { + checkedTokens++; + const got = spanBuckets(toks, inp.text, lr.start, lr.end); + const overlap = [...lr.expected].some((b) => got.has(b)); + if (overlap) continue; // highlighter painted the declared scope somewhere → consistent + // gate-1: a structural literal painted entirely as a content class + const contentGot = [...got].find((b) => CONTENT.has(b)); + if (lr.lit && contentGot && violations.length < 200) { + violations.push({ input: inp.text, strategy: inp.strategy, pos: lr.start, text: lr.text, tokenType: lr.tokenType, expected: [...lr.expected].join('|') as any, got: contentGot, gotScope: innerOf(scopeAt(toks, lr.start)), kind: '#24 structural-literal→content' }); + } + } + // gate-2: scan the highlighter's tokens for an anchored-marker scope on a leaf that is NOT that token + if (anchored.size) for (const t of toks) { + if (t.end <= t.start) continue; + const inner = innerOf(t.scopes); + const owners = anchored.get(inner.replace(/\.[a-z0-9]+$/, '')) ?? anchored.get(inner); + if (!owners) continue; + const leaf = leafCover(t.start); + if (leaf && !owners.has(leaf.tokenType) && violations.length < 200) { + violations.push({ input: inp.text, strategy: inp.strategy, pos: t.start, text: inp.text.slice(t.start, t.end), tokenType: leaf.tokenType, expected: [...owners].join('|') as any, got: 'name', gotScope: inner, kind: '#23 anchored-marker misfire' }); + } + } + } + + // ── report ── + const totalLegal = [...byStrat.values()].reduce((a, s) => a + s.ok, 0); + const totalN = [...byStrat.values()].reduce((a, s) => a + s.n, 0); + const structuredLegal = [...byStrat.entries()].filter(([k]) => k !== 'fuzz').reduce((a, [, s]) => a + s.ok, 0); + const structuredN = [...byStrat.entries()].filter(([k]) => k !== 'fuzz').reduce((a, [, s]) => a + s.n, 0); + const fuzzLegal = totalLegal - structuredLegal, fuzzN = totalN - structuredN; + const rate = (a: number, b: number) => b ? (100 * a / b).toFixed(0) + '%' : 'n/a'; + console.log(`\n── ${cfg.name} ── ${inputs.length} generated · ${entryLegal.length} full-document`); + // STRUCTURED is the by-construction round-trip guarantee (every derivation parses as its rule); + // FUZZ is exploratory (random choices wander outside the IR's context constraints → many illegal, + // which is expected and filtered) and is what surfaces divergences beyond the structured shapes. + console.log(` round-trip (rule-rooted): structured ${structuredLegal}/${structuredN} (${rate(structuredLegal, structuredN)} — the by-construction gate) · fuzz ${fuzzLegal}/${fuzzN} (exploratory)` + ['', ...[...byStrat.entries()].filter(([k]) => k !== 'fuzz').map(([k, s]) => `${k} ${s.ok}/${s.n}`)].join(' ')); + // What GATES vs what is a report-only DISCOVERY: + // • an ANCHORED-MARKER misfire (#23) ALWAYS gates — a position-anchored marker scope on a token the + // parser placed elsewhere is unambiguously the flat grammar mis-firing the pattern off-position; + // there is no legitimate "frontier limit" version of it. + // • a STRUCTURAL-LITERAL→content divergence (#24) gates on the STRUCTURED strategies (canonical, + // clean nested shapes — the by-construction guarantee: the dirnest `- - x\n - x` reproduces #24), + // but is report-only on gnarly FUZZ inputs, which legitimately reach STANDING flat-TM frontier + // limits (a block plain scalar containing an unclosed flow indicator `[`/`{` — block-vs-flow + // disambiguation that needs the indent/flow stack a flat grammar lacks). Those are not + // regressions of a known-fixed shape, and #25 is the testing harness, not a fix for every limit. + const isGated = (v: Violation) => v.kind.startsWith('#23') || !v.strategy.startsWith('fuzz'); + const gated = violations.filter(isGated); + const discovered = violations.filter((v) => !isGated(v)); + console.log(` scope≡role: ${checkedTokens} declared-scope tokens checked · ${gated.length} gated inconsistenc${gated.length === 1 ? 'y' : 'ies'} · ${discovered.length} discovered (fuzz frontier-limit, report-only)`); + const show = (vs: Violation[], tag: string) => { + const grouped = new Map(); + for (const v of vs) { const key = `${v.kind} ${v.tokenType}`; const e = grouped.get(key); if (e) e.n++; else grouped.set(key, { v, n: 1 }); } + for (const [key, { v, n }] of [...grouped.entries()].slice(0, 8)) console.log(` ${tag} ${key} ×${n} «${v.text.slice(0, 14).replace(/\n/g, '\\n')}» got «${v.gotScope}» in ${JSON.stringify(v.input.slice(0, 40))}`); + }; + if (gated.length) show(gated, '✗'); + if (discovered.length) show(discovered, '·'); + + // depth-site COVERAGE: the generated legal corpus must contain each declared depth-bug class, so the + // scope≡role gate provably exercises monogram#23/#24 (not just happens to be clean on a fixed corpus). + const legalTexts = entryLegal.map((i) => i.text); + const missing = (cfg.mustCover ?? []).filter((m) => !legalTexts.some((t) => m.re.test(t))); + if (cfg.mustCover?.length) { + const covered = cfg.mustCover.length - missing.length; + console.log(` depth-site coverage: ${covered}/${cfg.mustCover.length} classes present in the legal corpus` + (missing.length ? ` — MISSING: ${missing.map((m) => m.name).join(', ')}` : ` (${cfg.mustCover.map((m) => m.name).join(', ')})`)); + } + // GATE: (a) the generator produced a real LEGAL corpus (a coverage floor — proves round-trip works: + // the grammar's IR generates inputs the parser accepts), and (b) ZERO scope≡role gated inconsistencies. + // The structured legal RATE is reported for visibility but not gated on a percentage — the generator + // legitimately over-produces (the IR over-permits vs the parser; markup materialisation is rough), and + // the validated corpus is the inputs that DO parse. + const enoughLegal = entryLegal.length >= 15; + const reason = gated.length ? `${gated.length} scope≡role` : !enoughLegal ? `only ${entryLegal.length} legal docs` : missing.length ? `missing ${missing.map((m) => m.name).join('/')}` : ''; + return { name: cfg.name, ok: gated.length === 0 && enoughLegal && missing.length === 0, violations: gated.length, reason }; +} + +const only = process.argv[2]; +const targets = only ? LANGS.filter((l) => l.name === only || (only === 'tsfamily' && /script/.test(l.name))) : LANGS; +if (!targets.length) { console.error(`unknown language: ${only}`); process.exit(1); } +console.log('Generative consistency — grammar-derived inputs, by-construction round-trip + scope≡role'); +const results = []; +for (const cfg of targets) results.push(await runLang(cfg)); +const bad = results.filter((r) => !r.ok); +console.log(`\n${'='.repeat(70)}`); +console.log(` ${results.length - bad.length}/${results.length} languages consistent` + (bad.length ? ` — FAILED: ${bad.map((b) => `${b.name} (${b.reason})`).join(', ')}` : '')); +if (bad.length) { console.error('\nGENERATIVE GATE FAILED — a scope≡role inconsistency (flat highlighter ≠ parser) or too small a legal corpus.'); process.exit(1); } +console.log('\nDone.'); diff --git a/test/grammar-gen.ts b/test/grammar-gen.ts new file mode 100644 index 0000000..31eef55 --- /dev/null +++ b/test/grammar-gen.ts @@ -0,0 +1,694 @@ +// ───────────────────────────────────────────────────────────────────────────── +// grammar-gen.ts — a GENERIC, grammar-derived input GENERATOR (monogram#25 part 1). +// +// The premise of the whole project is that the source IS a grammar: the same +// combinator object (`yaml.ts`, `typescript.ts`, …) the parser / highlighter / +// tree-sitter derive from is ALSO a generator. Walk its rule IR — `alt`=branch, +// `seq`=concat, `*`/`+`/`?`=repeat, `ref`=descend, token=sample — and it emits +// guaranteed-legal inputs. That replaces "hope the corpus contains the shape" (the +// blind spot that hid monogram#23/#24 from a corpus-bound metric) with systematic, +// bounded coverage derived from the grammar itself. +// +// This file is the ENGINE; the judging (round-trip + scope≡role) lives in the +// drivers that import it (test/generative.ts). It is language-AGNOSTIC: every +// per-language fact (indent tokens, flow brackets, markup delimiters, compact +// indicators) is read from the grammar's own config (`grammar.indent` / `.markup`), +// never hardcoded — the same discipline the engines follow. +// +// Three production strategies, all over the SAME walker: +// • bounded-exhaustive — every derivation to a small depth N (provably complete at +// small scope; this is what makes coverage `grammar × bound` instead of imagination). +// • self-recursive nesting — for each rule that can contain itself, the nested shape +// at depth 1..N. Deep self-embedding is exactly where a flat highlighter loses to +// the stack-keeping parser (monogram#24 is `BlockSequence` inside `BlockSequence`). +// • fuzzing — random production choices, for deeper / wider structures. +// ───────────────────────────────────────────────────────────────────────────── +import type { CstGrammar, RuleExpr, RuleDecl, TokenDecl, TokenPattern, TokenCharClassItem } from '../src/types.ts'; + +// Max emissions in one derivation. A deep tree of 2-rep quantifiers grows the list multiplicatively; +// copying huge lists (not the call count) is what makes a naive enumerator hang — cap it. +const MAX_EMS = 220; + +// ── An EMISSION: the atomic unit the walker produces; the materializer renders it. ── +export type Emission = + | { t: 'tok'; name: string; text: string } // a real lexer token (text sampled from its pattern) + | { t: 'lit'; value: string } // a grammar literal (keyword or punctuation) + | { t: 'struct'; kind: 'indent' | 'dedent' | 'newline' } // indentation control (YAML indent mode) + | { t: 'compact' }; // marks an indent that the lexer would emit INLINE (YAML compact `- - a`) + +// A finished input: rendered text + the real tokens it should lex back to (round-trip witnesses). +export interface GenInput { + text: string; + tokens: { start: number; end: number; name: string; text: string }[]; + strategy: string; + rule: string; // the top rule the derivation started from (entry, or a self-recursive rule) +} + +// ── deterministic PRNG (Date.now/Math.random are unavailable in workflow scripts and make +// a generator unreproducible anyway — seed it). xorshift32. ── +function rng(seed: number): () => number { + let s = seed | 0 || 1; + return () => { s ^= s << 13; s ^= s >>> 17; s ^= s << 5; return ((s >>> 0) % 1_000_000) / 1_000_000; }; +} + +// ─── TOKEN SAMPLING ────────────────────────────────────────────────────────────── +// Produce a string that MATCHES a TokenPattern. Conservative by default (a short, +// unambiguous lexeme) so the generated input round-trips; `interesting` injects +// grammar-derived boundary literals (e.g. `---`, `#`, `-`) into free-form tokens so a +// plain scalar can be sampled as `--- x` — legal as that token, but a shape that +// stresses the flat highlighter's context guessing (monogram#23). Returns null when +// the pattern can't be sampled (a `never()` placeholder — a structural token). +interface SampleCtx { rand: () => number; interesting: string[]; variant: number } + +function pickNonExcluded(items: TokenCharClassItem[]): string | null { + // a char NOT in the negated class — try common, readable candidates in order + const cands = ['a', 'b', 'c', 'x', 'y', 'z', 'A', 'M', '1', '5', '_', '.', '@', '~']; + const inClass = (ch: string) => items.some((it) => + it.type === 'char' ? it.value === ch : ch >= it.from && ch <= it.to); + for (const ch of cands) if (!inClass(ch)) return ch; + return null; +} +function firstOfClass(items: TokenCharClassItem[]): string | null { + for (const it of items) { + if (it.type === 'char') { if (it.value !== '\n' && it.value !== '\r') return it.value; } + else { const c = it.from; if (c !== '\n' && c !== '\r') return c; } + } + return null; +} + +function sample(pat: TokenPattern, ctx: SampleCtx): string | null { + if (typeof pat === 'string') return pat; + switch (pat.type) { + case 'never': return null; // structural-token placeholder + case 'anyChar': return 'x'; + case 'anchor': return ''; + case 'lookahead': case 'lookbehind': return ''; // zero-width; context handled by the materializer's separators + case 'charClass': { + const ch = pat.negate ? pickNonExcluded(pat.items) : firstOfClass(pat.items); + return ch ?? 'x'; + } + case 'seq': { + let out = ''; + for (const it of pat.items) { const s = sample(it, ctx); if (s === null) return null; out += s; } + return out; + } + case 'alt': { + // bias toward branch 0 (usually the simplest); `variant` rotates for variety + const idx = pat.items.length ? ctx.variant % pat.items.length : 0; + for (let k = 0; k < pat.items.length; k++) { + const s = sample(pat.items[(idx + k) % pat.items.length], ctx); + if (s !== null) return s; + } + return null; + } + case 'repeat': { + const reps = pat.min === 0 ? (ctx.variant % 2 === 0 ? 1 : 0) : pat.min; // 0/1 for *, min for + + let out = ''; + for (let i = 0; i < Math.max(reps, pat.min); i++) { const s = sample(pat.body, ctx); if (s === null) return null; out += s; } + return out; + } + } +} + +// Sample several distinct, legal texts for a token (variants + interesting-literal embeds). +function sampleVariants(decl: TokenDecl, ctx: { rand: () => number; interesting: string[] }, n: number): string[] { + const out = new Set(); + for (let v = 0; v < n + 2 && out.size < n; v++) { + const s = sample(decl.pattern, { ...ctx, variant: v }); + if (s !== null && s.length > 0) out.add(s); + } + // a base sample to seed interesting-literal embeds + const base = sample(decl.pattern, { ...ctx, variant: 0 }) ?? ''; + // Embed grammar-derived boundary literals into free-form (multi-char-capable) tokens, where + // the result is still a single legal instance of the token — this is what produces the + // monogram#23 shape (a plain scalar whose text is `--- x`). Verified per-token by re-lexing + // in the driver; an embed that doesn't re-lex to this token is simply dropped there. + if (base.length >= 1) { + for (const lit of ctx.interesting) { + if (lit.length === 0 || /[\n\r]/.test(lit)) continue; + out.add(lit + base); // glued leading boundary (`---` + `x` → `---x`) + // a SPACE-separated form (`--- x`): a boundary literal that is line-structural only with a + // trailing space (a doc marker `---␣`, a comment `#␣`) re-fires its structural meaning here, so + // this is the form that exercises monogram#23 (a value-leading `--- x` the parser keeps a plain + // scalar but a flat grammar may mis-scope as a marker). Legal where the token body admits a space. + out.add(lit + ' ' + base); + if (out.size > n + ctx.interesting.length * 2) break; + } + } + return [...out]; +} + +// ─── THE WALKER ────────────────────────────────────────────────────────────────── +export interface GenOptions { + depth?: number; // bounded-exhaustive derivation depth (rule-ref recursion) + cap?: number; // max alternatives kept at each combinator node (anti-explosion) + maxInputs?: number; // global cap on emitted inputs per rule + fuzzRounds?: number; // random derivations + seed?: number; + nestDepth?: number; // self-recursive nesting depth + timeBudgetMs?: number; // wall-clock cap for the depth strategies (large token-stream grammars) +} + +class Walker { + tokenByName = new Map(); + ruleByName = new Map(); + interesting: string[]; + structKind = new Map(); + compactLits: Set; + reachMap = new Map>(); // rule → every rule it can transitively reach + ruleMin = new Map(); + rand: () => number; + cap: number; + grammar: CstGrammar; + budgetCalls = 0; // anti-explosion: enum() is a tree walk; cap the work PER top-level call + maxCalls = 60_000; + enumTop(e: RuleExpr, budget: number): Emission[][] { this.budgetCalls = 0; return this.enum(e, budget); } + + constructor(grammar: CstGrammar, seed: number, cap: number) { + this.grammar = grammar; + this.rand = rng(seed); + this.cap = cap; + for (const t of grammar.tokens) this.tokenByName.set(t.name, t); + for (const r of grammar.rules) this.ruleByName.set(r.name, r); + const ind = grammar.indent; + if (ind) { + this.structKind.set(ind.indentToken, 'indent'); + this.structKind.set(ind.dedentToken, 'dedent'); + this.structKind.set(ind.newlineToken, 'newline'); + } + this.compactLits = new Set(grammar.indent?.compactIndicators ?? []); + this.interesting = this.collectInteresting(); + this.computeReach(); + this.computeMins(); + } + + computeReach(): void { + const refs = (e: RuleExpr, acc: Set) => { + switch (e.type) { + case 'ref': if (this.ruleByName.has(e.name)) acc.add(e.name); break; + case 'seq': case 'alt': e.items.forEach((i) => refs(i, acc)); break; + case 'quantifier': case 'group': case 'not': refs(e.body, acc); break; + case 'sep': refs(e.element, acc); break; + } + }; + for (const r of this.grammar.rules) { const s = new Set(); refs(r.body, s); this.reachMap.set(r.name, s); } + for (let i = 0; i < this.grammar.rules.length; i++) + for (const r of this.grammar.rules) { const s = this.reachMap.get(r.name)!; for (const n of [...s]) for (const m of this.reachMap.get(n) ?? []) s.add(m); } + } + // does an expression (transitively) reference `target` — i.e. descending into it can reach target? + // memoised (per expr-object × target) — `nestChain` queries it on every item, so the cache matters. + reachesCache = new WeakMap>(); + exprReaches(e: RuleExpr, target: string): boolean { + if (typeof e === 'object') { + let m = this.reachesCache.get(e); if (!m) { m = new Map(); this.reachesCache.set(e, m); } + const c = m.get(target); if (c !== undefined) return c; + const v = this.exprReachesRaw(e, target); m.set(target, v); return v; + } + return this.exprReachesRaw(e, target); + } + exprReachesRaw(e: RuleExpr, target: string): boolean { + switch (e.type) { + case 'ref': return e.name === target || (this.reachMap.get(e.name)?.has(target) ?? false); + case 'seq': case 'alt': return e.items.some((i) => this.exprReaches(i, target)); + case 'quantifier': case 'group': case 'not': return this.exprReaches(e.body, target); + case 'sep': return this.exprReaches(e.element, target); + default: return false; + } + } + + // shortest rule-ref distance FROM each rule TO `target` (BFS on the reversed ref graph), memoised. + // `nestChain` uses it to descend the DIRECT path to target each level — picking merely "a branch + // that reaches target" loops forever through a long cycle that technically reaches it but never + // arrives (Node→[Indent,Node]→Node…), producing an empty indent chain instead of nested content. + distCache = new Map>(); + distTo(target: string): Map { + let m = this.distCache.get(target); if (m) return m; + m = new Map([[target, 0]]); + // reversed adjacency: who DIRECTLY refs each rule + const back = new Map(); + for (const r of this.grammar.rules) for (const ref of this.directRuleRefs(r.body)) { (back.get(ref) ?? back.set(ref, []).get(ref)!).push(r.name); } + const queue = [target]; + while (queue.length) { const cur = queue.shift()!; const d = m.get(cur)!; for (const pre of back.get(cur) ?? []) if (!m.has(pre)) { m.set(pre, d + 1); queue.push(pre); } } + this.distCache.set(target, m); return m; + } + directRuleRefs(e: RuleExpr): string[] { + const out: string[] = []; + const go = (x: RuleExpr) => { switch (x.type) { + case 'ref': if (this.ruleByName.has(x.name)) out.push(x.name); break; + case 'seq': case 'alt': x.items.forEach(go); break; + case 'quantifier': case 'group': case 'not': go(x.body); break; + case 'sep': go(x.element); break; + } }; + go(e); return out; + } + // min distance an expression sits from re-entering `target` (Infinity if it can't reach it) + distExprCache = new WeakMap>(); + exprDist(e: RuleExpr, target: string): number { + if (typeof e === 'object') { let m = this.distExprCache.get(e); if (!m) { m = new Map(); this.distExprCache.set(e, m); } const c = m.get(target); if (c !== undefined) return c; const v = this.exprDistRaw(e, target); m.set(target, v); return v; } + return this.exprDistRaw(e, target); + } + exprDistRaw(e: RuleExpr, target: string): number { + const dm = this.distTo(target); + switch (e.type) { + case 'ref': return e.name === target ? 0 : (dm.has(e.name) ? dm.get(e.name)! : Infinity); + case 'seq': case 'alt': return Math.min(Infinity, ...e.items.map((i) => this.exprDist(i, target))); + case 'quantifier': case 'group': case 'not': return this.exprDist(e.body, target); + case 'sep': return this.exprDist(e.element, target); + default: return Infinity; + } + } + + // grammar-derived boundary literals: every literal in the rules + structural sigils that + // a free-form token could legally contain but that ALSO start another token (the collision + // shapes a flat highlighter mis-scopes). Short, non-alphabetic ones are the interesting ones. + collectInteresting(): string[] { + const lits = new Set(); + const walk = (e: RuleExpr) => { + switch (e.type) { + case 'literal': lits.add(e.value); break; + case 'seq': case 'alt': e.items.forEach(walk); break; + case 'quantifier': case 'group': case 'not': walk(e.body); break; + case 'sep': walk(e.element); break; + } + }; + for (const r of this.grammar.rules) walk(r.body); + // doc markers / block-scalar introducers live in indent config, not the rules + const ind = this.grammar.indent; + for (const m of ind?.blockScalar?.documentMarkers ?? []) lits.add(m); + return [...lits].filter((l) => l.length > 0 && l.length <= 3 && !/^[A-Za-z]+$/.test(l)); + } + + isToken(name: string): boolean { return this.tokenByName.has(name); } + isStruct(name: string): boolean { return this.structKind.has(name); } + + // ── minimal terminating expansion (fixpoint), so any budget cut-off still produces legal text ── + computeMins(): void { + for (const r of this.grammar.rules) this.ruleMin.set(r.name, null); + for (let iter = 0; iter < this.grammar.rules.length + 2; iter++) { + let changed = false; + for (const r of this.grammar.rules) { + if (this.ruleMin.get(r.name)) continue; + const m = this.minExpand(r.body); + if (m) { this.ruleMin.set(r.name, m); changed = true; } + } + if (!changed) break; + } + } + minExpand(e: RuleExpr): Emission[] | null { + switch (e.type) { + case 'literal': return [{ t: 'lit', value: e.value }]; + case 'ref': { + if (this.isStruct(e.name)) return [{ t: 'struct', kind: this.structKind.get(e.name)! }]; + if (this.isToken(e.name)) { + const txt = sample(this.tokenByName.get(e.name)!.pattern, { rand: this.rand, interesting: [], variant: 0 }); + return txt === null ? null : [{ t: 'tok', name: e.name, text: txt || 'x' }]; + } + return this.ruleMin.get(e.name) ?? null; + } + case 'seq': { + const out: Emission[] = []; + for (const it of e.items) { const m = this.minExpand(it); if (!m) return null; out.push(...m); } + return out; + } + case 'alt': { + let best: Emission[] | null = null; + for (const it of e.items) { const m = this.minExpand(it); if (m && (!best || m.length < best.length)) best = m; } + return best; + } + case 'quantifier': return e.kind === '+' ? this.minExpand(e.body) : []; + case 'group': return this.minExpand(e.body); + case 'sep': return this.minExpand(e.element); + case 'not': case 'sameLine': case 'noCommentBefore': case 'noMultilineFlowBefore': + case 'op': case 'prefix': case 'postfix': return []; + } + } + + // Minimal-but-CONTENT-BEARING expansion: like minExpand, but `opt`/`*` fire ONE rep when their body + // can yield a token, and `alt` prefers a branch that produces a token — so a `- opt(Value)` becomes + // `- ` instead of a bare `-`. Bounded by `fuel`; falls back to minExpand at the floor. + fillBudget = 0; // global anti-explosion for fillContent's all-branches alt search (huge TS alts) + fillContent(e: RuleExpr, fuel: number): Emission[] { + if (--fuel <= 0 || --this.fillBudget <= 0) return this.minExpand(e) ?? []; + const hasTok = (xs: Emission[]) => xs.some((em) => em.t === 'tok'); + switch (e.type) { + case 'literal': return [{ t: 'lit', value: e.value }]; + case 'ref': { + if (this.isStruct(e.name)) return [{ t: 'struct', kind: this.structKind.get(e.name)! }]; + if (this.isToken(e.name)) { const v = sample(this.tokenByName.get(e.name)!.pattern, { rand: this.rand, interesting: [], variant: 0 }); return [{ t: 'tok', name: e.name, text: v || 'x' }]; } + return this.fillContent(this.ruleByName.get(e.name)!.body, fuel); + } + case 'seq': { const out: Emission[] = []; for (const it of e.items) for (const x of this.fillContent(it, fuel)) out.push(x); return out; } + case 'alt': { + // prefer a SHORT branch that yields PLAIN-STRING content — a clean scalar value (`- a`), not a + // sigil-led node (alias `*a`, flow `[…]`) or a multi-line fold. A plain string is what a + // sibling `-` line can (wrongly) fold into, which is the monogram#24 trigger. + let best: Emission[] | null = null, bestScore = -Infinity; + for (const it of e.items) { + const r = this.fillContent(it, fuel); + if (!hasTok(r)) continue; + const stringy = r.some((em) => em.t === 'tok' && /^string\.unquoted/.test(this.tokenByName.get(em.name)?.scope ?? '') && !/[&*!|>[\]{}#%'"]/.test(em.text[0] ?? '')); + const score = (stringy ? 100 : 0) - r.length; + if (score > bestScore) { bestScore = score; best = r; } + } + return best ?? this.fillContent(e.items[0], fuel); + } + case 'quantifier': { const r = this.fillContent(e.body, fuel); if ((e.kind === '?' || e.kind === '*') && !hasTok(r)) return []; return r; } + case 'group': return this.fillContent(e.body, fuel); + case 'sep': return this.fillContent(e.element, fuel); + default: return []; + } + } + + // ── bounded-exhaustive enumeration: a capped set of emission-sequences for `e` ── + enum(e: RuleExpr, budget: number): Emission[][] { + const cap = this.cap; + // global work cap: the walk is a tree whose SIZE (not just output) grows with depth×cap×rules; + // once exceeded, collapse to the minimal expansion so a run always terminates in bounded time. + if (++this.budgetCalls > this.maxCalls) { const m = this.minExpand(e); return m ? [m] : [[]]; } + switch (e.type) { + case 'literal': return [[{ t: 'lit', value: e.value }]]; + case 'ref': { + if (this.isStruct(e.name)) return [[{ t: 'struct', kind: this.structKind.get(e.name)! }]]; + if (this.isToken(e.name)) { + const vs = sampleVariants(this.tokenByName.get(e.name)!, { rand: this.rand, interesting: this.interesting }, 3); + return (vs.length ? vs : ['x']).slice(0, cap).map((t) => [{ t: 'tok', name: e.name, text: t }]); + } + if (budget <= 0) { const m = this.ruleMin.get(e.name); return m ? [m] : [[]]; } + return this.enum(this.ruleByName.get(e.name)!.body, budget - 1); + } + case 'seq': { + let acc: Emission[][] = [[]]; + for (const it of e.items) { + const parts = this.enum(it, budget); + const next: Emission[][] = []; + // skip combos whose emission list would blow past MAX_EMS — a deep tree of 2-rep quantifiers + // grows the list multiplicatively, and copying huge lists (not the call count) is the cost. + for (const a of acc) for (const p of parts) { if (a.length + p.length <= MAX_EMS) next.push([...a, ...p]); if (next.length >= cap) break; } + acc = next.length ? next : acc; + if (acc.length >= cap) acc = acc.slice(0, cap); + } + return acc; + } + case 'alt': { + // round-robin across branches so a deep/recursive branch (usually LAST) is not starved by an + // earlier scalar branch filling the cap — the difference between ever generating `- - a` or not. + const perBranch = e.items.map((it) => this.enum(it, budget)); + const out: Emission[][] = []; + for (let i = 0; out.length < cap; i++) { + let any = false; + for (const b of perBranch) { if (i < b.length) { out.push(b[i]); any = true; if (out.length >= cap) break; } } + if (!any) break; + } + return out; + } + case 'quantifier': { + const body = this.enum(e.body, budget); + const out: Emission[][] = []; + if (e.kind !== '+') out.push([]); // 0 reps for ? and * + for (const b of body) { out.push(b); if (out.length >= cap) return out; } + if (e.kind !== '?') for (const b of body) { if (b.length * 2 <= MAX_EMS) { out.push([...b, ...b]); if (out.length >= cap) return out; } } // 2 reps for * and + + return out; + } + case 'group': return this.enum(e.body, budget); + case 'sep': { + const el = this.enum(e.element, budget); + const out: Emission[][] = []; + for (const b of el) { out.push(b); if (out.length >= cap) return out; } + for (const b of el) { if (b.length * 2 + 1 <= MAX_EMS) { out.push([...b, { t: 'lit', value: e.delimiter }, ...b]); if (out.length >= cap) return out; } } + return out; + } + case 'not': case 'sameLine': case 'noCommentBefore': case 'noMultilineFlowBefore': + case 'op': case 'prefix': case 'postfix': return [[]]; + } + } + + // ── random derivation (fuzzing): one emission sequence, forced to terminate at budget 0 ── + fuzz(e: RuleExpr, budget: number): Emission[] { + const pick = (xs: T[]): T => xs[Math.floor(this.rand() * xs.length)]; + // bounded `for`-push (NOT spread on a possibly-huge array → stack overflow + size blowup) + const fappend = (out: Emission[], add: Emission[]) => { if (out.length < MAX_EMS) for (const x of add) out.push(x); }; + switch (e.type) { + case 'literal': return [{ t: 'lit', value: e.value }]; + case 'ref': { + if (this.isStruct(e.name)) return [{ t: 'struct', kind: this.structKind.get(e.name)! }]; + if (this.isToken(e.name)) { + const vs = sampleVariants(this.tokenByName.get(e.name)!, { rand: this.rand, interesting: this.interesting }, 4); + return [{ t: 'tok', name: e.name, text: vs.length ? pick(vs) : 'x' }]; + } + if (budget <= 0) return this.ruleMin.get(e.name) ?? []; + return this.fuzz(this.ruleByName.get(e.name)!.body, budget - 1); + } + case 'seq': { const out: Emission[] = []; for (const it of e.items) fappend(out, this.fuzz(it, budget)); return out; } + case 'alt': { + if (budget <= 0) { const m = this.minExpand(e); if (m) return m; } + return this.fuzz(pick(e.items), budget); + } + case 'quantifier': { + const reps = budget <= 0 ? (e.kind === '+' ? 1 : 0) : (e.kind === '?' ? Math.floor(this.rand() * 2) : Math.floor(this.rand() * 3) + (e.kind === '+' ? 1 : 0)); + const out: Emission[] = []; for (let i = 0; i < reps; i++) fappend(out, this.fuzz(e.body, budget - 1)); return out; + } + case 'group': return this.fuzz(e.body, budget); + case 'sep': { + const reps = budget <= 0 ? 1 : Math.floor(this.rand() * 3) + 1; const out: Emission[] = []; + for (let i = 0; i < reps; i++) { if (i) out.push({ t: 'lit', value: e.delimiter }); fappend(out, this.fuzz(e.element, budget - 1)); } + return out; + } + case 'not': case 'sameLine': case 'noCommentBefore': case 'noMultilineFlowBefore': + case 'op': case 'prefix': case 'postfix': return []; + } + } + + // Rules that can (transitively) contain themselves — the self-recursive nesting targets. + selfRecursive(): string[] { + return this.grammar.rules.filter((r) => this.reachMap.get(r.name)!.has(r.name)).map((r) => r.name); + } + + // ── DIRECTED nesting: a random derivation BIASED to descend back toward `target` until `depth` + // runs out, then to terminate — so a self-recursive rule is forced to NEST, and its repetitions + // fire to add SIBLINGS. This deterministically reaches the deep self-embedding shapes (a + // BlockSequence inside a BlockSequence with an inner sibling — monogram#24) that an un-biased, + // capped enumeration starves. Agnostic: `target` is any self-recursive rule, found generically. + siblingLeft = 0; + // Build a CLEAN, SHORT nested chain of `target` (a collection inside a collection, `nest` levels + // deep) with ONE inner sibling — the monogram#24 class. Fast and deterministic: at each rule, take + // the SINGLE first sub-path that re-enters `target` and minimal-fill everything else, so the output + // is the bare nested skeleton (no kitchen-sink filler). `target` is any self-recursive rule, found + // generically. The sibling (`- a`/`- b`) is added at the target's own repetition, innermost first. + nestChain(body: RuleExpr, target: string, nest: number): Emission[] { + this.siblingLeft = nest + 1; // one inner sibling per nesting level (the `- a`/`- b` pairs) + this.fillBudget = 200_000; // a high backstop (nestChain only runs on small indent/markup grammars now) + return this.nestRec(body, target, nest, 300, false); + } + nestRec(e: RuleExpr, target: string, nest: number, fuel: number, atTarget: boolean): Emission[] { + if (--fuel <= 0 || nest < 0) { return this.fillContent(e, 30) ?? []; } + // at the INNERMOST level (nest 0) fill with CONTENT (a scalar value) so a collection item is + // `- a`, not a bare `-` — monogram#24 needs a plain scalar for the sibling `-` to (wrongly) fold + // into. Off the recursive path at deeper levels → the minimal terminating filler (short chain). + if (nest === 0) { return this.fillContent(e, 30); } + if (!this.exprReaches(e, target)) { const m = this.minExpand(e); if (m) return m; } + switch (e.type) { + case 'literal': return [{ t: 'lit', value: e.value }]; + case 'ref': { + if (this.isStruct(e.name)) return [{ t: 'struct', kind: this.structKind.get(e.name)! }]; + if (this.isToken(e.name)) { const v = sample(this.tokenByName.get(e.name)!.pattern, { rand: this.rand, interesting: [], variant: 0 }); return [{ t: 'tok', name: e.name, text: v || 'x' }]; } + const re = e.name === target; + return this.nestRec(this.ruleByName.get(e.name)!.body, target, re ? nest - 1 : nest, fuel, re); + } + case 'seq': { + // descend the item with the SHORTEST distance to re-entering target (the direct path), and — + // when at the target rule's own body — fire ONE shallow sibling from its repetition (the + // `- a`/`- b` inner pair, monogram#24). Minimal-fill everything else → a clean nested chain. + let idx = -1, best = Infinity; + e.items.forEach((it, i) => { const d = this.exprDist(it, target); if (d < best) { best = d; idx = i; } }); + const out: Emission[] = []; + e.items.forEach((it, i) => { + let part: Emission[]; + if (i === idx) part = this.nestRec(it, target, nest, fuel, atTarget); // deepen the chain + else if (atTarget && this.siblingLeft > 0 && it.type === 'quantifier' && this.exprReaches(it, target)) { + this.siblingLeft--; part = this.nestRec(it.body, target, 0, fuel, false); // one shallow SIBLING + } else part = this.minExpand(it) ?? []; + for (const x of part) out.push(x); + }); + return out; + } + case 'alt': { + // the branch that re-enters target SOONEST (min distance) — so the chain actually descends + let pickEl = e.items[0], best = Infinity; + for (const it of e.items) { const d = this.exprDist(it, target); if (d < best) { best = d; pickEl = it; } } + return this.nestRec(pickEl, target, nest, fuel, atTarget); + } + case 'quantifier': { const out: Emission[] = []; for (const x of this.nestRec(e.body, target, nest, fuel, atTarget)) out.push(x); return out; } + case 'group': return this.nestRec(e.body, target, nest, fuel, atTarget); + case 'sep': { const out: Emission[] = []; for (const x of this.nestRec(e.element, target, nest, fuel, atTarget)) out.push(x); return out; } + case 'not': case 'sameLine': case 'noCommentBefore': case 'noMultilineFlowBefore': + case 'op': case 'prefix': case 'postfix': return []; + } + } +} + +// ─── MATERIALIZE: emissions → text + token spans ────────────────────────────────── +// The per-language structural-token materialization hook. Token-stream grammars join with a +// space (whitespace-insensitive); indentation grammars (YAML) render struct emissions through an +// indent STACK that mirrors the lexer (newline = same-column sibling, indent = deeper block, +// compact = an inline indent for `- - a`); markup grammars keep tag punctuation adjacent. +interface MatOptions { mode: 'token-stream' | 'indent' | 'markup'; indentStep: number } + +function materialize(grammar: CstGrammar, ems: Emission[], opts: MatOptions): { text: string; tokens: GenInput['tokens'] } { + let text = ''; + const tokens: GenInput['tokens'] = []; + // hard length cap: a pathological derivation (deep indent, many reps) must never grow text without + // bound — past the cap, appends are dropped (the input is over-long and discarded by the caller). + const emit = (s: string) => { if (text.length < 16_000) text += s; }; + const emitTok = (name: string, s: string) => { tokens.push({ start: text.length, end: text.length + s.length, name, text: s }); text += s; }; + + if (opts.mode === 'indent') { + const stack: number[] = [0]; // indentation columns; top = current block column + let atLineStart = true; + let pendingCompact = false; // the previous struct was a compact indicator's inline indent + const sp = (n: number) => ' '.repeat(n); + for (let i = 0; i < ems.length; i++) { + const e = ems[i]; + if (e.t === 'struct') { + if (e.kind === 'indent') { + const col = stack[stack.length - 1] + opts.indentStep; + stack.push(col); emit('\n' + sp(col)); atLineStart = true; + } else if (e.kind === 'dedent') { + if (stack.length > 1) stack.pop(); + } else { // newline — sibling at the current column + emit('\n' + sp(stack[stack.length - 1])); atLineStart = true; + } + continue; + } + if (e.t === 'compact') { + // an inline indent: the next content sits on the SAME line; defer the column PUSH until that + // content is emitted, so the pushed column is exactly where the inner indicator lands (the + // lexer's compactIndicators geometry — `- - a` pushes column 2, where the second `-` sits). + pendingCompact = true; continue; + } + const s = e.t === 'lit' ? e.value : e.text; + if (s.length === 0) continue; + if (pendingCompact) { emit(' '); stack.push(text.length - (text.lastIndexOf('\n') + 1)); pendingCompact = false; } // inner COLUMN (in-line), not absolute offset + else if (!atLineStart) emit(' '); // ordinary inline separator (`- a`, `key: v`) + if (e.t === 'tok') emitTok(e.name, s); else emit(s); + atLineStart = false; + } + return { text, tokens }; + } + + if (opts.mode === 'markup') { + const noSpaceBefore = new Set([grammar.markup?.tagClose, grammar.markup?.closeMarker].filter(Boolean) as string[]); + let prev = ''; + for (const e of ems) { + if (e.t === 'struct' || e.t === 'compact') continue; + const s = e.t === 'lit' ? e.value : e.text; + if (s.length === 0) continue; + const adjacent = prev === grammar.markup?.tagOpen || prev === grammar.markup?.closeMarker || noSpaceBefore.has(s) || prev === ''; + if (!adjacent) emit(' '); + if (e.t === 'tok') emitTok(e.name, s); else emit(s); + prev = s; + } + return { text, tokens }; + } + + // token-stream: join with a single space (always legal in a whitespace-insensitive language) + let first = true; + for (const e of ems) { + if (e.t === 'struct' || e.t === 'compact') continue; + const s = e.t === 'lit' ? e.value : e.text; + if (s.length === 0) continue; + if (!first) emit(' '); + if (e.t === 'tok') emitTok(e.name, s); else emit(s); + first = false; + } + return { text, tokens }; +} + +// Rewrite a YAML compact indicator's following `[Indent, …, Dedent]` so the indent renders INLINE +// (`- - a`) rather than next-line (`-\n - a`). Both are legal and parse identically; the compact +// form is what reproduces monogram#24's column geometry. Applied to a copy of the emission list. +function compactify(ems: Emission[], compactLits: Set): Emission[] { + const out: Emission[] = []; + for (let i = 0; i < ems.length; i++) { + const e = ems[i]; + out.push(e); + // a compact indicator literal (`-`/`?`) immediately followed by a struct indent → inline it + if (e.t === 'lit' && compactLits.has(e.value)) { + const nxt = ems[i + 1]; + if (nxt && nxt.t === 'struct' && nxt.kind === 'indent') { out.push({ t: 'compact' }); i++; } + } + } + return out; +} + +// ─── TOP LEVEL ──────────────────────────────────────────────────────────────────── +export function generateInputs(grammar: CstGrammar, opts: GenOptions = {}): GenInput[] { + const depth = opts.depth ?? 5; + const cap = opts.cap ?? 6; + const maxInputs = opts.maxInputs ?? 400; + const fuzzRounds = opts.fuzzRounds ?? 300; + const nestDepth = opts.nestDepth ?? 5; + const seed = opts.seed ?? 12345; + const w = new Walker(grammar, seed, cap); + + const mode: MatOptions['mode'] = grammar.indent ? 'indent' : grammar.markup ? 'markup' : 'token-stream'; + const matOpts: MatOptions = { mode, indentStep: 2 }; + const entry = grammar.rules[grammar.rules.length - 1]; + + // wall-clock budget: the depth strategies (nest / dirnest) over a LARGE token-stream grammar (the + // TS family — 50+ self-recursive rules, huge Pratt-expression alts) are heavy and add little, since + // those grammars have no indent/markup depth bugs for the scope≡role check to find. Cap total time + // so one driver stays tractable across all 7 languages; each per-rule loop checks it. + const t0 = Date.now(); + const timeBudgetMs = opts.timeBudgetMs ?? 9000; + const timeUp = () => Date.now() - t0 > timeBudgetMs; + + const seen = new Set(); + const out: GenInput[] = []; + const push = (ems: Emission[], strategy: string, rule: string) => { + if (out.length >= maxInputs * 4) return; + for (const variant of mode === 'indent' ? [ems, compactify(ems, w.compactLits)] : [ems]) { + const { text, tokens } = materialize(grammar, variant, matOpts); + if (!text.trim() || text.length > 2000 || seen.has(text)) continue; // skip blank / over-long / duplicate + seen.add(text); + out.push({ text, tokens, strategy, rule }); + } + }; + + // 1) bounded-exhaustive from the entry rule: the canonical small shapes (every derivation to depth N) + for (const ems of w.enumTop(entry.body, depth)) push(ems, 'exhaustive', entry.name); + + // The depth strategies (2,3) only matter for INDENTATION / MARKUP grammars — those are where a flat + // highlighter loses to the stack-keeping parser (the monogram#23/#24 class). Token-stream grammars + // are whitespace-insensitive and the flat grammar is exact, so their (large) self-recursive rule set + // is skipped: it adds no depth coverage and would dominate the time budget. + const depthMatters = !!(grammar.indent || grammar.markup); + const recursive = depthMatters ? w.selfRecursive() : []; + + // 2) bounded-exhaustive ROOTED at each self-recursive rule: exercises every rule's own small shapes + // (round-tripped against that rule as the entry), incl. the FIRST level of self-embedding. + for (const rn of recursive) { + if (timeUp()) break; + const r = w.ruleByName.get(rn)!; + for (let d = 1; d <= Math.min(nestDepth, 3); d++) for (const ems of w.enumTop(r.body, d)) push(ems, `nest:${rn}@${d}`, rn); + } + + // 3) directed nesting: a clean, deterministic nested chain of each self-recursive rule (with one + // inner sibling) at depth 1..N — monogram#24 is a BlockSequence inside a BlockSequence with an + // inner sibling (`- - a\n - b\n- c`), which the un-biased capped enumeration starves. + for (const rn of recursive) { + if (timeUp()) break; + const r = w.ruleByName.get(rn)!; + for (let d = 1; d <= nestDepth; d++) push(w.nestChain(r.body, rn, d), `dirnest:${rn}@${d}`, rn); + } + + // 4) fuzzing for deeper / wider structures (random production choices), rooted at the entry AND at + // each self-recursive rule so deep shapes are reached quickly. + for (let i = 0; i < fuzzRounds; i++) push(w.fuzz(entry.body, depth + 2), 'fuzz', entry.name); + for (const rn of recursive) { + if (timeUp()) break; + const r = w.ruleByName.get(rn)!; + for (let i = 0; i < Math.ceil(fuzzRounds / 8); i++) push(w.fuzz(r.body, depth + 2), `fuzz:${rn}`, rn); + } + + return out.slice(0, maxInputs); +} diff --git a/test/parser-gap.ts b/test/parser-gap.ts deleted file mode 100644 index a2f8226..0000000 --- a/test/parser-gap.ts +++ /dev/null @@ -1,254 +0,0 @@ -import { readFileSync, readdirSync, statSync } from 'node:fs'; -import { join, relative } from 'node:path'; - -const TEST_DIR = '/tmp/ts-repo/tests/cases/conformance'; - -// ── Collect test files ── - -function walkDir(dir: string): string[] { - const files: string[] = []; - for (const entry of readdirSync(dir)) { - const full = join(dir, entry); - if (statSync(full).isDirectory()) { - files.push(...walkDir(full)); - } else if (full.endsWith('.ts') && !full.endsWith('.d.ts')) { - files.push(full); - } - } - return files; -} - -const testFiles = walkDir(TEST_DIR).sort(); - -// ── Gap patterns: syntax our grammar does NOT define rules for ── -// Grouped by category, ordered by likely impact - -interface Gap { - name: string; - category: string; - test: (s: string) => boolean; - difficulty: 'easy' | 'medium' | 'hard'; - covered?: boolean; -} - -const gaps: Gap[] = [ - // ── Destructuring (COVERED — rules added) ── - { name: 'Object destructuring binding', category: 'Destructuring', difficulty: 'hard', covered: true, - test: s => /(?:let|const|var)\s+\{/.test(s) }, - { name: 'Array destructuring binding', category: 'Destructuring', difficulty: 'hard', covered: true, - test: s => /(?:let|const|var)\s+\[/.test(s) }, - { name: 'Destructuring in params', category: 'Destructuring', difficulty: 'hard', covered: true, - test: s => /\(\s*\{[^}]*\}\s*[,:)]/.test(s) || /\(\s*\[[^\]]*\]\s*[,:)]/.test(s) }, - { name: 'Destructuring in for-of/in', category: 'Destructuring', difficulty: 'hard', covered: true, - test: s => /for\s*\(\s*(?:const|let|var)\s+[\[{]/.test(s) }, - { name: 'Default values in destructuring', category: 'Destructuring', difficulty: 'hard', covered: true, - test: s => /\{\s*\w+\s*=\s*[^=]/.test(s) && /(?:let|const|var|function|\()/.test(s) }, - - // ── Statements (COVERED) ── - { name: 'Labeled statement', category: 'Statements', difficulty: 'easy', covered: true, - test: s => /^\s*[a-zA-Z_$]\w*\s*:\s*(?:for|while|do|switch)/m.test(s) }, - { name: 'debugger statement', category: 'Statements', difficulty: 'easy', covered: true, - test: s => /^\s*debugger\s*;?\s*$/m.test(s) }, - { name: 'with statement', category: 'Statements', difficulty: 'easy', covered: true, - test: s => /\bwith\s*\(/.test(s) }, - { name: 'Empty statement (bare ;)', category: 'Statements', difficulty: 'easy', covered: true, - test: s => /^\s*;\s*$/m.test(s) }, - - // ── Type features ── - { name: 'Index signature [k: T]: V', category: 'Types', difficulty: 'medium', covered: true, - test: s => /\[\s*\w+\s*:\s*(?:string|number|symbol)\s*\]\s*:/.test(s) }, - { name: 'Conditional type extends?:', category: 'Types', difficulty: 'medium', covered: true, - test: s => /\bextends\b[^{]*\?\s*\S[^;]*\s*:/.test(s) && /\btype\b/.test(s) }, - { name: 'Mapped type {[K in T]: V}', category: 'Types', difficulty: 'medium', covered: true, - test: s => /\{\s*\[?\s*\w+\s+in\s+/.test(s) }, - { name: 'infer keyword', category: 'Types', difficulty: 'medium', covered: true, - test: s => /\binfer\s+[A-Z]/.test(s) }, - { name: 'Template literal type', category: 'Types', difficulty: 'medium', covered: true, - test: s => /type\s+\w+[^=]*=\s*`/.test(s) }, - { name: 'Type predicate (x is T)', category: 'Types', difficulty: 'easy', covered: true, - test: s => /\)\s*:\s*\w+\s+is\s+\w/.test(s) }, - { name: 'asserts keyword', category: 'Types', difficulty: 'easy', covered: true, - test: s => /\basserts\s+\w+/.test(s) }, - { name: 'import type / export type', category: 'Types', difficulty: 'easy', covered: true, - test: s => /\b(?:import|export)\s+type\s+[{A-Z]/.test(s) }, - { name: 'satisfies operator', category: 'Types', difficulty: 'easy', covered: true, - test: s => /\bsatisfies\s+\w/.test(s) }, - - // ── Expression features ── - { name: 'Template literal ${expr}', category: 'Expressions', difficulty: 'hard', covered: true, - test: s => /`[^`]*\$\{/.test(s) }, - { name: 'Default parameter value', category: 'Expressions', difficulty: 'easy', covered: true, - test: s => /\(\s*\w+\s*(?::\s*\w[^)]*?)?\s*=[^=>][^)]*\)/.test(s) }, - { name: 'Optional chaining ?.( / ?.[', category: 'Expressions', difficulty: 'easy', covered: true, - test: s => /\?\.\s*[\[(]/.test(s) }, - { name: 'Dynamic import()', category: 'Expressions', difficulty: 'easy', covered: true, - test: s => /\bimport\s*\(/.test(s) }, - { name: 'import.meta', category: 'Expressions', difficulty: 'easy', covered: true, - test: s => /\bimport\s*\.\s*meta\b/.test(s) }, - { name: 'Tagged template f`...`', category: 'Expressions', difficulty: 'medium', covered: true, - test: s => /\w\s*`/.test(s) && /`[^`]*\$\{/.test(s) }, - { name: 'Comma operator', category: 'Expressions', difficulty: 'easy', covered: true, - test: s => /\breturn\s*\(.*,.*\)\s*;/.test(s) }, - { name: 'Class expression', category: 'Expressions', difficulty: 'medium', covered: true, - test: s => /=\s*class\s*(?:\w+\s*)?\{/.test(s) }, - { name: 'Function expression', category: 'Expressions', difficulty: 'easy', covered: true, - test: s => /=\s*function\s*\w*\s*[\(<]/.test(s) }, - { name: 'void expression', category: 'Expressions', difficulty: 'easy', covered: true, - test: s => /\bvoid\s+\w/.test(s) }, - - // ── Declaration features ── - { name: 'export default', category: 'Declarations', difficulty: 'easy', covered: true, - test: s => /\bexport\s+default\b/.test(s) }, - { name: 'export * / re-export', category: 'Declarations', difficulty: 'easy', covered: true, - test: s => /\bexport\s+\*/.test(s) || /\bexport\s+\{[^}]+\}\s+from\b/.test(s) }, - { name: 'export = / import =', category: 'Declarations', difficulty: 'easy', covered: true, - test: s => /\bexport\s*=/.test(s) || /\bimport\s+\w+\s*=\s*require/.test(s) }, - { name: 'const enum', category: 'Declarations', difficulty: 'easy', covered: true, - test: s => /\bconst\s+enum\b/.test(s) }, - { name: 'Class static block', category: 'Declarations', difficulty: 'medium', covered: true, - test: s => /\bstatic\s*\{/.test(s) }, - { name: 'Call/construct signature', category: 'Declarations', difficulty: 'medium', covered: true, - test: s => /(?:interface|type)[^{]*\{[^}]*(?:new\s*\(|^\s*\()/ms.test(s) }, - { name: 'Method overloads', category: 'Declarations', difficulty: 'medium', covered: true, - test: s => /\w+\s*\([^)]*\)\s*:\s*\w[^{;]*;\s*\n\s*\w+\s*\(/m.test(s) }, - { name: 'using / await using', category: 'Declarations', difficulty: 'easy', covered: true, - test: s => /\b(?:await\s+)?using\s+\w+\s*=/.test(s) }, - { name: 'accessor keyword', category: 'Declarations', difficulty: 'easy', covered: true, - test: s => /\baccessor\s+\w+/.test(s) }, - - // ── Class features ── - { name: 'Parameter properties', category: 'Classes', difficulty: 'easy', covered: true, - test: s => /constructor\s*\([^)]*\b(?:public|private|protected|readonly)\b/.test(s) }, - { name: 'Decorators with args @f()', category: 'Classes', difficulty: 'easy', covered: true, - test: s => /@\w+\s*\([^)]*\)/.test(s) }, -]; - -// ── Scan ── - -console.log(`Scanning ${testFiles.length} conformance test files...\n`); - -const gapHits = new Map(); -for (const g of gaps) gapHits.set(g.name, { count: 0, examples: [] }); - -let filesWithGaps = 0; - -for (const file of testFiles) { - const source = readFileSync(file, 'utf-8'); - const rel = relative(TEST_DIR, file); - let hasGap = false; - - for (const g of gaps) { - if (g.test(source)) { - const hit = gapHits.get(g.name)!; - hit.count++; - if (hit.examples.length < 3) hit.examples.push(rel); - hasGap = true; - } - } - - if (hasGap) filesWithGaps++; -} - -// ── Report ── - -console.log('═══════════════════════════════════════════════════════════════'); -console.log(' Monogram — Parser Gap Analysis'); -console.log(` ${testFiles.length} TypeScript conformance tests → ${filesWithGaps} files with gaps`); -console.log('═══════════════════════════════════════════════════════════════\n'); - -const categories = [...new Set(gaps.map(g => g.category))]; -const allHitsRaw = gaps.map(g => ({ ...g, ...gapHits.get(g.name)! })).filter(g => g.count > 0); - -// Recompute filesWithGaps excluding covered constructs -let filesWithRemainingGaps = 0; -for (const file of testFiles) { - const source = readFileSync(file, 'utf-8'); - let hasUncoveredGap = false; - for (const g of gaps) { - if (g.covered) continue; - if (g.test(source)) { hasUncoveredGap = true; break; } - } - if (hasUncoveredGap) filesWithRemainingGaps++; -} - -// ── Covered constructs ── -const coveredHits = allHitsRaw.filter(g => g.covered && g.count > 0); -if (coveredHits.length > 0) { - const covTotal = coveredHits.reduce((s, g) => s + g.count, 0); - console.log(`── COVERED (in grammar) ── (${covTotal} file hits)\n`); - for (const g of coveredHits.sort((a, b) => b.count - a.count)) { - const pct = ((g.count / testFiles.length) * 100).toFixed(1); - console.log(` ✓ ${g.name.padEnd(38)} ${String(g.count).padStart(4)} files (${pct}%)`); - } - console.log(); -} - -// ── Remaining gaps ── -let totalGapFiles = 0; - -for (const cat of categories) { - const catGaps = gaps.filter(g => g.category === cat && !g.covered); - const catHits = catGaps - .map(g => ({ ...g, ...gapHits.get(g.name)! })) - .filter(g => g.count > 0) - .sort((a, b) => b.count - a.count); - - if (catHits.length === 0) continue; - - const catTotal = catHits.reduce((s, g) => s + g.count, 0); - totalGapFiles += catTotal; - - console.log(`── ${cat} ── (${catTotal} hits)\n`); - - for (const g of catHits) { - const pct = ((g.count / testFiles.length) * 100).toFixed(1); - const diff = g.difficulty === 'easy' ? '●' : g.difficulty === 'medium' ? '◐' : '○'; - console.log(` ${diff} ${g.name.padEnd(38)} ${String(g.count).padStart(4)} files (${pct}%) [${g.difficulty}]`); - } - console.log(); -} - -// ── Difficulty summary ── -const allHits = allHitsRaw.filter(g => !g.covered); -const easy = allHits.filter(g => g.difficulty === 'easy'); -const medium = allHits.filter(g => g.difficulty === 'medium'); -const hard = allHits.filter(g => g.difficulty === 'hard'); - -const easyFiles = easy.reduce((s, g) => s + g.count, 0); -const mediumFiles = medium.reduce((s, g) => s + g.count, 0); -const hardFiles = hard.reduce((s, g) => s + g.count, 0); - -console.log('═══════════════════════════════════════════════════════════════'); -console.log(' Summary'); -console.log('═══════════════════════════════════════════════════════════════\n'); -const coveredCount = coveredHits.reduce((s, g) => s + g.count, 0); -console.log(` Total test files: ${testFiles.length}`); -console.log(` Files fully covered: ${testFiles.length - filesWithRemainingGaps} (${(((testFiles.length - filesWithRemainingGaps) / testFiles.length) * 100).toFixed(1)}%)`); -console.log(` Files with gaps: ${filesWithRemainingGaps} (${((filesWithRemainingGaps / testFiles.length) * 100).toFixed(1)}%)`); -console.log(` Recently covered: ${coveredHits.length} constructs (${coveredCount} file hits)`); -console.log(); -console.log(` Remaining gaps:`); -console.log(` ● Easy: ${easy.length.toString().padStart(2)} constructs (${easyFiles} file hits) — add rule/keyword`); -console.log(` ◐ Medium: ${medium.length.toString().padStart(2)} constructs (${mediumFiles} file hits) — new rule + patterns`); -console.log(` ○ Hard: ${hard.length.toString().padStart(2)} constructs (${hardFiles} file hits) — recursive patterns / new concepts`); -console.log(); - -// ── What closing easy gaps would achieve ── -let onlyEasyGapFiles = 0; -for (const file of testFiles) { - const source = readFileSync(file, 'utf-8'); - let hasHard = false; - let hasMedium = false; - let hasAny = false; - for (const g of gaps) { - if (g.covered) continue; - if (!g.test(source)) continue; - hasAny = true; - if (g.difficulty === 'hard') hasHard = true; - if (g.difficulty === 'medium') hasMedium = true; - } - if (hasAny && !hasHard && !hasMedium) onlyEasyGapFiles++; -} - -const afterEasy = testFiles.length - filesWithRemainingGaps + onlyEasyGapFiles; -console.log(`\n After closing ● easy: ${afterEasy}/${testFiles.length} files covered (${((afterEasy / testFiles.length) * 100).toFixed(1)}%)`); diff --git a/test/prof.ts b/test/prof.ts deleted file mode 100644 index f896d0a..0000000 --- a/test/prof.ts +++ /dev/null @@ -1,10 +0,0 @@ -import { readFileSync } from 'fs'; -import { createParser } from '../src/gen-parser.ts'; -const grammar = (await import('../typescript.ts')).default; -process.env.PROF = '1'; -const p: any = createParser(grammar); -const code = readFileSync('/tmp/ts-repo/tests/cases/conformance/parser/ecmascript5/RealWorld/parserharness.ts','utf-8'); -try { p.parse(code); } catch {} -const g = (k:string)=>p.profCounts.get(k)??0; -console.log('memo hit/miss:', g('$memoHit'), '/', g('$memoMiss'), '=> hit rate', (100*g('$memoHit')/(g('$memoHit')+g('$memoMiss'))).toFixed(0)+'%'); -console.log('LED loop: tries', g('$ledTry'), ' hits', g('$ledHit'), '=> wasted', (100*(1-g('$ledHit')/g('$ledTry'))).toFixed(0)+'% of led matchSeq attempts fail fast'); diff --git a/test/scope-gap-html.ts b/test/scope-gap-html.ts deleted file mode 100644 index 31a9309..0000000 --- a/test/scope-gap-html.ts +++ /dev/null @@ -1,48 +0,0 @@ -// scope-gap-html.ts — HTML adapter for the unified scope-gap harness. The FIRST real -// vscode#203212 comparative gap: VS Code's HTML grammar is the unmaintained textmate/html.tmbundle; -// the oracle is parse5 (maintained, authoritative). Run (bare node): node test/scope-gap-html.ts -// Override the official grammar: MONOGRAM_OFFICIAL_HTML=/path/to/html.tmLanguage.json -import { run } from './scope-gap.ts'; -import { htmlOracle } from './html-oracle.ts'; -import { cases as htmlIssueCases } from './html-issue-cases.ts'; - -const OFFICIAL = process.env.MONOGRAM_OFFICIAL_HTML - ?? '/Applications/Visual Studio Code.app/Contents/Resources/app/extensions/html/syntaxes/html.tmLanguage.json'; - -// Realistic HTML (baseline) — tags, quoted/unquoted/boolean attrs, nesting, comments, voids. -const GENERAL: string[] = [ - '

Hello world.

', - '
  • one
  • two
  • three
', - 'a picture', - '', - '', - '

Title

Body with bold and italic.

', - '', - '
', - '
AB
12
', - '
x
', - '', - '

', - '', - '', - '', - 'text', - 'body', - 'link', - '', - '
photo
cap
', -]; - -const corpus = [ - ...GENERAL.map((text, i) => ({ name: `general#${i}`, text })), - ...htmlIssueCases.map((c: any, i: number) => ({ name: `issue:${c.title ?? i}`, text: c.src as string })), -]; - -await run({ - name: 'HTML', - scopeName: 'text.html.basic', - officialPath: OFFICIAL, - monogramPath: 'html.tmLanguage.json', - loadCorpus: () => corpus, - roleOracle: htmlOracle, -}); diff --git a/test/scope-gap-js.ts b/test/scope-gap-js.ts deleted file mode 100644 index e483ed3..0000000 --- a/test/scope-gap-js.ts +++ /dev/null @@ -1,30 +0,0 @@ -// scope-gap-js.ts — JavaScript (.js) adapter for the unified scope-gap harness. Grades VS Code's -// OFFICIAL JavaScript.tmLanguage.json AND Monogram's javascript.tmLanguage.json against the parser -// oracle (oracle.ts with ScriptKind.JS). Both grammars declare scopeName `source.js`, so they load -// + compare on one scale. Corpus = Test262 (tc39/test262), the canonical ECMAScript corpus — the TS -// suite has ~no .js. Provision once: git clone --depth 1 https://github.com/tc39/test262 /tmp/test262 -// Run (bare node): node test/scope-gap-js.ts [N|all] (Test262 is huge; default sample 800) -import ts from 'typescript'; -import { run } from './scope-gap.ts'; -import { oracle } from './oracle.ts'; -import { walkCorpus, subsetArg } from './src-coverage-tsfamily.ts'; - -const BASE = '/tmp/test262/test/language'; // the syntax-relevant subtree of Test262 -const OFFICIAL = process.env.MONOGRAM_OFFICIAL_TM - ?? '/Applications/Visual Studio Code.app/Contents/Resources/app/extensions/javascript/syntaxes/JavaScript.tmLanguage.json'; - -// walkCorpus already drops .d.ts + multi-file (@filename) fixtures and stride-samples. -const corpus = walkCorpus([BASE], ['.js'], subsetArg(800)).filter((c) => !c.file.endsWith('_FIXTURE.js')); - -await run({ - name: 'JavaScript (.js)', - scopeName: 'source.js', - officialPath: OFFICIAL, - monogramPath: 'javascript.tmLanguage.json', - loadCorpus: () => corpus.map((c) => ({ name: c.file, text: c.code })), - roleOracle: (text) => oracle(text, ts.ScriptKind.JS), - isGradable: (text) => { - const sf = ts.createSourceFile('c.js', text, ts.ScriptTarget.Latest, true, ts.ScriptKind.JS); - return (((sf as any).parseDiagnostics?.length ?? 0) === 0); - }, -}); diff --git a/test/scope-gap-run.ts b/test/scope-gap-run.ts new file mode 100644 index 0000000..c754442 --- /dev/null +++ b/test/scope-gap-run.ts @@ -0,0 +1,130 @@ +// ───────────────────────────────────────────────────────────────────────────── +// scope-gap-run.ts — the UNIFIED, data-driven entry for the scope-gap metric (monogram#25 +// part 2B). One driver + a per-language config TABLE, replacing the seven thin +// scope-gap-{ts,js,jsx,tsx,html,yaml,vue} adapter files: each was mostly the same `run(adapter)` +// literal differing only in corpus path / grammar path / scopeName / oracle / official path. +// Those vary as DATA here; the shared core stays scope-gap.ts's `run()`. +// +// Run (bare node): node test/scope-gap-run.ts [N|all] +// lang ∈ ts | js | jsx | tsx | html | yaml | vue +// +// Per-language entry is preserved as the `` PARAMETER (the npm scripts pass it). The +// thicker html / yaml specifics (multi-file official loader, fullSpan, differential) live in their +// TABLE ENTRY, not a separate file. VUE is genuinely different — it is an INJECTION grammar that +// needs vuejs/language-tools' own tokenizer (a bare Registry.loadGrammar never fires the directive +// / interpolation injections), so it cannot use `run()`; ` vue` DELEGATES to scope-gap-vue.ts. +// ───────────────────────────────────────────────────────────────────────────── +import ts from 'typescript'; +import { readdirSync, readFileSync } from 'node:fs'; +import { dirname, join } from 'node:path'; +import { parse as yamlParse, parseAllDocuments } from 'yaml'; +import { run, type ScopeGapAdapter } from './scope-gap.ts'; +import { oracle } from './oracle.ts'; +import { yamlOracle } from './yaml-oracle.ts'; +import { htmlOracle } from './html-oracle.ts'; +import { walkCorpus } from './src-coverage-tsfamily.ts'; +import { JSX_CASES, HTML_GENERAL } from './curated-corpora.ts'; +import { cases as htmlIssueCases } from './html-issue-cases.ts'; +import { cases as yamlIssue12 } from './yaml-issue12-regressions.ts'; + +const VSCODE_TM = '/Applications/Visual Studio Code.app/Contents/Resources/app/extensions'; +// subset size from argv[3] (argv[2] is the language) / env SUBSET / default; `all` = the full corpus. +const subN = (def = 400): number => { const a = process.argv[3]; return a === 'all' ? Infinity : Number(a ?? process.env.SUBSET ?? def); }; +const tsParseClean = (kind: ts.ScriptKind, fn: string) => (text: string): boolean => { + const sf = ts.createSourceFile(fn, text, ts.ScriptTarget.Latest, true, kind); + return (((sf as any).parseDiagnostics?.length ?? 0) === 0); +}; + +// One TS-family scope-gap adapter (TS/JS/JSX/TSX differ only by ScriptKind + corpus + paths). +function tsFamily(o: { name: string; scopeName: string; kind: ts.ScriptKind; mono: string; officialEnv: string; officialDefault: string; fn: string; corpus: () => { name: string; text: string }[] }): ScopeGapAdapter { + return { + name: o.name, scopeName: o.scopeName, + officialPath: process.env[o.officialEnv] ?? o.officialDefault, + monogramPath: o.mono, + loadCorpus: o.corpus, + roleOracle: (text) => oracle(text, o.kind), + isGradable: tsParseClean(o.kind, o.fn), + }; +} + +// ── per-language config table ──────────────────────────────────────────────────────────────────── +const BUILDERS: Record ScopeGapAdapter> = { + ts: () => { + // The TS entry strides over the FULL .ts file list then drops multi-file (@filename) fixtures — + // the original scope-gap-ts order (walk-all → stride-pick → filter), preserved so the metric is + // byte-identical (it differs subtly from walkCorpus, which filters before the stride). + const DIR = '/tmp/ts-repo/tests/cases/conformance/parser'; + const all: string[] = []; + const walk = (d: string) => { for (const e of readdirSync(d, { withFileTypes: true })) { const f = join(d, e.name); if (e.isDirectory()) walk(f); else if (e.name.endsWith('.ts') && !e.name.endsWith('.d.ts')) all.push(f); } }; + walk(DIR); all.sort(); + return tsFamily({ + name: 'TypeScript', scopeName: 'source.ts', kind: ts.ScriptKind.TS, mono: 'typescript.tmLanguage.json', fn: 'c.ts', + officialEnv: 'MONOGRAM_OFFICIAL_TM', officialDefault: `${VSCODE_TM}/typescript-basics/syntaxes/TypeScript.tmLanguage.json`, + corpus: () => { const N = subN(400); const pick = !isFinite(N) || N >= all.length ? all : Array.from({ length: N }, (_, i) => all[Math.floor(i * all.length / N)]); return pick.map((f) => ({ name: f, text: readFileSync(f, 'utf8') })).filter((x) => !/^\s*\/\/\s*@filename:/im.test(x.text)); }, + }); + }, + js: () => tsFamily({ + name: 'JavaScript (.js)', scopeName: 'source.js', kind: ts.ScriptKind.JS, mono: 'javascript.tmLanguage.json', fn: 'c.js', + officialEnv: 'MONOGRAM_OFFICIAL_TM', officialDefault: `${VSCODE_TM}/javascript/syntaxes/JavaScript.tmLanguage.json`, + corpus: () => walkCorpus(['/tmp/test262/test/language'], ['.js'], subN(800)).filter((c) => !c.file.endsWith('_FIXTURE.js')).map((c) => ({ name: c.file, text: c.code })), + }), + jsx: () => tsFamily({ + name: 'JavaScriptReact (.jsx)', scopeName: 'source.js.jsx', kind: ts.ScriptKind.JSX, mono: 'javascriptreact.tmLanguage.json', fn: 'c.jsx', + officialEnv: 'MONOGRAM_OFFICIAL_TM', officialDefault: `${VSCODE_TM}/javascript/syntaxes/JavaScriptReact.tmLanguage.json`, + corpus: () => JSX_CASES.map((text, i) => ({ name: ``, text })), + }), + tsx: () => { + const BASE = '/tmp/ts-repo/tests/cases'; + return tsFamily({ + name: 'TypeScriptReact (.tsx)', scopeName: 'source.tsx', kind: ts.ScriptKind.TSX, mono: 'typescriptreact.tmLanguage.json', fn: 'c.tsx', + officialEnv: 'MONOGRAM_OFFICIAL_TM', officialDefault: `${VSCODE_TM}/typescript-basics/syntaxes/TypeScriptReact.tmLanguage.json`, + corpus: () => walkCorpus([`${BASE}/conformance`, `${BASE}/compiler`], ['.tsx'], subN(Infinity)).map((c) => ({ name: c.file, text: c.code })), + }); + }, + html: () => ({ + name: 'HTML', scopeName: 'text.html.basic', + officialPath: process.env.MONOGRAM_OFFICIAL_HTML ?? `${VSCODE_TM}/html/syntaxes/html.tmLanguage.json`, + monogramPath: 'html.tmLanguage.json', + loadCorpus: () => [ + ...HTML_GENERAL.map((text, i) => ({ name: `general#${i}`, text })), + ...htmlIssueCases.map((c: any, i: number) => ({ name: `issue:${c.title ?? i}`, text: c.src as string })), + ], + roleOracle: htmlOracle, + }), + yaml: () => { + // The "official" YAML baseline is the MAINTAINED RedCMD/VS Code grammar (microsoft/vscode#232244), + // a multi-file dispatcher that include()s version-specific sub-grammars in the same dir. + const OFFICIAL = process.env.MONOGRAM_OFFICIAL_YAML ?? '/tmp/redcmd-yaml/syntaxes/yaml.tmLanguage.json'; + const SYN = dirname(OFFICIAL); + const SUITE = '/tmp/yaml-test-suite/src'; + const decode = (s: string) => s.replace(/␣/g, ' ').replace(/—+»/g, '\t').replace(/[↵∎]/g, ''); + const corpus: { name: string; text: string }[] = []; + for (const f of readdirSync(SUITE).filter((n) => n.endsWith('.yaml'))) { + try { const meta = yamlParse(readFileSync(`${SUITE}/${f}`, 'utf8')); for (const t of (Array.isArray(meta) ? meta : [meta])) if (t && typeof t.yaml === 'string') corpus.push({ name: f, text: decode(t.yaml) }); } catch { /* skip */ } + } + for (const c of yamlIssue12) corpus.push({ name: `monogram#12 ${c.id}`, text: c.src }); + return { + name: 'YAML', scopeName: 'source.yaml', officialPath: OFFICIAL, monogramPath: 'yaml.tmLanguage.json', + officialExtra: { + 'source.yaml.1.2': join(SYN, 'yaml-1.2.tmLanguage.json'), 'source.yaml.1.1': join(SYN, 'yaml-1.1.tmLanguage.json'), + 'source.yaml.1.0': join(SYN, 'yaml-1.0.tmLanguage.json'), 'source.yaml.1.3': join(SYN, 'yaml-1.3.tmLanguage.json'), + 'source.yaml.embedded': join(SYN, 'yaml-embedded.tmLanguage.json'), + }, + loadCorpus: () => corpus, + roleOracle: yamlOracle, + // Only grade valid YAML (the AST's key/value resolution is unreliable on malformed input); the + // invalid-input blind spot is covered by the asserted issue12 gate + the differential pass. + isGradable: (text) => { try { return parseAllDocuments(text).every((d: any) => d.errors.length === 0); } catch { return false; } }, + fullSpan: true, // YAML's oracle emits coarse, role-homogeneous spans — grade every char + differential: true, // also report oracle-independent Monogram-vs-official divergences + }; + }, +}; + +const lang = process.argv[2]; +if (lang === 'vue') { await import('./scope-gap-vue.ts'); } // injection grammar — its own tokenizer +else { + const build = BUILDERS[lang]; + if (!build) { console.error(`usage: node test/scope-gap-run.ts [N|all]\nunknown language: ${lang ?? '(none)'}`); process.exit(1); } + await run(build()); +} diff --git a/test/scope-gap-ts.ts b/test/scope-gap-ts.ts deleted file mode 100644 index 92d4d72..0000000 --- a/test/scope-gap-ts.ts +++ /dev/null @@ -1,39 +0,0 @@ -// scope-gap-ts.ts — TypeScript adapter for the unified scope-gap harness. Demonstrates the -// harness reproduces the official-vs-Monogram gap from a parser-role oracle -// (oracle.ts = tsc → roles). Run (bare node): node test/scope-gap-ts.ts [N|all] -import ts from 'typescript'; -import { readdirSync, readFileSync } from 'node:fs'; -import { join } from 'node:path'; -import { run } from './scope-gap.ts'; -import { oracle } from './oracle.ts'; - -const PARSER_DIR = '/tmp/ts-repo/tests/cases/conformance/parser'; -const OFFICIAL = process.env.MONOGRAM_OFFICIAL_TM - ?? '/Applications/Visual Studio Code.app/Contents/Resources/app/extensions/typescript-basics/syntaxes/TypeScript.tmLanguage.json'; - -function walk(d: string): string[] { - let o: string[] = []; - for (const e of readdirSync(d, { withFileTypes: true })) { - const f = join(d, e.name); - if (e.isDirectory()) o = o.concat(walk(f)); - else if (e.name.endsWith('.ts') && !e.name.endsWith('.d.ts')) o.push(f); - } - return o; -} -const arg = process.argv[2]; -const N = arg === 'all' ? Infinity : Number(arg ?? 400); -const all = walk(PARSER_DIR).sort(); -const pick = !isFinite(N) || N >= all.length ? all : Array.from({ length: N }, (_, i) => all[Math.floor(i * all.length / N)]); - -await run({ - name: 'TypeScript', - scopeName: 'source.ts', - officialPath: OFFICIAL, - monogramPath: 'typescript.tmLanguage.json', - loadCorpus: () => pick.map((f) => ({ name: f, text: readFileSync(f, 'utf8') })).filter((x) => !/^\s*\/\/\s*@filename:/im.test(x.text)), - roleOracle: (text) => oracle(text, ts.ScriptKind.TS), - isGradable: (text) => { - const sf = ts.createSourceFile('c.ts', text, ts.ScriptTarget.Latest, true, ts.ScriptKind.TS); - return (((sf as any).parseDiagnostics?.length ?? 0) === 0); - }, -}); diff --git a/test/scope-gap-tsx.ts b/test/scope-gap-tsx.ts deleted file mode 100644 index 270b832..0000000 --- a/test/scope-gap-tsx.ts +++ /dev/null @@ -1,28 +0,0 @@ -// scope-gap-tsx.ts — TSX (.tsx) adapter for the unified scope-gap harness. Grades VS Code's -// OFFICIAL TypeScriptReact.tmLanguage.json AND Monogram's typescriptreact.tmLanguage.json against -// the parser oracle (oracle.ts with ScriptKind.TSX). Both grammars declare scopeName `source.tsx`. -// Corpus = the TypeScript repo's single-file .tsx tests (conformance/jsx + compiler). -// Run (bare node): node test/scope-gap-tsx.ts [N|all] (default: all — the .tsx set is small) -import ts from 'typescript'; -import { run } from './scope-gap.ts'; -import { oracle } from './oracle.ts'; -import { walkCorpus, subsetArg } from './src-coverage-tsfamily.ts'; - -const BASE = '/tmp/ts-repo/tests/cases'; -const OFFICIAL = process.env.MONOGRAM_OFFICIAL_TM - ?? '/Applications/Visual Studio Code.app/Contents/Resources/app/extensions/typescript-basics/syntaxes/TypeScriptReact.tmLanguage.json'; - -const corpus = walkCorpus([`${BASE}/conformance`, `${BASE}/compiler`], ['.tsx'], subsetArg(Infinity)); - -await run({ - name: 'TypeScriptReact (.tsx)', - scopeName: 'source.tsx', - officialPath: OFFICIAL, - monogramPath: 'typescriptreact.tmLanguage.json', - loadCorpus: () => corpus.map((c) => ({ name: c.file, text: c.code })), - roleOracle: (text) => oracle(text, ts.ScriptKind.TSX), - isGradable: (text) => { - const sf = ts.createSourceFile('c.tsx', text, ts.ScriptTarget.Latest, true, ts.ScriptKind.TSX); - return (((sf as any).parseDiagnostics?.length ?? 0) === 0); - }, -}); diff --git a/test/scope-gap-yaml.ts b/test/scope-gap-yaml.ts deleted file mode 100644 index 95e1e91..0000000 --- a/test/scope-gap-yaml.ts +++ /dev/null @@ -1,68 +0,0 @@ -// scope-gap-yaml.ts — YAML adapter for the unified scope-gap harness. NOTE: unlike most of the -// vscode#203212 list, VS Code already switched YAML OFF the dead textmate/yaml.tmbundle TO the -// maintained RedCMD/YAML-Syntax-Highlighter (microsoft/vscode#232244). So YAML's "official" -// baseline here is that MAINTAINED grammar — this gap is Monogram vs a maintained competitor, not -// a dead bundle. Default = RedCMD UPSTREAM; clone it first: -// git clone --depth 1 https://github.com/RedCMD/YAML-Syntax-Highlighter /tmp/redcmd-yaml -// (VS Code's bundled YAML is the same grammar — identical result; set MONOGRAM_OFFICIAL_YAML to -// .../extensions/yaml/syntaxes/yaml.tmLanguage.json for that.) Oracle = the `yaml` package. -// Run (bare node): node test/scope-gap-yaml.ts -import { readdirSync, readFileSync } from 'node:fs'; -import { dirname, join } from 'node:path'; -import { parse as yamlParse, parseAllDocuments } from 'yaml'; -import { run } from './scope-gap.ts'; -import { yamlOracle } from './yaml-oracle.ts'; -import { cases as issue12 } from './yaml-issue12-regressions.ts'; - -const OFFICIAL = process.env.MONOGRAM_OFFICIAL_YAML ?? '/tmp/redcmd-yaml/syntaxes/yaml.tmLanguage.json'; -// The RedCMD/VS Code YAML grammar is a dispatcher stub that include()s version-specific -// sub-grammars in the same syntaxes/ dir — load them all, or the official scopes nothing. -const SYN = dirname(OFFICIAL); -const officialExtra: Record = { - 'source.yaml.1.2': join(SYN, 'yaml-1.2.tmLanguage.json'), - 'source.yaml.1.1': join(SYN, 'yaml-1.1.tmLanguage.json'), - 'source.yaml.1.0': join(SYN, 'yaml-1.0.tmLanguage.json'), - 'source.yaml.1.3': join(SYN, 'yaml-1.3.tmLanguage.json'), - 'source.yaml.embedded': join(SYN, 'yaml-embedded.tmLanguage.json'), -}; - -// Corpus: yaml-test-suite inputs (src meta-files; decode the visible-whitespace markers). -const SUITE = '/tmp/yaml-test-suite/src'; -const decode = (s: string) => s.replace(/␣/g, ' ').replace(/—+»/g, '\t').replace(/[↵∎]/g, ''); -const corpus: { name: string; text: string }[] = []; -for (const f of readdirSync(SUITE).filter((n) => n.endsWith('.yaml'))) { - try { - const meta = yamlParse(readFileSync(`${SUITE}/${f}`, 'utf8')); - for (const t of (Array.isArray(meta) ? meta : [meta])) { - if (t && typeof t.yaml === 'string') corpus.push({ name: f, text: decode(t.yaml) }); - } - } catch { /* skip */ } -} -// Plus the RedCMD monogram#12 repros (many are tiny edge/error inputs absent from the suite) so the -// metric actually SEES the constructs the comment flagged. Asserted should-be scopes live in their -// own gate (yaml-issue12-regressions.ts); here they just widen what the gap/differential pass covers. -for (const c of issue12) corpus.push({ name: `monogram#12 ${c.id}`, text: c.src }); - -await run({ - name: 'YAML', - scopeName: 'source.yaml', - officialPath: OFFICIAL, - officialExtra, - monogramPath: 'yaml.tmLanguage.json', - loadCorpus: () => corpus, - roleOracle: yamlOracle, - // The GRADED headline stays valid-only: on malformed YAML the AST's key/value resolution is itself - // unreliable, so grading it would inject false "Monogram-wrong" tokens and poison the very signal - // we're making trustworthy. The invalid-input blind spot is instead closed by TWO mechanisms that - // stay honest there: (1) the asserted regression gate (yaml-issue12-regressions.ts) pins the - // should-be scope of the specific malformed repros (#4/#5/#8); (2) the differential pass below runs - // on ALL inputs and FLAGS invalid-input divergences for human review without auto-judging them. - isGradable: (text) => { try { return parseAllDocuments(text).every((d: any) => d.errors.length === 0); } catch { return false; } }, - // YAML's oracle emits COARSE, role-homogeneous spans (a whole plain scalar, a block-scalar body, a - // directive line); grade every char so a bug mid-span (a `%YAML` folded into a scalar, a block line - // bailing to a comment) is caught instead of hidden behind a correct start. See scope-gap.ts. - fullSpan: true, - // Also report oracle-INDEPENDENT divergences (Monogram vs official, where the oracle is silent) so a - // construct the CST oracle doesn't model can't become a silent blind spot. See scope-gap.ts. - differential: true, -}); diff --git a/test/src-coverage-js.ts b/test/src-coverage-js.ts deleted file mode 100644 index f6c1ec4..0000000 --- a/test/src-coverage-js.ts +++ /dev/null @@ -1,27 +0,0 @@ -// src-coverage-js.ts — JavaScript (.js, VS Code "javascript") entrypoint. -// Official parser = typescript.js with ScriptKind.JS (this IS VS Code's built-in JS support); -// Monogram grammar = javascript.ts. The TS test suite has ~no .js corpus, so we use Test262 -// (tc39/test262) — the canonical ECMAScript corpus, including negative parse tests (great -// reject cases). Provision once: -// git clone --depth 1 https://github.com/tc39/test262 /tmp/test262 -// Run (bare node): node test/src-coverage-js.ts [N|all] (Test262 is huge; default sample 800) -// -// Note: VS Code's `javascript` (ScriptKind.JS) ALLOWS JSX, but Monogram's javascript.ts models -// no JSX (that lives in javascriptreact.ts). Test262 is pure ECMAScript with no JSX, so this -// definitional gap doesn't trigger here — the comparison stays clean. - -import ts from 'typescript'; -import { run } from './src-coverage.ts'; -import { tsFamilyAdapter, walkCorpus, subsetArg } from './src-coverage-tsfamily.ts'; - -const BASE = '/tmp/test262/test/language'; // the syntax-relevant subtree of Test262 -const corpus = walkCorpus([BASE], ['.js'], subsetArg(800)).filter((c) => !c.file.endsWith('_FIXTURE.js')); -console.log(`JavaScript corpus: ${corpus.length} Test262 .js cases (test/language, stride-sampled).`); - -await run(tsFamilyAdapter({ - name: 'JavaScript (.js)', - scriptKind: ts.ScriptKind.JS, - grammar: (await import('../javascript.ts')).default, - corpus, - originBase: '/tmp/test262', -})); diff --git a/test/src-coverage-jsx.ts b/test/src-coverage-jsx.ts deleted file mode 100644 index 0cbeb2c..0000000 --- a/test/src-coverage-jsx.ts +++ /dev/null @@ -1,60 +0,0 @@ -// src-coverage-jsx.ts — JSX (.jsx, VS Code "javascriptreact") entrypoint. -// Official parser = typescript.js with ScriptKind.JSX; Monogram grammar = javascriptreact.ts -// (JS + JSX, NO TypeScript types). Neither the TS suite nor Test262 ships a .jsx corpus, so -// this uses a CURATED set exercising both halves (plain JS + JSX constructs). It is small, so -// completeness% is honestly low; a real .jsx corpus is a follow-up. Run: node test/src-coverage-jsx.ts - -import ts from 'typescript'; -import { run } from './src-coverage.ts'; -import { tsFamilyAdapter } from './src-coverage-tsfamily.ts'; - -// No TS types — these are .jsx (JavaScript + JSX) only. -const JSX_CASES: string[] = [ - // --- plain JS half --- - 'const x = 1, y = 2;', - 'function f(a, b = 1, ...rest) { return a + b + rest.length; }', - 'class C extends B { #p = 1; static s() {} get v() { return this.#p; } }', - 'const g = async (x) => { for await (const v of x) console.log(v); };', - 'const { a, b: { c } = {}, ...r } = obj;', - 'label: for (let i = 0; i < 10; i++) { if (i) continue label; }', - 'try { risky(); } catch { recover(); } finally { done(); }', - 'a ??= b; c ||= d; e &&= f; g?.h?.[i]?.(j);', - 'const t = `a${b}c${d}e`, n = 1_000_000n, hex = 0xFF, oct = 0o17, bin = 0b101;', - 'export default function () {}; export const z = 1; export * from "m";', - 'import def, { named as alias } from "mod"; import * as ns from "ns";', - 'switch (x) { case 1: break; default: { let y = 2; } }', - 'do { step(); } while (cond);', - 'const re = /foo\\d+/giu; const s = "a\\u{1F600}b";', - 'new.target; import.meta.url; function* gen() { yield* other(); }', - // --- JSX half --- - 'const a =
;', - 'const b =
text
;', - 'const frag = <>;', - 'const member = ;', - 'const ns = ;', - 'const nested = }>{children};', - 'const cond = ok ? : ;', - 'const list = items.map((it) =>
  • {it.label}
  • );', - 'const guard =
    {show && }{count || }
    ;', - 'const text =

    leading {a} middle {b} trailing

    ;', - 'const selfClosingVoid = ;', - 'const entity = a & b < c 😀;', - 'const multiline = (\n
    \n

    Title

    \n

    Body

    \n
    \n);', - 'const exprChild =
    {/* comment */}{items.length}
    ;', - 'const spreadChild = {...rows};', - 'function App() { return
    ; }', - 'const attrExpr = go()}>link;', - 'const deep = deep;', - 'const stringAttr =
    ;', - 'const boolAttr = ;', -]; - -const corpus = JSX_CASES.map((code, i) => ({ file: ``, code })); -console.log(`JSX corpus: ${corpus.length} curated .jsx snippets (no .jsx corpus exists in the TS suite / Test262; partial — completeness% will be low).`); - -await run(tsFamilyAdapter({ - name: 'JavaScriptReact (.jsx)', - scriptKind: ts.ScriptKind.JSX, - grammar: (await import('../javascriptreact.ts')).default, - corpus, -})); diff --git a/test/src-coverage-run.ts b/test/src-coverage-run.ts new file mode 100644 index 0000000..2be7dff --- /dev/null +++ b/test/src-coverage-run.ts @@ -0,0 +1,52 @@ +// ───────────────────────────────────────────────────────────────────────────── +// src-coverage-run.ts — the UNIFIED, data-driven entry for the source-coverage parser-alignment +// metric (monogram#25 part 2B). One driver + a per-language config TABLE, replacing the four thin +// src-coverage-{ts,js,jsx,tsx} adapters: each was just a corpus + ScriptKind + grammar over the +// SHARED `tsFamilyAdapter` (the accept/reject oracle) and `run()` core (src-coverage.ts). +// +// Run (bare node): node test/src-coverage-run.ts [N|all] +// lang ∈ ts | js | jsx | tsx | html | yaml +// +// The thicker html / yaml adapters use a DIFFERENT oracle (html = parse5 STRUCTURAL tree-equality, +// yaml = the `yaml` package accept/reject) and their own corpus, so they keep their files; +// ` html|yaml` DELEGATES to them. The per-language entry stays a `` parameter throughout. +// ───────────────────────────────────────────────────────────────────────────── +import ts from 'typescript'; +import { run } from './src-coverage.ts'; +import { tsFamilyAdapter, walkCorpus, type TsFamilyCase } from './src-coverage-tsfamily.ts'; +import { JSX_CASES } from './curated-corpora.ts'; + +const subN = (def = 400): number => { const a = process.argv[3]; return a === 'all' ? Infinity : Number(a ?? process.env.SUBSET ?? def); }; + +const lang = process.argv[2]; + +// html / yaml use a different oracle + corpus → their own files; delegate (preserves the `` entry). +if (lang === 'html') { await import('./src-coverage-html.ts'); } +else if (lang === 'yaml') { await import('./src-coverage-yaml.ts'); } +else { + // ── TS-family config table: ts/js/jsx/tsx differ only by ScriptKind + grammar + corpus ── + const TS_BASE = '/tmp/ts-repo/tests/cases'; + const BUILDERS: Record Promise<{ opts: Parameters[0]; note: string }>> = { + ts: async () => { + const corpus = walkCorpus([`${TS_BASE}/conformance`], ['.ts'], subN(400)); + return { opts: { name: 'TypeScript (.ts)', scriptKind: ts.ScriptKind.TS, grammar: (await import('../typescript.ts')).default, corpus, originBase: `${TS_BASE}/conformance` }, note: `${corpus.length} single-file .ts cases (tests/cases/conformance).` }; + }, + js: async () => { + const corpus = walkCorpus(['/tmp/test262/test/language'], ['.js'], subN(800)).filter((c) => !c.file.endsWith('_FIXTURE.js')); + return { opts: { name: 'JavaScript (.js)', scriptKind: ts.ScriptKind.JS, grammar: (await import('../javascript.ts')).default, corpus, originBase: '/tmp/test262' }, note: `${corpus.length} Test262 .js cases (test/language, stride-sampled).` }; + }, + jsx: async () => { + const corpus: TsFamilyCase[] = JSX_CASES.map((code, i) => ({ file: ``, code })); + return { opts: { name: 'JavaScriptReact (.jsx)', scriptKind: ts.ScriptKind.JSX, grammar: (await import('../javascriptreact.ts')).default, corpus }, note: `${corpus.length} curated .jsx cases.` }; + }, + tsx: async () => { + const corpus = walkCorpus([`${TS_BASE}/conformance`, `${TS_BASE}/compiler`], ['.tsx'], subN(Infinity)); + return { opts: { name: 'TypeScriptReact (.tsx)', scriptKind: ts.ScriptKind.TSX, grammar: (await import('../typescriptreact.ts')).default, corpus, originBase: TS_BASE }, note: `${corpus.length} single-file .tsx cases (conformance + compiler).` }; + }, + }; + const build = BUILDERS[lang]; + if (!build) { console.error(`usage: node test/src-coverage-run.ts [N|all]\nunknown language: ${lang ?? '(none)'}`); process.exit(1); } + const { opts, note } = await build(); + console.log(`${opts.name} corpus: ${note}`); + await run(tsFamilyAdapter(opts)); +} diff --git a/test/src-coverage-ts.ts b/test/src-coverage-ts.ts deleted file mode 100644 index b9f7a57..0000000 --- a/test/src-coverage-ts.ts +++ /dev/null @@ -1,27 +0,0 @@ -// src-coverage-ts.ts — TypeScript (.ts) entrypoint for the source-coverage alignment metric. -// Thin: corpus + dialect knobs only; the TS-family adapter is in ./src-coverage-tsfamily.ts -// and the coverage harness in ./src-coverage.ts. -// -// Oracle/corpus/Monogram-invocation mirror the accept/reject oracle: ts.createSourceFile (TS), -// accept iff no parseDiagnostics; /tmp/ts-repo/tests/cases/conformance, single-file .ts. -// -// Run (Node 24+, bare node — NOT tsx): -// node test/src-coverage-ts.ts # default subset (env SUBSET, default 400) -// node test/src-coverage-ts.ts 1000 # subset size as arg -// node test/src-coverage-ts.ts all # full single-file corpus - -import ts from 'typescript'; -import { run } from './src-coverage.ts'; -import { tsFamilyAdapter, walkCorpus, subsetArg } from './src-coverage-tsfamily.ts'; - -const BASE = '/tmp/ts-repo/tests/cases'; -const corpus = walkCorpus([`${BASE}/conformance`], ['.ts'], subsetArg()); -console.log(`TypeScript corpus: ${corpus.length} single-file .ts cases (tests/cases/conformance).`); - -await run(tsFamilyAdapter({ - name: 'TypeScript (.ts)', - scriptKind: ts.ScriptKind.TS, - grammar: (await import('../typescript.ts')).default, - corpus, - originBase: `${BASE}/conformance`, -})); diff --git a/test/src-coverage-tsx.ts b/test/src-coverage-tsx.ts deleted file mode 100644 index 2485da2..0000000 --- a/test/src-coverage-tsx.ts +++ /dev/null @@ -1,20 +0,0 @@ -// src-coverage-tsx.ts — TSX (.tsx, VS Code "typescriptreact") entrypoint. -// Same official parser as TS (typescript.js) but ScriptKind.TSX + the typescriptreact grammar. -// Corpus = the TypeScript repo's .tsx tests (conformance/jsx + compiler), single-file. -// Run (bare node): node test/src-coverage-tsx.ts [N|all] (default: all — the .tsx set is small) - -import ts from 'typescript'; -import { run } from './src-coverage.ts'; -import { tsFamilyAdapter, walkCorpus, subsetArg } from './src-coverage-tsfamily.ts'; - -const BASE = '/tmp/ts-repo/tests/cases'; -const corpus = walkCorpus([`${BASE}/conformance`, `${BASE}/compiler`], ['.tsx'], subsetArg(Infinity)); -console.log(`TSX corpus: ${corpus.length} single-file .tsx cases (conformance + compiler).`); - -await run(tsFamilyAdapter({ - name: 'TypeScriptReact (.tsx)', - scriptKind: ts.ScriptKind.TSX, - grammar: (await import('../typescriptreact.ts')).default, - corpus, - originBase: BASE, -})); diff --git a/test/ts-ast.ts b/test/ts-ast.ts deleted file mode 100644 index a82b482..0000000 --- a/test/ts-ast.ts +++ /dev/null @@ -1,9 +0,0 @@ -import ts from 'typescript'; -const code = process.argv[2] ?? `x ? y => ({ y }) : z => ({ z })`; -const sf = ts.createSourceFile('t.ts', code, ts.ScriptTarget.Latest, true); -function show(n: ts.Node, d=0){ - console.log(' '.repeat(d) + ts.SyntaxKind[n.kind] + (n.kind===ts.SyntaxKind.Identifier?`(${(n as any).text})`:'')); - n.forEachChild(c=>show(c,d+1)); -} -show(sf); -console.log('parseDiagnostics:', (sf as any).parseDiagnostics?.length ?? 0); diff --git a/test/yaml-diag.ts b/test/yaml-diag.ts deleted file mode 100644 index 01e875a..0000000 --- a/test/yaml-diag.ts +++ /dev/null @@ -1,40 +0,0 @@ -// Throwaway diagnostic: categorize yaml-test-suite inputs as FN (yaml accepts, we reject) / -// FP (yaml rejects, we accept) with Monogram's error, to drive grammar work. Run: node test/yaml-diag.ts -import { readdirSync, readFileSync } from 'node:fs'; -import { parse as yamlParse, parseAllDocuments } from 'yaml'; -import { createParser } from '../src/gen-parser.ts'; -import grammar from '../yaml.ts'; - -const { parse } = createParser(grammar); -const SUITE = '/tmp/yaml-test-suite/src'; -const decode = (s: string) => s.replace(/␣/g, ' ').replace(/—*»/g, '\t').replace(/[↵∎]/g, ''); -const corpus: { code: string; origin: string; name: string }[] = []; -for (const f of readdirSync(SUITE).filter((n) => n.endsWith('.yaml'))) { - try { - const meta = yamlParse(readFileSync(`${SUITE}/${f}`, 'utf8')); - for (const t of (Array.isArray(meta) ? meta : [meta])) { - if (t && typeof t.yaml === 'string') corpus.push({ code: decode(t.yaml), origin: f, name: t.name ?? '' }); - } - } catch { /* skip */ } -} -const oAccept = (c: string) => { try { return parseAllDocuments(c).every((d: any) => d.errors.length === 0); } catch { return false; } }; -const mRes = (c: string) => { try { parse(c); return { ok: true, err: '' }; } catch (e) { return { ok: false, err: String((e as Error).message).split('\n')[0] }; } }; - -const FN: any[] = [], FP: any[] = []; -let TP = 0, TN = 0; -for (const x of corpus) { - const o = oAccept(x.code), m = mRes(x.code); - if (o && m.ok) TP++; else if (o && !m.ok) FN.push({ ...x, err: m.err }); else if (!o && m.ok) FP.push(x); else TN++; -} -console.log(`corpus ${corpus.length}: TP=${TP} FN=${FN.length} FP=${FP.length} TN=${TN}`); - -// Group FN by Monogram error message (the failure mode). -const byErr = new Map(); -for (const x of FN) { const k = x.err.replace(/offset \d+/, 'offset N'); (byErr.get(k) ?? byErr.set(k, []).get(k)!).push(x); } -console.log(`\n=== FN grouped by error (${byErr.size} kinds) ===`); -for (const [err, xs] of [...byErr.entries()].sort((a, b) => b[1].length - a[1].length)) { - console.log(`\n[${xs.length}] ${err}`); - for (const x of xs.slice(0, 4)) console.log(` ${JSON.stringify(x.code.slice(0, 60))}`); -} -console.log(`\n=== FP sample (yaml rejects, we accept) — ${FP.length} ===`); -for (const x of FP.slice(0, 18)) console.log(` ${JSON.stringify(x.code.slice(0, 60))}`); diff --git a/test/yaml-poc.ts b/test/yaml-poc.ts deleted file mode 100644 index fd0635d..0000000 --- a/test/yaml-poc.ts +++ /dev/null @@ -1,33 +0,0 @@ -// Throwaway PoC: verify the indentation lexer emits correct INDENT/DEDENT/NEWLINE and that the -// first-cut yaml.ts grammar parses common documents. Run: node test/yaml-poc.ts -import { createLexer } from '../src/gen-lexer.ts'; -import { createParser } from '../src/gen-parser.ts'; -import grammar from '../yaml.ts'; - -const { tokenize } = createLexer(grammar); -const { parse } = createParser(grammar); - -const samples = [ - 'a: 1\nb: 2', - 'a:\n b: 1\n c: 2\nd: 3', - '- one\n- two\n- three', - 'key:\n - a\n - b', - 'nested:\n list:\n - x\n - y\n val: z', - '{a: 1, b: 2}', - '[1, 2, 3]', - 'name: "John"\nage: 30', - 'list: [a, b, c]', - '# comment\nkey: value # trailing', -]; - -const show = (t: any) => - t.type === 'Indent' ? '»IND' : t.type === 'Dedent' ? '«DED' : t.type === 'Newline' ? '⏎NL' - : t.type === '' ? JSON.stringify(t.text) : `${t.type}(${JSON.stringify(t.text)})`; - -for (const s of samples) { - console.log('\n=== ' + JSON.stringify(s) + ' ==='); - let toks: any[]; - try { toks = tokenize(s); } catch (e) { console.log(' LEX THREW:', (e as Error).message); continue; } - console.log(' toks:', toks.map(show).join(' ')); - try { parse(s); console.log(' PARSE: ok'); } catch (e) { console.log(' PARSE FAIL:', (e as Error).message.split('\n')[0]); } -} From 1a28ba28fc3da7830734dabe0b5249b26068b36a Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Tue, 9 Jun 2026 05:36:51 +0800 Subject: [PATCH 2/6] Generator: directed tokenCover strategy for deterministic per-token coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The generated legal corpus never reached whole scoped token classes the scope≡role judge checks — for TypeScript, numerics (Hex/Octal/Binary/BigInt/ Number), because the legal corpus is shallow/structural and never lands on an expression-position literal (proven: raising cap/fuzz still yields zero numerics). Add a 5th strategy `tokenCover`: for each scoped, samplable token, descend the SHORTEST path from the entry rule that references it (reusing the distTo/exprDist BFS), build a minimal legal context (fillContent/minExpand), and substitute sampleVariants. Deterministic and minimal-context, so it stays cheap on the large TS grammar (no depth strategies for token-stream). Also sweep all top-level token-pattern `alt` branches in sampleVariants (so a Number emits hex/oct/bin/ float/bigint, not just `0`), guarded against the interesting-literal embed for decimal-start / start()-anchored tokens (no `-0x1`, no broken column-0 anchor). TS declared-scope tokens checked 157→326 (numerics now graded); generative 7/7 consistent, depth-site 2/2 (#23/#24 intact); agnostic 9/9. --- test/grammar-gen.ts | 184 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 182 insertions(+), 2 deletions(-) diff --git a/test/grammar-gen.ts b/test/grammar-gen.ts index 31eef55..4b539bf 100644 --- a/test/grammar-gen.ts +++ b/test/grammar-gen.ts @@ -24,6 +24,7 @@ // • fuzzing — random production choices, for deeper / wider structures. // ───────────────────────────────────────────────────────────────────────────── import type { CstGrammar, RuleExpr, RuleDecl, TokenDecl, TokenPattern, TokenCharClassItem } from '../src/types.ts'; +import { tokenPatternStartsWithDecimal, tokenPatternHasStartAnchor } from '../src/token-pattern.ts'; // Max emissions in one derivation. A deep tree of 2-rep quantifiers grows the list multiplicatively; // copying huge lists (not the call count) is what makes a naive enumerator hang — cap it. @@ -110,10 +111,30 @@ function sample(pat: TokenPattern, ctx: SampleCtx): string | null { } } +// The number of branches in the SHALLOWEST `alt` reachable through the pattern's +// leading seq/group/repeat spine — the branches that a different `variant` index makes +// `sample` rotate through (it picks `variant % items.length` at each alt). A token whose +// value is an alternation of forms (a Number's int / float branches, a string's escape +// alternatives) needs at least this many variant indices for EVERY branch to be emitted, +// not just branch 0 — otherwise the budget caps it at the first form (`0`, never `1.5`). +function topAltBranches(pat: TokenPattern): number { + if (typeof pat === 'string') return 1; + switch (pat.type) { + case 'alt': return pat.items.length; + case 'seq': return Math.max(1, ...pat.items.map(topAltBranches)); + case 'repeat': return topAltBranches(pat.body); + default: return 1; + } +} + // Sample several distinct, legal texts for a token (variants + interesting-literal embeds). function sampleVariants(decl: TokenDecl, ctx: { rand: () => number; interesting: string[] }, n: number): string[] { const out = new Set(); - for (let v = 0; v < n + 2 && out.size < n; v++) { + // Cover every top-level alt branch: a token that is itself an alternation (hex/oct/bin/float + // forms) must emit ALL its branches, not stop at branch 0 once `n` distinct samples are reached — + // so the budget is at least the branch count, and the all-branch sweep is NOT capped by `out.size`. + const budget = Math.max(n + 2, topAltBranches(decl.pattern) + 2); + for (let v = 0; v < budget; v++) { const s = sample(decl.pattern, { ...ctx, variant: v }); if (s !== null && s.length > 0) out.add(s); } @@ -123,7 +144,13 @@ function sampleVariants(decl: TokenDecl, ctx: { rand: () => number; interesting: // the result is still a single legal instance of the token — this is what produces the // monogram#23 shape (a plain scalar whose text is `--- x`). Verified per-token by re-lexing // in the driver; an embed that doesn't re-lex to this token is simply dropped there. - if (base.length >= 1) { + // GUARD: a token whose pattern starts with a DECIMAL digit (`0x1F`, `1.5`) or carries a + // `start()` line/stream anchor (a shebang `^#!…`) must NOT get a leading-literal embed: gluing + // `-`/`#`/`---` on front re-lexes as a different token (`-0x1` = minus + number, `#0x1` ≠ hex) + // or breaks the column-0 anchor — so the embed would never round-trip back to THIS token. The + // pure-variant samples above already cover such tokens; only free-form tokens take the embeds. + const anchored = tokenPatternStartsWithDecimal(decl) || tokenPatternHasStartAnchor(decl); + if (base.length >= 1 && !anchored) { for (const lit of ctx.interesting) { if (lit.length === 0 || /[\n\r]/.test(lit)) continue; out.add(lit + base); // glued leading boundary (`---` + `x` → `---x`) @@ -156,6 +183,7 @@ class Walker { structKind = new Map(); compactLits: Set; reachMap = new Map>(); // rule → every rule it can transitively reach + tokenHostRules = new Map(); // token name → rules whose body DIRECTLY references it ruleMin = new Map(); rand: () => number; cap: number; @@ -179,9 +207,29 @@ class Walker { this.compactLits = new Set(grammar.indent?.compactIndicators ?? []); this.interesting = this.collectInteresting(); this.computeReach(); + this.computeTokenHosts(); this.computeMins(); } + // For each token, the rules whose body DIRECTLY references it (`ref` to a token name). This is the + // entry point of tokenCover's directed descent: a scoped token only ever appears at these rules, so + // building the shortest legal path to one of them and substituting the token covers it. A token with + // NO host rule (a lexer-trivia token the parser never consumes — a shebang / JSDoc comment, skipped + // before the token stream) is unreachable by ANY derivation and is left out (it is not a CST leaf). + computeTokenHosts(): void { + for (const r of this.grammar.rules) { + const toks = new Set(); + const go = (e: RuleExpr) => { switch (e.type) { + case 'ref': if (this.isToken(e.name)) toks.add(e.name); break; + case 'seq': case 'alt': e.items.forEach(go); break; + case 'quantifier': case 'group': case 'not': go(e.body); break; + case 'sep': go(e.element); break; + } }; + go(r.body); + for (const tn of toks) (this.tokenHostRules.get(tn) ?? this.tokenHostRules.set(tn, []).get(tn)!).push(r.name); + } + } + computeReach(): void { const refs = (e: RuleExpr, acc: Set) => { switch (e.type) { @@ -524,6 +572,114 @@ class Walker { case 'op': case 'prefix': case 'postfix': return []; } } + + // ── DIRECTED TOKEN COVERAGE ────────────────────────────────────────────────────────────────────── + // The same directed-descent idea as nestChain, but the target is a scoped TOKEN, not a self-recursive + // RULE. A grammar-derived LEGAL corpus is shallow/structural and never reaches an expression-position + // literal: every numeric, every private field — the scoped leaves the scope≡role judge checks — appears + // ZERO times. tokenCover fixes that by, for each scoped token, building the SHORTEST legal path from the + // entry rule to a rule that references it (the SAME reversed-BFS the nesting strategies use, retargeted + // at a token via its host rules) and substituting real samples of the token there. Minimal context only + // (shortest path + minExpand filler), so it stays cheap on a 50-rule grammar. + + // shortest rule-ref distance FROM each rule TO any rule that references `tokenName` (reversed-BFS, like + // distTo but seeded at the token's host rules). Memoised. Infinity-absent ⇒ the rule can't reach the token. + // A host rule starts at distance 1 (entering its body costs one ref step to reach the direct token use); + // a DIRECT `ref:token` in an expression is 0. The gap is what makes the descent STOP at the first direct + // token use instead of recursing into a self-recursive host (`Type` → `aa is Type → …` never terminating): + // `ref:token` (0) strictly beats `ref:host` (≥1), so a `seq`/`alt`'s shortest branch is the one that + // actually places the token here, not the one that re-enters a host rule that also eventually reaches it. + tokenDistCache = new Map>(); + tokenDistTo(tokenName: string): Map { + let m = this.tokenDistCache.get(tokenName); if (m) return m; + m = new Map(); + const back = new Map(); + for (const r of this.grammar.rules) for (const ref of this.directRuleRefs(r.body)) (back.get(ref) ?? back.set(ref, []).get(ref)!).push(r.name); + const queue: string[] = []; + for (const host of this.tokenHostRules.get(tokenName) ?? []) if (!m.has(host)) { m.set(host, 1); queue.push(host); } // host rule body = 1 step from the direct token use + while (queue.length) { const cur = queue.shift()!; const d = m.get(cur)!; for (const pre of back.get(cur) ?? []) if (!m.has(pre)) { m.set(pre, d + 1); queue.push(pre); } } + this.tokenDistCache.set(tokenName, m); return m; + } + // min rule-ref distance from an expression to `tokenName` — 0 if it DIRECTLY refs the token (a direct + // use strictly beats re-entering a host rule, so the descent terminates at the token, see tokenDistTo). + exprDistToToken(e: RuleExpr, tokenName: string): number { + const dm = this.tokenDistTo(tokenName); + switch (e.type) { + case 'ref': return e.name === tokenName ? 0 : (dm.has(e.name) ? dm.get(e.name)! : Infinity); + case 'seq': case 'alt': return Math.min(Infinity, ...e.items.map((i) => this.exprDistToToken(i, tokenName))); + case 'quantifier': case 'group': case 'not': return this.exprDistToToken(e.body, tokenName); + case 'sep': return this.exprDistToToken(e.element, tokenName); + default: return Infinity; + } + } + exprReachesToken(e: RuleExpr, tokenName: string): boolean { return this.exprDistToToken(e, tokenName) < Infinity; } + + // Scoped tokens that tokenCover CAN reach: a declared `.scope`, a samplable pattern (not a `never()` + // structural placeholder), and at least one host rule reachable from the entry. A trivia token the + // parser never consumes (no host rule — a shebang / doc comment) is excluded HERE: no rule path reaches + // it (it is handled, where it can be at all, by `prefixOnlyTokens`). + coverableTokens(entryName: string): TokenDecl[] { + return this.grammar.tokens.filter((t) => { + if (!t.scope) return false; + if (typeof t.pattern !== 'string' && t.pattern.type === 'never') return false; // structural placeholder + const dm = this.tokenDistTo(t.name); + return dm.has(entryName) || (this.tokenHostRules.get(t.name) ?? []).includes(entryName); + }); + } + + // Scoped tokens NO rule references but that carry a `start()` line/stream anchor (a shebang `^#!…`) — + // the parser treats them as leading trivia (skipped, never a CST leaf), so coverableTokens can't reach + // them, yet they ARE a legal document PREFIX the highlighter scopes. We emit each as a stand-alone line + // so the generated corpus contains it; it can only be the first emission (the anchor), which a one-token + // input trivially satisfies. (Such a token is not a CST leaf, so the scope≡role gate does not grade it — + // this widens the round-trip corpus, not the leaf check.) + prefixOnlyTokens(): TokenDecl[] { + return this.grammar.tokens.filter((t) => + !!t.scope && + !(typeof t.pattern !== 'string' && t.pattern.type === 'never') && + !this.tokenHostRules.has(t.name) && + tokenPatternHasStartAnchor(t)); + } + + // Build the minimal legal context from `entry` down to `tokenName`, with the token rendered as + // `sampleText` at its position. Descends the SHORTEST branch toward the token at each node and + // minimal-fills everything else — the directed, deterministic analogue of nestChain for a token. + coverToken(entryBody: RuleExpr, tokenName: string, sampleText: string): Emission[] { + this.coverFuel = 400; + return this.coverRec(entryBody, tokenName, sampleText); + } + coverFuel = 0; + coverRec(e: RuleExpr, tokenName: string, sampleText: string): Emission[] { + if (--this.coverFuel <= 0 || !this.exprReachesToken(e, tokenName)) return this.minExpand(e) ?? []; + switch (e.type) { + case 'literal': return [{ t: 'lit', value: e.value }]; + case 'ref': { + if (e.name === tokenName) return [{ t: 'tok', name: e.name, text: sampleText }]; // THE target token → the sample + if (this.isStruct(e.name)) return [{ t: 'struct', kind: this.structKind.get(e.name)! }]; + if (this.isToken(e.name)) { const v = sample(this.tokenByName.get(e.name)!.pattern, { rand: this.rand, interesting: [], variant: 0 }); return [{ t: 'tok', name: e.name, text: v || 'x' }]; } + return this.coverRec(this.ruleByName.get(e.name)!.body, tokenName, sampleText); // descend into the rule + } + case 'seq': { + // descend the ONE item closest to the token; minimal-fill the rest → the shortest legal frame. + let idx = -1, best = Infinity; + e.items.forEach((it, i) => { const d = this.exprDistToToken(it, tokenName); if (d < best) { best = d; idx = i; } }); + const out: Emission[] = []; + e.items.forEach((it, i) => { for (const x of (i === idx ? this.coverRec(it, tokenName, sampleText) : this.minExpand(it) ?? [])) out.push(x); }); + return out; + } + case 'alt': { + // the branch that reaches the token soonest (so the frame actually contains it). + let pick = e.items[0], best = Infinity; + for (const it of e.items) { const d = this.exprDistToToken(it, tokenName); if (d < best) { best = d; pick = it; } } + return this.coverRec(pick, tokenName, sampleText); + } + case 'quantifier': return this.coverRec(e.body, tokenName, sampleText); // fire exactly one rep (it carries the token) + case 'group': return this.coverRec(e.body, tokenName, sampleText); + case 'sep': return this.coverRec(e.element, tokenName, sampleText); // one element (it carries the token) + case 'not': case 'sameLine': case 'noCommentBefore': case 'noMultilineFlowBefore': + case 'op': case 'prefix': case 'postfix': return []; + } + } } // ─── MATERIALIZE: emissions → text + token spans ────────────────────────────────── @@ -690,5 +846,29 @@ export function generateInputs(grammar: CstGrammar, opts: GenOptions = {}): GenI for (let i = 0; i < Math.ceil(fuzzRounds / 8); i++) push(w.fuzz(r.body, depth + 2), `fuzz:${rn}`, rn); } + // 5) DIRECTED TOKEN COVERAGE — for each scoped token, the shortest legal context from the entry rule + // with several real samples of the token at its position. The bounded-exhaustive / fuzz strategies + // only reach a shallow structural skeleton, so an expression-position literal (every numeric, the + // private field) — exactly the scoped leaves the scope≡role judge checks — is otherwise NEVER + // generated. Each context is minimal (shortest path + minExpand filler), so this stays cheap even + // on the 50-rule TS grammar and needs no depth budget. The samples are guard-filtered (sampleVariants + // skips the leading-literal embeds for decimal-/anchor-led tokens, so `0x1F` is never mangled to `-0x1F`). + for (const tok of w.coverableTokens(entry.name)) { + if (timeUp()) break; + // CLEAN samples only (no interesting-literal embeds): tokenCover's job is to make the token APPEAR in + // a legal context, not to stress boundary collisions — that is the enum/fuzz strategies' role, where + // the embed belongs. Prepending a boundary sigil to a sigil-led token (`<` + `#name`, `>` + `@name`) + // just produces non-parsing junk here, so the directed contexts stay clean and ~100% legal. + for (const text of sampleVariants(tok, { rand: w.rand, interesting: [] }, 6)) { + push(w.coverToken(entry.body, tok.name, text), `tokenCover:${tok.name}`, entry.name); + } + } + // a position-anchored leading-trivia token (a shebang) as a stand-alone first line — see prefixOnlyTokens. + for (const tok of w.prefixOnlyTokens()) { + for (const text of sampleVariants(tok, { rand: w.rand, interesting: [] }, 3)) { + if (!/[\n\r]/.test(text)) push([{ t: 'tok', name: tok.name, text }], `tokenCover:${tok.name}`, entry.name); + } + } + return out.slice(0, maxInputs); } From 018959c3808c4e63d282ec3806a0293ac0a6b257 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Tue, 9 Jun 2026 06:06:23 +0800 Subject: [PATCH 3/6] Generator: replace random fuzz with deterministic systematic coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generation was seed-dependent — different opts.seed → different fuzz outputs → different "discovered" divergences. That's fatal for a reproducible gap ledger (random testing shows presence, not absence, and can't be tracked across commits) and contradicts the project's own "systematic, not a representativeness bet" thesis. The only random STRUCTURE was `fuzz` (this.rand for alt/quantifier choices); enum/ nestChain/tokenCover already rotate on a variant index. Replace fuzz with `cover`: the same walk, but every production choice comes from a deterministic mixed-radix Chooser indexed by round i alone — the first few choice points form a full base-N cartesian (t-wise interaction coverage by construction: measured complete to 3-wise), the tail perturbed by rotations. this.rand is seeded from a fixed constant; opts.seed is now a no-op. generateInputs(grammar) is a pure function of the grammar: byte-identical across runs for all 7 languages. 7/7 consistent, depth-site 2/2 (#23/#24 intact); agnostic 9/9. Foundation for a deterministic, commit-trackable gap ledger. --- test/grammar-gen.ts | 158 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 129 insertions(+), 29 deletions(-) diff --git a/test/grammar-gen.ts b/test/grammar-gen.ts index 4b539bf..7eafb4f 100644 --- a/test/grammar-gen.ts +++ b/test/grammar-gen.ts @@ -15,13 +15,22 @@ // indicators) is read from the grammar's own config (`grammar.indent` / `.markup`), // never hardcoded — the same discipline the engines follow. // -// Three production strategies, all over the SAME walker: +// Production strategies, all over the SAME walker — ALL DETERMINISTIC (no PRNG seed; the +// generator is a pure function of the grammar, so a gap ledger is reproducible across commits): // • bounded-exhaustive — every derivation to a small depth N (provably complete at // small scope; this is what makes coverage `grammar × bound` instead of imagination). // • self-recursive nesting — for each rule that can contain itself, the nested shape // at depth 1..N. Deep self-embedding is exactly where a flat highlighter loses to // the stack-keeping parser (monogram#24 is `BlockSequence` inside `BlockSequence`). -// • fuzzing — random production choices, for deeper / wider structures. +// • directed token coverage — the shortest legal context for every scoped token. +// • systematic t-wise coverage (was random "fuzzing") — for deeper / wider structures: a +// DETERMINISTIC mixed-radix enumeration over the grammar's CHOICE POINTS (which `alt` +// branch, how many `quantifier`/`sep` reps). Round i → a choice vector derived from i +// alone (no external seed). A FULL cartesian over the first few choice-point digits +// covers every t-tuple (t≤digits) of (choice-point, value) among them BY CONSTRUCTION — +// so it reaches INTERACTION shapes (an explicit key × a `[` in its scalar, monogram's +// `[`-in-key leak) deterministically, not by the luck of a seed. Polynomial (C^D rounds), +// never the exponential full derivation tree. // ───────────────────────────────────────────────────────────────────────────── import type { CstGrammar, RuleExpr, RuleDecl, TokenDecl, TokenPattern, TokenCharClassItem } from '../src/types.ts'; import { tokenPatternStartsWithDecimal, tokenPatternHasStartAnchor } from '../src/token-pattern.ts'; @@ -45,8 +54,13 @@ export interface GenInput { rule: string; // the top rule the derivation started from (entry, or a self-recursive rule) } -// ── deterministic PRNG (Date.now/Math.random are unavailable in workflow scripts and make -// a generator unreproducible anyway — seed it). xorshift32. ── +// ── fixed-seed xorshift32. The generator has NO external randomness: every STRUCTURE choice is made +// by the deterministic t-wise schedule (the `cover` strategy / mixed-radix chooser), and every +// token-TEXT sample is indexed deterministically (`sample`/`sampleVariants` rotate on a `variant` +// INDEX, never on `rand`). This PRNG is retained only so any future text-sampling path that wants a +// tie-break has one; it is seeded from a FIXED constant so two `generateInputs(grammar)` calls are +// byte-identical regardless of any `opts.seed` (which is now a NO-OP, kept for back-compat). ── +const FIXED_SEED = 0x9e3779b9 | 0; // a constant (golden-ratio bits); NOT derived from time / opts. function rng(seed: number): () => number { let s = seed | 0 || 1; return () => { s ^= s << 13; s ^= s >>> 17; s ^= s << 5; return ((s >>> 0) % 1_000_000) / 1_000_000; }; @@ -165,13 +179,74 @@ function sampleVariants(decl: TokenDecl, ctx: { rand: () => number; interesting: return [...out]; } +// ─── DETERMINISTIC CHOICE SCHEDULE (t-wise systematic coverage) ──────────────────────────────────── +// A `Chooser` answers each production CHOICE POINT during a `cover` walk, in WALK ORDER. Two kinds: +// • `next(radix)` — a STRUCTURAL choice (which alt branch · how many quantifier/sep reps). These drive +// the t-wise cartesian: because the walk is deterministic given the answers, the k-th structural call +// is always choice point k, so a Chooser IS a choice vector `(v_0, v_1, …)` and a derivation is a +// function of it. The shape of the tree (key-vs-seq, explicit-vs-plain, nesting) lives here. +// • `variant(n)` — a token-TEXT choice (which sampled lexeme for a token: `x` vs the boundary-embed +// `--- x`, an int vs a float form). These do NOT change the tree SHAPE, only a leaf's bytes, so they +// are kept on a SEPARATE fast counter — every token position (even a DEEP value scalar) then sweeps +// its variants across rounds, instead of being frozen by a slow high mixed-radix digit. That is what +// reliably lands a boundary-embed in VALUE position (`k: --- x`, monogram#23) — a structural-context +// × text-variant interaction the cartesian reaches the context for and the text counter the variant. +export interface Chooser { next: (radix: number) => number; variant: (n: number) => number } + +// One round's choice vector, as a MIXED-RADIX reading of a round index `i` (NO external seed): +// structural digit k = ( ⌊ i / B^k ⌋ + k·rot ) mod radix_k +// `B` is the schedule BASE. Reading `i` low-digit-first means the FIRST choice points (the structurally +// decisive ones — which Node kind, key-vs-seq, explicit-vs-plain) move SLOWEST, so a contiguous block of +// rounds holds a fixed prefix while the deeper tail varies. Enumerating i over `B^D` (the coverSchedule +// loop) therefore walks the FULL cartesian product of the first D structural digits → every t-tuple +// (t ≤ D) of (choice-point, value) among the first D points appears in SOME round, BY CONSTRUCTION. That +// is the t-wise (here t≤D≈4) interaction guarantee — it covers an explicit-key × `[`-in-its-scalar pair, +// monogram's `[`-in-key leak, deterministically, with no luck. `rot` (a per-schedule offset) perturbs +// the deeper tail so a second/third pass reaches different deep shapes than the first; it does NOT affect +// the prefix cartesian (it shifts every digit by a constant, a relabelling of values, so all tuples among +// the first D points still occur — just at permuted round indices). Polynomial: B^D rounds, never the +// exponential whole derivation tree (a structural point past digit D simply reads its slow-moving high +// digit). The token-TEXT counter is an INDEPENDENT per-round walk index (j-th text choice = (i+j) mod n), +// so it cycles every position's variants fast regardless of structural depth. +function mixedRadixChooser(i: number, base: number, rot: number): Chooser { + let k = 0; // structural choice-point index (drives the mixed-radix cartesian) + let j = 0; // token-text choice index (independent fast counter) + return { + next(radix: number): number { + if (radix <= 1) return 0; // a forced single option consumes a (no-op) digit slot + const digit = Math.floor(i / Math.pow(base, k)) + k * rot; + k++; + return ((digit % radix) + radix) % radix; + }, + variant(n: number): number { + if (n <= 1) return 0; + const idx = (i + j) % n; // fast: sweeps each token position's variants across rounds, depth-agnostic + j++; + return idx; + }, + }; +} + +// The deterministic schedule of choice vectors the `cover` strategy enumerates: the full cartesian over +// the first D digits (radix `base`) — `base^D` rounds — optionally repeated under a few `rot` offsets so +// the deep tail (past digit D) also varies. `rounds` caps it (polynomial, bounded). Pure function of its +// args: identical every call, so `generateInputs` is reproducible. Yields `Chooser`s in order. +function* coverSchedule(base: number, digits: number, rounds: number, rotations: number[]): Generator { + const span = Math.pow(base, digits); + let emitted = 0; + for (const rot of rotations) { + for (let i = 0; i < span && emitted < rounds; i++, emitted++) yield mixedRadixChooser(i, base, rot); + if (emitted >= rounds) return; + } +} + // ─── THE WALKER ────────────────────────────────────────────────────────────────── export interface GenOptions { depth?: number; // bounded-exhaustive derivation depth (rule-ref recursion) cap?: number; // max alternatives kept at each combinator node (anti-explosion) maxInputs?: number; // global cap on emitted inputs per rule - fuzzRounds?: number; // random derivations - seed?: number; + fuzzRounds?: number; // budget (cap) on systematic-coverage rounds — DETERMINISTIC choice vectors, not random + seed?: number; // NO-OP, retained for back-compat: the generator is a pure function of the grammar nestDepth?: number; // self-recursive nesting depth timeBudgetMs?: number; // wall-clock cap for the depth strategies (large token-stream grammars) } @@ -192,9 +267,9 @@ class Walker { maxCalls = 60_000; enumTop(e: RuleExpr, budget: number): Emission[][] { this.budgetCalls = 0; return this.enum(e, budget); } - constructor(grammar: CstGrammar, seed: number, cap: number) { + constructor(grammar: CstGrammar, cap: number) { this.grammar = grammar; - this.rand = rng(seed); + this.rand = rng(FIXED_SEED); // FIXED — the walker is a pure function of the grammar (see rng note). this.cap = cap; for (const t of grammar.tokens) this.tokenByName.set(t.name, t); for (const r of grammar.rules) this.ruleByName.set(r.name, r); @@ -470,35 +545,49 @@ class Walker { } } - // ── random derivation (fuzzing): one emission sequence, forced to terminate at budget 0 ── - fuzz(e: RuleExpr, budget: number): Emission[] { - const pick = (xs: T[]): T => xs[Math.floor(this.rand() * xs.length)]; + // ── DETERMINISTIC SYSTEMATIC derivation (replaces random fuzzing): one emission sequence whose every + // production CHOICE comes from a `Chooser`, not a PRNG. The walk is otherwise identical to the old + // fuzz, so the SAME structures are reachable — but reproducibly. A Chooser is consulted at each + // CHOICE POINT in walk order (alt branch · quantifier reps · sep reps · token-text variant); since + // the walk is deterministic given the chooser's outputs, choice point k is ALWAYS the k-th call, so + // a mixed-radix counter (slow-moving early digits, fast late ones) keeps a stable choice-point + // PREFIX while sweeping the tail — which is what yields t-wise coverage over the prefix (see + // coverSchedule). Forced to terminate at budget 0 (the minimal expansion), like fuzz. ── + cover(e: RuleExpr, budget: number, ch: Chooser): Emission[] { // bounded `for`-push (NOT spread on a possibly-huge array → stack overflow + size blowup) - const fappend = (out: Emission[], add: Emission[]) => { if (out.length < MAX_EMS) for (const x of add) out.push(x); }; + const cappend = (out: Emission[], add: Emission[]) => { if (out.length < MAX_EMS) for (const x of add) out.push(x); }; switch (e.type) { case 'literal': return [{ t: 'lit', value: e.value }]; case 'ref': { if (this.isStruct(e.name)) return [{ t: 'struct', kind: this.structKind.get(e.name)! }]; if (this.isToken(e.name)) { const vs = sampleVariants(this.tokenByName.get(e.name)!, { rand: this.rand, interesting: this.interesting }, 4); - return [{ t: 'tok', name: e.name, text: vs.length ? pick(vs) : 'x' }]; + // pick a variant on the TOKEN-TEXT counter (ch.variant, not the structural ch.next), so the + // token TEXT (a plain scalar `--- x` vs `x`, a number's int vs float form) is swept fast at EVERY + // position regardless of structural depth — see the Chooser note (this lands #23's `k: --- x`). + return [{ t: 'tok', name: e.name, text: vs.length ? vs[ch.variant(vs.length)] : 'x' }]; } if (budget <= 0) return this.ruleMin.get(e.name) ?? []; - return this.fuzz(this.ruleByName.get(e.name)!.body, budget - 1); + return this.cover(this.ruleByName.get(e.name)!.body, budget - 1, ch); } - case 'seq': { const out: Emission[] = []; for (const it of e.items) fappend(out, this.fuzz(it, budget)); return out; } + case 'seq': { const out: Emission[] = []; for (const it of e.items) cappend(out, this.cover(it, budget, ch)); return out; } case 'alt': { - if (budget <= 0) { const m = this.minExpand(e); if (m) return m; } - return this.fuzz(pick(e.items), budget); + if (budget <= 0) { const m = this.minExpand(e); if (m) return m; } // no budget → shortest, no choice consumed + return this.cover(e.items[ch.next(e.items.length)], budget, ch); // CHOICE POINT: which branch } case 'quantifier': { - const reps = budget <= 0 ? (e.kind === '+' ? 1 : 0) : (e.kind === '?' ? Math.floor(this.rand() * 2) : Math.floor(this.rand() * 3) + (e.kind === '+' ? 1 : 0)); - const out: Emission[] = []; for (let i = 0; i < reps; i++) fappend(out, this.fuzz(e.body, budget - 1)); return out; + // CHOICE POINT: how many reps. `?`→{0,1} (radix 2), `*`/`+`→{0..2}/{1..3} (radix 3). At budget 0 + // the count is forced to the minimum (radix 1 → digit is a fixed no-op, keeping schedules aligned). + const lo = e.kind === '+' ? 1 : 0; + const radix = budget <= 0 ? 1 : (e.kind === '?' ? 2 : 3); + const reps = lo + ch.next(radix); + const out: Emission[] = []; for (let i = 0; i < reps; i++) cappend(out, this.cover(e.body, budget - 1, ch)); return out; } - case 'group': return this.fuzz(e.body, budget); + case 'group': return this.cover(e.body, budget, ch); case 'sep': { - const reps = budget <= 0 ? 1 : Math.floor(this.rand() * 3) + 1; const out: Emission[] = []; - for (let i = 0; i < reps; i++) { if (i) out.push({ t: 'lit', value: e.delimiter }); fappend(out, this.fuzz(e.element, budget - 1)); } + // CHOICE POINT: element count (≥1). radix 3 → 1..3 elements; forced to 1 at budget 0. + const reps = 1 + (budget <= 0 ? 0 : ch.next(3)); const out: Emission[] = []; + for (let i = 0; i < reps; i++) { if (i) out.push({ t: 'lit', value: e.delimiter }); cappend(out, this.cover(e.element, budget - 1, ch)); } return out; } case 'not': case 'sameLine': case 'noCommentBefore': case 'noMultilineFlowBefore': @@ -781,10 +870,12 @@ export function generateInputs(grammar: CstGrammar, opts: GenOptions = {}): GenI const depth = opts.depth ?? 5; const cap = opts.cap ?? 6; const maxInputs = opts.maxInputs ?? 400; - const fuzzRounds = opts.fuzzRounds ?? 300; + // `fuzzRounds` is honoured as the BUDGET (cap on systematic-coverage rounds), but the rounds are now + // DETERMINISTIC choice vectors, not random draws. `opts.seed` is a NO-OP (kept for back-compat): the + // generator is a pure function of the grammar, so two calls — with any seed or none — are identical. + const coverRounds = opts.fuzzRounds ?? 300; const nestDepth = opts.nestDepth ?? 5; - const seed = opts.seed ?? 12345; - const w = new Walker(grammar, seed, cap); + const w = new Walker(grammar, cap); const mode: MatOptions['mode'] = grammar.indent ? 'indent' : grammar.markup ? 'markup' : 'token-stream'; const matOpts: MatOptions = { mode, indentStep: 2 }; @@ -837,13 +928,22 @@ export function generateInputs(grammar: CstGrammar, opts: GenOptions = {}): GenI for (let d = 1; d <= nestDepth; d++) push(w.nestChain(r.body, rn, d), `dirnest:${rn}@${d}`, rn); } - // 4) fuzzing for deeper / wider structures (random production choices), rooted at the entry AND at - // each self-recursive rule so deep shapes are reached quickly. - for (let i = 0; i < fuzzRounds; i++) push(w.fuzz(entry.body, depth + 2), 'fuzz', entry.name); + // 4) SYSTEMATIC t-wise coverage for deeper / wider structures (DETERMINISTIC choice vectors, was random + // fuzzing), rooted at the entry AND at each self-recursive rule. The schedule is a full mixed-radix + // cartesian over the first `COVER_DIGITS` choice points at `COVER_BASE` values each (covers every + // t-tuple, t≤COVER_DIGITS, of those points BY CONSTRUCTION → reaches an explicit-key × `[`-in-scalar + // interaction without a seed), with a few rotation offsets perturbing the deeper tail. `coverRounds` + // caps it — polynomial (COVER_BASE^COVER_DIGITS ≈ 256), never the exponential whole derivation tree. + // NB the emitted strategy key stays `fuzz` (the driver buckets it as the EXPLORATORY tier — deeper/wider + // shapes that legitimately reach STANDING flat-TM frontier limits, so #24 is report-only there; the + // STRUCTURED strategies remain the by-construction gate). Only the MECHANISM changed (deterministic, not + // random); the bucket's meaning is the same, so the driver's gating semantics are untouched. + const COVER_BASE = 4, COVER_DIGITS = 4, ROTS = [0, 1, 2]; + for (const ch of coverSchedule(COVER_BASE, COVER_DIGITS, coverRounds, ROTS)) push(w.cover(entry.body, depth + 2, ch), 'fuzz', entry.name); for (const rn of recursive) { if (timeUp()) break; const r = w.ruleByName.get(rn)!; - for (let i = 0; i < Math.ceil(fuzzRounds / 8); i++) push(w.fuzz(r.body, depth + 2), `fuzz:${rn}`, rn); + for (const ch of coverSchedule(COVER_BASE, COVER_DIGITS, Math.ceil(coverRounds / 8), ROTS)) push(w.cover(r.body, depth + 2, ch), `fuzz:${rn}`, rn); } // 5) DIRECTED TOKEN COVERAGE — for each scoped token, the shortest legal context from the entry rule From 9acf16caafcd195b16cb41facdcb569a146a4e18 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Tue, 9 Jun 2026 06:52:56 +0800 Subject: [PATCH 4/6] Generator precision: make the gap shape-classes systematically producible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Deterministic generation found 0 divergences — the gaps random fuzz hit were luck, and the deterministic generator couldn't produce those shapes. Discovery is bounded by generator PRECISION, not luck; so make the known gap shape-classes producible (config-derived, no language names): - markup: a NO-SPACE (tight) render variant + a directed `markupSelfCloseAttr` producer so `` (quoted attr flush against `/>`) forms. The HTML/Vue self-close `/` gap now surfaces deterministically under "discovered": «/» got «string.unquoted.html». - indent: sample plain scalars from `blockPattern` + splice a flow bracket mid-token, and directed `indentExplicitKeyBracket` producer, so `? k [y : …` forms (round-trips). - indent: `indentBlockScalar` synthesis for the `never()`-token block scalar `|`/`>` (introducer + deeper-indented body), so `string.unquoted.block` is covered (was 0%). Deterministic preserved (generateInputs pure); 7/7 gated-clean; depth-site 2/2 (#23/#24 intact); agnostic 9/9. Honest finding: the YAML explicit-key `[` divergence is a `name`-bucket scope (entity.name.tag), which the scope≡role gates (literal→content, anchored-marker) structurally don't flag — a check-precision item for a follow-up, distinct from producibility (which is now done). The HTML `/` is unambiguously gate-1. --- test/grammar-gen.ts | 227 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 219 insertions(+), 8 deletions(-) diff --git a/test/grammar-gen.ts b/test/grammar-gen.ts index 7eafb4f..70a0b43 100644 --- a/test/grammar-gen.ts +++ b/test/grammar-gen.ts @@ -142,7 +142,10 @@ function topAltBranches(pat: TokenPattern): number { } // Sample several distinct, legal texts for a token (variants + interesting-literal embeds). -function sampleVariants(decl: TokenDecl, ctx: { rand: () => number; interesting: string[] }, n: number): string[] { +// `blockEmbed` (indent grammars) are content literals that are STRUCTURAL in flow context but +// plain-scalar CONTENT in BLOCK context (the flow brackets `[`/`{`/`]`/`}`) — see the internal-embed +// note below; passed from the grammar's `indent.flowOpen`/`flowClose`, empty otherwise. +function sampleVariants(decl: TokenDecl, ctx: { rand: () => number; interesting: string[]; blockEmbed?: string[] }, n: number): string[] { const out = new Set(); // Cover every top-level alt branch: a token that is itself an alternation (hex/oct/bin/float // forms) must emit ALL its branches, not stop at branch 0 once `n` distinct samples are reached — @@ -152,6 +155,24 @@ function sampleVariants(decl: TokenDecl, ctx: { rand: () => number; interesting: const s = sample(decl.pattern, { ...ctx, variant: v }); if (s !== null && s.length > 0) out.add(s); } + // INTERNAL boundary-literal embeds (indent grammars with a block-context pattern): a flow bracket + // (`[`/`{`) is a flow INDICATOR inside `[ ]`/`{ }`, but ordinary plain-scalar CONTENT in block + // context — which is the whole reason a token carries a `blockPattern` (its body drops the flow + // exclusions). The default `.pattern` (flow-restricted) can NEVER sample such a char, so a block + // plain scalar like `k [y` — one scalar to the stack-keeping parser, but a phantom flow-open to a + // flat grammar — is otherwise unreachable. Sample the base from the BLOCK pattern and splice a + // bracket AFTER the head char (the head must stay a non-indicator, so the splice is mid-token, never + // leading); the parser re-lexes the result as ONE scalar (verified by the round-trip). This makes + // `? k [y : …` (the monogram `[`-in-key flow-leak) producible deterministically. + const blockBase = decl.blockPattern ? (sample(decl.blockPattern, { ...ctx, variant: 0 }) ?? '') : ''; + if (blockBase.length >= 1 && ctx.blockEmbed?.length && !tokenPatternHasStartAnchor(decl) && !tokenPatternStartsWithDecimal(decl)) { + const head = blockBase[0], tail = blockBase.slice(1) || 'y'; + for (const br of ctx.blockEmbed) { + if (br.length !== 1 || /[\n\r]/.test(br)) continue; + out.add(head + br + tail); // glued mid-scalar (`k` + `[` + `y` → `k[y`) + out.add(head + ' ' + br + tail); // space-led bracket (`k [y`) — the prompt's exact shape + } + } // a base sample to seed interesting-literal embeds const base = sample(decl.pattern, { ...ctx, variant: 0 }) ?? ''; // Embed grammar-derived boundary literals into free-form (multi-char-capable) tokens, where @@ -257,6 +278,7 @@ class Walker { interesting: string[]; structKind = new Map(); compactLits: Set; + blockEmbed: string[]; // flow brackets (`[`/`{`/`]`/`}`) — flow indicators, but block-scalar CONTENT reachMap = new Map>(); // rule → every rule it can transitively reach tokenHostRules = new Map(); // token name → rules whose body DIRECTLY references it ruleMin = new Map(); @@ -280,6 +302,9 @@ class Walker { this.structKind.set(ind.newlineToken, 'newline'); } this.compactLits = new Set(grammar.indent?.compactIndicators ?? []); + // flow brackets are flow indicators in `[ ]`/`{ }` but plain-scalar CONTENT in block context — the + // single-char ones seed the internal-embed that makes a `[`-in-block-scalar (`k [y`) producible. + this.blockEmbed = [...(ind?.flowOpen ?? []), ...(ind?.flowClose ?? [])].filter((b) => b.length === 1); this.interesting = this.collectInteresting(); this.computeReach(); this.computeTokenHosts(); @@ -493,7 +518,7 @@ class Walker { case 'ref': { if (this.isStruct(e.name)) return [[{ t: 'struct', kind: this.structKind.get(e.name)! }]]; if (this.isToken(e.name)) { - const vs = sampleVariants(this.tokenByName.get(e.name)!, { rand: this.rand, interesting: this.interesting }, 3); + const vs = sampleVariants(this.tokenByName.get(e.name)!, { rand: this.rand, interesting: this.interesting, blockEmbed: this.blockEmbed }, 3); return (vs.length ? vs : ['x']).slice(0, cap).map((t) => [{ t: 'tok', name: e.name, text: t }]); } if (budget <= 0) { const m = this.ruleMin.get(e.name); return m ? [m] : [[]]; } @@ -561,7 +586,7 @@ class Walker { case 'ref': { if (this.isStruct(e.name)) return [{ t: 'struct', kind: this.structKind.get(e.name)! }]; if (this.isToken(e.name)) { - const vs = sampleVariants(this.tokenByName.get(e.name)!, { rand: this.rand, interesting: this.interesting }, 4); + const vs = sampleVariants(this.tokenByName.get(e.name)!, { rand: this.rand, interesting: this.interesting, blockEmbed: this.blockEmbed }, 4); // pick a variant on the TOKEN-TEXT counter (ch.variant, not the structural ch.next), so the // token TEXT (a plain scalar `--- x` vs `x`, a number's int vs float form) is swept fast at EVERY // position regardless of structural depth — see the Chooser note (this lands #23's `k: --- x`). @@ -730,6 +755,136 @@ class Walker { tokenPatternHasStartAnchor(t)); } + // ── DIRECTED MARKUP SELF-CLOSE-WITH-ATTRIBUTE (markup grammars only) ────────────────────────────── + // The minimal self-closing element carrying ONE quoted attribute: ``. Built DIRECTLY + // from `grammar.markup` (tagOpen / attributeAssign / attributeQuotes / closeMarker / tagClose) plus two + // generically-discovered tokens — a NAME token (an `identifier` token: the tag + attribute name) and a + // QUOTED-VALUE token (a `string` token whose sample opens with an `attributeQuote`) — so it stays + // language-agnostic (no `<`/`/`/HTML hardcoded; a markup grammar with different delimiters yields its + // own shape). The un-biased bounded-exhaustive enumeration STARVES this combination at a small `cap` + // (the cross of "an attribute has a quoted value" × "the optional self-close `/` fired" is past the + // first few derivations), so — exactly like nestChain forces a starved nesting and coverToken a starved + // token — this forces it deterministically. Its tight rendering (`name="v"/>` flush) is what exposes + // the flat grammar mis-scoping the self-close `/` as unquoted-value content (a STANDING flat-TM limit). + // Returns [] when the grammar lacks the needed tokens (no string/identifier token) — then it is a no-op. + markupSelfCloseAttr(): Emission[] { + const mk = this.grammar.markup; + if (!mk || !mk.closeMarker) return []; + const nameTok = this.grammar.tokens.find((t) => t.identifier); // the tag / attribute NAME token + // a string token whose conservative sample is a QUOTED value (opens with one of the attribute quotes) + const quotes = mk.attributeQuotes ?? ['"', "'"]; + const valTok = this.grammar.tokens.find((t) => { + if (!t.string && !t.scope) return false; + const s = sample(t.pattern, { rand: this.rand, interesting: [], variant: 0 }); + return s !== null && s.length >= 2 && quotes.includes(s[0]); + }); + if (!nameTok || !valTok) return []; + const nameTxt = sample(nameTok.pattern, { rand: this.rand, interesting: [], variant: 0 }) || 'a'; + const valTxt = sample(valTok.pattern, { rand: this.rand, interesting: [], variant: 0 })!; + const assign = mk.attributeAssign ?? '='; + return [ + { t: 'lit', value: mk.tagOpen }, + { t: 'tok', name: nameTok.name, text: nameTxt }, // tag name + { t: 'tok', name: nameTok.name, text: nameTxt }, // attribute name + { t: 'lit', value: assign }, + { t: 'tok', name: valTok.name, text: valTxt }, // quoted attribute value + { t: 'lit', value: mk.closeMarker }, // self-close marker + { t: 'lit', value: mk.tagClose }, + ]; + } + + // The leading literal of an alt arm's seq/group spine (the indicator a `? …`/`- …` arm starts with). + private armLeadLiteral(e: RuleExpr): string | null { + if (e.type === 'literal') return e.value; + if (e.type === 'seq') return e.items.length ? this.armLeadLiteral(e.items[0]) : null; + if (e.type === 'group') return this.armLeadLiteral(e.body); + return null; + } + private exprContainsLiteral(e: RuleExpr, v: string): boolean { + switch (e.type) { + case 'literal': return e.value === v; + case 'seq': case 'alt': return e.items.some((i) => this.exprContainsLiteral(i, v)); + case 'quantifier': case 'group': case 'not': return this.exprContainsLiteral(e.body, v); + case 'sep': return this.exprContainsLiteral(e.element, v); + default: return false; + } + } + // The explicit-key indicator of an indent grammar (YAML `?`), found GENERICALLY: the `compactIndicator` + // that heads a rule arm which ALSO carries the key/value separator (`? key : value`), distinguishing it + // from the block-SEQUENCE indicator (`-`, whose arm leads to an item, not a `:` pair). Config-derived + // (compactIndicators × keyValueSeparator), so no token/rule name is hardcoded; null if none qualifies. + explicitKeyIndicator(): string | null { + const ind = this.grammar.indent; if (!ind?.compactIndicators) return null; + const kv = ind.keyValueSeparator ?? ':'; + const ci = new Set(ind.compactIndicators); + for (const r of this.grammar.rules) { + const arms = r.body.type === 'alt' ? r.body.items : [r.body]; + for (const arm of arms) { const lead = this.armLeadLiteral(arm); if (lead && ci.has(lead) && this.exprContainsLiteral(arm, kv)) return lead; } + } + return null; + } + + // ── DIRECTED INDENT EXPLICIT-KEY WITH A FLOW-BRACKET PLAIN SCALAR (indent grammars only) ─────────── + // The shape `? k [y :\n - p\n - q`: an EXPLICIT-key entry whose KEY is a plain scalar containing a flow + // bracket, with a block-SEQUENCE value. To the stack-keeping parser the key is ONE plain scalar (its + // `blockPattern` admits `[`/`{` outside flow) and the `-` items are sequence indicators; a flat grammar + // instead opens a phantom flow at the `[` that never closes, so the value `-`s leak to the key scope. + // Two structural facts STARVE this in the un-biased strategies: a plain-scalar key in EXPLICIT position + // is itself rare (the cover walk reaches `? *alias :`/`? {flow} :`/`?\n indented`, but not `? plain :`), + // and the bracket must additionally land in THAT key — so it is forced here, deterministically, the + // indent analogue of markupSelfCloseAttr. All pieces are config-derived (the explicit-key indicator, the + // key/value separator, the flow brackets, the seq indicator = the OTHER compactIndicator, and the indent + // struct tokens), with the scalar drawn from a `blockPattern` token — no YAML token/rule name hardcoded. + // Returns [] when the grammar lacks the config (no explicit-key indicator / flow brackets / block scalar). + indentExplicitKeyBracket(): Emission[] { + const ind = this.grammar.indent; if (!ind) return []; + const qmark = this.explicitKeyIndicator(); if (!qmark) return []; + const bracket = this.blockEmbed[0]; if (!bracket) return []; // a flow-bracket content char + const kv = ind.keyValueSeparator ?? ':'; + const seqInd = (ind.compactIndicators ?? []).find((c) => c !== qmark); // the block-sequence indicator + if (!seqInd) return []; + // a block plain-scalar token whose blockPattern admits the bracket (the KEY), and one for the items. + const scalarTok = this.grammar.tokens.find((t) => t.blockPattern && t.scope); if (!scalarTok) return []; + const head = sample(scalarTok.blockPattern!, { rand: this.rand, interesting: [], variant: 0 }) || 'k'; + const keyTxt = head[0] + ' ' + bracket + (head.slice(1) || 'y'); // `k [y` — bracket mid-scalar + const itemTxt = (sample(scalarTok.blockPattern!, { rand: this.rand, interesting: [], variant: 0 }) || 'p'); + return [ + { t: 'lit', value: qmark }, // `?` + { t: 'tok', name: scalarTok.name, text: keyTxt }, // `k [y` + { t: 'lit', value: kv }, // `:` + { t: 'struct', kind: 'indent' }, // block value, more-indented + { t: 'lit', value: seqInd }, { t: 'tok', name: scalarTok.name, text: itemTxt }, // `- p` + { t: 'struct', kind: 'newline' }, // sibling item + { t: 'lit', value: seqInd }, { t: 'tok', name: scalarTok.name, text: itemTxt }, // `- p` + { t: 'struct', kind: 'dedent' }, + ]; + } + + // ── DIRECTED BLOCK SCALAR (indent grammars with a block-scalar config) ───────────────────────────── + // A YAML block scalar `|\n body\n more`: an introducer (`|`/`>`, +optional chomping/indent indicators) + // then verbatim more-indented lines emitted as ONE token (like raw text, but bounded by indentation, not + // a close tag). Its token is `never()` (the LEXER emits it from indentation state), so `sample()` yields + // null and the ordinary strategies NEVER produce it — leaving its scope (`string.unquoted.block`) at 0% + // coverage. This synthesizes it directly from `indent.blockScalar` (the introducers + token name) as a + // single multi-line tok at the document root (the minimal legal frame — a bare block scalar parses as a + // one-token document). Body lines are STRICTLY more-indented (the `indentWidth` columns) and plain words, + // never a col-0 `documentMarker` (`---`/`...`), which would terminate the scalar early (a doc boundary + // outranks indentation). Emitted as one tok (not a `lit`+struct), so `compactify` — which only rewrites a + // compact-indicator literal followed by a struct indent — leaves it untouched. Returns [] without config. + indentBlockScalar(indentWidth: number): Emission[] { + const bs = this.grammar.indent?.blockScalar; if (!bs || !bs.introducers.length) return []; + const tok = this.grammar.tokens.find((t) => t.name === bs.token); if (!tok) return []; + const pad = ' '.repeat(Math.max(1, indentWidth)); + const markers = new Set(bs.documentMarkers ?? []); + // a plain body word that is NOT a document marker (so it can't terminate the scalar at col-0; here it is + // indented anyway, but keep it marker-free for safety) — derived from a block plain-scalar token sample. + const scalarTok = this.grammar.tokens.find((t) => t.blockPattern && t.scope); + let body = (scalarTok && sample(scalarTok.blockPattern!, { rand: this.rand, interesting: [], variant: 0 })) || 'body'; + if (markers.has(body)) body = body + 'x'; + const intro = bs.introducers[0]; // `|` + return [{ t: 'tok', name: bs.token, text: `${intro}\n${pad}${body}\n${pad}${body}` }]; + } + // Build the minimal legal context from `entry` down to `tokenName`, with the token rendered as // `sampleText` at its position. Descends the SHORTEST branch toward the token at each node and // minimal-fills everything else — the directed, deterministic analogue of nestChain for a token. @@ -776,7 +931,13 @@ class Walker { // space (whitespace-insensitive); indentation grammars (YAML) render struct emissions through an // indent STACK that mirrors the lexer (newline = same-column sibling, indent = deeper block, // compact = an inline indent for `- - a`); markup grammars keep tag punctuation adjacent. -interface MatOptions { mode: 'token-stream' | 'indent' | 'markup'; indentStep: number } +// `tight` (markup only) ALSO glues the attribute-internal punctuation — `name="value"` with no +// spaces around the `attributeAssign`/quotes — so a quoted value sits FLUSH against the self-close +// `/>` (the WHATWG-canonical ``). That adjacency is what the spaced rendering never +// forms, and it is exactly where a flat TextMate grammar mis-scopes the `/` (it reads the closing +// quote then the `/` as an unquoted-value char, not tag punctuation). A SECOND, legal rendering of +// the same emission list — the markup analogue of indent's compactify — in the exploratory tier. +interface MatOptions { mode: 'token-stream' | 'indent' | 'markup'; indentStep: number; tight?: boolean } function materialize(grammar: CstGrammar, ems: Emission[], opts: MatOptions): { text: string; tokens: GenInput['tokens'] } { let text = ''; @@ -822,12 +983,18 @@ function materialize(grammar: CstGrammar, ems: Emission[], opts: MatOptions): { if (opts.mode === 'markup') { const noSpaceBefore = new Set([grammar.markup?.tagClose, grammar.markup?.closeMarker].filter(Boolean) as string[]); + const assign = grammar.markup?.attributeAssign; // `=`; in tight mode it glues `name=value` let prev = ''; for (const e of ems) { if (e.t === 'struct' || e.t === 'compact') continue; const s = e.t === 'lit' ? e.value : e.text; if (s.length === 0) continue; - const adjacent = prev === grammar.markup?.tagOpen || prev === grammar.markup?.closeMarker || noSpaceBefore.has(s) || prev === ''; + // TIGHT also glues the attribute `=` to its name and value: `name=` (cur is the assign) and + // `=value` (prev was the assign). Combined with `noSpaceBefore` already gluing the value→`/>`, + // this renders ``. The inter-attribute / name boundary still takes a space (the + // value isn't an assign, the next name isn't), so `a="x" b="y"` stays well-formed. + const tightGlue = !!opts.tight && !!assign && (s === assign || prev === assign); + const adjacent = prev === grammar.markup?.tagOpen || prev === grammar.markup?.closeMarker || noSpaceBefore.has(s) || tightGlue || prev === ''; if (!adjacent) emit(' '); if (e.t === 'tok') emitTok(e.name, s); else emit(s); prev = s; @@ -891,13 +1058,30 @@ export function generateInputs(grammar: CstGrammar, opts: GenOptions = {}): GenI const seen = new Set(); const out: GenInput[] = []; + // The render JOBS for one emission list: each pairs an emission-variant with materialize options and + // the strategy label to file the resulting input under. Most modes have ONE job (the canonical + // rendering, same strategy). Two modes add a SECOND, equally-legal rendering of the same emissions: + // • indent → a compactified copy (`- - a` inline), SAME strategy (a correct shape, still a gate). + // • markup → a TIGHT copy (`name="value"/>` flush), filed in the EXPLORATORY (`fuzz`) tier. The + // tight adjacency is where a flat grammar mis-scopes the self-close `/` — a STANDING flat-TM + // limit in the unfixed grammar, not a regression of a structured shape — so, like a gnarly fuzz + // derivation, it is report-only (`isGated` keys off the `fuzz` prefix). The spaced rendering keeps + // the original strategy, so the structured round-trip guarantee is untouched. + const renderJobs = (ems: Emission[], strategy: string): { variant: Emission[]; mat: MatOptions; strat: string }[] => { + if (mode === 'indent') return [ems, compactify(ems, w.compactLits)].map((variant) => ({ variant, mat: matOpts, strat: strategy })); + if (mode === 'markup') return [ + { variant: ems, mat: matOpts, strat: strategy }, + { variant: ems, mat: { ...matOpts, tight: true }, strat: `fuzz:tight:${strategy}` }, + ]; + return [{ variant: ems, mat: matOpts, strat: strategy }]; + }; const push = (ems: Emission[], strategy: string, rule: string) => { if (out.length >= maxInputs * 4) return; - for (const variant of mode === 'indent' ? [ems, compactify(ems, w.compactLits)] : [ems]) { - const { text, tokens } = materialize(grammar, variant, matOpts); + for (const job of renderJobs(ems, strategy)) { + const { text, tokens } = materialize(grammar, job.variant, job.mat); if (!text.trim() || text.length > 2000 || seen.has(text)) continue; // skip blank / over-long / duplicate seen.add(text); - out.push({ text, tokens, strategy, rule }); + out.push({ text, tokens, strategy: job.strat, rule }); } }; @@ -970,5 +1154,32 @@ export function generateInputs(grammar: CstGrammar, opts: GenOptions = {}): GenI } } + // 6) DIRECTED MARKUP SELF-CLOSE-WITH-ATTRIBUTE (markup grammars) — ``. The un-biased + // enumeration starves the quoted-attribute × self-close cross at a small cap, so this forces it (the + // markup analogue of nestChain/tokenCover). Filed in the EXPLORATORY (`fuzz`) tier: even the SPACED + // rendering puts the quoted value FLUSH against the self-close `/` (the `/` is structural punctuation, + // always glued), and that flush value→`/` adjacency is exactly the STANDING flat-TM limit (the grammar + // reads the `/` as unquoted-value content) — a real highlighter bug in the unfixed grammar, not a + // regression of a by-construction shape, so it is report-only like a gnarly fuzz derivation, not a gate. + if (mode === 'markup') { + const sc = w.markupSelfCloseAttr(); + if (sc.length) push(sc, 'fuzz:markupSelfClose', entry.name); + } + + // 7) DIRECTED INDENT EXPLICIT-KEY-WITH-BRACKET-SCALAR (indent grammars) — `? k [y :\n - p\n - q`. The + // un-biased strategies starve a plain-scalar explicit key (let alone one carrying a `[`), so this forces + // it — the indent analogue of markupSelfCloseAttr. Filed EXPLORATORY (`fuzz:`): it deliberately stresses + // the block-vs-flow-stack limit a flat grammar lacks (the phantom flow a `[`-in-key opens), so any + // divergence is a STANDING limit of the unfixed grammar, report-only, not a by-construction gate. + if (mode === 'indent') { + const ek = w.indentExplicitKeyBracket(); + if (ek.length) push(ek, 'fuzz:explicitKeyBracket', entry.name); + // a block scalar (`|\n body`): its token is lexer-emitted (never() pattern), so no ordinary strategy + // produces it — synthesize one so its `string.unquoted.block` scope is covered. A clean structured + // shape that round-trips (a one-token document), so it is a normal `nest`-tier input (no flat-TM limit). + const bs = w.indentBlockScalar(matOpts.indentStep); + if (bs.length) push(bs, 'nest:blockScalar', entry.name); + } + return out.slice(0, maxInputs); } From 5d06488652e25a3c7a70bb3a661b796b0d9a386d Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Tue, 9 Jun 2026 07:20:35 +0800 Subject: [PATCH 5/6] Gap ledger: deterministic, minimized, oracle-classified findings (KNOWN-GAPS.md) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Operationalize the scope≡role check's "discovered" divergences into a committed, commit-trackable ledger instead of console output that vanishes. test/gap-ledger.ts: for each language, collect the discovered divergences (reusing the EXACT detection, factored into generative-detect.ts so generative.ts's gate is unchanged), MINIMIZE each via delta-debugging to a stable minimal repro, CLASSIFY via the neutral oracle (typescript/yaml/parse5) keeping only oracle-VALID-input gaps (over-accepts dropped), and FINGERPRINT (content hash, stable across commits). Emits KNOWN-GAPS.md (human + machine-readable), regenerated with `--write`, gated up-to-date with `--check`. Deterministic: two runs → byte-identical ledger. Currently 2 gaps, 0 dropped — the HTML/Vue self-close `/` mis-scope (`` ddmin-minimized to ``), the floor-blind divergence the corpus-bound scope-gap metric can't see. CI runs the selftest + `--check`. generative 7/7 unchanged; agnostic 9/9; deterministic. The fixes for these gaps live on a separate branch (highlighter product changes), so the ledger here demonstrates the tool FINDING them; a later layer can reconcile the ledger into GitHub issues. --- .github/workflows/ci.yml | 8 + KNOWN-GAPS.md | 62 ++++++ package.json | 3 + test/gap-ledger-selftest.ts | 89 +++++++++ test/gap-ledger.ts | 380 ++++++++++++++++++++++++++++++++++++ test/generative-detect.ts | 181 +++++++++++++++++ test/generative.ts | 137 ++----------- 7 files changed, 736 insertions(+), 124 deletions(-) create mode 100644 KNOWN-GAPS.md create mode 100644 test/gap-ledger-selftest.ts create mode 100644 test/gap-ledger.ts create mode 100644 test/generative-detect.ts diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4962098..905a304 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -58,6 +58,14 @@ jobs: node test/yaml-issue12-regressions.ts node test/yaml-depth-witnesses.ts node test/generative.ts + # The gap ledger is the deterministic, oracle-classified record of the divergences the + # generative check DISCOVERS. Its self-test asserts the ddmin keep-path + the oracle + # drop-path + determinism; --check fails if the committed KNOWN-GAPS.md is stale (the + # ledger is a pure function of the grammar, so it must be regenerated when a grammar + # changes — `npm run gap-ledger`). This is the deterministic source of truth; a later + # layer can turn rows into issues, but the committed artifact is gated here first. + node test/gap-ledger-selftest.ts + node test/gap-ledger.ts --check # The derived tree-sitter highlighter is the strongest thesis proof (a real GLR # parser from the same grammar, beating the official hand-written one). Build its diff --git a/KNOWN-GAPS.md b/KNOWN-GAPS.md new file mode 100644 index 0000000..8cd7f44 --- /dev/null +++ b/KNOWN-GAPS.md @@ -0,0 +1,62 @@ +# KNOWN-GAPS — Monogram flat-highlighter divergences (auto-generated) + + + +A **gap** is a position where, on **valid input** (accepted by the language’s external +authority — typescript / yaml / parse5), the **flat TextMate highlighter** paints a token a +different visual role than the **Monogram parser** assigns it by construction. These are the +floor-blind divergences the generative scope≡role check (`test/generative.ts`) DISCOVERS over +grammar-derived inputs — the monogram#23/#24 class — which the corpus-bound scope-gap metric is +blind to (a small/clean corpus may never contain the shape, and the role-graded metric ignores +punctuation-floor mis-paints). Each gap’s input is **minimized** (delta-debugged to a minimal +repro that still parses and still diverges) and **fingerprinted** (a content hash, stable across +commits) so the ledger is deterministic and commit-trackable. + +Regenerate: `node test/gap-ledger.ts --write` · verify up-to-date: `node test/gap-ledger.ts --check`. + +**2 gaps** across 7 grammars · 0 dropped. + +## `525e867dc205` — html: #24 structural-literal→content + +- **Language:** html +- **Minimal repro:** `` +- **Divergent token:** `/` (parser token `$punct`) +- **Role vs scope:** want **punct**, got **string** (highlighter scope `string.unquoted.html`) +- **Fingerprint:** `525e867dc205` + +```json +{ + "id": "525e867dc205", + "language": "html", + "kind": "#24 structural-literal→content", + "repro": "", + "tokenType": "$punct", + "tokenText": "/", + "want": "punct", + "got": "string", + "gotScope": "string.unquoted.html" +} +``` + +## `85c793d02a86` — vue: #24 structural-literal→content + +- **Language:** vue +- **Minimal repro:** `` +- **Divergent token:** `/` (parser token `$punct`) +- **Role vs scope:** want **punct**, got **string** (highlighter scope `string.unquoted.vue`) +- **Fingerprint:** `85c793d02a86` + +```json +{ + "id": "85c793d02a86", + "language": "vue", + "kind": "#24 structural-literal→content", + "repro": "", + "tokenType": "$punct", + "tokenText": "/", + "want": "punct", + "got": "string", + "gotScope": "string.unquoted.vue" +} +``` + diff --git a/package.json b/package.json index b3937f7..ec8f9c8 100644 --- a/package.json +++ b/package.json @@ -6,6 +6,9 @@ "gen": "node src/cli.ts typescript.ts && node src/cli.ts javascript.ts && node src/cli.ts typescriptreact.ts && node src/cli.ts javascriptreact.ts && node src/cli.ts html.ts && node src/cli.ts vue.ts && node src/cli.ts yaml.ts", "test": "node test/sanity-check.ts", "generative": "node test/generative.ts", + "gap-ledger": "node test/gap-ledger.ts --write", + "gap-ledger:check": "node test/gap-ledger.ts --check", + "gap-ledger:selftest": "node test/gap-ledger-selftest.ts", "conformance": "node test/run-conformance.ts", "conformance:js": "node test/js-conformance.ts", "conformance:tsx": "node test/tsx-conformance.ts", diff --git a/test/gap-ledger-selftest.ts b/test/gap-ledger-selftest.ts new file mode 100644 index 0000000..4c2125f --- /dev/null +++ b/test/gap-ledger-selftest.ts @@ -0,0 +1,89 @@ +// ───────────────────────────────────────────────────────────────────────────── +// gap-ledger-selftest.ts — asserts the gap ledger's two load-bearing behaviours +// on the REAL HTML probe, independent of how many gaps happen to surface: +// +// (A) DETERMINISM — `generateInputs` + ddmin + fingerprint are a pure function of +// the grammar, so two full ledger builds are byte-identical. Asserted here over +// the rendered KNOWN-GAPS.md (the committed artifact) by building it twice. +// +// (B) the oracle CLASSIFY DROP-PATH — a divergence whose minimized repro the external +// oracle REJECTS (a parser over-accept, not a real highlighter gap) is DROPPED, +// not filed. We assert the ledger's keep/drop predicate (`oracleAccepts(repro)`) +// routes a parser OVER-ACCEPT (a markup the Monogram parser accepts but parse5 +// REJECTS — `< a/>`, `<:a/>`) to DROP, and the oracle-VALID ``-shape to +// KEEP. (Note: the self-close `/` divergence itself only arises on WELL-FORMED tag +// shapes — which parse5 also accepts — so a single input that BOTH diverges AND is +// oracle-rejected does not exist for this gap; the drop-path is exercised by the +// classify predicate over real over-accept markup, which is what would gate it.) +// +// Run (bare node): node test/gap-ledger-selftest.ts +// ───────────────────────────────────────────────────────────────────────────── +import { execFileSync } from 'node:child_process'; +import { createParser } from '../src/gen-parser.ts'; +import type { CstGrammar } from '../src/types.ts'; +import { buildRoleMap, anchoredScopes, leafRoles, collectViolations, isGated } from './generative-detect.ts'; +import { loadTm, tmTokenize, reproStillDiverges, sig, minimize, LANGS, type Probe } from './gap-ledger.ts'; + +let failures = 0; +const ok = (cond: boolean, msg: string) => { console.log(`${cond ? ' ✓' : ' ✗ FAIL:'} ${msg}`); if (!cond) failures++; }; + +// ── build the HTML probe (the cheapest grammar with a known divergence) ── +const htmlCfg = LANGS.find((l) => l.name === 'html')!; +const grammar = (await import(htmlCfg.module)).default as CstGrammar; +const { parse } = createParser(grammar); +const tm = await loadTm(htmlCfg.scopeName, { [htmlCfg.scopeName]: htmlCfg.tmPath, ...(htmlCfg.tmExtra ?? {}) }); +if (!tm) throw new Error('failed to load html grammar'); +const probe: Probe = { parse, tm, grammar, roleOf: buildRoleMap(grammar), anchored: anchoredScopes(grammar) }; + +// the ledger's CLASSIFY predicate, verbatim: keep iff the oracle accepts the minimal repro as VALID. +const classifyKeeps = (text: string) => htmlCfg.oracleAccepts(text); + +console.log('gap-ledger self-test\n'); + +// ── (B1) the canonical KEPT case: ``-shape, oracle-valid, still diverges ── +const keptInput = ''; // the generator's tight-markup shape +{ + // detect the divergence on the real input, minimize, classify + const v0 = probeDivergence(keptInput); + ok(!!v0, `kept case: a self-close \`/\` divergence is detected on ${JSON.stringify(keptInput)}`); + if (v0) { + const repro = minimize(probe, keptInput, v0.target); + ok(!!reproStillDiverges(probe, repro, v0.target), `kept case: minimized repro ${JSON.stringify(repro)} still diverges`); + ok(classifyKeeps(repro), `kept case: parse5 ACCEPTS the minimized repro → KEEP (a real highlighter gap)`); + } +} + +// ── (B2) the DROP case: real parser OVER-ACCEPTS (parser accepts, parse5 REJECTS) ── +// markup the Monogram markup parser accepts but parse5 does NOT recover as an element — exactly the +// "Monogram parses but the oracle rejects" class the ledger must DROP (a parser concern, not a +// highlighter gap). We assert each is parser-accepted AND classify-DROPPED (oracleAccepts == false). +const overAccepts = ['< a/>', '<:a/>']; +let dropProven = false; +for (const cand of overAccepts) { + let parserOk = false; try { parse(cand); parserOk = true; } catch { /* */ } + if (!parserOk) continue; + ok(!classifyKeeps(cand), `drop case: ${JSON.stringify(cand)} is parser-accepted but parse5-REJECTS → classify DROPS it`); + dropProven = true; +} +ok(dropProven, 'drop case: at least one real parser-over-accept is parser-accepted and confirmed dropped'); +// and the dual: the oracle-VALID minimal repro is KEPT (not dropped) — the keep/drop split is real. +ok(classifyKeeps(''), 'keep/drop split: the oracle-VALID ``-shape repro is KEPT (not dropped)'); + +// ── (A) determinism of the rendered artifact: two builds byte-identical ── +console.log('\n determinism (two full ledger builds)…'); +const run = () => execFileSync('node', ['test/gap-ledger.ts'], { encoding: 'utf8', stdio: ['ignore', 'pipe', 'ignore'], maxBuffer: 64 * 1024 * 1024 }); +const a = run(), b = run(); +ok(a === b, `two \`node test/gap-ledger.ts\` runs produce byte-identical output (${a.length} bytes)`); + +console.log(failures ? `\n${failures} self-test failure(s).` : '\nAll gap-ledger self-tests passed.'); +process.exit(failures ? 1 : 0); + +// ── helper: detect the self-close `/` divergence on `text`, returning its signature ── +function probeDivergence(text: string): { target: string } | null { + let cst; try { cst = parse(text); } catch { return null; } + let toks; try { toks = tmTokenize(probe.tm, text); } catch { return null; } + const leaves = leafRoles(grammar, cst, probe.roleOf); + const vs = collectViolations({ input: text, strategy: 'fuzz', cst, toks, leaves, anchored: probe.anchored }); + const v = vs.find((x) => !isGated(x)); + return v ? { target: sig(v) } : null; +} diff --git a/test/gap-ledger.ts b/test/gap-ledger.ts new file mode 100644 index 0000000..7b3769d --- /dev/null +++ b/test/gap-ledger.ts @@ -0,0 +1,380 @@ +// ───────────────────────────────────────────────────────────────────────────── +// gap-ledger.ts — a DETERMINISTIC, auto-maintained GAP LEDGER for Monogram. +// +// The generative by-construction check (test/generative.ts) DISCOVERS divergences +// where the flat TextMate highlighter and the Monogram parser disagree on the +// visual role of a token in a grammar-DERIVED input — the floor-blind class the +// corpus-bound scope-gap metric is blind to (monogram#23/#24). That check REPORTS +// them; this ledger OPERATIONALIZES them into a stable, commit-trackable artifact: +// +// 1. DISCOVER — for each of the 7 grammars, generate inputs deterministically +// (grammar-gen.ts), tokenize with the flat grammar + parse with the parser, +// and collect the divergences using the SAME detector generative.ts uses +// (generative-detect.ts) — not a reimplementation. +// 2. MINIMIZE — delta-debug (ddmin) each divergence's input down to a minimal +// repro that still parses AND still exhibits the SAME divergence (same parser +// role-bucket vs same highlighter bucket, identified by a position-independent +// signature). The generator + ddmin are deterministic, so the minimal repro is +// stable across runs and commits. +// 3. CLASSIFY — parse the minimal repro with the language's EXTERNAL authority +// (typescript / yaml / parse5). File ONLY divergences the oracle accepts as +// VALID input (a real highlighter gap on valid input). A repro the parser +// accepts but the oracle rejects is a parser OVER-ACCEPT — a different concern; +// it is DROPPED from the gap list (its count is reported, not listed). +// 4. FINGERPRINT — a stable id = hash(language, normalized repro, role, bucket), +// so the same gap keeps the same id across commits. +// 5. EMIT — a sorted KNOWN-GAPS.md (committed artifact): per gap, the language, +// escaped minimal repro, role-vs-scope (want vs got), fingerprint, and a +// machine-readable JSON block. +// +// DETERMINISM is the whole point (a commit-trackable ledger): two runs produce a +// BYTE-IDENTICAL KNOWN-GAPS.md. The generator is a pure function of the grammar +// (no seed), ddmin is deterministic, the oracle is deterministic, and the hash is +// content-only — so nothing varies run-to-run. +// +// Run (bare node): +// node test/gap-ledger.ts # print the ledger to stdout (don't write) +// node test/gap-ledger.ts --write # (re)write KNOWN-GAPS.md +// node test/gap-ledger.ts --check # fail if KNOWN-GAPS.md is stale (CI guard) +// node test/gap-ledger.ts yaml # one language +// ───────────────────────────────────────────────────────────────────────────── +import { readFileSync, writeFileSync, existsSync } from 'node:fs'; +import { createHash } from 'node:crypto'; +import { createRequire } from 'node:module'; +import vsctm from 'vscode-textmate'; +import onig from 'vscode-oniguruma'; +import ts from 'typescript'; +import { parseAllDocuments } from 'yaml'; +import { parseFragment } from 'parse5'; +import sfcCompiler from '@vue/compiler-sfc'; +import { createParser, type CstNode } from '../src/gen-parser.ts'; +import type { CstGrammar } from '../src/types.ts'; +import { generateInputs } from './grammar-gen.ts'; +import { + type TmTok, type Violation, + buildRoleMap, leafRoles, anchoredScopes, collectViolations, isGated, +} from './generative-detect.ts'; + +// ── language registry — the SAME per-language DATA shape as generative.ts's LANGS, plus an +// `oracleAccepts(text)`: the external authority's verdict on whether the minimal repro is VALID +// input. THAT is the only per-language wiring (a config table, like generative.ts's LANGS); the +// ddmin / fingerprint / emit ENGINE below is language-agnostic. ── +interface LangCfg { + name: string; + module: string; // grammar module (default export = CstGrammar) + scopeName: string; // TextMate scope, e.g. source.yaml + tmPath: string; // the derived flat .tmLanguage.json + tmExtra?: Record; // extra scopeName → file for multi-file grammars + oracleAccepts: (text: string) => boolean; // the neutral oracle's "is this VALID input?" verdict +} + +// ── oracle validity verdicts (DATA) ────────────────────────────────────────────────────────────── +// TS-family: tsc's own parser — zero parseDiagnostics means it accepts the text as valid source. +const tsAccepts = (kind: ts.ScriptKind) => (text: string): boolean => { + try { + const sf = ts.createSourceFile('gap.ts', text, ts.ScriptTarget.Latest, /*setParentNodes*/ false, kind); + return ((sf as any).parseDiagnostics?.length ?? 0) === 0; + } catch { return false; } +}; +// YAML: the `yaml` package — a document with zero `.errors` is valid (the same independent authority +// the scope-gap YAML oracle uses). A throw or any error ⇒ not valid. +const yamlAccepts = (text: string): boolean => { + try { const docs = parseAllDocuments(text); return docs.length > 0 && docs.every((d: any) => (d.errors?.length ?? 0) === 0); } + catch { return false; } +}; +// HTML: parse5 is error-TOLERANT (never throws), so "valid" = it recovered a real element structure — +// at least one element/tag node (not pure text / a dropped ``). This matches html-oracle.ts's own +// emission gate (it only emits tag/attr roles when parse5 reports a tagName + location). +const htmlAccepts = (text: string): boolean => { + try { + const frag: any = parseFragment(text, { sourceCodeLocationInfo: true }); + const hasEl = (nodes: any[]): boolean => nodes.some((n) => (n.tagName && n.sourceCodeLocation) || (n.childNodes && hasEl(n.childNodes))); + return hasEl(frag.childNodes ?? []); + } catch { return false; } +}; +// Vue SFC: the template markup sub-language IS HTML — vue-oracle.ts composes parse5 over the template +// content as its markup authority. @vue/compiler-sfc only does SFC BLOCK splitting; a bare template- +// level markup fragment (what the generator emits for the vue grammar — ``, not a full +// `` SFC) is NOT a top-level SFC block, so the SFC parser reports no template. +// The right neutral verdict for such markup is therefore parse5's (the template arbiter): if the SFC +// parser DID isolate a