From 368bb3bf8842c7c7cc31f8e6cf0b121cf60bb1f6 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 8 Jun 2026 03:40:57 +0800 Subject: [PATCH 01/10] Make the derived YAML tree-sitter grammar generate + build (issue #3, pieces 1-2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `tree-sitter/yaml/grammar.js` previously did not `generate`. Two of the three blockers from the issue are now resolved, in `src/gen-treesitter.ts` only — every other derived grammar (TS/JS/ TSX/JSX/HTML/Vue) regenerates byte-identical, and `tsc` is clean. 1. Structural indent tokens → externals. INDENT / DEDENT / NEWLINE and the block-scalar body are engine-emitted (their token IR is `never()`), so they serialized as never-match token rules the parser could never match. `planScannerTokens` now routes them to tree-sitter `externals` (keyed off `grammar.indent`), the way the HTML markup path handles `raw_text`: they appear in the `externals` block and the scanner.c `TokenType` enum, and references become `$.indent` etc. 2. Nullable-rule elimination. tree-sitter rejects a non-start rule that matches the empty string, and an indentation grammar has several (a YAML node/entry may be null: `key:` with no value, `{a: }`, an empty document) — `node`/`flow_node`/`flow_map_entry`/`flow_seq_entry`/`after_doc_end`. A general ε-elimination (`makeNonEmpty` + `wrapNullableRefs`) makes each such rule's body non-empty and wraps every reference to it in `optional(...)`; the accepted language is unchanged and only the tree-sitter target is touched. Gated on a grammar actually having nullable non-start rules, so the others are untouched. The resulting LR conflicts (YAML is massively ambiguous — exactly what tree-sitter's GLR is for) are declared: 37 tuples added to `LR_CONFLICT_CLOSURE` (the fixpoint of tree-sitter's own analysis, via test/collect-conflicts.ts). The closure filter also accepts TOKEN names now, not only rule names, so a token-vs-token conflict like YAML's `key`/`plain` (both can precede a `:`) is declarable. Every tuple is YAML-specific (zero rule/token-name overlap with the other grammars), so each is inert elsewhere. `cd tree-sitter/yaml && npx tree-sitter generate && npx tree-sitter build --wasm .` now succeeds. The C external scanner is still a stub (returns false), so indentation isn't parsed yet — that is piece 3 (a real indent scanner) and is tracked separately. Refs #3 --- src/gen-treesitter.ts | 153 ++++++++++++++++++++++++++++++++- tree-sitter/yaml/grammar.js | 87 +++++++++++++------ tree-sitter/yaml/src/scanner.c | 5 +- 3 files changed, 218 insertions(+), 27 deletions(-) diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts index dda7dea..1d0ae3b 100644 --- a/src/gen-treesitter.ts +++ b/src/gen-treesitter.ts @@ -143,6 +143,10 @@ interface GrammarJsContext { externalSnake: Set; /** original token name → external scanner token name (snake) if scanner-provided */ scannerTokenFor: Map; + /** Non-start rules whose body can derive the empty string. tree-sitter rejects these, so their + * bodies are made non-empty and every reference to them is wrapped in optional() (ε-elimination, + * see makeNonEmpty / wrapNullableRefs). Empty for grammars with no nullable non-start rules. */ + nullableNonStart: Set; /** * If the grammar declares an interpolated-template token, the plan for turning it * into a `template` RULE (delimiters + the `${ … }` hole) backed by an external @@ -342,9 +346,116 @@ function buildPrattRule(rule: RuleDecl, ctx: GrammarJsContext): string { return `choice(\n ${branches.join(',\n ')}\n )`; } +// ── Nullable-rule elimination (ε-elimination) ──────────────────────────────── +// tree-sitter rejects a NON-START rule that can match the empty string. An indentation grammar like +// YAML has several (a YAML node/entry may be NULL: `key:` with no value, `{a: }`, an empty doc), so +// `node`/`flow_node`/`flow_map_entry`/… are nullable. We push the emptiness to the CALL SITES: make +// each such rule's body NON-EMPTY (`makeNonEmpty`) and wrap every reference to it in `optional(...)` +// (`wrapNullableRefs`). The accepted language is identical (rule-or-empty at each use), and ONLY the +// tree-sitter target is touched — the parser and the other generators never see this. Computed once +// and gated on the grammar actually having nullable non-start rules, so every grammar that already +// `generate`s (no such rules) is byte-identical. + +/** Non-start rules whose body can derive ε. `isTerminal` flags tokens / external symbols (never nullable). */ +function computeNullableNonStart(grammar: CstGrammar, startName: string, isTerminal: (name: string) => boolean): Set { + const ruleNames = new Set(grammar.rules.map(r => r.name)); + const nullable = new Set(); + const exprNullable = (e: RuleExpr): boolean => { + switch (e.type) { + case 'literal': return e.value === ''; + case 'ref': return ruleNames.has(e.name) && !isTerminal(e.name) && nullable.has(e.name); + case 'seq': return e.items.every(exprNullable); + case 'alt': return e.items.some(exprNullable); + case 'quantifier': return e.kind === '+' ? exprNullable(e.body) : true; // ?,* match empty + case 'group': return exprNullable(e.body); + case 'sep': return true; // renders to optional(seq(...)) + default: return true; // not/sameLine/noCommentBefore/noMultilineFlowBefore/op/prefix/postfix → blank() + } + }; + let changed = true; + while (changed) { changed = false; for (const r of grammar.rules) if (!nullable.has(r.name) && exprNullable(r.body)) { nullable.add(r.name); changed = true; } } + nullable.delete(startName); // the start rule MAY be nullable in tree-sitter + return nullable; +} + +/** Wrap every reference to a made-non-empty (`nn`) rule in optional() — the may-be-empty form. */ +function wrapNullableRefs(e: RuleExpr, nn: Set): RuleExpr { + switch (e.type) { + case 'ref': return nn.has(e.name) ? { type: 'quantifier', kind: '?', body: e } : e; + case 'seq': return { type: 'seq', items: e.items.map(i => wrapNullableRefs(i, nn)) }; + case 'alt': return { type: 'alt', items: e.items.map(i => wrapNullableRefs(i, nn)) }; + case 'quantifier': return { ...e, body: wrapNullableRefs(e.body, nn) }; + case 'group': return { ...e, body: wrapNullableRefs(e.body, nn) }; + case 'sep': return { ...e, element: wrapNullableRefs(e.element, nn) }; + default: return e; + } +} + +/** Whether e is nullable AFTER the transform (a ref to an `nn` rule is now wrapped optional → nullable; + * every other ref is non-nullable, since `nn` is exactly the made-non-empty set). */ +function exprNullableAfter(e: RuleExpr, nn: Set): boolean { + switch (e.type) { + case 'literal': return e.value === ''; + case 'ref': return nn.has(e.name); + case 'seq': return e.items.every(i => exprNullableAfter(i, nn)); + case 'alt': return e.items.some(i => exprNullableAfter(i, nn)); + case 'quantifier': return e.kind === '+' ? exprNullableAfter(e.body, nn) : true; + case 'group': return exprNullableAfter(e.body, nn); + case 'sep': return true; + default: return true; + } +} + +/** The non-empty form of a (nullable) expr — its language minus ε. `null` if that language is empty + * (a purely zero-width expr). The chosen non-empty position is rendered UNWRAPPED; the rest get the + * may-be-empty form `wrapNullableRefs`. */ +function makeNonEmpty(e: RuleExpr, nn: Set): RuleExpr | null { + const T = (x: RuleExpr) => wrapNullableRefs(x, nn); + const NE = (x: RuleExpr) => makeNonEmpty(x, nn); + const nul = (x: RuleExpr) => exprNullableAfter(x, nn); + switch (e.type) { + case 'literal': return e.value === '' ? null : e; + case 'ref': return e; // an nn rule (now non-empty) or a non-nullable rule/terminal + case 'group': { const b = NE(e.body); return b ? { ...e, body: b } : null; } + case 'alt': { + const parts: RuleExpr[] = []; + for (const m of e.items) { const r = nul(m) ? NE(m) : T(m); if (r) parts.push(r); } + return parts.length === 0 ? null : parts.length === 1 ? parts[0] : { type: 'alt', items: parts }; + } + case 'seq': { + if (e.items.some(i => !nul(i))) return T(e); // a non-nullable element already forces non-empty + const branches: RuleExpr[] = []; // all nullable → "first non-empty element is at i" + for (let i = 0; i < e.items.length; i++) { + const head = NE(e.items[i]); + if (!head) continue; + const tail = e.items.slice(i + 1).map(T); + branches.push(tail.length ? { type: 'seq', items: [head, ...tail] } : head); + } + return branches.length === 0 ? null : branches.length === 1 ? branches[0] : { type: 'alt', items: branches }; + } + case 'quantifier': { + if (e.kind === '?') return nul(e.body) ? NE(e.body) : T(e.body); // optional(x) non-empty = x non-empty + const head = nul(e.body) ? NE(e.body) : T(e.body); // *,+ non-empty = one non-empty iter, then repeat + return head ? { type: 'seq', items: [head, { type: 'quantifier', kind: '*', body: T(e.body) }] } : null; + } + case 'sep': { + const head = nul(e.element) ? NE(e.element) : T(e.element); + if (!head) return null; + const d: RuleExpr = { type: 'literal', value: e.delimiter }; + return { type: 'seq', items: [head, { type: 'quantifier', kind: '*', body: { type: 'seq', items: [d, T(e.element)] } }, { type: 'quantifier', kind: '?', body: d }] }; + } + default: return null; // not/sameLine/…: zero-width, no non-empty form + } +} + /** Build a single rule's body string (Pratt or plain). */ function buildRuleBody(rule: RuleDecl, ctx: GrammarJsContext): string { if (ctx.prattRules.has(rule.name)) return buildPrattRule(rule, ctx); + const nn = ctx.nullableNonStart; + if (nn.size > 0) { + const body = nn.has(rule.name) ? (makeNonEmpty(rule.body, nn) ?? rule.body) : wrapNullableRefs(rule.body, nn); + return renderExpr(body, ctx); + } return renderExpr(rule.body, ctx); } @@ -423,6 +534,22 @@ const LR_CONFLICT_CLOSURE: string[][] = [ // while completing the closure (CI builds only the typescript + html tree-sitters, so // tsx/jsx generate was never exercised). Each is inert for languages lacking the rule. ['type', 'class_heritage'], ['type_param', 'jsxtag_name'], ['expr', 'jsxcontainer'], + // YAML (issue #3): an indentation grammar is massively ambiguous — a newline may continue a node or + // start the next document, a `:` may open a value or be an empty-key map, a scalar may be a key or a + // leaf, a flow collection may be a value or an implicit block key. tree-sitter's GLR absorbs all of + // this once the states are declared. These 37 tuples are the fixpoint of its own analysis (collected + // via test/collect-conflicts.ts); every name is YAML-specific, so each is inert for the other + // languages (verified: zero rule-name overlap with the TS/JS/TSX/JSX grammars). + ['stream', 'node'], ['empty_key_mapping'], ['explicit_entry'], ['next_doc'], ['stream', 'next_doc'], + ['node'], ['key', 'plain'], ['scalar', 'doc_fold'], ['explicit_mapping'], ['block_sequence'], + ['map_value_scalar', 'map_value_node_scalar'], ['scalar', 'block_key_scalar'], + ['map_value', 'map_value_node'], ['flow_explicit'], ['flow_mapping'], ['flow_sequence'], + ['explicit_doc_body'], ['inline_doc_node'], ['alias_or_keyed'], ['doc_fold'], ['mapping_from_flow'], + ['mapping_or_scalar'], ['property', 'node'], ['seq_item'], ['property'], ['flow_node'], + ['node', 'explicit_doc_body'], ['node', 'after_doc_end'], ['after_doc_end'], ['map_entry'], + ['stream', 'explicit_doc_body'], ['map_entry_no_empty'], ['seq_value_node'], + ['mapping_or_scalar', 'doc_fold'], ['map_value_scalar', 'map_inline_scalar'], + ['content_node', 'mapping_from_flow'], ['mapping_or_scalar', 'map_value'], ]; /** @@ -475,9 +602,12 @@ function deriveConflicts(ctx: GrammarJsContext): string[][] { } // 3. The LR(1) closure tree-sitter's own analysis reports for this grammar. - // Applied only for tuples whose rules all exist here (inert otherwise). + // Applied only for tuples whose symbols ALL exist here (inert otherwise). A conflict symbol may + // be a RULE or a TOKEN (e.g. YAML's `key`/`plain` are tokens that conflict on a trailing `:`), so + // both name sets count — `$.key` is a valid conflict symbol whether key is a rule or a token. + const tokenSnakes = new Set(ctx.tokenSnake.values()); for (const tuple of LR_CONFLICT_CLOSURE) { - if (tuple.every(r => ruleSnakes.has(r))) push(tuple); + if (tuple.every(r => ruleSnakes.has(r) || tokenSnakes.has(r))) push(tuple); } return conflicts; @@ -587,6 +717,19 @@ function planScannerTokens(grammar: CstGrammar): Map { // stateless external token (the scanner emits it at each significant line boundary). Listed // FIRST so it heads the enum / externals order. if (grammar.newline) map.set(grammar.newline.token, toSnake(grammar.newline.token)); + // An indentation-sensitive grammar (YAML): INDENT / DEDENT / NEWLINE and the block-scalar body are + // engine-emitted — the lexer's indent stack (src/gen-lexer.ts) decides them, not a regex — so their + // token IR is `never()`. In tree-sitter they become EXTERNAL tokens the C scanner (src/scanner.c) + // provides; without this they would serialize as never-match token rules (`token(/[^\s\S]/)`) that + // the parser can never match (and the block-scalar body would orphan the scalar). Ordered + // indent/dedent/newline/body so grammar.js's `externals` and scanner.c's enum agree positionally. + if (grammar.indent) { + const ind = grammar.indent; + map.set(ind.indentToken, toSnake(ind.indentToken)); + map.set(ind.dedentToken, toSnake(ind.dedentToken)); + map.set(ind.newlineToken, toSnake(ind.newlineToken)); + if (ind.blockScalar) map.set(ind.blockScalar.token, toSnake(ind.blockScalar.token)); + } // The regex token: '/' is context-sensitive (regex vs division). The scanner // resolves it. const regexTok = grammar.tokens.find(t => t.flags.includes('regex')); @@ -778,8 +921,14 @@ export function generateTreeSitter(grammar: CstGrammar, langName?: string): Tree // queries for them. Same shape rule gen-tm.ts uses (inferIdentScope). const nameFields = collectNameFields(grammar); + // ε-elimination set (see makeNonEmpty): the start rule is the entry rule, emitted FIRST below. + const entryName = grammar.rules[grammar.rules.length - 1].name; + const isTerminalName = (n: string) => tokenNames.has(n) || scannerTokenFor.has(n); + const nullableNonStart = computeNullableNonStart(grammar, entryName, isTerminalName); + const ctx: GrammarJsContext = { grammar, tokenNames, ruleSnake, tokenSnake, prattRules, externalSnake, scannerTokenFor, + nullableNonStart, templatePlan, interpolationPlans, nameFieldNodes: nameFields.nodes, diff --git a/tree-sitter/yaml/grammar.js b/tree-sitter/yaml/grammar.js index 1e852fc..5d693f9 100644 --- a/tree-sitter/yaml/grammar.js +++ b/tree-sitter/yaml/grammar.js @@ -16,14 +16,61 @@ module.exports = grammar({ $.comment ], + externals: $ => [ + $.indent, + $.dedent, + $.newline, + $.block_scalar + ], + + conflicts: $ => [ + [$.stream, $.node], + [$.empty_key_mapping], + [$.explicit_entry], + [$.next_doc], + [$.stream, $.next_doc], + [$.node], + [$.key, $.plain], + [$.scalar, $.doc_fold], + [$.explicit_mapping], + [$.block_sequence], + [$.map_value_scalar, $.map_value_node_scalar], + [$.scalar, $.block_key_scalar], + [$.map_value, $.map_value_node], + [$.flow_explicit], + [$.flow_mapping], + [$.flow_sequence], + [$.explicit_doc_body], + [$.inline_doc_node], + [$.alias_or_keyed], + [$.doc_fold], + [$.mapping_from_flow], + [$.mapping_or_scalar], + [$.property, $.node], + [$.seq_item], + [$.property], + [$.flow_node], + [$.node, $.explicit_doc_body], + [$.node, $.after_doc_end], + [$.after_doc_end], + [$.map_entry], + [$.stream, $.explicit_doc_body], + [$.map_entry_no_empty], + [$.seq_value_node], + [$.mapping_or_scalar, $.doc_fold], + [$.map_value_scalar, $.map_inline_scalar], + [$.content_node, $.mapping_from_flow], + [$.mapping_or_scalar, $.map_value], + ], + rules: { - stream: $ => choice(seq(repeat1(seq(choice($.yaml_directive, $.directive), optional($.newline))), optional(seq($.doc_start, optional($.explicit_doc_body), repeat(seq(optional($.newline), $.next_doc)), optional($.newline), optional($.doc_end), optional($.newline)))), seq(optional($.indent), optional(choice($.doc_fold, $.node)), optional($.dedent), repeat(seq(optional($.newline), $.next_doc)), optional($.newline), optional($.doc_end), optional($.newline))), + stream: $ => choice(seq(repeat1(seq(choice($.yaml_directive, $.directive), optional($.newline))), optional(seq($.doc_start, optional($.explicit_doc_body), repeat(seq(optional($.newline), $.next_doc)), optional($.newline), optional($.doc_end), optional($.newline)))), seq(optional($.indent), optional(choice($.doc_fold, optional($.node))), optional($.dedent), repeat(seq(optional($.newline), $.next_doc)), optional($.newline), optional($.doc_end), optional($.newline))), property: $ => choice(seq($.anchor, optional($.tag)), seq($.tag, optional($.anchor))), content_node: $ => choice($.block_sequence, $.explicit_mapping, $.empty_key_mapping, $.flow_mapping, $.flow_sequence, $.mapping_from_flow, $.alias_or_keyed, $.mapping_or_scalar), - node: $ => choice(seq(optional($.anchor), optional($.tag), optional(choice(seq($.indent, $.node, $.dedent), seq($.newline, $.node), $.block_sequence, $.explicit_mapping, $.empty_key_mapping, $.mapping_from_flow, $.flow_mapping, $.flow_sequence, $.alias_or_keyed, $.mapping_or_scalar))), seq($.tag, $.anchor, optional(choice(seq($.indent, $.node, $.dedent), seq($.newline, $.node), $.block_sequence, $.explicit_mapping, $.empty_key_mapping, $.mapping_from_flow, $.flow_mapping, $.flow_sequence, $.alias_or_keyed, $.mapping_or_scalar))), $.block_sequence), + node: $ => choice(choice(seq($.anchor, optional($.tag), optional(choice(seq($.indent, optional($.node), $.dedent), seq($.newline, optional($.node)), $.block_sequence, $.explicit_mapping, $.empty_key_mapping, $.mapping_from_flow, $.flow_mapping, $.flow_sequence, $.alias_or_keyed, $.mapping_or_scalar))), seq($.tag, optional(choice(seq($.indent, optional($.node), $.dedent), seq($.newline, optional($.node)), $.block_sequence, $.explicit_mapping, $.empty_key_mapping, $.mapping_from_flow, $.flow_mapping, $.flow_sequence, $.alias_or_keyed, $.mapping_or_scalar))), choice(seq($.indent, optional($.node), $.dedent), seq($.newline, optional($.node)), $.block_sequence, $.explicit_mapping, $.empty_key_mapping, $.mapping_from_flow, $.flow_mapping, $.flow_sequence, $.alias_or_keyed, $.mapping_or_scalar)), seq($.tag, $.anchor, optional(choice(seq($.indent, optional($.node), $.dedent), seq($.newline, optional($.node)), $.block_sequence, $.explicit_mapping, $.empty_key_mapping, $.mapping_from_flow, $.flow_mapping, $.flow_sequence, $.alias_or_keyed, $.mapping_or_scalar))), $.block_sequence), mapping_or_scalar: $ => choice(seq(choice($.num, $.bool_null, $.plain), $.indent, choice($.num, $.bool_null, $.plain), repeat(seq($.newline, choice($.num, $.bool_null, $.plain))), $.dedent), seq($.block_key_scalar, ":", optional($.map_value_scalar), repeat(seq($.newline, $.map_entry))), $.scalar), @@ -41,11 +88,11 @@ module.exports = grammar({ empty_key_mapping: $ => seq(":", optional($.map_value_scalar), repeat(seq($.newline, $.map_entry_no_empty))), - value: $ => choice(seq($.indent, choice($.num, $.bool_null, $.plain), repeat1(seq($.newline, $.plain)), $.dedent), seq($.indent, $.node, $.dedent), $.seq_value_node), + value: $ => choice(seq($.indent, choice($.num, $.bool_null, $.plain), repeat1(seq($.newline, $.plain)), $.dedent), seq($.indent, optional($.node), $.dedent), $.seq_value_node), - map_value: $ => choice(seq(choice($.num, $.bool_null, $.plain), $.indent, choice($.num, $.bool_null, $.plain), repeat(seq($.newline, choice($.num, $.bool_null, $.plain))), $.dedent), seq($.indent, choice($.num, $.bool_null, $.plain), repeat1(seq($.newline, $.plain)), $.dedent), seq($.indent, $.node, $.dedent), seq($.indent, $.property, $.dedent, $.content_node), seq($.indent, $.property, $.dedent, $.newline, $.block_sequence), seq($.property, $.newline, $.block_sequence), seq($.newline, $.block_sequence), $.map_value_node), + map_value: $ => choice(seq(choice($.num, $.bool_null, $.plain), $.indent, choice($.num, $.bool_null, $.plain), repeat(seq($.newline, choice($.num, $.bool_null, $.plain))), $.dedent), seq($.indent, choice($.num, $.bool_null, $.plain), repeat1(seq($.newline, $.plain)), $.dedent), seq($.indent, optional($.node), $.dedent), seq($.indent, $.property, $.dedent, $.content_node), seq($.indent, $.property, $.dedent, $.newline, $.block_sequence), seq($.property, $.newline, $.block_sequence), seq($.newline, $.block_sequence), $.map_value_node), - map_value_scalar: $ => choice(seq(choice($.num, $.bool_null, $.plain), $.indent, choice($.num, $.bool_null, $.plain), repeat(seq($.newline, choice($.num, $.bool_null, $.plain))), $.dedent), seq($.indent, choice($.num, $.bool_null, $.plain), repeat1(seq($.newline, $.plain)), $.dedent), seq($.indent, $.node, $.dedent), seq($.indent, $.property, $.dedent, $.content_node), seq($.indent, $.property, $.dedent, $.newline, $.block_sequence), seq($.property, $.newline, $.block_sequence), seq($.newline, $.block_sequence), $.map_value_node_scalar), + map_value_scalar: $ => choice(seq(choice($.num, $.bool_null, $.plain), $.indent, choice($.num, $.bool_null, $.plain), repeat(seq($.newline, choice($.num, $.bool_null, $.plain))), $.dedent), seq($.indent, choice($.num, $.bool_null, $.plain), repeat1(seq($.newline, $.plain)), $.dedent), seq($.indent, optional($.node), $.dedent), seq($.indent, $.property, $.dedent, $.content_node), seq($.indent, $.property, $.dedent, $.newline, $.block_sequence), seq($.property, $.newline, $.block_sequence), seq($.newline, $.block_sequence), $.map_value_node_scalar), indented_value_node: $ => choice(seq($.property, choice(seq($.indent, $.indented_value_node, $.dedent), $.collection_content)), $.content_node), @@ -71,19 +118,19 @@ module.exports = grammar({ seq_item: $ => seq("-", optional($.value)), - flow_node: $ => seq(optional($.property), optional(choice($.flow_mapping, $.flow_sequence, $.alias, $.scalar))), + flow_node: $ => choice(seq($.property, optional(choice($.flow_mapping, $.flow_sequence, $.alias, $.scalar))), choice($.flow_mapping, $.flow_sequence, $.alias, $.scalar)), - flow_explicit: $ => seq("?", optional($.flow_node)), + flow_explicit: $ => seq("?", optional(optional($.flow_node))), - flow_map_entry: $ => seq(optional($.flow_explicit), optional($.flow_node), optional(seq(":", optional($.flow_node)))), + flow_map_entry: $ => choice(seq($.flow_explicit, optional(optional($.flow_node)), optional(seq(":", optional(optional($.flow_node))))), seq($.flow_node, optional(seq(":", optional(optional($.flow_node))))), seq(":", optional(optional($.flow_node)))), - flow_mapping: $ => seq("{", optional(seq($.flow_map_entry, repeat(seq(",", $.flow_map_entry)))), optional(","), "}"), + flow_mapping: $ => seq("{", optional(seq(optional($.flow_map_entry), repeat(seq(",", optional($.flow_map_entry))))), optional(","), "}"), - flow_seq_entry: $ => choice(seq($.flow_seq_key, ":", optional($.flow_node)), seq("?", optional($.flow_node), optional(seq(":", optional($.flow_node)))), seq(":", optional($.flow_node)), $.flow_node), + flow_seq_entry: $ => choice(seq($.flow_seq_key, ":", optional(optional($.flow_node))), seq("?", optional(optional($.flow_node)), optional(seq(":", optional(optional($.flow_node))))), seq(":", optional(optional($.flow_node))), $.flow_node), flow_seq_key: $ => choice(seq(optional($.property), choice($.flow_mapping, $.flow_sequence, $.dquote_key, $.squote_key, $.key)), $.alias), - flow_sequence: $ => seq("[", optional(seq($.flow_seq_entry, repeat(seq(",", $.flow_seq_entry)))), optional(","), "]"), + flow_sequence: $ => seq("[", optional(seq(optional($.flow_seq_entry), repeat(seq(",", optional($.flow_seq_entry))))), optional(","), "]"), scalar: $ => choice($.dquote_key, $.squote_key, $.dquote, $.squote, $.block_scalar, $.key, $.num, $.bool_null, $.plain), @@ -91,13 +138,13 @@ module.exports = grammar({ doc_fold: $ => seq(choice($.num, $.bool_null, $.plain), repeat1(choice(seq($.newline, choice($.plain, $.yaml_directive, $.directive)), seq($.indent, choice($.plain, $.yaml_directive, $.directive), repeat(seq($.newline, choice($.plain, $.yaml_directive, $.directive))), $.dedent)))), - inline_doc_node: $ => choice(seq($.property, optional(choice(seq($.indent, $.doc_fold, $.dedent), seq($.indent, $.node, $.dedent), seq($.newline, $.doc_fold), seq($.newline, $.node), $.flow_mapping, $.flow_sequence, $.alias, $.scalar))), $.doc_fold, choice($.flow_mapping, $.flow_sequence, $.alias, $.scalar)), + inline_doc_node: $ => choice(seq($.property, optional(choice(seq($.indent, $.doc_fold, $.dedent), seq($.indent, optional($.node), $.dedent), seq($.newline, $.doc_fold), seq($.newline, optional($.node)), $.flow_mapping, $.flow_sequence, $.alias, $.scalar))), $.doc_fold, choice($.flow_mapping, $.flow_sequence, $.alias, $.scalar)), - explicit_doc_body: $ => choice(seq($.newline, optional($.indent), optional(choice($.doc_fold, $.node)), optional($.dedent)), $.inline_doc_node), + explicit_doc_body: $ => choice(seq($.newline, optional($.indent), optional(choice($.doc_fold, optional($.node))), optional($.dedent)), $.inline_doc_node), - after_doc_end: $ => choice(seq(repeat(seq(choice($.yaml_directive, $.directive), optional($.newline))), $.doc_start, optional($.explicit_doc_body)), seq(optional($.indent), choice($.doc_fold, $.node), optional($.dedent))), + after_doc_end: $ => choice(seq(repeat(seq(choice($.yaml_directive, $.directive), optional($.newline))), $.doc_start, optional($.explicit_doc_body)), choice(seq($.indent, choice($.doc_fold, optional($.node)), optional($.dedent)), seq(choice($.doc_fold, $.node), optional($.dedent)), $.dedent)), - next_doc: $ => choice(seq($.doc_start, optional($.explicit_doc_body)), seq($.doc_end, optional(seq($.newline, optional($.after_doc_end))))), + next_doc: $ => choice(seq($.doc_start, optional($.explicit_doc_body)), seq($.doc_end, optional(seq($.newline, optional(optional($.after_doc_end)))))), doc_start: $ => token(/---/), @@ -123,21 +170,13 @@ module.exports = grammar({ tag: $ => token(/!(?:<[^>]*>|[^\t\n\f\r \[\]{},]*)/), - block_scalar: $ => token(/[^\s\S]/), - key: $ => token(/(?:[^\t\n\f\r \-?:,\[\]{}#&*!|>'"%@`]|[\-?:])(?:[^:#\n,\[\]{}]|:|#)*/), num: $ => token(/(?:[+\-]?\.(?:inf|Inf|INF)|\.(?:nan|NaN|NAN)|0x[0-9A-Fa-f]+|0o[0-7]+|[+\-]?(?:\.[0-9]+|[0-9]+(?:\.[0-9]*)?)(?:[eE][+\-]?[0-9]+)?)/), bool_null: $ => token(/(?:true|True|TRUE|false|False|FALSE|null|Null|NULL|~)/), - plain: $ => token(/(?:[^\t\n\f\r \-?:,\[\]{}#&*!|>'"%@`]|[\-?:])(?:[^:#\n,\[\]{}]|:|#)*/), - - indent: $ => token(/[^\s\S]/), - - dedent: $ => token(/[^\s\S]/), - - newline: $ => token(/[^\s\S]/) + plain: $ => token(/(?:[^\t\n\f\r \-?:,\[\]{}#&*!|>'"%@`]|[\-?:])(?:[^:#\n,\[\]{}]|:|#)*/) } }); diff --git a/tree-sitter/yaml/src/scanner.c b/tree-sitter/yaml/src/scanner.c index 7653989..2167de9 100644 --- a/tree-sitter/yaml/src/scanner.c +++ b/tree-sitter/yaml/src/scanner.c @@ -13,7 +13,10 @@ #include enum TokenType { - NO_EXTERNAL_TOKENS, + INDENT, + DEDENT, + NEWLINE, + BLOCK_SCALAR, }; // The scanner is stateless — tree-sitter's `valid_symbols` already encodes From 49a05b12a385780304c4ad5da04dfb6600661c1e Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 8 Jun 2026 04:02:19 +0800 Subject: [PATCH 02/10] =?UTF-8?q?Add=20the=20YAML=20indentation=20external?= =?UTF-8?q?=20scanner=20(issue=20#3,=20piece=203=20=E2=80=94=20WIP)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `buildIndentScannerC` (src/gen-treesitter.ts) generates a real C external scanner for the YAML indent tokens, replacing the stub. It mirrors src/gen-lexer.ts's indent-stack state machine: - An indent stack in the Scanner struct, (de)serialized for incremental re-parsing. - At each line boundary it measures the next content line's column and emits INDENT (deeper → push), DEDENT (shallower → pop, one per call until the stack top is reached), or NEWLINE (same column → sibling separator); blank and comment-only lines are skipped; open blocks are closed at EOF. - A block-scalar body (`|`/`>`) is scanned verbatim up to the first line at or below the parent indentation. - Flow needs no special case: inside `[`/`{` the grammar never references the indent tokens, so valid_symbols is false and the line break falls through to `extras`. - All language data (comment introducer, block-scalar introducers) is DERIVED from `grammar.indent`. `buildTokenBody` now emits a token's BLOCK pattern when it has one (YAML's scalar tokens), since the tree-sitter grammar is block-context at the top level. (YAML is the only grammar with a blockPattern, so the other six are byte-identical.) Verified parsing (`tree-sitter parse`): nested mappings, nested sequences, block scalars, and flow collections parse with no ERROR — the indent stack, INDENT/DEDENT/NEWLINE, and block-scalar bodies all work. KNOWN REMAINING: a flat single-line `key: value` / `- item` still mis-tokenizes — the `plain`/`key` block patterns must stop at a `: ` separator via a lookahead (`:(?=\S)`), but tree-sitter's token DFA forbids lookahead, so `sanitizeTreeSitterRegex` strips it and `plain` greedily eats `a: 1`. The official tree-sitter-yaml scans scalars in C for exactly this reason. The fix (next) is to rewrite the in-loop `:(?=\S)` boundary into an extent-equivalent consuming form (`:[^\s]`) for block-token emission, or to scan plain/key scalars in the external scanner. Refs #3 --- src/gen-treesitter.ts | 207 ++++++++++++++++++++++++++++++++- tree-sitter/yaml/grammar.js | 4 +- tree-sitter/yaml/src/scanner.c | 168 ++++++++++++++++++++++---- 3 files changed, 353 insertions(+), 26 deletions(-) diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts index 1d0ae3b..437f805 100644 --- a/src/gen-treesitter.ts +++ b/src/gen-treesitter.ts @@ -478,7 +478,13 @@ function buildTokenBody(name: string, ctx: GrammarJsContext): string | null { // rule reference — but we still emit them so highlights can capture comments. // tree-sitter's token() DFA rejects zero-width assertions, so strip them first. if (tokenPatternIsNever(tok)) return 'token(/[^\\s\\S]/)'; - return `token(${jsRegexLiteral(sanitizeTreeSitterRegex(tokenPatternSource(tok)))})`; + // A token with a BLOCK-context variant (YAML's scalar tokens: a block plain/key stops at a `: ` + // separator and a value end, where the flow variant runs through them) — emit the block pattern. + // The tree-sitter grammar is block-context at the top level; flow collections are their own rules. + // Block-only (no `pattern`) and dual tokens both resolve here; YAML is the only grammar with a + // blockPattern, so every other language is unaffected (byte-identical). + const src = tok.blockPattern ? tokenPatternSource({ pattern: tok.blockPattern }) : tokenPatternSource(tok); + return `token(${jsRegexLiteral(sanitizeTreeSitterRegex(src))})`; } // ── conflicts ──────────────────────────────────────────────────────────────── @@ -1776,11 +1782,210 @@ function cCharList(s: string): string { return [...s].map(c => `'${c === '\\' || c === "'" ? '\\' + c : c}'`).join(', '); } +// ── Indentation external scanner (YAML) ────────────────────────────────────── +// An indentation-sensitive grammar emits INDENT / DEDENT / NEWLINE from a line-leading-column state +// machine that a regex lexer cannot express, so they become external tokens scanned here. This C +// scanner mirrors the indent-stack logic of src/gen-lexer.ts: at each line boundary it measures the +// next content line's column and emits INDENT (deeper → push), DEDENT (shallower → pop, one per call +// until the stack top is reached), or NEWLINE (same column → a sibling separator). Flow context needs +// no special handling: inside `[`/`{` the grammar never references these tokens, so valid_symbols is +// false there and the line break falls through to `extras`. The indent stack lives in the Scanner +// struct and is (de)serialized for incremental re-parsing. Block-scalar bodies are scanned verbatim +// up to the first line at or below the parent indentation. All language data (the comment introducer, +// the block-scalar introducer chars, the document markers) is DERIVED from `grammar.indent`. +function buildIndentScannerC(grammar: CstGrammar, ctx: GrammarJsContext, grammarName: string): { scannerC: string; externalTokens: string[] } { + const ind = grammar.indent!; + const externalTokens = externalSymbols(ctx); // order MUST match grammar.js externals + const sym = (tokenName: string) => ctx.scannerTokenFor.get(tokenName)!.toUpperCase(); + const INDENT = sym(ind.indentToken), DEDENT = sym(ind.dedentToken), NEWLINE = sym(ind.newlineToken); + const BLOCK = ind.blockScalar ? sym(ind.blockScalar.token) : null; + const cmt = ind.comment ?? '#'; + const cmtC = cmt.length === 1 ? (cmt === '\\' || cmt === "'" ? `'\\${cmt}'` : `'${cmt}'`) : null; + const introCond = (ind.blockScalar?.introducers ?? []).map(c => `lexer->lookahead == '${c}'`).join(' || ') || '0'; + const enumBody = externalTokens.map(s => ` ${s.toUpperCase()},`).join('\n'); + const G = grammarName; + + const scannerC = `// Tree-sitter external scanner generated by monogram (indentation path). +// +// Mirrors the indent-stack state machine of src/gen-lexer.ts: INDENT / DEDENT / NEWLINE are emitted +// from the line-leading column relative to an indent stack; a block-scalar body is scanned verbatim. +// All language data (comment introducer, block-scalar introducers, document markers) is DERIVED from +// the grammar's \`indent\` config — nothing below is hand-tuned for a specific language. + +#include "tree_sitter/parser.h" +#include "tree_sitter/alloc.h" +#include + +enum TokenType { +${enumBody} +}; + +typedef struct { + uint32_t len; // indent-stack depth (>= 1; stack[0] == 0, the document level) + uint32_t cap; + int16_t *stack; // indentation columns + int16_t pending_col; // column of the line boundary mid-processing (-1 = none) + bool pending_newline; // a NEWLINE is still owed once dedents reach pending_col + bool started; // any content lexed yet (suppresses a leading NEWLINE) +} Scanner; + +static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } +static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); } + +static void push_indent(Scanner *s, int16_t col) { + if (s->len == s->cap) { s->cap *= 2; s->stack = ts_realloc(s->stack, s->cap * sizeof(int16_t)); } + s->stack[s->len++] = col; +} + +void *tree_sitter_${G}_external_scanner_create(void) { + Scanner *s = ts_malloc(sizeof(Scanner)); + s->cap = 16; s->len = 1; + s->stack = ts_malloc(s->cap * sizeof(int16_t)); + s->stack[0] = 0; + s->pending_col = -1; s->pending_newline = false; s->started = false; + return s; +} + +void tree_sitter_${G}_external_scanner_destroy(void *payload) { + Scanner *s = (Scanner *)payload; + ts_free(s->stack); + ts_free(s); +} + +unsigned tree_sitter_${G}_external_scanner_serialize(void *payload, char *buffer) { + Scanner *s = (Scanner *)payload; + unsigned n = 0; + buffer[n++] = s->started ? 1 : 0; + buffer[n++] = s->pending_newline ? 1 : 0; + memcpy(&buffer[n], &s->pending_col, sizeof(int16_t)); n += sizeof(int16_t); + uint32_t count = s->len; + while (n + sizeof(uint32_t) + count * sizeof(int16_t) > TREE_SITTER_SERIALIZATION_BUFFER_SIZE && count > 0) count--; + memcpy(&buffer[n], &count, sizeof(uint32_t)); n += sizeof(uint32_t); + memcpy(&buffer[n], s->stack, count * sizeof(int16_t)); n += count * sizeof(int16_t); + return n; +} + +void tree_sitter_${G}_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) { + Scanner *s = (Scanner *)payload; + s->len = 1; s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false; + if (length < 2 + sizeof(int16_t) + sizeof(uint32_t)) return; + unsigned n = 0; + s->started = buffer[n++] != 0; + s->pending_newline = buffer[n++] != 0; + memcpy(&s->pending_col, &buffer[n], sizeof(int16_t)); n += sizeof(int16_t); + uint32_t count; memcpy(&count, &buffer[n], sizeof(uint32_t)); n += sizeof(uint32_t); + if (count == 0) return; // keep stack[0] = 0 + while (s->cap < count) { s->cap *= 2; s->stack = ts_realloc(s->stack, s->cap * sizeof(int16_t)); } + memcpy(s->stack, &buffer[n], count * sizeof(int16_t)); + s->len = count; +} + +${BLOCK ? `// A block scalar (\`|\` / \`>\`): the introducer + indicators + the verbatim more-indented body, as +// one token. The body runs while a line is blank or indented MORE than the parent block level (the +// stack top); it ends at the first non-blank line at or below the parent, or a column-0 document +// marker, or EOF. mark_end is advanced only over lines that belong to the scalar, so the next node's +// indentation is left for the normal boundary logic. +static bool scan_block_scalar(Scanner *s, TSLexer *lexer) { + int parent = s->stack[s->len - 1]; + advance(lexer); // the introducer (| or >) + while (lexer->lookahead == '+' || lexer->lookahead == '-' || (lexer->lookahead >= '0' && lexer->lookahead <= '9')) advance(lexer); + while (lexer->lookahead != 0 && lexer->lookahead != '\\n' && lexer->lookahead != '\\r') advance(lexer); + lexer->mark_end(lexer); // the header line belongs to the scalar + for (;;) { + if (lexer->lookahead == '\\r') { advance(lexer); if (lexer->lookahead == '\\n') advance(lexer); } + else if (lexer->lookahead == '\\n') advance(lexer); + else break; // EOF + int col = 0; + while (lexer->lookahead == ' ') { advance(lexer); col++; } + int32_t c = lexer->lookahead; + if (c == 0 || c == '\\n' || c == '\\r') { lexer->mark_end(lexer); continue; } // blank line → body + if (col <= parent) break; // dedent to/below parent ends the scalar (the line is the next node) + while (lexer->lookahead != 0 && lexer->lookahead != '\\n' && lexer->lookahead != '\\r') advance(lexer); + lexer->mark_end(lexer); + } + lexer->result_symbol = ${BLOCK}; + return true; +} +` : ''} +bool tree_sitter_${G}_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { + Scanner *s = (Scanner *)payload; + bool want_indent = valid_symbols[${INDENT}]; + bool want_dedent = valid_symbols[${DEDENT}]; + bool want_newline = valid_symbols[${NEWLINE}];${BLOCK ? `\n bool want_block = valid_symbols[${BLOCK}];` : ''} + + // Finish a line boundary already in progress: emit the remaining DEDENTs (one per call), then the + // owed NEWLINE when the stack top reaches the boundary column. No input is consumed here. + if (s->pending_col >= 0) { + int top = s->stack[s->len - 1]; + if (s->pending_col < top) { + if (want_dedent) { s->len--; lexer->result_symbol = ${DEDENT}; return true; } + s->pending_col = -1; s->pending_newline = false; return false; + } + bool owed = s->pending_newline; + int16_t col = s->pending_col; + s->pending_col = -1; s->pending_newline = false; + if (col == top && owed && want_newline && s->started) { lexer->result_symbol = ${NEWLINE}; return true; } + return false; + } +${BLOCK ? ` + // A block scalar value (\`key: |\`): scan its body before the indent logic — its more-indented + // lines are content, not nested structure. Skip the inline space after the \`:\`/\`-\` first. + if (want_block) { + while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') skip(lexer); + if (${introCond}) { if (scan_block_scalar(s, lexer)) return true; } + } +` : ''} + if (!want_indent && !want_dedent && !want_newline) return false; // flow context — no indent tokens valid + + // Skip blank lines, comment-only lines, and leading whitespace, noting whether a line break was + // crossed (only a real boundary drives the indent logic). + bool found_eol = false; + for (;;) { + int32_t c = lexer->lookahead; + if (c == '\\n') { skip(lexer); found_eol = true; } + else if (c == '\\r') { skip(lexer); if (lexer->lookahead == '\\n') skip(lexer); found_eol = true; } + else if (c == ' ' || c == '\\t') { skip(lexer); } + ${cmtC ? `else if (c == ${cmtC}) { while (lexer->lookahead != 0 && lexer->lookahead != '\\n' && lexer->lookahead != '\\r') skip(lexer); }` : ''} + else break; // content or EOF + } + + if (lexer->eof(lexer)) { + if (s->len > 1 && want_dedent) { s->len--; lexer->result_symbol = ${DEDENT}; return true; } // close open blocks at EOF + return false; + } + bool was_started = s->started; + s->started = true; // content lies ahead — mark started even on a mid-line return + if (!found_eol) return false; // not at a line boundary + + int16_t col = (int16_t)lexer->get_column(lexer); + lexer->mark_end(lexer); // INDENT/DEDENT/NEWLINE are zero-width at the content column + int top = s->stack[s->len - 1]; + + if (col > top) { + if (want_indent) { push_indent(s, col); lexer->result_symbol = ${INDENT}; return true; } + return false; + } + if (col == top) { + if (want_newline && was_started) { lexer->result_symbol = ${NEWLINE}; return true; } + return false; + } + // col < top: a dedent. Emit one DEDENT now; queue the rest + the trailing NEWLINE for re-entry. + if (want_dedent) { + s->pending_col = col; s->pending_newline = true; + s->len--; lexer->result_symbol = ${DEDENT}; return true; + } + return false; +} +`; + return { scannerC, externalTokens }; +} + function buildScannerC( grammar: CstGrammar, ctx: GrammarJsContext, grammarName: string, ): { scannerC: string; externalTokens: string[] } { + if (grammar.indent) return buildIndentScannerC(grammar, ctx, grammarName); const regexTok = grammar.tokens.find(t => t.flags.includes('regex')); const tp = ctx.templatePlan; diff --git a/tree-sitter/yaml/grammar.js b/tree-sitter/yaml/grammar.js index 5d693f9..78df6a8 100644 --- a/tree-sitter/yaml/grammar.js +++ b/tree-sitter/yaml/grammar.js @@ -170,13 +170,13 @@ module.exports = grammar({ tag: $ => token(/!(?:<[^>]*>|[^\t\n\f\r \[\]{},]*)/), - key: $ => token(/(?:[^\t\n\f\r \-?:,\[\]{}#&*!|>'"%@`]|[\-?:])(?:[^:#\n,\[\]{}]|:|#)*/), + key: $ => token(/(?:[^\t\n\f\r \-?:,\[\]{}#&*!|>'"%@`]|[\-?:])(?:[^:#\n]|:|#)*/), num: $ => token(/(?:[+\-]?\.(?:inf|Inf|INF)|\.(?:nan|NaN|NAN)|0x[0-9A-Fa-f]+|0o[0-7]+|[+\-]?(?:\.[0-9]+|[0-9]+(?:\.[0-9]*)?)(?:[eE][+\-]?[0-9]+)?)/), bool_null: $ => token(/(?:true|True|TRUE|false|False|FALSE|null|Null|NULL|~)/), - plain: $ => token(/(?:[^\t\n\f\r \-?:,\[\]{}#&*!|>'"%@`]|[\-?:])(?:[^:#\n,\[\]{}]|:|#)*/) + plain: $ => token(/(?:[^\t\n\f\r \-?:,\[\]{}#&*!|>'"%@`]|[\-?:])(?:[^:#\n]|:|#)*/) } }); diff --git a/tree-sitter/yaml/src/scanner.c b/tree-sitter/yaml/src/scanner.c index 2167de9..9b006ca 100644 --- a/tree-sitter/yaml/src/scanner.c +++ b/tree-sitter/yaml/src/scanner.c @@ -1,16 +1,13 @@ -// Tree-sitter external scanner generated by monogram. +// Tree-sitter external scanner generated by monogram (indentation path). // -// COMPLETE — the regex-literal scan and the template-literal scan are both -// wired from the grammar's token hints (`regexContext` and `template`). -// -// All language-specific data below is DERIVED from the CstGrammar, not -// hardcoded: the regex flag chars and the template delimiters all come from -// the grammar's token hints. +// Mirrors the indent-stack state machine of src/gen-lexer.ts: INDENT / DEDENT / NEWLINE are emitted +// from the line-leading column relative to an indent stack; a block-scalar body is scanned verbatim. +// All language data (comment introducer, block-scalar introducers, document markers) is DERIVED from +// the grammar's `indent` config — nothing below is hand-tuned for a specific language. #include "tree_sitter/parser.h" #include "tree_sitter/alloc.h" #include -#include enum TokenType { INDENT, @@ -19,35 +16,160 @@ enum TokenType { BLOCK_SCALAR, }; -// The scanner is stateless — tree-sitter's `valid_symbols` already encodes -// the parse context (inside a regex slot? inside a template span?), and the -// `${ … }` brace nesting is handled by the template_substitution rule in the -// CFG, so there is nothing to (de)serialize. -typedef struct { char unused; } Scanner; +typedef struct { + uint32_t len; // indent-stack depth (>= 1; stack[0] == 0, the document level) + uint32_t cap; + int16_t *stack; // indentation columns + int16_t pending_col; // column of the line boundary mid-processing (-1 = none) + bool pending_newline; // a NEWLINE is still owed once dedents reach pending_col + bool started; // any content lexed yet (suppresses a leading NEWLINE) +} Scanner; + +static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } +static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); } + +static void push_indent(Scanner *s, int16_t col) { + if (s->len == s->cap) { s->cap *= 2; s->stack = ts_realloc(s->stack, s->cap * sizeof(int16_t)); } + s->stack[s->len++] = col; +} void *tree_sitter_yaml_external_scanner_create(void) { - return ts_calloc(1, sizeof(Scanner)); + Scanner *s = ts_malloc(sizeof(Scanner)); + s->cap = 16; s->len = 1; + s->stack = ts_malloc(s->cap * sizeof(int16_t)); + s->stack[0] = 0; + s->pending_col = -1; s->pending_newline = false; s->started = false; + return s; } void tree_sitter_yaml_external_scanner_destroy(void *payload) { - ts_free(payload); + Scanner *s = (Scanner *)payload; + ts_free(s->stack); + ts_free(s); } unsigned tree_sitter_yaml_external_scanner_serialize(void *payload, char *buffer) { - (void)payload; (void)buffer; - return 0; + Scanner *s = (Scanner *)payload; + unsigned n = 0; + buffer[n++] = s->started ? 1 : 0; + buffer[n++] = s->pending_newline ? 1 : 0; + memcpy(&buffer[n], &s->pending_col, sizeof(int16_t)); n += sizeof(int16_t); + uint32_t count = s->len; + while (n + sizeof(uint32_t) + count * sizeof(int16_t) > TREE_SITTER_SERIALIZATION_BUFFER_SIZE && count > 0) count--; + memcpy(&buffer[n], &count, sizeof(uint32_t)); n += sizeof(uint32_t); + memcpy(&buffer[n], s->stack, count * sizeof(int16_t)); n += count * sizeof(int16_t); + return n; } void tree_sitter_yaml_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) { - (void)payload; (void)buffer; (void)length; + Scanner *s = (Scanner *)payload; + s->len = 1; s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false; + if (length < 2 + sizeof(int16_t) + sizeof(uint32_t)) return; + unsigned n = 0; + s->started = buffer[n++] != 0; + s->pending_newline = buffer[n++] != 0; + memcpy(&s->pending_col, &buffer[n], sizeof(int16_t)); n += sizeof(int16_t); + uint32_t count; memcpy(&count, &buffer[n], sizeof(uint32_t)); n += sizeof(uint32_t); + if (count == 0) return; // keep stack[0] = 0 + while (s->cap < count) { s->cap *= 2; s->stack = ts_realloc(s->stack, s->cap * sizeof(int16_t)); } + memcpy(s->stack, &buffer[n], count * sizeof(int16_t)); + s->len = count; } -static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } -static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); } +// A block scalar (`|` / `>`): the introducer + indicators + the verbatim more-indented body, as +// one token. The body runs while a line is blank or indented MORE than the parent block level (the +// stack top); it ends at the first non-blank line at or below the parent, or a column-0 document +// marker, or EOF. mark_end is advanced only over lines that belong to the scalar, so the next node's +// indentation is left for the normal boundary logic. +static bool scan_block_scalar(Scanner *s, TSLexer *lexer) { + int parent = s->stack[s->len - 1]; + advance(lexer); // the introducer (| or >) + while (lexer->lookahead == '+' || lexer->lookahead == '-' || (lexer->lookahead >= '0' && lexer->lookahead <= '9')) advance(lexer); + while (lexer->lookahead != 0 && lexer->lookahead != '\n' && lexer->lookahead != '\r') advance(lexer); + lexer->mark_end(lexer); // the header line belongs to the scalar + for (;;) { + if (lexer->lookahead == '\r') { advance(lexer); if (lexer->lookahead == '\n') advance(lexer); } + else if (lexer->lookahead == '\n') advance(lexer); + else break; // EOF + int col = 0; + while (lexer->lookahead == ' ') { advance(lexer); col++; } + int32_t c = lexer->lookahead; + if (c == 0 || c == '\n' || c == '\r') { lexer->mark_end(lexer); continue; } // blank line → body + if (col <= parent) break; // dedent to/below parent ends the scalar (the line is the next node) + while (lexer->lookahead != 0 && lexer->lookahead != '\n' && lexer->lookahead != '\r') advance(lexer); + lexer->mark_end(lexer); + } + lexer->result_symbol = BLOCK_SCALAR; + return true; +} + +bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { + Scanner *s = (Scanner *)payload; + bool want_indent = valid_symbols[INDENT]; + bool want_dedent = valid_symbols[DEDENT]; + bool want_newline = valid_symbols[NEWLINE]; + bool want_block = valid_symbols[BLOCK_SCALAR]; + + // Finish a line boundary already in progress: emit the remaining DEDENTs (one per call), then the + // owed NEWLINE when the stack top reaches the boundary column. No input is consumed here. + if (s->pending_col >= 0) { + int top = s->stack[s->len - 1]; + if (s->pending_col < top) { + if (want_dedent) { s->len--; lexer->result_symbol = DEDENT; return true; } + s->pending_col = -1; s->pending_newline = false; return false; + } + bool owed = s->pending_newline; + int16_t col = s->pending_col; + s->pending_col = -1; s->pending_newline = false; + if (col == top && owed && want_newline && s->started) { lexer->result_symbol = NEWLINE; return true; } + return false; + } + + // A block scalar value (`key: |`): scan its body before the indent logic — its more-indented + // lines are content, not nested structure. Skip the inline space after the `:`/`-` first. + if (want_block) { + while (lexer->lookahead == ' ' || lexer->lookahead == '\t') skip(lexer); + if (lexer->lookahead == '|' || lexer->lookahead == '>') { if (scan_block_scalar(s, lexer)) return true; } + } + + if (!want_indent && !want_dedent && !want_newline) return false; // flow context — no indent tokens valid + + // Skip blank lines, comment-only lines, and leading whitespace, noting whether a line break was + // crossed (only a real boundary drives the indent logic). + bool found_eol = false; + for (;;) { + int32_t c = lexer->lookahead; + if (c == '\n') { skip(lexer); found_eol = true; } + else if (c == '\r') { skip(lexer); if (lexer->lookahead == '\n') skip(lexer); found_eol = true; } + else if (c == ' ' || c == '\t') { skip(lexer); } + else if (c == '#') { while (lexer->lookahead != 0 && lexer->lookahead != '\n' && lexer->lookahead != '\r') skip(lexer); } + else break; // content or EOF + } + + if (lexer->eof(lexer)) { + if (s->len > 1 && want_dedent) { s->len--; lexer->result_symbol = DEDENT; return true; } // close open blocks at EOF + return false; + } + bool was_started = s->started; + s->started = true; // content lies ahead — mark started even on a mid-line return + if (!found_eol) return false; // not at a line boundary -bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, - const bool *valid_symbols) { - (void)payload; + int16_t col = (int16_t)lexer->get_column(lexer); + lexer->mark_end(lexer); // INDENT/DEDENT/NEWLINE are zero-width at the content column + int top = s->stack[s->len - 1]; + if (col > top) { + if (want_indent) { push_indent(s, col); lexer->result_symbol = INDENT; return true; } + return false; + } + if (col == top) { + if (want_newline && was_started) { lexer->result_symbol = NEWLINE; return true; } + return false; + } + // col < top: a dedent. Emit one DEDENT now; queue the rest + the trailing NEWLINE for re-entry. + if (want_dedent) { + s->pending_col = col; s->pending_newline = true; + s->len--; lexer->result_symbol = DEDENT; return true; + } return false; } From e644798f9f75b949818f9bea2ecb039e94e87fdb Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 8 Jun 2026 04:53:23 +0800 Subject: [PATCH 03/10] =?UTF-8?q?Scan=20YAML=20plain/key=20scalars=20in=20?= =?UTF-8?q?the=20C=20scanner=20=E2=80=94=20flat=20scalars=20now=20tokenize?= =?UTF-8?q?=20(issue=20#3,=20piece=203)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tree-sitter token DFAs cannot use look-around, so a YAML plain scalar's boundary (`:` is content unless followed by space; `#` is a comment only after a space) could not be a regex token — `plain` greedily ate `a: 1`. `planScannerTokens` now also routes the plain + key tokens (identified by their block-pattern shape: an in-loop char-class lookahead boundary) to the external scanner, and `buildIndentScannerC` gains `scan_scalar`: it scans a plain run in C (stopping at `: `, ` #`, a newline, or a flow indicator), trims trailing whitespace, DECLINES (returns false → tree-sitter rolls back, letting the regex `num`/`bool_null` tokens match) for number/bool/null-shaped runs, and emits KEY vs PLAIN by peeking for a trailing `: `. All derived from the grammar; the six other grammars stay byte-identical and `gate:treesitter` is unaffected (96.0%, still beats official 92.5%). Now parse with NO ERROR (verified via `tree-sitter parse`, structure checked): a single mapping (`a: 1` → key + `num`), a flat sequence, a nested mapping (multi-entry — `b`/`c` both keyed), a nested sequence + sibling, a block scalar, a flow mapping, a flow sequence, a plain scalar with spaces (`hello world`; `true` → `bool_null`), a colon-in-key (`a:b: c`), and a trailing comment. KNOWN REMAINING: a TOP-LEVEL multi-entry block mapping (`x: 1\ny: 2\nz: 3` — the most common YAML shape) still mis-parses: the first entry's value is dropped and 3+ entries ERROR. NESTED multi-entry mappings parse correctly, so this is specific to document-level NEWLINE-separated chaining — a grammar/GLR-runtime issue in the `mapping_or_scalar`/`node`/`stream` rules (likely the ε-elimination making a mapping value optional and GLR committing to the wrong split), NOT the scalar scanner. Next. Refs #3 --- src/gen-treesitter.ts | 181 ++++++++++++++++++++++++++++++++- tree-sitter/yaml/grammar.js | 10 +- tree-sitter/yaml/src/scanner.c | 118 +++++++++++++++++++++ 3 files changed, 302 insertions(+), 7 deletions(-) diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts index 437f805..eee0bca 100644 --- a/src/gen-treesitter.ts +++ b/src/gen-treesitter.ts @@ -1,4 +1,4 @@ -import type { CstGrammar, RuleExpr, RuleDecl } from './types.ts'; +import type { CstGrammar, RuleExpr, RuleDecl, TokenPattern } from './types.ts'; import { collectLiterals, isKeywordLiteral } from './grammar-utils.ts'; import { tokenPatternIsNever, tokenPatternSource, tokenPatternStartsWithDecimal, tokenPatternStringDelimiters, tokenPatternTrailingCharClass } from './token-pattern.ts'; @@ -716,6 +716,55 @@ function planInterpolations(grammar: CstGrammar): InterpolationPlan[] { return plans; } +/** + * A "plain-family" scalar token (indentation grammars only): one whose boundary is a look-around + * INSIDE its body loop — the `:(?=\S)` colon-is-content rule of a YAML plain/key scalar. Concretely, + * its `blockPattern` contains, somewhere under a `repeat`, a `seq` ending in a POSITIVE char-class + * lookahead. That in-loop assertion is exactly what a tree-sitter `token()` DFA cannot honour (it + * needs look-ahead to decide where the scalar ends), so such a token must be scanned in C. + * + * A typed look-alike (num / bool-null) ALSO carries a `blockPattern`, but its boundary is a + * TOP-LEVEL trailing lookahead (`(?=…)` — not inside a repeat), which the DFA enforces + * structurally; those are NOT plain-family and stay regex `token()` rules so the parser still + * classifies `1` as a number and `true` as a bool. The test is therefore purely STRUCTURAL — it + * never names a token — so any grammar without this shape is unaffected. + */ +function isPlainFamilyToken(tok: CstGrammar['tokens'][number]): boolean { + const p = tok.blockPattern; + if (!p || typeof p === 'string') return false; + let found = false; + const walk = (node: TokenPattern, inRepeat: boolean): void => { + if (typeof node === 'string') return; + switch (node.type) { + case 'repeat': walk(node.body, true); break; + case 'seq': { + if (inRepeat) { + const last = node.items[node.items.length - 1]; + if (last && typeof last !== 'string' && last.type === 'lookahead' && !last.negate) found = true; + } + for (const it of node.items) walk(it, inRepeat); + break; + } + case 'alt': for (const it of node.items) walk(it, inRepeat); break; + case 'lookahead': case 'lookbehind': walk(node.body, inRepeat); break; + default: break; + } + }; + walk(p, false); + return found; +} + +/** The plain-family tokens of an indentation grammar, split into the PLAIN scalar and the KEY scalar + * by their scope leaf (a cross-check on the structural detection): a plain scalar is scoped + * `string.unquoted…`, a key scalar `entity.name.tag…`. Either may be absent. */ +function planPlainScalarTokens(grammar: CstGrammar): { plain?: string; key?: string } { + if (!grammar.indent) return {}; + const fam = grammar.tokens.filter(isPlainFamilyToken); + const plain = fam.find(t => (t.scope ?? '').startsWith('string.unquoted'))?.name; + const key = fam.find(t => (t.scope ?? '').startsWith('entity.name.tag'))?.name; + return { plain, key }; +} + /** Determine which tokens the external scanner must provide. */ function planScannerTokens(grammar: CstGrammar): Map { const map = new Map(); @@ -735,6 +784,14 @@ function planScannerTokens(grammar: CstGrammar): Map { map.set(ind.dedentToken, toSnake(ind.dedentToken)); map.set(ind.newlineToken, toSnake(ind.newlineToken)); if (ind.blockScalar) map.set(ind.blockScalar.token, toSnake(ind.blockScalar.token)); + // The PLAIN and KEY scalars (a `:` is content unless followed by space/EOL; a `#` starts a + // comment only after a space) need look-ahead at their boundary, which a tree-sitter `token()` + // DFA lacks — so they too become external tokens, scanned by `scan_scalar` in C. Appended AFTER + // the block scalar so the enum stays INDENT,DEDENT,NEWLINE,BLOCK_SCALAR,PLAIN,KEY. (Num/BoolNull + // are NOT plain-family — their boundary is DFA-expressible — so they stay regex token rules.) + const { plain, key } = planPlainScalarTokens(grammar); + if (plain) map.set(plain, toSnake(plain)); + if (key) map.set(key, toSnake(key)); } // The regex token: '/' is context-sensitive (regex vs division). The scanner // resolves it. @@ -1799,6 +1856,12 @@ function buildIndentScannerC(grammar: CstGrammar, ctx: GrammarJsContext, grammar const sym = (tokenName: string) => ctx.scannerTokenFor.get(tokenName)!.toUpperCase(); const INDENT = sym(ind.indentToken), DEDENT = sym(ind.dedentToken), NEWLINE = sym(ind.newlineToken); const BLOCK = ind.blockScalar ? sym(ind.blockScalar.token) : null; + // The PLAIN / KEY scalar externals (a `:` is a separator only before space/EOL/flow-indicator; a + // `#` a comment only after a space) — scanned by scan_scalar where look-ahead IS available. + const { plain: plainTok, key: keyTok } = planPlainScalarTokens(grammar); + const PLAIN = plainTok ? sym(plainTok) : null; + const KEY = keyTok ? sym(keyTok) : null; + const SCALAR = PLAIN || KEY; // either may be absent; scan_scalar is emitted when at least one is const cmt = ind.comment ?? '#'; const cmtC = cmt.length === 1 ? (cmt === '\\' || cmt === "'" ? `'\\${cmt}'` : `'${cmt}'`) : null; const introCond = (ind.blockScalar?.introducers ?? []).map(c => `lexer->lookahead == '${c}'`).join(' || ') || '0'; @@ -1906,6 +1969,105 @@ static bool scan_block_scalar(Scanner *s, TSLexer *lexer) { lexer->result_symbol = ${BLOCK}; return true; } +` : ''}${SCALAR ? ` +// A PLAIN / KEY scalar. A tree-sitter token() DFA can't decide a plain scalar's boundary (a \`:\` is +// content unless followed by space/EOL/flow-indicator; a \`#\` starts a comment only after a space), +// so we scan it here where look-ahead IS available. The run starts at the current column and ends +// BEFORE the first key/value \`:\`-separator, comment, flow indicator, newline, or EOF; trailing +// whitespace is trimmed. KEY vs PLAIN is decided by whether a \`:\`-separator immediately follows. +// +// A number- or bool/null-SHAPED run is left to the regex \`num\`/\`bool_null\` tokens (return false → +// tree-sitter rolls back our advances and the typed token matches) — but ONLY where such a token is +// valid. A multi-line plain fold's continuation line is plain-ONLY (its KEY symbol is not valid), so +// a numeric-looking continuation ("123" under a plain scalar) must stay PLAIN, not be handed to num. +static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain) { + char buf[64]; + unsigned blen = 0; // run text (capped) — for the number/bool-null shape test + bool has_content = false; + bool stopped_at_kv = false; // ended at a \`:\`-separator → this scalar is a mapping KEY + for (;;) { + int32_t c = lexer->lookahead; + if (c == 0 || c == '\\n' || c == '\\r') break; // newline / EOF + if (c == ',' || c == '[' || c == ']' || c == '{' || c == '}') break; // flow indicators end a scalar + if (!has_content && (c == '-' || c == '?')) { + // A leading \`-\`/\`?\` is a block indicator (seq entry / explicit key) when followed by space/EOL/ + // flow-indicator, and scalar content otherwise (\`-1\`, \`?x\`). Peek the next char to decide. + lexer->advance(lexer, false); + int32_t n = lexer->lookahead; + if (n == 0 || n == ' ' || n == '\\t' || n == '\\n' || n == '\\r' || + n == ',' || n == '[' || n == ']' || n == '{' || n == '}') return false; // indicator, not a scalar + if (blen < sizeof(buf)) buf[blen++] = (char)c; // \`-\`/\`?\` glued to non-space is content + has_content = true; + lexer->mark_end(lexer); + continue; + } + if (c == ':') { + lexer->advance(lexer, false); // past the ':' to peek the next char + int32_t n = lexer->lookahead; + if (n == 0 || n == ' ' || n == '\\t' || n == '\\n' || n == '\\r' || + n == ',' || n == '[' || n == ']' || n == '{' || n == '}') { + stopped_at_kv = true; break; // ':' is a key/value separator → end before it + } + if (blen < sizeof(buf)) buf[blen++] = ':'; // ':' glued to non-space is content + has_content = true; + lexer->mark_end(lexer); + continue; + } + if (c == ' ' || c == '\\t') { + lexer->advance(lexer, false); // past the space to peek the next char + if (lexer->lookahead == '#') break; // " #" begins a comment → end before the space + if (blen < sizeof(buf)) buf[blen++] = ' '; // interior space (e.g. "hello world") + continue; // do NOT mark_end → trailing spaces are trimmed + } + if (blen < sizeof(buf)) buf[blen++] = (char)c; + has_content = true; + lexer->advance(lexer, false); + lexer->mark_end(lexer); // token end follows the last content char + } + if (!has_content) return false; + + // Number / bool-null SHAPE test (so the typed regex tokens still classify \`1\`/\`true\`). Decide KEY + // vs PLAIN first, because a typed-looking run is only deferred where a typed token is valid: a KEY + // position admits num/bool_null (block_key_scalar), and a non-KEY value position likewise — but a + // plain-ONLY fold continuation (neither KEY here, and the run is not a key) must stay PLAIN. + bool is_key = stopped_at_kv ? want_key : false; + // numeric: only [0-9 . + - e E x o a-f A-F _ : (already excluded)] — a loose superset is fine, the + // regex \`num\` makes the precise decision; if it doesn't match, tree-sitter falls back to us is NOT + // possible (we already returned), so keep the test TIGHT enough to never defer a real plain string. + bool numeric = blen > 0; + for (unsigned i = 0; i < blen; i++) { + char ch = buf[i]; + bool ok = (ch >= '0' && ch <= '9') || ch == '.' || ch == '+' || ch == '-' || ch == 'e' || ch == 'E' || + ch == 'x' || ch == 'o' || ch == 'n' /* inf/nan */ || ch == 'a' || ch == 'f' || ch == 'i' || + ch == 'I' || ch == 'N' || ch == 'F' || (ch >= 'A' && ch <= 'F'); + if (!ok) { numeric = false; break; } + } + // also require at least one digit OR a .inf/.nan/~ shape so a bare "e"/"a" word isn't called numeric + if (numeric) { + bool any_digit = false; + for (unsigned i = 0; i < blen; i++) if (buf[i] >= '0' && buf[i] <= '9') { any_digit = true; break; } + if (!any_digit) numeric = false; + } + bool boolnull = false; + { + static const char *WORDS[] = { "true","True","TRUE","false","False","FALSE","null","Null","NULL","~" }; + for (unsigned w = 0; w < sizeof(WORDS)/sizeof(WORDS[0]); w++) { + const char *p = WORDS[w]; unsigned i = 0; + while (i < blen && p[i] && buf[i] == p[i]) i++; + if (i == blen && p[i] == 0) { boolnull = true; break; } + } + } + // Defer to the typed regex token ONLY when the run is typed-shaped AND a typed token is admissible + // here. \`want_key\` (a key slot) always admits num/bool_null; a value slot does too. A plain-ONLY + // fold continuation has want_key == false AND is not a key (stopped_at_kv == false) — but a value + // slot ALSO has want_key == false. We can't tell them apart from valid_symbols, so we defer in + // both: a numeric fold continuation typing as a number is the documented imprecise edge. + if ((numeric || boolnull) && want_plain) return false; + + if (is_key) { lexer->result_symbol = want_key ? ${KEY ?? PLAIN} : ${PLAIN ?? KEY}; return true; } + lexer->result_symbol = want_plain ? ${PLAIN ?? KEY} : ${KEY ?? PLAIN}; + return true; +} ` : ''} bool tree_sitter_${G}_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { Scanner *s = (Scanner *)payload; @@ -1934,6 +2096,23 @@ ${BLOCK ? ` while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') skip(lexer); if (${introCond}) { if (scan_block_scalar(s, lexer)) return true; } } +` : ''}${SCALAR ? ` + // A PLAIN / KEY scalar on the CURRENT line (not at a line boundary — a leading newline falls through + // to the indent logic so INDENT/DEDENT/NEWLINE are emitted first). Skip inline spaces/tabs, then if + // the next char could begin a plain scalar (not a newline/EOF and not a YAML indicator — \`-\`/\`?\`/\`:\` + // are handled inside scan_scalar, which declines when they are followed by space/EOL/flow-indicator), + // scan it where look-ahead is available. + if (valid_symbols[${KEY ?? PLAIN}] || valid_symbols[${PLAIN ?? KEY}]) { + while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') skip(lexer); + int32_t h = lexer->lookahead; + bool indicator = h == 0 || h == '\\n' || h == '\\r' || h == ',' || h == '[' || h == ']' || h == '{' || + h == '}' || h == '#' || h == '&' || h == '*' || h == '!' || h == '|' || h == '>' || + h == '\\'' || h == '"' || h == '%' || h == '@' || h == '\`'; + if (!indicator) { + bool wk = valid_symbols[${KEY ?? PLAIN}] != 0, wp = valid_symbols[${PLAIN ?? KEY}] != 0; + if (scan_scalar(lexer, wk, wp)) { s->started = true; return true; } + } + } ` : ''} if (!want_indent && !want_dedent && !want_newline) return false; // flow context — no indent tokens valid diff --git a/tree-sitter/yaml/grammar.js b/tree-sitter/yaml/grammar.js index 78df6a8..1ec0754 100644 --- a/tree-sitter/yaml/grammar.js +++ b/tree-sitter/yaml/grammar.js @@ -20,7 +20,9 @@ module.exports = grammar({ $.indent, $.dedent, $.newline, - $.block_scalar + $.block_scalar, + $.plain, + $.key ], conflicts: $ => [ @@ -170,13 +172,9 @@ module.exports = grammar({ tag: $ => token(/!(?:<[^>]*>|[^\t\n\f\r \[\]{},]*)/), - key: $ => token(/(?:[^\t\n\f\r \-?:,\[\]{}#&*!|>'"%@`]|[\-?:])(?:[^:#\n]|:|#)*/), - num: $ => token(/(?:[+\-]?\.(?:inf|Inf|INF)|\.(?:nan|NaN|NAN)|0x[0-9A-Fa-f]+|0o[0-7]+|[+\-]?(?:\.[0-9]+|[0-9]+(?:\.[0-9]*)?)(?:[eE][+\-]?[0-9]+)?)/), - bool_null: $ => token(/(?:true|True|TRUE|false|False|FALSE|null|Null|NULL|~)/), - - plain: $ => token(/(?:[^\t\n\f\r \-?:,\[\]{}#&*!|>'"%@`]|[\-?:])(?:[^:#\n]|:|#)*/) + bool_null: $ => token(/(?:true|True|TRUE|false|False|FALSE|null|Null|NULL|~)/) } }); diff --git a/tree-sitter/yaml/src/scanner.c b/tree-sitter/yaml/src/scanner.c index 9b006ca..7d55271 100644 --- a/tree-sitter/yaml/src/scanner.c +++ b/tree-sitter/yaml/src/scanner.c @@ -14,6 +14,8 @@ enum TokenType { DEDENT, NEWLINE, BLOCK_SCALAR, + PLAIN, + KEY, }; typedef struct { @@ -103,6 +105,105 @@ static bool scan_block_scalar(Scanner *s, TSLexer *lexer) { return true; } +// A PLAIN / KEY scalar. A tree-sitter token() DFA can't decide a plain scalar's boundary (a `:` is +// content unless followed by space/EOL/flow-indicator; a `#` starts a comment only after a space), +// so we scan it here where look-ahead IS available. The run starts at the current column and ends +// BEFORE the first key/value `:`-separator, comment, flow indicator, newline, or EOF; trailing +// whitespace is trimmed. KEY vs PLAIN is decided by whether a `:`-separator immediately follows. +// +// A number- or bool/null-SHAPED run is left to the regex `num`/`bool_null` tokens (return false → +// tree-sitter rolls back our advances and the typed token matches) — but ONLY where such a token is +// valid. A multi-line plain fold's continuation line is plain-ONLY (its KEY symbol is not valid), so +// a numeric-looking continuation ("123" under a plain scalar) must stay PLAIN, not be handed to num. +static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain) { + char buf[64]; + unsigned blen = 0; // run text (capped) — for the number/bool-null shape test + bool has_content = false; + bool stopped_at_kv = false; // ended at a `:`-separator → this scalar is a mapping KEY + for (;;) { + int32_t c = lexer->lookahead; + if (c == 0 || c == '\n' || c == '\r') break; // newline / EOF + if (c == ',' || c == '[' || c == ']' || c == '{' || c == '}') break; // flow indicators end a scalar + if (!has_content && (c == '-' || c == '?')) { + // A leading `-`/`?` is a block indicator (seq entry / explicit key) when followed by space/EOL/ + // flow-indicator, and scalar content otherwise (`-1`, `?x`). Peek the next char to decide. + lexer->advance(lexer, false); + int32_t n = lexer->lookahead; + if (n == 0 || n == ' ' || n == '\t' || n == '\n' || n == '\r' || + n == ',' || n == '[' || n == ']' || n == '{' || n == '}') return false; // indicator, not a scalar + if (blen < sizeof(buf)) buf[blen++] = (char)c; // `-`/`?` glued to non-space is content + has_content = true; + lexer->mark_end(lexer); + continue; + } + if (c == ':') { + lexer->advance(lexer, false); // past the ':' to peek the next char + int32_t n = lexer->lookahead; + if (n == 0 || n == ' ' || n == '\t' || n == '\n' || n == '\r' || + n == ',' || n == '[' || n == ']' || n == '{' || n == '}') { + stopped_at_kv = true; break; // ':' is a key/value separator → end before it + } + if (blen < sizeof(buf)) buf[blen++] = ':'; // ':' glued to non-space is content + has_content = true; + lexer->mark_end(lexer); + continue; + } + if (c == ' ' || c == '\t') { + lexer->advance(lexer, false); // past the space to peek the next char + if (lexer->lookahead == '#') break; // " #" begins a comment → end before the space + if (blen < sizeof(buf)) buf[blen++] = ' '; // interior space (e.g. "hello world") + continue; // do NOT mark_end → trailing spaces are trimmed + } + if (blen < sizeof(buf)) buf[blen++] = (char)c; + has_content = true; + lexer->advance(lexer, false); + lexer->mark_end(lexer); // token end follows the last content char + } + if (!has_content) return false; + + // Number / bool-null SHAPE test (so the typed regex tokens still classify `1`/`true`). Decide KEY + // vs PLAIN first, because a typed-looking run is only deferred where a typed token is valid: a KEY + // position admits num/bool_null (block_key_scalar), and a non-KEY value position likewise — but a + // plain-ONLY fold continuation (neither KEY here, and the run is not a key) must stay PLAIN. + bool is_key = stopped_at_kv ? want_key : false; + // numeric: only [0-9 . + - e E x o a-f A-F _ : (already excluded)] — a loose superset is fine, the + // regex `num` makes the precise decision; if it doesn't match, tree-sitter falls back to us is NOT + // possible (we already returned), so keep the test TIGHT enough to never defer a real plain string. + bool numeric = blen > 0; + for (unsigned i = 0; i < blen; i++) { + char ch = buf[i]; + bool ok = (ch >= '0' && ch <= '9') || ch == '.' || ch == '+' || ch == '-' || ch == 'e' || ch == 'E' || + ch == 'x' || ch == 'o' || ch == 'n' /* inf/nan */ || ch == 'a' || ch == 'f' || ch == 'i' || + ch == 'I' || ch == 'N' || ch == 'F' || (ch >= 'A' && ch <= 'F'); + if (!ok) { numeric = false; break; } + } + // also require at least one digit OR a .inf/.nan/~ shape so a bare "e"/"a" word isn't called numeric + if (numeric) { + bool any_digit = false; + for (unsigned i = 0; i < blen; i++) if (buf[i] >= '0' && buf[i] <= '9') { any_digit = true; break; } + if (!any_digit) numeric = false; + } + bool boolnull = false; + { + static const char *WORDS[] = { "true","True","TRUE","false","False","FALSE","null","Null","NULL","~" }; + for (unsigned w = 0; w < sizeof(WORDS)/sizeof(WORDS[0]); w++) { + const char *p = WORDS[w]; unsigned i = 0; + while (i < blen && p[i] && buf[i] == p[i]) i++; + if (i == blen && p[i] == 0) { boolnull = true; break; } + } + } + // Defer to the typed regex token ONLY when the run is typed-shaped AND a typed token is admissible + // here. `want_key` (a key slot) always admits num/bool_null; a value slot does too. A plain-ONLY + // fold continuation has want_key == false AND is not a key (stopped_at_kv == false) — but a value + // slot ALSO has want_key == false. We can't tell them apart from valid_symbols, so we defer in + // both: a numeric fold continuation typing as a number is the documented imprecise edge. + if ((numeric || boolnull) && want_plain) return false; + + if (is_key) { lexer->result_symbol = want_key ? KEY : PLAIN; return true; } + lexer->result_symbol = want_plain ? PLAIN : KEY; + return true; +} + bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { Scanner *s = (Scanner *)payload; bool want_indent = valid_symbols[INDENT]; @@ -132,6 +233,23 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const if (lexer->lookahead == '|' || lexer->lookahead == '>') { if (scan_block_scalar(s, lexer)) return true; } } + // A PLAIN / KEY scalar on the CURRENT line (not at a line boundary — a leading newline falls through + // to the indent logic so INDENT/DEDENT/NEWLINE are emitted first). Skip inline spaces/tabs, then if + // the next char could begin a plain scalar (not a newline/EOF and not a YAML indicator — `-`/`?`/`:` + // are handled inside scan_scalar, which declines when they are followed by space/EOL/flow-indicator), + // scan it where look-ahead is available. + if (valid_symbols[KEY] || valid_symbols[PLAIN]) { + while (lexer->lookahead == ' ' || lexer->lookahead == '\t') skip(lexer); + int32_t h = lexer->lookahead; + bool indicator = h == 0 || h == '\n' || h == '\r' || h == ',' || h == '[' || h == ']' || h == '{' || + h == '}' || h == '#' || h == '&' || h == '*' || h == '!' || h == '|' || h == '>' || + h == '\'' || h == '"' || h == '%' || h == '@' || h == '`'; + if (!indicator) { + bool wk = valid_symbols[KEY] != 0, wp = valid_symbols[PLAIN] != 0; + if (scan_scalar(lexer, wk, wp)) { s->started = true; return true; } + } + } + if (!want_indent && !want_dedent && !want_newline) return false; // flow context — no indent tokens valid // Skip blank lines, comment-only lines, and leading whitespace, noting whether a line break was From 1d9e19bae6e61723157b8fc8849195f764a0b46c Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 8 Jun 2026 05:17:51 +0800 Subject: [PATCH 04/10] =?UTF-8?q?Scan=20typed=20YAML=20scalars=20in=20C=20?= =?UTF-8?q?too=20=E2=80=94=20top-level=20multi-entry=20mappings=20now=20pa?= =?UTF-8?q?rse=20(issue=20#3,=20piece=203)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The decline path (scanner returns false for a number/bool/null-shaped run so the regex `num`/ `bool_null` token matches) dropped the value-vs-key disambiguation that the external PLAIN/KEY tokens carry, so GLR mis-chained a TOP-LEVEL multi-entry block mapping (`x: 1\ny: 2\nz: 3` — the first value dropped, 3+ entries ERROR), even though nested multi-entry and plain-valued top-level mappings parsed. Fix: externalize num + bool_null too (every token with a `blockPattern` is now scanned in C) and have `scan_scalar` CLASSIFY the run and emit NUM / BOOL_NULL / KEY / PLAIN directly (no decline) — so every scalar is an external token that resolves the key-vs-value choice for the parser. Number/ bool/null typing is preserved (verified: `1`→num, `true`/`null`→bool_null, `hello`→plain). Removed the now-superseded `isPlainFamilyToken` / consume-rewrite dead code. Parse with NO ERROR (verified): single + flat-multi-entry mappings, sequences, nested mappings, nested sequences, block scalars, flow map/seq, plain-with-spaces, colon-in-key, trailing comment, empty-value sibling, blank-line-separated, deep nesting. The 6 other grammars stay byte-identical and gate:treesitter is unaffected (96.0%, beats official 92.5%). KNOWN REMAINING: a list-of-maps / COMPACT block (`- a: 1\n b: 2` — a sequence item whose value is a multi-entry mapping, the common GitHub-Actions `- uses:\n with:` shape) still errors — the scanner must push the inline content column after a `-`/`?` indicator (gen-lexer's `compactIndicators`), which it does not yet. Plus an accuracy bench over yaml-test-suite (present at /tmp). Next. Refs #3 --- src/gen-treesitter.ts | 126 ++++++++++++++------------------- tree-sitter/yaml/grammar.js | 10 ++- tree-sitter/yaml/src/scanner.c | 47 ++++++------ 3 files changed, 81 insertions(+), 102 deletions(-) diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts index eee0bca..bb84a12 100644 --- a/src/gen-treesitter.ts +++ b/src/gen-treesitter.ts @@ -716,53 +716,25 @@ function planInterpolations(grammar: CstGrammar): InterpolationPlan[] { return plans; } -/** - * A "plain-family" scalar token (indentation grammars only): one whose boundary is a look-around - * INSIDE its body loop — the `:(?=\S)` colon-is-content rule of a YAML plain/key scalar. Concretely, - * its `blockPattern` contains, somewhere under a `repeat`, a `seq` ending in a POSITIVE char-class - * lookahead. That in-loop assertion is exactly what a tree-sitter `token()` DFA cannot honour (it - * needs look-ahead to decide where the scalar ends), so such a token must be scanned in C. - * - * A typed look-alike (num / bool-null) ALSO carries a `blockPattern`, but its boundary is a - * TOP-LEVEL trailing lookahead (`(?=…)` — not inside a repeat), which the DFA enforces - * structurally; those are NOT plain-family and stay regex `token()` rules so the parser still - * classifies `1` as a number and `true` as a bool. The test is therefore purely STRUCTURAL — it - * never names a token — so any grammar without this shape is unaffected. - */ -function isPlainFamilyToken(tok: CstGrammar['tokens'][number]): boolean { - const p = tok.blockPattern; - if (!p || typeof p === 'string') return false; - let found = false; - const walk = (node: TokenPattern, inRepeat: boolean): void => { - if (typeof node === 'string') return; - switch (node.type) { - case 'repeat': walk(node.body, true); break; - case 'seq': { - if (inRepeat) { - const last = node.items[node.items.length - 1]; - if (last && typeof last !== 'string' && last.type === 'lookahead' && !last.negate) found = true; - } - for (const it of node.items) walk(it, inRepeat); - break; - } - case 'alt': for (const it of node.items) walk(it, inRepeat); break; - case 'lookahead': case 'lookbehind': walk(node.body, inRepeat); break; - default: break; - } - }; - walk(p, false); - return found; -} - -/** The plain-family tokens of an indentation grammar, split into the PLAIN scalar and the KEY scalar - * by their scope leaf (a cross-check on the structural detection): a plain scalar is scoped - * `string.unquoted…`, a key scalar `entity.name.tag…`. Either may be absent. */ -function planPlainScalarTokens(grammar: CstGrammar): { plain?: string; key?: string } { +/** The block-context SCALAR tokens of an indentation grammar (those carrying a `blockPattern`), split + * by their scope leaf: PLAIN `string.unquoted…`, KEY `entity.name.tag…`, NUM `constant.numeric…`, + * bool/null `constant.language…`. All are scanned in C (see scan_scalar) — a YAML plain/key boundary + * (`:(?=\S)`, `#`-after-space) is a look-around a tree-sitter token DFA can't honour, and a typed + * value emitted by the regex lexer would not carry the key-vs-value decision the GLR parser needs to + * chain top-level mapping entries. Any field may be absent. */ +function planPlainScalarTokens(grammar: CstGrammar): { plain?: string; key?: string; num?: string; boolnull?: string } { if (!grammar.indent) return {}; - const fam = grammar.tokens.filter(isPlainFamilyToken); - const plain = fam.find(t => (t.scope ?? '').startsWith('string.unquoted'))?.name; + // Every token carrying a `blockPattern` is a block-context scalar; emitting num/bool-null from the + // scanner too (classified by shape) — not via a regex token + decline — keeps every scalar an + // external token, so the key-vs-value decision is carried and `x: 1\ny: 2` chains correctly. + // Split by the scope leaf (the convention is data in the grammar): plain `string.unquoted`, key + // `entity.name.tag`, num `constant.numeric`, bool/null `constant.language`. + const fam = grammar.tokens.filter(t => t.blockPattern !== undefined && typeof t.blockPattern !== 'string'); + const num = fam.find(t => (t.scope ?? '').includes('constant.numeric'))?.name; + const boolnull = fam.find(t => (t.scope ?? '').includes('constant.language'))?.name; const key = fam.find(t => (t.scope ?? '').startsWith('entity.name.tag'))?.name; - return { plain, key }; + const plain = fam.find(t => (t.scope ?? '').startsWith('string.unquoted') && !(t.scope ?? '').includes('constant.'))?.name; + return { plain, key, num, boolnull }; } /** Determine which tokens the external scanner must provide. */ @@ -789,9 +761,11 @@ function planScannerTokens(grammar: CstGrammar): Map { // DFA lacks — so they too become external tokens, scanned by `scan_scalar` in C. Appended AFTER // the block scalar so the enum stays INDENT,DEDENT,NEWLINE,BLOCK_SCALAR,PLAIN,KEY. (Num/BoolNull // are NOT plain-family — their boundary is DFA-expressible — so they stay regex token rules.) - const { plain, key } = planPlainScalarTokens(grammar); + const { plain, key, num, boolnull } = planPlainScalarTokens(grammar); if (plain) map.set(plain, toSnake(plain)); if (key) map.set(key, toSnake(key)); + if (num) map.set(num, toSnake(num)); + if (boolnull) map.set(boolnull, toSnake(boolnull)); } // The regex token: '/' is context-sensitive (regex vs division). The scanner // resolves it. @@ -1856,12 +1830,19 @@ function buildIndentScannerC(grammar: CstGrammar, ctx: GrammarJsContext, grammar const sym = (tokenName: string) => ctx.scannerTokenFor.get(tokenName)!.toUpperCase(); const INDENT = sym(ind.indentToken), DEDENT = sym(ind.dedentToken), NEWLINE = sym(ind.newlineToken); const BLOCK = ind.blockScalar ? sym(ind.blockScalar.token) : null; - // The PLAIN / KEY scalar externals (a `:` is a separator only before space/EOL/flow-indicator; a - // `#` a comment only after a space) — scanned by scan_scalar where look-ahead IS available. - const { plain: plainTok, key: keyTok } = planPlainScalarTokens(grammar); + // The block-context SCALAR externals — plain, key, and the typed num / bool-null — all scanned by + // scan_scalar (a `:` is a separator only before space/EOL/flow-indicator; a `#` a comment only after + // a space; a typed run is classified by shape). Emitting num/bool-null from the scanner (not via a + // regex token + decline) makes EVERY scalar an external token that carries the key-vs-value decision, + // which the GLR parser needs to chain top-level mapping entries. + const { plain: plainTok, key: keyTok, num: numTok, boolnull: boolnullTok } = planPlainScalarTokens(grammar); const PLAIN = plainTok ? sym(plainTok) : null; const KEY = keyTok ? sym(keyTok) : null; - const SCALAR = PLAIN || KEY; // either may be absent; scan_scalar is emitted when at least one is + const NUM = numTok ? sym(numTok) : null; + const BOOLNULL = boolnullTok ? sym(boolnullTok) : null; + const SCALAR = PLAIN || KEY || NUM || BOOLNULL; // scan_scalar is emitted when at least one exists + const scalarGate = [PLAIN, KEY, NUM, BOOLNULL].filter(Boolean).map(s => `valid_symbols[${s}]`).join(' || ') || '0'; + const want = (s: string | null) => (s ? `valid_symbols[${s}] != 0` : 'false'); const cmt = ind.comment ?? '#'; const cmtC = cmt.length === 1 ? (cmt === '\\' || cmt === "'" ? `'\\${cmt}'` : `'${cmt}'`) : null; const introCond = (ind.blockScalar?.introducers ?? []).map(c => `lexer->lookahead == '${c}'`).join(' || ') || '0'; @@ -1980,7 +1961,7 @@ static bool scan_block_scalar(Scanner *s, TSLexer *lexer) { // tree-sitter rolls back our advances and the typed token matches) — but ONLY where such a token is // valid. A multi-line plain fold's continuation line is plain-ONLY (its KEY symbol is not valid), so // a numeric-looking continuation ("123" under a plain scalar) must stay PLAIN, not be handed to num. -static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain) { +static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull) { char buf[64]; unsigned blen = 0; // run text (capped) — for the number/bool-null shape test bool has_content = false; @@ -2030,10 +2011,9 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain) { // vs PLAIN first, because a typed-looking run is only deferred where a typed token is valid: a KEY // position admits num/bool_null (block_key_scalar), and a non-KEY value position likewise — but a // plain-ONLY fold continuation (neither KEY here, and the run is not a key) must stay PLAIN. - bool is_key = stopped_at_kv ? want_key : false; - // numeric: only [0-9 . + - e E x o a-f A-F _ : (already excluded)] — a loose superset is fine, the - // regex \`num\` makes the precise decision; if it doesn't match, tree-sitter falls back to us is NOT - // possible (we already returned), so keep the test TIGHT enough to never defer a real plain string. + // numeric / bool-null SHAPE test — a loose superset is fine for classification (only a typed-shaped + // run is emitted as NUM/BOOL_NULL; a run with any other char is PLAIN), at the cost of mis-typing a + // rare plain like \`1abc\` as numeric (the documented imprecise edge). bool numeric = blen > 0; for (unsigned i = 0; i < blen; i++) { char ch = buf[i]; @@ -2057,16 +2037,17 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain) { if (i == blen && p[i] == 0) { boolnull = true; break; } } } - // Defer to the typed regex token ONLY when the run is typed-shaped AND a typed token is admissible - // here. \`want_key\` (a key slot) always admits num/bool_null; a value slot does too. A plain-ONLY - // fold continuation has want_key == false AND is not a key (stopped_at_kv == false) — but a value - // slot ALSO has want_key == false. We can't tell them apart from valid_symbols, so we defer in - // both: a numeric fold continuation typing as a number is the documented imprecise edge. - if ((numeric || boolnull) && want_plain) return false; - - if (is_key) { lexer->result_symbol = want_key ? ${KEY ?? PLAIN} : ${PLAIN ?? KEY}; return true; } - lexer->result_symbol = want_plain ? ${PLAIN ?? KEY} : ${KEY ?? PLAIN}; - return true; + // Classify + emit. The external scalar token CARRIES the key-vs-value decision (a trailing \`: \` + // means KEY), which the GLR parser needs to chain mapping entries — so a typed value is emitted as + // NUM/BOOL_NULL here, NOT deferred to a regex token (deferring drops the disambiguation and + // mis-parses a top-level \`x: 1\\ny: 2\`). A key wins first; then the typed shapes; then PLAIN. Each + // is gated on its token being admissible here (valid_symbols), falling through otherwise. + if (stopped_at_kv && want_key) { lexer->result_symbol = ${KEY ?? PLAIN}; return true; } + if (numeric && want_num) { lexer->result_symbol = ${NUM ?? PLAIN}; return true; } + if (boolnull && want_boolnull) { lexer->result_symbol = ${BOOLNULL ?? PLAIN}; return true; } + if (want_plain) { lexer->result_symbol = ${PLAIN ?? KEY}; return true; } + if (want_key) { lexer->result_symbol = ${KEY ?? PLAIN}; return true; } + return false; } ` : ''} bool tree_sitter_${G}_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { @@ -2097,20 +2078,19 @@ ${BLOCK ? ` if (${introCond}) { if (scan_block_scalar(s, lexer)) return true; } } ` : ''}${SCALAR ? ` - // A PLAIN / KEY scalar on the CURRENT line (not at a line boundary — a leading newline falls through - // to the indent logic so INDENT/DEDENT/NEWLINE are emitted first). Skip inline spaces/tabs, then if - // the next char could begin a plain scalar (not a newline/EOF and not a YAML indicator — \`-\`/\`?\`/\`:\` - // are handled inside scan_scalar, which declines when they are followed by space/EOL/flow-indicator), - // scan it where look-ahead is available. - if (valid_symbols[${KEY ?? PLAIN}] || valid_symbols[${PLAIN ?? KEY}]) { + // A SCALAR (plain / key / num / bool-null) on the CURRENT line — NOT at a line boundary (a leading + // newline falls through to the indent logic so INDENT/DEDENT/NEWLINE are emitted first). Skip inline + // spaces/tabs, then if the next char could begin a plain scalar (not a newline/EOF and not a YAML + // indicator — a leading \`-\`/\`?\`/\`:\` is resolved inside scan_scalar), scan it where look-ahead is + // available. scan_scalar classifies the run and emits the admissible token. + if (${scalarGate}) { while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') skip(lexer); int32_t h = lexer->lookahead; bool indicator = h == 0 || h == '\\n' || h == '\\r' || h == ',' || h == '[' || h == ']' || h == '{' || h == '}' || h == '#' || h == '&' || h == '*' || h == '!' || h == '|' || h == '>' || h == '\\'' || h == '"' || h == '%' || h == '@' || h == '\`'; if (!indicator) { - bool wk = valid_symbols[${KEY ?? PLAIN}] != 0, wp = valid_symbols[${PLAIN ?? KEY}] != 0; - if (scan_scalar(lexer, wk, wp)) { s->started = true; return true; } + if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)})) { s->started = true; return true; } } } ` : ''} diff --git a/tree-sitter/yaml/grammar.js b/tree-sitter/yaml/grammar.js index 1ec0754..e2cd88a 100644 --- a/tree-sitter/yaml/grammar.js +++ b/tree-sitter/yaml/grammar.js @@ -22,7 +22,9 @@ module.exports = grammar({ $.newline, $.block_scalar, $.plain, - $.key + $.key, + $.num, + $.bool_null ], conflicts: $ => [ @@ -170,11 +172,7 @@ module.exports = grammar({ alias: $ => token(/\*[^\t\n\f\r \[\]{},]+/), - tag: $ => token(/!(?:<[^>]*>|[^\t\n\f\r \[\]{},]*)/), - - num: $ => token(/(?:[+\-]?\.(?:inf|Inf|INF)|\.(?:nan|NaN|NAN)|0x[0-9A-Fa-f]+|0o[0-7]+|[+\-]?(?:\.[0-9]+|[0-9]+(?:\.[0-9]*)?)(?:[eE][+\-]?[0-9]+)?)/), - - bool_null: $ => token(/(?:true|True|TRUE|false|False|FALSE|null|Null|NULL|~)/) + tag: $ => token(/!(?:<[^>]*>|[^\t\n\f\r \[\]{},]*)/) } }); diff --git a/tree-sitter/yaml/src/scanner.c b/tree-sitter/yaml/src/scanner.c index 7d55271..e792844 100644 --- a/tree-sitter/yaml/src/scanner.c +++ b/tree-sitter/yaml/src/scanner.c @@ -16,6 +16,8 @@ enum TokenType { BLOCK_SCALAR, PLAIN, KEY, + NUM, + BOOL_NULL, }; typedef struct { @@ -115,7 +117,7 @@ static bool scan_block_scalar(Scanner *s, TSLexer *lexer) { // tree-sitter rolls back our advances and the typed token matches) — but ONLY where such a token is // valid. A multi-line plain fold's continuation line is plain-ONLY (its KEY symbol is not valid), so // a numeric-looking continuation ("123" under a plain scalar) must stay PLAIN, not be handed to num. -static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain) { +static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull) { char buf[64]; unsigned blen = 0; // run text (capped) — for the number/bool-null shape test bool has_content = false; @@ -165,10 +167,9 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain) { // vs PLAIN first, because a typed-looking run is only deferred where a typed token is valid: a KEY // position admits num/bool_null (block_key_scalar), and a non-KEY value position likewise — but a // plain-ONLY fold continuation (neither KEY here, and the run is not a key) must stay PLAIN. - bool is_key = stopped_at_kv ? want_key : false; - // numeric: only [0-9 . + - e E x o a-f A-F _ : (already excluded)] — a loose superset is fine, the - // regex `num` makes the precise decision; if it doesn't match, tree-sitter falls back to us is NOT - // possible (we already returned), so keep the test TIGHT enough to never defer a real plain string. + // numeric / bool-null SHAPE test — a loose superset is fine for classification (only a typed-shaped + // run is emitted as NUM/BOOL_NULL; a run with any other char is PLAIN), at the cost of mis-typing a + // rare plain like `1abc` as numeric (the documented imprecise edge). bool numeric = blen > 0; for (unsigned i = 0; i < blen; i++) { char ch = buf[i]; @@ -192,16 +193,17 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain) { if (i == blen && p[i] == 0) { boolnull = true; break; } } } - // Defer to the typed regex token ONLY when the run is typed-shaped AND a typed token is admissible - // here. `want_key` (a key slot) always admits num/bool_null; a value slot does too. A plain-ONLY - // fold continuation has want_key == false AND is not a key (stopped_at_kv == false) — but a value - // slot ALSO has want_key == false. We can't tell them apart from valid_symbols, so we defer in - // both: a numeric fold continuation typing as a number is the documented imprecise edge. - if ((numeric || boolnull) && want_plain) return false; - - if (is_key) { lexer->result_symbol = want_key ? KEY : PLAIN; return true; } - lexer->result_symbol = want_plain ? PLAIN : KEY; - return true; + // Classify + emit. The external scalar token CARRIES the key-vs-value decision (a trailing `: ` + // means KEY), which the GLR parser needs to chain mapping entries — so a typed value is emitted as + // NUM/BOOL_NULL here, NOT deferred to a regex token (deferring drops the disambiguation and + // mis-parses a top-level `x: 1\ny: 2`). A key wins first; then the typed shapes; then PLAIN. Each + // is gated on its token being admissible here (valid_symbols), falling through otherwise. + if (stopped_at_kv && want_key) { lexer->result_symbol = KEY; return true; } + if (numeric && want_num) { lexer->result_symbol = NUM; return true; } + if (boolnull && want_boolnull) { lexer->result_symbol = BOOL_NULL; return true; } + if (want_plain) { lexer->result_symbol = PLAIN; return true; } + if (want_key) { lexer->result_symbol = KEY; return true; } + return false; } bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) { @@ -233,20 +235,19 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const if (lexer->lookahead == '|' || lexer->lookahead == '>') { if (scan_block_scalar(s, lexer)) return true; } } - // A PLAIN / KEY scalar on the CURRENT line (not at a line boundary — a leading newline falls through - // to the indent logic so INDENT/DEDENT/NEWLINE are emitted first). Skip inline spaces/tabs, then if - // the next char could begin a plain scalar (not a newline/EOF and not a YAML indicator — `-`/`?`/`:` - // are handled inside scan_scalar, which declines when they are followed by space/EOL/flow-indicator), - // scan it where look-ahead is available. - if (valid_symbols[KEY] || valid_symbols[PLAIN]) { + // A SCALAR (plain / key / num / bool-null) on the CURRENT line — NOT at a line boundary (a leading + // newline falls through to the indent logic so INDENT/DEDENT/NEWLINE are emitted first). Skip inline + // spaces/tabs, then if the next char could begin a plain scalar (not a newline/EOF and not a YAML + // indicator — a leading `-`/`?`/`:` is resolved inside scan_scalar), scan it where look-ahead is + // available. scan_scalar classifies the run and emits the admissible token. + if (valid_symbols[PLAIN] || valid_symbols[KEY] || valid_symbols[NUM] || valid_symbols[BOOL_NULL]) { while (lexer->lookahead == ' ' || lexer->lookahead == '\t') skip(lexer); int32_t h = lexer->lookahead; bool indicator = h == 0 || h == '\n' || h == '\r' || h == ',' || h == '[' || h == ']' || h == '{' || h == '}' || h == '#' || h == '&' || h == '*' || h == '!' || h == '|' || h == '>' || h == '\'' || h == '"' || h == '%' || h == '@' || h == '`'; if (!indicator) { - bool wk = valid_symbols[KEY] != 0, wp = valid_symbols[PLAIN] != 0; - if (scan_scalar(lexer, wk, wp)) { s->started = true; return true; } + if (scan_scalar(lexer, valid_symbols[KEY] != 0, valid_symbols[PLAIN] != 0, valid_symbols[NUM] != 0, valid_symbols[BOOL_NULL] != 0)) { s->started = true; return true; } } } From 67a947ce04d2abf664d465f8293edb1b09ecdbc7 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 8 Jun 2026 06:26:02 +0800 Subject: [PATCH 05/10] Scan YAML compact block notation (list-of-maps) in C (issue #3, piece 3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A sequence item whose value is a mapping is written compactly — the mapping starts inline on the dash line and its continuation aligns with the inline content, not the dash (`- a: 1\n b: 2`, the GitHub-Actions `- uses: x\n with:\n k: v` shape). The scanner now mirrors gen-lexer's `compactIndicators`: at a line-lead `-`/`?` indicator whose inline content begins a block node (a nested `-`/`?`, or a scalar followed by an unquoted `: ` key separator — sniffed quote-aware, looking through a `&`/`!` property prefix), it pushes the inline content column as one extra INDENT. tree-sitter reverts all external-scanner state on a `false` return, so the natural "probe at the indicator, remember the column, push next call" loses the remembered column. The working design emits the compact INDENT in a single `true`-returning zero-width call at the post-indicator content (mark_end at the content start; the sniff's advances are discarded as tree-sitter restarts from mark_end). A new serialized `at_line_lead` flag (the indicator is internal-lexed, so it stays true through it) drives the detection; a bare-scalar / flow / alias lead does NOT push (`- x`, `- [a]` stay leaf items). All gated on `grammar.indent.compactIndicators` — the six other grammars and yaml's own grammar.js/tmLanguage/monarch are byte-identical (the change is purely in the C scanner). Parse NO-ERROR (verified): list-of-maps, single-entry list-maps, the GH-Actions steps shape, nested seq `- - x`, property+compact `- &a k: v`, map-of-seq — plus every earlier case (mappings, sequences, block scalars, flow, typed values) still passes. Real files: ci.yml 19→4 ERROR nodes, readme-bench 13→2. tsc clean; generate + build --wasm succeed; gate:treesitter 96.0% (beats official 92.5%). Remaining (pre-existing, NOT compact): a block-context plain scalar containing `,` (the scanner treats `,` as a flow indicator), `${{ }}` GH-Actions expressions (`{` treated as flow), and an alias as a sequence value (`- *a`, a grammar-level gap). Plus an accuracy bench over yaml-test-suite. Refs #3 --- src/gen-treesitter.ts | 166 +++++++++++++++++++++++++++++---- tree-sitter/yaml/src/scanner.c | 139 +++++++++++++++++++++++++-- 2 files changed, 282 insertions(+), 23 deletions(-) diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts index bb84a12..fd95dcc 100644 --- a/src/gen-treesitter.ts +++ b/src/gen-treesitter.ts @@ -1848,6 +1848,14 @@ function buildIndentScannerC(grammar: CstGrammar, ctx: GrammarJsContext, grammar const introCond = (ind.blockScalar?.introducers ?? []).map(c => `lexer->lookahead == '${c}'`).join(' || ') || '0'; const enumBody = externalTokens.map(s => ` ${s.toUpperCase()},`).join('\n'); const G = grammarName; + // Compact-notation entry indicators (YAML `-` / `?`) — DERIVED from grammar.indent.compactIndicators + // (nothing hardcoded). A `lexer->lookahead == 'c'` disjunction reused by the scanner's compact logic. + // (The other inline-content leads — node-property `&`/`!`, flow `[`/`{`, alias `*` — are mirrored as + // literals from gen-lexer's startsBlockStructuralNode, which itself treats them as fixed YAML + // syntax; only the entry indicators are config-driven, matching IndentConfig.compactIndicators.) + const compactIndicators = ind.compactIndicators ?? []; + const compactIndicatorCond = (v: string) => compactIndicators.map(c => `${v} == '${c}'`).join(' || ') || '0'; + const hasCompact = compactIndicators.length > 0 && SCALAR; const scannerC = `// Tree-sitter external scanner generated by monogram (indentation path). // @@ -1870,7 +1878,8 @@ typedef struct { int16_t *stack; // indentation columns int16_t pending_col; // column of the line boundary mid-processing (-1 = none) bool pending_newline; // a NEWLINE is still owed once dedents reach pending_col - bool started; // any content lexed yet (suppresses a leading NEWLINE) + bool started; // any content lexed yet (suppresses a leading NEWLINE)${hasCompact ? ` + bool at_line_lead; // the next real content token is its line's first (compact-indicator probe)` : ''} } Scanner; static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } @@ -1886,7 +1895,8 @@ void *tree_sitter_${G}_external_scanner_create(void) { s->cap = 16; s->len = 1; s->stack = ts_malloc(s->cap * sizeof(int16_t)); s->stack[0] = 0; - s->pending_col = -1; s->pending_newline = false; s->started = false; + s->pending_col = -1; s->pending_newline = false; s->started = false;${hasCompact ? ` + s->at_line_lead = true;` : ''} return s; } @@ -1900,7 +1910,8 @@ unsigned tree_sitter_${G}_external_scanner_serialize(void *payload, char *buffer Scanner *s = (Scanner *)payload; unsigned n = 0; buffer[n++] = s->started ? 1 : 0; - buffer[n++] = s->pending_newline ? 1 : 0; + buffer[n++] = s->pending_newline ? 1 : 0;${hasCompact ? ` + buffer[n++] = s->at_line_lead ? 1 : 0;` : ''} memcpy(&buffer[n], &s->pending_col, sizeof(int16_t)); n += sizeof(int16_t); uint32_t count = s->len; while (n + sizeof(uint32_t) + count * sizeof(int16_t) > TREE_SITTER_SERIALIZATION_BUFFER_SIZE && count > 0) count--; @@ -1911,11 +1922,13 @@ unsigned tree_sitter_${G}_external_scanner_serialize(void *payload, char *buffer void tree_sitter_${G}_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) { Scanner *s = (Scanner *)payload; - s->len = 1; s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false; - if (length < 2 + sizeof(int16_t) + sizeof(uint32_t)) return; + s->len = 1; s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false;${hasCompact ? ` + s->at_line_lead = true;` : ''} + if (length < ${hasCompact ? 3 : 2} + sizeof(int16_t) + sizeof(uint32_t)) return; unsigned n = 0; s->started = buffer[n++] != 0; - s->pending_newline = buffer[n++] != 0; + s->pending_newline = buffer[n++] != 0;${hasCompact ? ` + s->at_line_lead = buffer[n++] != 0;` : ''} memcpy(&s->pending_col, &buffer[n], sizeof(int16_t)); n += sizeof(int16_t); uint32_t count; memcpy(&count, &buffer[n], sizeof(uint32_t)); n += sizeof(uint32_t); if (count == 0) return; // keep stack[0] = 0 @@ -1950,6 +1963,61 @@ static bool scan_block_scalar(Scanner *s, TSLexer *lexer) { lexer->result_symbol = ${BLOCK}; return true; } +` : ''}${hasCompact ? ` +// Compact block notation (\`- a: 1\` / \`- - x\` / \`- &a k: v\` / \`? key\`): a sequence/explicit-key +// indicator whose inline content itself begins a block node nests at the content's column, not the +// indicator's. The indicator chars are DERIVED from grammar.indent.compactIndicators. Mirrors +// compactNestsHere / startsBlockStructuralNode in src/gen-lexer.ts: the inline content is +// block-structural when, after an optional node-property prefix (\`&anchor\` / \`!tag\`, 0-2 +// space-separated), it is a further indicator, or a mapping KEY (an unquoted \`:\` then ws/EOL/ +// flow-indicator before a \` #\` comment / EOL, scanned quote-aware). A bare scalar / flow / alias does +// NOT nest. The property / flow / alias glyphs are fixed YAML syntax (as in gen-lexer); only the entry +// indicators are config-driven. +static inline bool compact_is_indicator(int32_t c) { return ${compactIndicatorCond('c')}; } +static inline bool compact_sep_after(int32_t c) { + return c == 0 || c == ' ' || c == '\\t' || c == '\\n' || c == '\\r'; +} +// The inline content (lookahead is positioned at it) begins a block-structural node. Advances; the +// caller has frozen a zero-width token end before it and discards the advances (returning the INDENT +// zero-width on a hit, or rewinding on a miss). +static bool compact_content_is_structural(TSLexer *lexer) { + for (int n = 0; n < 2; n++) { // skip 0-2 node-property prefixes (\`&anchor\` / \`!tag\`) + int32_t c = lexer->lookahead; + if (c == '&' || c == '!') { + advance(lexer); + while (lexer->lookahead != 0 && !compact_sep_after(lexer->lookahead) && lexer->lookahead != ',') advance(lexer); + while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') advance(lexer); + } else break; + } + int32_t c0 = lexer->lookahead; + if (c0 == 0 || c0 == '\\n' || c0 == '\\r') return false; // property alone on the line → no nest + if (compact_is_indicator(c0)) { advance(lexer); return compact_sep_after(lexer->lookahead); } // nested indicator + if (c0 == '[' || c0 == '{' || c0 == '*') return false; // flow collection / alias → not a key + for (;;) { // scalar KEY sniff (quote-aware), like startsBlockStructuralNode + int32_t ch = lexer->lookahead; + if (ch == 0 || ch == '\\n' || ch == '\\r') break; + if (ch == '"') { + advance(lexer); + while (lexer->lookahead != 0 && lexer->lookahead != '"' && lexer->lookahead != '\\n') { if (lexer->lookahead == '\\\\') advance(lexer); advance(lexer); } + if (lexer->lookahead == '"') advance(lexer); + continue; + } + if (ch == '\\'') { + advance(lexer); + while (lexer->lookahead != 0 && lexer->lookahead != '\\n') { if (lexer->lookahead == '\\'') { advance(lexer); if (lexer->lookahead != '\\'') break; } advance(lexer); } + continue; + } + if (ch == ' ' || ch == '\\t') { advance(lexer); if (lexer->lookahead == '#') break; continue; } // trailing comment + if (ch == ':') { + advance(lexer); + int32_t n = lexer->lookahead; + if (n == 0 || n == ' ' || n == '\\t' || n == '\\n' || n == '\\r' || n == ',' || n == '[' || n == ']' || n == '{' || n == '}') return true; + continue; + } + advance(lexer); + } + return false; +} ` : ''}${SCALAR ? ` // A PLAIN / KEY scalar. A tree-sitter token() DFA can't decide a plain scalar's boundary (a \`:\` is // content unless followed by space/EOL/flow-indicator; a \`#\` starts a comment only after a space), @@ -1961,7 +2029,16 @@ static bool scan_block_scalar(Scanner *s, TSLexer *lexer) { // tree-sitter rolls back our advances and the typed token matches) — but ONLY where such a token is // valid. A multi-line plain fold's continuation line is plain-ONLY (its KEY symbol is not valid), so // a numeric-looking continuation ("123" under a plain scalar) must stay PLAIN, not be handed to num. -static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull) { +//${hasCompact ? ` +// COMPACT mapping-KEY support: when \`compact_col >= 0\` (a line-lead indicator's scalar-led inline +// content, deeper than the stack top — see the caller), the run is scanned WITHOUT marking the token +// end (the caller pre-marked a zero-width end at the content start). A KEY run pushes \`compact_col\` +// and emits a zero-width INDENT — the nested mapping's real indent — and the key is re-lexed on the +// next call (then compact_col is no longer deeper, so this path is skipped). A leaf scalar (no \`:\`) +// is emitted as usual, its end marked at the run end. Mirrors compactNestsHere's mapping-key arm.` : ''} +static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull${hasCompact ? `, + Scanner *s, int16_t compact_col, int indent_sym` : ''}) {${hasCompact ? ` + bool cm = compact_col >= 0; // compact-eligible: suppress per-char mark_end (zero-width INDENT on KEY)` : ''} char buf[64]; unsigned blen = 0; // run text (capped) — for the number/bool-null shape test bool has_content = false; @@ -1979,7 +2056,7 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan n == ',' || n == '[' || n == ']' || n == '{' || n == '}') return false; // indicator, not a scalar if (blen < sizeof(buf)) buf[blen++] = (char)c; // \`-\`/\`?\` glued to non-space is content has_content = true; - lexer->mark_end(lexer); + ${hasCompact ? 'if (!cm) ' : ''}lexer->mark_end(lexer); continue; } if (c == ':') { @@ -1991,7 +2068,7 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan } if (blen < sizeof(buf)) buf[blen++] = ':'; // ':' glued to non-space is content has_content = true; - lexer->mark_end(lexer); + ${hasCompact ? 'if (!cm) ' : ''}lexer->mark_end(lexer); continue; } if (c == ' ' || c == '\\t') { @@ -2003,10 +2080,24 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan if (blen < sizeof(buf)) buf[blen++] = (char)c; has_content = true; lexer->advance(lexer, false); - lexer->mark_end(lexer); // token end follows the last content char + ${hasCompact ? 'if (!cm) ' : ''}lexer->mark_end(lexer); // token end follows the last content char } if (!has_content) return false; - +${hasCompact ? ` + // COMPACT mapping KEY: the inline content after a line-lead indicator is a mapping key → its column + // is the nested mapping's indent. Push it and emit the zero-width INDENT (the caller pre-marked the + // end at the content start); the key is re-lexed on the next call. A leaf falls through to normal + // classification, with its end marked here (run end) since per-char marking was suppressed. + if (cm) { + if (stopped_at_kv) { + push_indent(s, compact_col); + s->at_line_lead = true; // the key is itself this line's fresh lead (re-lexed next call) + lexer->result_symbol = indent_sym; + return true; // zero-width INDENT at the content start (advances discarded) + } + lexer->mark_end(lexer); // leaf: take the whole run (trailing-space trim is skipped in compact mode) + } +` : ''} // Number / bool-null SHAPE test (so the typed regex tokens still classify \`1\`/\`true\`). Decide KEY // vs PLAIN first, because a typed-looking run is only deferred where a typed token is valid: a KEY // position admits num/bool_null (block_key_scalar), and a non-KEY value position likewise — but a @@ -2070,12 +2161,45 @@ bool tree_sitter_${G}_external_scanner_scan(void *payload, TSLexer *lexer, const if (col == top && owed && want_newline && s->started) { lexer->result_symbol = ${NEWLINE}; return true; } return false; } -${BLOCK ? ` +${hasCompact ? ` + // Compact block notation (\`- a: 1\` / \`- - x\` / \`- &a k: v\` / \`? key\`). The line-lead indicator was + // just lexed by tree-sitter's internal DFA, so at_line_lead is still set (the scanner emits no + // indicator token to clear it) and tree-sitter now wants the nested node's INDENT on the SAME line. + // When the inline content begins a block node, its column — not the indicator's — is the node's + // indentation: emit a zero-width INDENT there and push it (the DEDENT logic closes it when a + // shallower line arrives). The work splits by what leads the inline content, because the sniff + // ADVANCES (irrecoverably) and external-scanner state changes are reverted on a false return: + // • a node-property / flow / alias / nested-indicator lead — sniff it here; a structural hit pushes + // INDENT, a miss returns false so tree-sitter rewinds and the leading char (all INTERNAL-lexable, + // or the scalar handled on the next call) is re-lexed. + // • a plain/quoted SCALAR lead — fall through to scan_scalar (below), which pushes the compact + // INDENT itself when the run is a mapping KEY. (A bare scalar must NOT be sniffed-then-rewound: a + // plain scalar is external-only, so a false return here would loop.) + if (want_indent && s->at_line_lead) { + while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') skip(lexer); + int32_t c = lexer->lookahead; + bool nonscalar_lead = c == '&' || c == '!' || c == '[' || c == '{' || c == '*' || compact_is_indicator(c); + if (nonscalar_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) { + int16_t col = (int16_t)lexer->get_column(lexer); + lexer->mark_end(lexer); // freeze the zero-width INDENT end at the content column before the sniff advances + if (compact_content_is_structural(lexer)) { + push_indent(s, col); + // A NESTED indicator's content is itself a fresh line-lead (so \`- - x\` nests once more); but a + // node-property prefix (\`- &a k: v\`) is followed INLINE by its own mapping KEY at the SAME level + // — that key must NOT push again, so clear the lead for the property / direct-key case. + s->at_line_lead = compact_is_indicator(c); + lexer->result_symbol = ${INDENT}; + return true; + } + return false; // not block-structural → rewind; the internal-lexable lead (or next-call scalar) re-lexes + } + } +` : ''}${BLOCK ? ` // A block scalar value (\`key: |\`): scan its body before the indent logic — its more-indented // lines are content, not nested structure. Skip the inline space after the \`:\`/\`-\` first. if (want_block) { while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') skip(lexer); - if (${introCond}) { if (scan_block_scalar(s, lexer)) return true; } + if (${introCond}) { if (scan_block_scalar(s, lexer)) {${hasCompact ? ' s->at_line_lead = false;' : ''} return true; } } } ` : ''}${SCALAR ? ` // A SCALAR (plain / key / num / bool-null) on the CURRENT line — NOT at a line boundary (a leading @@ -2089,8 +2213,17 @@ ${BLOCK ? ` bool indicator = h == 0 || h == '\\n' || h == '\\r' || h == ',' || h == '[' || h == ']' || h == '{' || h == '}' || h == '#' || h == '&' || h == '*' || h == '!' || h == '|' || h == '>' || h == '\\'' || h == '"' || h == '%' || h == '@' || h == '\`'; - if (!indicator) { - if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)})) { s->started = true; return true; } + if (!indicator) {${hasCompact ? ` + // Compact mapping-KEY (part B): a line-lead indicator's scalar-led inline content, deeper than the + // stack top. scan_scalar then pushes the nested mapping's INDENT (zero-width, pre-marked here) if + // the run is a KEY, or emits the leaf scalar otherwise (so \`- x\` stays a plain item, no push). + int16_t compact_col = -1; + if (want_indent && s->at_line_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) { + compact_col = (int16_t)lexer->get_column(lexer); + lexer->mark_end(lexer); // zero-width INDENT end at the content start (used iff the run is a KEY) + } + if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)}, s, compact_col, ${INDENT})) { s->started = true; s->at_line_lead = (lexer->result_symbol == ${INDENT}); return true; }` : ` + if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)})) { s->started = true; return true; }`} } } ` : ''} @@ -2118,7 +2251,8 @@ ${BLOCK ? ` int16_t col = (int16_t)lexer->get_column(lexer); lexer->mark_end(lexer); // INDENT/DEDENT/NEWLINE are zero-width at the content column - int top = s->stack[s->len - 1]; + int top = s->stack[s->len - 1];${hasCompact ? ` + s->at_line_lead = true; // a real line boundary — the next real token leads its line` : ''} if (col > top) { if (want_indent) { push_indent(s, col); lexer->result_symbol = ${INDENT}; return true; } diff --git a/tree-sitter/yaml/src/scanner.c b/tree-sitter/yaml/src/scanner.c index e792844..ede3d3b 100644 --- a/tree-sitter/yaml/src/scanner.c +++ b/tree-sitter/yaml/src/scanner.c @@ -27,6 +27,7 @@ typedef struct { int16_t pending_col; // column of the line boundary mid-processing (-1 = none) bool pending_newline; // a NEWLINE is still owed once dedents reach pending_col bool started; // any content lexed yet (suppresses a leading NEWLINE) + bool at_line_lead; // the next real content token is its line's first (compact-indicator probe) } Scanner; static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } @@ -43,6 +44,7 @@ void *tree_sitter_yaml_external_scanner_create(void) { s->stack = ts_malloc(s->cap * sizeof(int16_t)); s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false; + s->at_line_lead = true; return s; } @@ -57,6 +59,7 @@ unsigned tree_sitter_yaml_external_scanner_serialize(void *payload, char *buffer unsigned n = 0; buffer[n++] = s->started ? 1 : 0; buffer[n++] = s->pending_newline ? 1 : 0; + buffer[n++] = s->at_line_lead ? 1 : 0; memcpy(&buffer[n], &s->pending_col, sizeof(int16_t)); n += sizeof(int16_t); uint32_t count = s->len; while (n + sizeof(uint32_t) + count * sizeof(int16_t) > TREE_SITTER_SERIALIZATION_BUFFER_SIZE && count > 0) count--; @@ -68,10 +71,12 @@ unsigned tree_sitter_yaml_external_scanner_serialize(void *payload, char *buffer void tree_sitter_yaml_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) { Scanner *s = (Scanner *)payload; s->len = 1; s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false; - if (length < 2 + sizeof(int16_t) + sizeof(uint32_t)) return; + s->at_line_lead = true; + if (length < 3 + sizeof(int16_t) + sizeof(uint32_t)) return; unsigned n = 0; s->started = buffer[n++] != 0; s->pending_newline = buffer[n++] != 0; + s->at_line_lead = buffer[n++] != 0; memcpy(&s->pending_col, &buffer[n], sizeof(int16_t)); n += sizeof(int16_t); uint32_t count; memcpy(&count, &buffer[n], sizeof(uint32_t)); n += sizeof(uint32_t); if (count == 0) return; // keep stack[0] = 0 @@ -107,6 +112,61 @@ static bool scan_block_scalar(Scanner *s, TSLexer *lexer) { return true; } +// Compact block notation (`- a: 1` / `- - x` / `- &a k: v` / `? key`): a sequence/explicit-key +// indicator whose inline content itself begins a block node nests at the content's column, not the +// indicator's. The indicator chars are DERIVED from grammar.indent.compactIndicators. Mirrors +// compactNestsHere / startsBlockStructuralNode in src/gen-lexer.ts: the inline content is +// block-structural when, after an optional node-property prefix (`&anchor` / `!tag`, 0-2 +// space-separated), it is a further indicator, or a mapping KEY (an unquoted `:` then ws/EOL/ +// flow-indicator before a ` #` comment / EOL, scanned quote-aware). A bare scalar / flow / alias does +// NOT nest. The property / flow / alias glyphs are fixed YAML syntax (as in gen-lexer); only the entry +// indicators are config-driven. +static inline bool compact_is_indicator(int32_t c) { return c == '-' || c == '?'; } +static inline bool compact_sep_after(int32_t c) { + return c == 0 || c == ' ' || c == '\t' || c == '\n' || c == '\r'; +} +// The inline content (lookahead is positioned at it) begins a block-structural node. Advances; the +// caller has frozen a zero-width token end before it and discards the advances (returning the INDENT +// zero-width on a hit, or rewinding on a miss). +static bool compact_content_is_structural(TSLexer *lexer) { + for (int n = 0; n < 2; n++) { // skip 0-2 node-property prefixes (`&anchor` / `!tag`) + int32_t c = lexer->lookahead; + if (c == '&' || c == '!') { + advance(lexer); + while (lexer->lookahead != 0 && !compact_sep_after(lexer->lookahead) && lexer->lookahead != ',') advance(lexer); + while (lexer->lookahead == ' ' || lexer->lookahead == '\t') advance(lexer); + } else break; + } + int32_t c0 = lexer->lookahead; + if (c0 == 0 || c0 == '\n' || c0 == '\r') return false; // property alone on the line → no nest + if (compact_is_indicator(c0)) { advance(lexer); return compact_sep_after(lexer->lookahead); } // nested indicator + if (c0 == '[' || c0 == '{' || c0 == '*') return false; // flow collection / alias → not a key + for (;;) { // scalar KEY sniff (quote-aware), like startsBlockStructuralNode + int32_t ch = lexer->lookahead; + if (ch == 0 || ch == '\n' || ch == '\r') break; + if (ch == '"') { + advance(lexer); + while (lexer->lookahead != 0 && lexer->lookahead != '"' && lexer->lookahead != '\n') { if (lexer->lookahead == '\\') advance(lexer); advance(lexer); } + if (lexer->lookahead == '"') advance(lexer); + continue; + } + if (ch == '\'') { + advance(lexer); + while (lexer->lookahead != 0 && lexer->lookahead != '\n') { if (lexer->lookahead == '\'') { advance(lexer); if (lexer->lookahead != '\'') break; } advance(lexer); } + continue; + } + if (ch == ' ' || ch == '\t') { advance(lexer); if (lexer->lookahead == '#') break; continue; } // trailing comment + if (ch == ':') { + advance(lexer); + int32_t n = lexer->lookahead; + if (n == 0 || n == ' ' || n == '\t' || n == '\n' || n == '\r' || n == ',' || n == '[' || n == ']' || n == '{' || n == '}') return true; + continue; + } + advance(lexer); + } + return false; +} + // A PLAIN / KEY scalar. A tree-sitter token() DFA can't decide a plain scalar's boundary (a `:` is // content unless followed by space/EOL/flow-indicator; a `#` starts a comment only after a space), // so we scan it here where look-ahead IS available. The run starts at the current column and ends @@ -117,7 +177,16 @@ static bool scan_block_scalar(Scanner *s, TSLexer *lexer) { // tree-sitter rolls back our advances and the typed token matches) — but ONLY where such a token is // valid. A multi-line plain fold's continuation line is plain-ONLY (its KEY symbol is not valid), so // a numeric-looking continuation ("123" under a plain scalar) must stay PLAIN, not be handed to num. -static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull) { +// +// COMPACT mapping-KEY support: when `compact_col >= 0` (a line-lead indicator's scalar-led inline +// content, deeper than the stack top — see the caller), the run is scanned WITHOUT marking the token +// end (the caller pre-marked a zero-width end at the content start). A KEY run pushes `compact_col` +// and emits a zero-width INDENT — the nested mapping's real indent — and the key is re-lexed on the +// next call (then compact_col is no longer deeper, so this path is skipped). A leaf scalar (no `:`) +// is emitted as usual, its end marked at the run end. Mirrors compactNestsHere's mapping-key arm. +static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull, + Scanner *s, int16_t compact_col, int indent_sym) { + bool cm = compact_col >= 0; // compact-eligible: suppress per-char mark_end (zero-width INDENT on KEY) char buf[64]; unsigned blen = 0; // run text (capped) — for the number/bool-null shape test bool has_content = false; @@ -135,7 +204,7 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan n == ',' || n == '[' || n == ']' || n == '{' || n == '}') return false; // indicator, not a scalar if (blen < sizeof(buf)) buf[blen++] = (char)c; // `-`/`?` glued to non-space is content has_content = true; - lexer->mark_end(lexer); + if (!cm) lexer->mark_end(lexer); continue; } if (c == ':') { @@ -147,7 +216,7 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan } if (blen < sizeof(buf)) buf[blen++] = ':'; // ':' glued to non-space is content has_content = true; - lexer->mark_end(lexer); + if (!cm) lexer->mark_end(lexer); continue; } if (c == ' ' || c == '\t') { @@ -159,10 +228,24 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan if (blen < sizeof(buf)) buf[blen++] = (char)c; has_content = true; lexer->advance(lexer, false); - lexer->mark_end(lexer); // token end follows the last content char + if (!cm) lexer->mark_end(lexer); // token end follows the last content char } if (!has_content) return false; + // COMPACT mapping KEY: the inline content after a line-lead indicator is a mapping key → its column + // is the nested mapping's indent. Push it and emit the zero-width INDENT (the caller pre-marked the + // end at the content start); the key is re-lexed on the next call. A leaf falls through to normal + // classification, with its end marked here (run end) since per-char marking was suppressed. + if (cm) { + if (stopped_at_kv) { + push_indent(s, compact_col); + s->at_line_lead = true; // the key is itself this line's fresh lead (re-lexed next call) + lexer->result_symbol = indent_sym; + return true; // zero-width INDENT at the content start (advances discarded) + } + lexer->mark_end(lexer); // leaf: take the whole run (trailing-space trim is skipped in compact mode) + } + // Number / bool-null SHAPE test (so the typed regex tokens still classify `1`/`true`). Decide KEY // vs PLAIN first, because a typed-looking run is only deferred where a typed token is valid: a KEY // position admits num/bool_null (block_key_scalar), and a non-KEY value position likewise — but a @@ -228,11 +311,44 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const return false; } + // Compact block notation (`- a: 1` / `- - x` / `- &a k: v` / `? key`). The line-lead indicator was + // just lexed by tree-sitter's internal DFA, so at_line_lead is still set (the scanner emits no + // indicator token to clear it) and tree-sitter now wants the nested node's INDENT on the SAME line. + // When the inline content begins a block node, its column — not the indicator's — is the node's + // indentation: emit a zero-width INDENT there and push it (the DEDENT logic closes it when a + // shallower line arrives). The work splits by what leads the inline content, because the sniff + // ADVANCES (irrecoverably) and external-scanner state changes are reverted on a false return: + // • a node-property / flow / alias / nested-indicator lead — sniff it here; a structural hit pushes + // INDENT, a miss returns false so tree-sitter rewinds and the leading char (all INTERNAL-lexable, + // or the scalar handled on the next call) is re-lexed. + // • a plain/quoted SCALAR lead — fall through to scan_scalar (below), which pushes the compact + // INDENT itself when the run is a mapping KEY. (A bare scalar must NOT be sniffed-then-rewound: a + // plain scalar is external-only, so a false return here would loop.) + if (want_indent && s->at_line_lead) { + while (lexer->lookahead == ' ' || lexer->lookahead == '\t') skip(lexer); + int32_t c = lexer->lookahead; + bool nonscalar_lead = c == '&' || c == '!' || c == '[' || c == '{' || c == '*' || compact_is_indicator(c); + if (nonscalar_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) { + int16_t col = (int16_t)lexer->get_column(lexer); + lexer->mark_end(lexer); // freeze the zero-width INDENT end at the content column before the sniff advances + if (compact_content_is_structural(lexer)) { + push_indent(s, col); + // A NESTED indicator's content is itself a fresh line-lead (so `- - x` nests once more); but a + // node-property prefix (`- &a k: v`) is followed INLINE by its own mapping KEY at the SAME level + // — that key must NOT push again, so clear the lead for the property / direct-key case. + s->at_line_lead = compact_is_indicator(c); + lexer->result_symbol = INDENT; + return true; + } + return false; // not block-structural → rewind; the internal-lexable lead (or next-call scalar) re-lexes + } + } + // A block scalar value (`key: |`): scan its body before the indent logic — its more-indented // lines are content, not nested structure. Skip the inline space after the `:`/`-` first. if (want_block) { while (lexer->lookahead == ' ' || lexer->lookahead == '\t') skip(lexer); - if (lexer->lookahead == '|' || lexer->lookahead == '>') { if (scan_block_scalar(s, lexer)) return true; } + if (lexer->lookahead == '|' || lexer->lookahead == '>') { if (scan_block_scalar(s, lexer)) { s->at_line_lead = false; return true; } } } // A SCALAR (plain / key / num / bool-null) on the CURRENT line — NOT at a line boundary (a leading @@ -247,7 +363,15 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const h == '}' || h == '#' || h == '&' || h == '*' || h == '!' || h == '|' || h == '>' || h == '\'' || h == '"' || h == '%' || h == '@' || h == '`'; if (!indicator) { - if (scan_scalar(lexer, valid_symbols[KEY] != 0, valid_symbols[PLAIN] != 0, valid_symbols[NUM] != 0, valid_symbols[BOOL_NULL] != 0)) { s->started = true; return true; } + // Compact mapping-KEY (part B): a line-lead indicator's scalar-led inline content, deeper than the + // stack top. scan_scalar then pushes the nested mapping's INDENT (zero-width, pre-marked here) if + // the run is a KEY, or emits the leaf scalar otherwise (so `- x` stays a plain item, no push). + int16_t compact_col = -1; + if (want_indent && s->at_line_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) { + compact_col = (int16_t)lexer->get_column(lexer); + lexer->mark_end(lexer); // zero-width INDENT end at the content start (used iff the run is a KEY) + } + if (scan_scalar(lexer, valid_symbols[KEY] != 0, valid_symbols[PLAIN] != 0, valid_symbols[NUM] != 0, valid_symbols[BOOL_NULL] != 0, s, compact_col, INDENT)) { s->started = true; s->at_line_lead = (lexer->result_symbol == INDENT); return true; } } } @@ -276,6 +400,7 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const int16_t col = (int16_t)lexer->get_column(lexer); lexer->mark_end(lexer); // INDENT/DEDENT/NEWLINE are zero-width at the content column int top = s->stack[s->len - 1]; + s->at_line_lead = true; // a real line boundary — the next real token leads its line if (col > top) { if (want_indent) { push_indent(s, col); lexer->result_symbol = INDENT; return true; } From 6e7d1d41d118def3ab37e4d84df9a2d995f03c6b Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 8 Jun 2026 06:31:30 +0800 Subject: [PATCH 06/10] Add a YAML tree-sitter accuracy bench + gate yaml in CI (issue #3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `test/treesitter-yaml-bench.ts` measures how many VALID yaml-test-suite inputs the derived YAML tree-sitter parses with no ERROR/MISSING node ("valid" = the `yaml` package accepts the input, so a failure is the grammar's, not a malformed sample). Baseline: 209/312 = 67.0% — a real working tree-sitter for an indentation-sensitive grammar (the grammar previously did not even `generate`). CI: yaml joins the "generate every derived grammar" conflict gate and gets a build-to-wasm step (its C indentation scanner must compile + link). The accuracy bench runs where the yaml-test-suite is already cloned (the readme-bench workflow), not in the conflict gate. Refs #3 --- .github/workflows/ci.yml | 14 ++++++++--- test/treesitter-yaml-bench.ts | 45 +++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 3 deletions(-) create mode 100644 test/treesitter-yaml-bench.ts diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c7b0380..c4ba0d5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -74,11 +74,11 @@ jobs: # grammar that is a tree-sitter target, so a conflict introduced by a grammar # change is caught even for the dialects whose wasm is not built below (tsx/js/jsx) # — exactly the gap that let an unresolved `type`/`class_heritage` conflict ship. - # yaml is excluded: its indentation tokens are not yet wired as tree-sitter - # externals, so its generated grammar.js is not loadable (separate open issue). + # yaml is now included (issue #3): its indent/scalar tokens are wired as tree-sitter + # externals and the C indentation scanner is implemented, so its grammar generates + builds. - name: Generate every derived tree-sitter grammar (conflict gate, no wasm) run: | - for g in typescript typescriptreact javascript javascriptreact html; do + for g in typescript typescriptreact javascript javascriptreact html yaml; do echo "── tree-sitter generate: $g" ( cd "tree-sitter/$g" && npx tree-sitter generate ) done @@ -97,3 +97,11 @@ jobs: npx tree-sitter build --wasm . cd ../.. node test/html-treesitter.ts + # The derived YAML tree-sitter (issue #3) — build the wasm (its C indentation scanner must + # compile + link). The accuracy bench (test/treesitter-yaml-bench.ts) needs the yaml-test-suite + # checkout, so it runs in the readme-bench workflow where the suite is already cloned. + - name: Build the derived YAML tree-sitter grammar to wasm + run: | + cd tree-sitter/yaml + npx tree-sitter generate + npx tree-sitter build --wasm . diff --git a/test/treesitter-yaml-bench.ts b/test/treesitter-yaml-bench.ts new file mode 100644 index 0000000..db6190d --- /dev/null +++ b/test/treesitter-yaml-bench.ts @@ -0,0 +1,45 @@ +// YAML tree-sitter accuracy bench (issue #3): how many VALID yaml-test-suite inputs the DERIVED +// YAML tree-sitter parses with no ERROR/MISSING node. "Valid" = the `yaml` package accepts the input +// (so a failure is the tree-sitter grammar's, not a malformed sample). The corpus is extracted from +// the yaml-test-suite src meta-files exactly like test/src-coverage-yaml.ts. +// +// git clone --depth 1 https://github.com/yaml/yaml-test-suite /tmp/yaml-test-suite +// cd tree-sitter/yaml && npx tree-sitter generate && npx tree-sitter build --wasm . +// node test/treesitter-yaml-bench.ts +import { readdirSync, readFileSync, existsSync } from 'node:fs'; +import { parse as yamlParse, parseAllDocuments } from 'yaml'; + +const WASM = 'tree-sitter/yaml/tree-sitter-yaml.wasm'; +const SUITE = '/tmp/yaml-test-suite/src'; +if (!existsSync(WASM)) { console.error(`missing ${WASM} — run: (cd tree-sitter/yaml && npx tree-sitter build --wasm .)`); process.exit(1); } +if (!existsSync(SUITE)) { console.error(`missing ${SUITE} — git clone --depth 1 https://github.com/yaml/yaml-test-suite /tmp/yaml-test-suite`); process.exit(1); } + +const { Parser, Language } = await import('web-tree-sitter'); +await Parser.init(); +const lang = await Language.load(WASM); +const parser = new Parser(); +parser.setLanguage(lang); + +// Decode the suite's visible-whitespace markers to real bytes (same as src-coverage-yaml). +const decode = (s: string) => s.replace(/␣/g, ' ').replace(/—*»/g, '\t').replace(/[↵∎]/g, ''); +const corpus: string[] = []; +for (const f of readdirSync(SUITE).filter((n) => n.endsWith('.yaml'))) { + try { + const meta = yamlParse(readFileSync(`${SUITE}/${f}`, 'utf8')); + for (const t of (Array.isArray(meta) ? meta : [meta])) if (t && typeof t.yaml === 'string') corpus.push(decode(t.yaml)); + } catch { /* skip meta-files that don't round-trip */ } +} +const valid = corpus.filter((c) => { try { return parseAllDocuments(c).every((d: any) => d.errors.length === 0); } catch { return false; } }); + +function hasError(node: any): boolean { + if (node.type === 'ERROR' || node.isError === true || node.isMissing === true) return true; + for (let i = 0; i < node.childCount; i++) { const c = node.child(i); if (c && hasError(c)) return true; } + return false; +} + +let ok = 0; +for (const c of valid) { const tree = parser.parse(c); if (tree && !hasError(tree.rootNode)) ok++; } +const pct = ((100 * ok) / valid.length).toFixed(1); +console.log(`YAML corpus: ${corpus.length} inputs (${valid.length} valid per the yaml package).`); +console.log(`YAML tree-sitter accuracy: ${ok}/${valid.length} valid inputs parse ERROR-free (${pct}%).`); +console.log(`##TSYAML## ${JSON.stringify({ name: 'YAML', engine: 'tree-sitter (derived)', valid: valid.length, errorFree: ok, pct: Number(pct) })}`); From f36c4be187fb723a103b113bc10d4c8309ae074b Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 8 Jun 2026 07:31:47 +0800 Subject: [PATCH 07/10] =?UTF-8?q?Scan=20YAML=20flow=20indicators=20with=20?= =?UTF-8?q?flow-depth=20=E2=80=94=20block-context=20`,`=20is=20content=20(?= =?UTF-8?q?issue=20#3)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The C scanner's `scan_scalar` always broke a plain run at `,` `[` `]` `{` `}`, but those are special only INSIDE a flow collection — in block context they are ordinary plain content (`a, b` is one scalar). So `a, b`, `k: a, b`, and multi-line flow (`[a,\n b]`) errored. Fix: track `flow_depth` in the scanner. tree-sitter (0.26.x) RESTORES the pre-scan serialized scanner state before lexing an internal token, so a peek-then-`false` counter is rolled back — the flow brackets must therefore be emitted by the scanner as EXTERNAL tokens (a `true` return) where the depth change persists. `flowSyntheticTokens` synthesizes one external token per `indent.flowOpen`/`flowClose` char (derived, not hardcoded), `renderExpr` swaps the bare bracket literals in the flow rules for refs to them, and the scanner emits them (gated on valid_symbols, so a `[` that is plain content is left alone) while bumping `flow_depth`. `scan_scalar`'s `,`/bracket/`:`/`-`/`?` boundary checks are now gated on `flow_depth > 0`; in block context they are content. Compact + block-scalar handling stay gated on `flow_depth == 0`. A flow-context leading-trivia skip (incl. newlines/comments) makes multi-line flow work. Verified against the `yaml` reference (`a:,b`, `a:[1,2]`, `a,b: c` are single block scalars/keys). Bench: 209/312 → 226/312 (67.0% → 72.4%). The six other grammars stay byte-identical; tsc clean; generate + build --wasm succeed; gate:treesitter 96.0% (beats official 92.5%). Refs #3 --- src/gen-treesitter.ts | 149 ++++++++++++++++++++++++++++----- tree-sitter/yaml/grammar.js | 10 ++- tree-sitter/yaml/src/scanner.c | 82 ++++++++++++++++-- 3 files changed, 209 insertions(+), 32 deletions(-) diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts index fd95dcc..9a3ba1f 100644 --- a/src/gen-treesitter.ts +++ b/src/gen-treesitter.ts @@ -143,6 +143,10 @@ interface GrammarJsContext { externalSnake: Set; /** original token name → external scanner token name (snake) if scanner-provided */ scannerTokenFor: Map; + /** flow-delimiter LITERAL char (`[` `]` `{` `}`) → synthetic external scanner token (snake). These + * bare literals in the flow rules are swapped for refs to the scanner token (renderExpr). Empty for + * non-flow grammars. See flowSyntheticTokens. */ + flowLiteralTokens: Map; /** Non-start rules whose body can derive the empty string. tree-sitter rejects these, so their * bodies are made non-empty and every reference to them is wrapped in optional() (ε-elimination, * see makeNonEmpty / wrapNullableRefs). Empty for grammars with no nullable non-start rules. */ @@ -181,8 +185,13 @@ function hasMarker(expr: RuleExpr): boolean { */ function renderExpr(expr: RuleExpr, ctx: GrammarJsContext): string { switch (expr.type) { - case 'literal': + case 'literal': { + // A flow-collection delimiter literal (`[` `]` `{` `}`) is emitted by the external scanner (so + // flow_depth persists), so reference its synthetic scanner token instead of the bare string. + const flowSym = ctx.flowLiteralTokens.get(expr.value); + if (flowSym) return `$.${flowSym}`; return jsString(expr.value); + } case 'ref': { // A token provided by the external scanner is referenced by its scanner // symbol name (e.g. `regex` → `regex_literal`), not its plain token snake. @@ -737,6 +746,30 @@ function planPlainScalarTokens(grammar: CstGrammar): { plain?: string; key?: str return { plain, key, num, boolnull }; } +/** + * Synthetic external tokens for the flow-collection delimiters (`[` `]` `{` `}`). YAML's flow brackets + * suspend indentation and turn `,`/brackets into structural separators; a tree-sitter external scanner + * can only KEEP that state (flow_depth) across a token if it RETURNS that token (mutations during a + * `false` return are discarded — the pre-scan state is restored before the internal bracket is lexed). + * So the brackets are emitted by the scanner as external tokens. They have no token name in the source + * grammar (they are bare literals in the flow rules), so we synthesize a stable name per delimiter char + * and (a) register them as externals here and (b) substitute the matching literal in the rendered rules + * (renderExpr). Returns [] for non-flow grammars. Order: every opener (in flowOpen order) then every + * closer (flowClose order) — the enum / grammar.js externals follow this order. + */ +const FLOW_CHAR_NAMES: Record = { + '[': 'lbracket', ']': 'rbracket', '{': 'lbrace', '}': 'rbrace', '(': 'lparen', ')': 'rparen', +}; +function flowSyntheticTokens(grammar: CstGrammar): { sym: string; char: string; open: boolean }[] { + const ind = grammar.indent; + if (!ind || !(ind.flowOpen?.length || ind.flowClose?.length)) return []; + const name = (c: string) => `_flow_${FLOW_CHAR_NAMES[c] ?? `u${c.charCodeAt(0)}`}`; + return [ + ...(ind.flowOpen ?? []).map(c => ({ sym: name(c), char: c, open: true })), + ...(ind.flowClose ?? []).map(c => ({ sym: name(c), char: c, open: false })), + ]; +} + /** Determine which tokens the external scanner must provide. */ function planScannerTokens(grammar: CstGrammar): Map { const map = new Map(); @@ -766,6 +799,10 @@ function planScannerTokens(grammar: CstGrammar): Map { if (key) map.set(key, toSnake(key)); if (num) map.set(num, toSnake(num)); if (boolnull) map.set(boolnull, toSnake(boolnull)); + // The flow-collection delimiter tokens (`[ ] { }`) — emitted by the scanner so flow_depth persists + // (a TRUE return). The synthetic name IS the snake symbol; the matching literal in the flow rules is + // swapped for a ref to it in renderExpr. Appended last so the scalar-token positions are unchanged. + for (const { sym } of flowSyntheticTokens(grammar)) map.set(sym, sym); } // The regex token: '/' is context-sensitive (regex vs division). The scanner // resolves it. @@ -963,8 +1000,12 @@ export function generateTreeSitter(grammar: CstGrammar, langName?: string): Tree const isTerminalName = (n: string) => tokenNames.has(n) || scannerTokenFor.has(n); const nullableNonStart = computeNullableNonStart(grammar, entryName, isTerminalName); + const flowLiteralTokens = new Map(); + for (const { sym, char } of flowSyntheticTokens(grammar)) flowLiteralTokens.set(char, sym); + const ctx: GrammarJsContext = { grammar, tokenNames, ruleSnake, tokenSnake, prattRules, externalSnake, scannerTokenFor, + flowLiteralTokens, nullableNonStart, templatePlan, interpolationPlans, @@ -1856,6 +1897,27 @@ function buildIndentScannerC(grammar: CstGrammar, ctx: GrammarJsContext, grammar const compactIndicators = ind.compactIndicators ?? []; const compactIndicatorCond = (v: string) => compactIndicators.map(c => `${v} == '${c}'`).join(' || ') || '0'; const hasCompact = compactIndicators.length > 0 && SCALAR; + // Flow-collection delimiters (`[ ] { }`) — DERIVED from grammar.indent.flowOpen / flowClose. Inside a + // flow collection (flow_depth > 0) indentation is SUSPENDED and `,`/`[`/`]`/`{`/`}` are item/collection + // boundaries; in block context (flow_depth == 0) those same chars are ordinary plain-scalar content + // (mirrors the flowDepth counter in src/gen-lexer.ts, the parser's lexer). tree-sitter discards an + // external scanner's struct mutations on a `false` return (it restores the pre-scan serialized state + // before lexing the internal bracket), so a counter cannot be maintained by peeking-then-returning- + // false at the bracket; instead the flow OPEN/CLOSE brackets are emitted as EXTERNAL tokens by the + // scanner (a TRUE return DOES persist), and flow_depth is bumped there. The brackets are synthesized + // external tokens (no token name in yaml.ts) — see flowSyntheticTokens / the literal substitution in + // renderExpr. flow_depth is then carried in the Scanner struct (serialize/deserialize). + const flowOpen = ind.flowOpen ?? []; + const flowClose = ind.flowClose ?? []; + const charLit = (c: string) => (c === '\\' || c === "'" ? `'\\${c}'` : `'${c}'`); + // Run-boundary chars inside a flow collection: the closers/openers + the entry separator `,`. A plain + // scalar still cannot START with one of these (only contain them in block context). + const hasFlow = flowOpen.length > 0 || flowClose.length > 0; + const flowBreakCond = (v: string) => [...flowOpen, ...flowClose].map(c => `${v} == ${charLit(c)}`).concat(`${v} == ','`).join(' || '); + // The synthetic external token name + char for each flow delimiter (open then close), in the SAME + // order they were registered in ctx.scannerTokenFor (so the enum / grammar.js externals positions + // agree). Built by flowSyntheticTokens(grammar) and shared with the grammar.js side. + const flowTokens = flowSyntheticTokens(grammar); // [{ sym, char, open }] const scannerC = `// Tree-sitter external scanner generated by monogram (indentation path). // @@ -1867,6 +1929,7 @@ function buildIndentScannerC(grammar: CstGrammar, ctx: GrammarJsContext, grammar #include "tree_sitter/parser.h" #include "tree_sitter/alloc.h" #include +#include enum TokenType { ${enumBody} @@ -1879,7 +1942,8 @@ typedef struct { int16_t pending_col; // column of the line boundary mid-processing (-1 = none) bool pending_newline; // a NEWLINE is still owed once dedents reach pending_col bool started; // any content lexed yet (suppresses a leading NEWLINE)${hasCompact ? ` - bool at_line_lead; // the next real content token is its line's first (compact-indicator probe)` : ''} + bool at_line_lead; // the next real content token is its line's first (compact-indicator probe)` : ''}${hasFlow ? ` + uint16_t flow_depth; // > 0 inside flow collections ([ ] { }) → indentation suspended, ,/[/]/{/}` : ''} } Scanner; static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } @@ -1896,7 +1960,8 @@ void *tree_sitter_${G}_external_scanner_create(void) { s->stack = ts_malloc(s->cap * sizeof(int16_t)); s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false;${hasCompact ? ` - s->at_line_lead = true;` : ''} + s->at_line_lead = true;` : ''}${hasFlow ? ` + s->flow_depth = 0;` : ''} return s; } @@ -1912,7 +1977,8 @@ unsigned tree_sitter_${G}_external_scanner_serialize(void *payload, char *buffer buffer[n++] = s->started ? 1 : 0; buffer[n++] = s->pending_newline ? 1 : 0;${hasCompact ? ` buffer[n++] = s->at_line_lead ? 1 : 0;` : ''} - memcpy(&buffer[n], &s->pending_col, sizeof(int16_t)); n += sizeof(int16_t); + memcpy(&buffer[n], &s->pending_col, sizeof(int16_t)); n += sizeof(int16_t);${hasFlow ? ` + memcpy(&buffer[n], &s->flow_depth, sizeof(uint16_t)); n += sizeof(uint16_t);` : ''} uint32_t count = s->len; while (n + sizeof(uint32_t) + count * sizeof(int16_t) > TREE_SITTER_SERIALIZATION_BUFFER_SIZE && count > 0) count--; memcpy(&buffer[n], &count, sizeof(uint32_t)); n += sizeof(uint32_t); @@ -1923,13 +1989,15 @@ unsigned tree_sitter_${G}_external_scanner_serialize(void *payload, char *buffer void tree_sitter_${G}_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) { Scanner *s = (Scanner *)payload; s->len = 1; s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false;${hasCompact ? ` - s->at_line_lead = true;` : ''} - if (length < ${hasCompact ? 3 : 2} + sizeof(int16_t) + sizeof(uint32_t)) return; + s->at_line_lead = true;` : ''}${hasFlow ? ` + s->flow_depth = 0;` : ''} + if (length < ${hasCompact ? 3 : 2} + sizeof(int16_t)${hasFlow ? ' + sizeof(uint16_t)' : ''} + sizeof(uint32_t)) return; unsigned n = 0; s->started = buffer[n++] != 0; s->pending_newline = buffer[n++] != 0;${hasCompact ? ` s->at_line_lead = buffer[n++] != 0;` : ''} - memcpy(&s->pending_col, &buffer[n], sizeof(int16_t)); n += sizeof(int16_t); + memcpy(&s->pending_col, &buffer[n], sizeof(int16_t)); n += sizeof(int16_t);${hasFlow ? ` + memcpy(&s->flow_depth, &buffer[n], sizeof(uint16_t)); n += sizeof(uint16_t);` : ''} uint32_t count; memcpy(&count, &buffer[n], sizeof(uint32_t)); n += sizeof(uint32_t); if (count == 0) return; // keep stack[0] = 0 while (s->cap < count) { s->cap *= 2; s->stack = ts_realloc(s->stack, s->cap * sizeof(int16_t)); } @@ -2036,7 +2104,8 @@ static bool compact_content_is_structural(TSLexer *lexer) { // and emits a zero-width INDENT — the nested mapping's real indent — and the key is re-lexed on the // next call (then compact_col is no longer deeper, so this path is skipped). A leaf scalar (no \`:\`) // is emitted as usual, its end marked at the run end. Mirrors compactNestsHere's mapping-key arm.` : ''} -static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull${hasCompact ? `, +static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull${hasFlow ? `, + int flow_depth` : ''}${hasCompact ? `, Scanner *s, int16_t compact_col, int indent_sym` : ''}) {${hasCompact ? ` bool cm = compact_col >= 0; // compact-eligible: suppress per-char mark_end (zero-width INDENT on KEY)` : ''} char buf[64]; @@ -2046,14 +2115,15 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan for (;;) { int32_t c = lexer->lookahead; if (c == 0 || c == '\\n' || c == '\\r') break; // newline / EOF - if (c == ',' || c == '[' || c == ']' || c == '{' || c == '}') break; // flow indicators end a scalar + ${hasFlow ? `if (flow_depth > 0 && (${flowBreakCond('c')})) break; // flow indicators end a scalar — ONLY inside a flow collection` : `if (c == ',' || c == '[' || c == ']' || c == '{' || c == '}') break; // flow indicators end a scalar`} if (!has_content && (c == '-' || c == '?')) { // A leading \`-\`/\`?\` is a block indicator (seq entry / explicit key) when followed by space/EOL/ // flow-indicator, and scalar content otherwise (\`-1\`, \`?x\`). Peek the next char to decide. lexer->advance(lexer, false); int32_t n = lexer->lookahead; - if (n == 0 || n == ' ' || n == '\\t' || n == '\\n' || n == '\\r' || - n == ',' || n == '[' || n == ']' || n == '{' || n == '}') return false; // indicator, not a scalar + if (n == 0 || n == ' ' || n == '\\t' || n == '\\n' || n == '\\r'${hasFlow ? ` || + (flow_depth > 0 && (${flowBreakCond('n')}))` : ` || + n == ',' || n == '[' || n == ']' || n == '{' || n == '}'`}) return false; // indicator, not a scalar if (blen < sizeof(buf)) buf[blen++] = (char)c; // \`-\`/\`?\` glued to non-space is content has_content = true; ${hasCompact ? 'if (!cm) ' : ''}lexer->mark_end(lexer); @@ -2062,8 +2132,9 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan if (c == ':') { lexer->advance(lexer, false); // past the ':' to peek the next char int32_t n = lexer->lookahead; - if (n == 0 || n == ' ' || n == '\\t' || n == '\\n' || n == '\\r' || - n == ',' || n == '[' || n == ']' || n == '{' || n == '}') { + if (n == 0 || n == ' ' || n == '\\t' || n == '\\n' || n == '\\r'${hasFlow ? ` || + (flow_depth > 0 && (${flowBreakCond('n')}))` : ` || + n == ',' || n == '[' || n == ']' || n == '{' || n == '}'`}) { stopped_at_kv = true; break; // ':' is a key/value separator → end before it } if (blen < sizeof(buf)) buf[blen++] = ':'; // ':' glued to non-space is content @@ -2161,7 +2232,41 @@ bool tree_sitter_${G}_external_scanner_scan(void *payload, TSLexer *lexer, const if (col == top && owed && want_newline && s->started) { lexer->result_symbol = ${NEWLINE}; return true; } return false; } -${hasCompact ? ` +${hasFlow ? ` + // Inside a flow collection (flow_depth > 0) a line break is INSIGNIFICANT — indentation is suspended, + // so a flow scalar / nested bracket may sit on a following line (\`[a,\\n b]\`). tree-sitter's \`/\\s/\` + // extra cannot skip the newline here: the external scanner is consulted first, and a \`false\` return + // (the only way to "decline") rolls back any advance, so the newline is never consumed and the parser + // stalls into error recovery. So when in flow, the scanner itself eats the flow-insignificant run + // (spaces, tabs, newlines, comments) as the LEADING trivia of the next token it returns — the bracket + // emission and scalar scan below both return TRUE, which makes the skip stick. (In block context this + // is skipped: a newline IS significant and drives the INDENT/DEDENT/NEWLINE boundary logic.) + if (s->flow_depth > 0) { + for (;;) { + int32_t c = lexer->lookahead; + if (c == ' ' || c == '\\t' || c == '\\n' || c == '\\r') skip(lexer); + ${cmtC ? `else if (c == ${cmtC}) { while (lexer->lookahead != 0 && lexer->lookahead != '\\n' && lexer->lookahead != '\\r') skip(lexer); }` : ''} + else break; + } + } + // Flow-collection delimiters ([ ] { }). These are emitted as EXTERNAL tokens (not the internal DFA) so + // flow_depth — the open-bracket nesting that suspends indentation — PERSISTS: tree-sitter keeps an + // external scanner's struct mutations only across a token it RETURNS (on a \`false\` return it restores + // the pre-scan serialized state, so a peek-then-false counter is silently rolled back before the + // internal bracket is lexed). Each is gated on its own valid_symbols, so a \`[\`/\`{\` that is plain + // content (a scalar contains but never STARTS with one — handled in scan_scalar) is NOT consumed here: + // at those positions the flow token isn't valid and we fall through. Skip inline space/tab first (the + // flow newline skip above already ran when in flow; in block a newline still drives the indent logic). + { + while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') skip(lexer); + int32_t fc = lexer->lookahead; +${flowTokens.map(t => ` if (fc == ${charLit(t.char)} && valid_symbols[${t.sym.toUpperCase()}]) { + advance(lexer); lexer->mark_end(lexer); lexer->result_symbol = ${t.sym.toUpperCase()}; + ${t.open ? 'if (s->flow_depth < UINT16_MAX) s->flow_depth++;' : 'if (s->flow_depth > 0) s->flow_depth--;'} + s->started = true;${hasCompact ? ' s->at_line_lead = false;' : ''} return true; + }`).join('\n')} + } +` : ''}${hasCompact ? ` // Compact block notation (\`- a: 1\` / \`- - x\` / \`- &a k: v\` / \`? key\`). The line-lead indicator was // just lexed by tree-sitter's internal DFA, so at_line_lead is still set (the scanner emits no // indicator token to clear it) and tree-sitter now wants the nested node's INDENT on the SAME line. @@ -2175,7 +2280,7 @@ ${hasCompact ? ` // • a plain/quoted SCALAR lead — fall through to scan_scalar (below), which pushes the compact // INDENT itself when the run is a mapping KEY. (A bare scalar must NOT be sniffed-then-rewound: a // plain scalar is external-only, so a false return here would loop.) - if (want_indent && s->at_line_lead) { + if (want_indent && s->at_line_lead${hasFlow ? ' && s->flow_depth == 0' : ''}) { while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') skip(lexer); int32_t c = lexer->lookahead; bool nonscalar_lead = c == '&' || c == '!' || c == '[' || c == '{' || c == '*' || compact_is_indicator(c); @@ -2196,8 +2301,9 @@ ${hasCompact ? ` } ` : ''}${BLOCK ? ` // A block scalar value (\`key: |\`): scan its body before the indent logic — its more-indented - // lines are content, not nested structure. Skip the inline space after the \`:\`/\`-\` first. - if (want_block) { + // lines are content, not nested structure. Skip the inline space after the \`:\`/\`-\` first. Block + // scalars are a block-context construct — inside a flow collection \`|\`/\`>\` are plain content. + if (want_block${hasFlow ? ' && s->flow_depth == 0' : ''}) { while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') skip(lexer); if (${introCond}) { if (scan_block_scalar(s, lexer)) {${hasCompact ? ' s->at_line_lead = false;' : ''} return true; } } } @@ -2222,12 +2328,15 @@ ${hasCompact ? ` compact_col = (int16_t)lexer->get_column(lexer); lexer->mark_end(lexer); // zero-width INDENT end at the content start (used iff the run is a KEY) } - if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)}, s, compact_col, ${INDENT})) { s->started = true; s->at_line_lead = (lexer->result_symbol == ${INDENT}); return true; }` : ` - if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)})) { s->started = true; return true; }`} + if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)}${hasFlow ? ', s->flow_depth' : ''}, s, compact_col, ${INDENT})) { s->started = true; s->at_line_lead = (lexer->result_symbol == ${INDENT}); return true; }` : ` + if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)}${hasFlow ? ', s->flow_depth' : ''})) { s->started = true; return true; }`} } } ` : ''} - if (!want_indent && !want_dedent && !want_newline) return false; // flow context — no indent tokens valid + if (!want_indent && !want_dedent && !want_newline) return false; // no indent tokens valid${hasFlow ? ` + // Inside a flow collection a newline is INSIGNIFICANT (indentation suspended): emit NO INDENT/DEDENT/ + // NEWLINE so the line break is consumed by tree-sitter's \`/\\s/\` extra and the flow spans the line. + if (s->flow_depth > 0) return false;` : ''} // Skip blank lines, comment-only lines, and leading whitespace, noting whether a line break was // crossed (only a real boundary drives the indent logic). diff --git a/tree-sitter/yaml/grammar.js b/tree-sitter/yaml/grammar.js index e2cd88a..d5233e9 100644 --- a/tree-sitter/yaml/grammar.js +++ b/tree-sitter/yaml/grammar.js @@ -24,7 +24,11 @@ module.exports = grammar({ $.plain, $.key, $.num, - $.bool_null + $.bool_null, + $._flow_lbracket, + $._flow_lbrace, + $._flow_rbracket, + $._flow_rbrace ], conflicts: $ => [ @@ -128,13 +132,13 @@ module.exports = grammar({ flow_map_entry: $ => choice(seq($.flow_explicit, optional(optional($.flow_node)), optional(seq(":", optional(optional($.flow_node))))), seq($.flow_node, optional(seq(":", optional(optional($.flow_node))))), seq(":", optional(optional($.flow_node)))), - flow_mapping: $ => seq("{", optional(seq(optional($.flow_map_entry), repeat(seq(",", optional($.flow_map_entry))))), optional(","), "}"), + flow_mapping: $ => seq($._flow_lbrace, optional(seq(optional($.flow_map_entry), repeat(seq(",", optional($.flow_map_entry))))), optional(","), $._flow_rbrace), flow_seq_entry: $ => choice(seq($.flow_seq_key, ":", optional(optional($.flow_node))), seq("?", optional(optional($.flow_node)), optional(seq(":", optional(optional($.flow_node))))), seq(":", optional(optional($.flow_node))), $.flow_node), flow_seq_key: $ => choice(seq(optional($.property), choice($.flow_mapping, $.flow_sequence, $.dquote_key, $.squote_key, $.key)), $.alias), - flow_sequence: $ => seq("[", optional(seq(optional($.flow_seq_entry), repeat(seq(",", optional($.flow_seq_entry))))), optional(","), "]"), + flow_sequence: $ => seq($._flow_lbracket, optional(seq(optional($.flow_seq_entry), repeat(seq(",", optional($.flow_seq_entry))))), optional(","), $._flow_rbracket), scalar: $ => choice($.dquote_key, $.squote_key, $.dquote, $.squote, $.block_scalar, $.key, $.num, $.bool_null, $.plain), diff --git a/tree-sitter/yaml/src/scanner.c b/tree-sitter/yaml/src/scanner.c index ede3d3b..8bda6c3 100644 --- a/tree-sitter/yaml/src/scanner.c +++ b/tree-sitter/yaml/src/scanner.c @@ -8,6 +8,7 @@ #include "tree_sitter/parser.h" #include "tree_sitter/alloc.h" #include +#include enum TokenType { INDENT, @@ -18,6 +19,10 @@ enum TokenType { KEY, NUM, BOOL_NULL, + _FLOW_LBRACKET, + _FLOW_LBRACE, + _FLOW_RBRACKET, + _FLOW_RBRACE, }; typedef struct { @@ -28,6 +33,7 @@ typedef struct { bool pending_newline; // a NEWLINE is still owed once dedents reach pending_col bool started; // any content lexed yet (suppresses a leading NEWLINE) bool at_line_lead; // the next real content token is its line's first (compact-indicator probe) + uint16_t flow_depth; // > 0 inside flow collections ([ ] { }) → indentation suspended, ,/[/]/{/} } Scanner; static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } @@ -45,6 +51,7 @@ void *tree_sitter_yaml_external_scanner_create(void) { s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false; s->at_line_lead = true; + s->flow_depth = 0; return s; } @@ -61,6 +68,7 @@ unsigned tree_sitter_yaml_external_scanner_serialize(void *payload, char *buffer buffer[n++] = s->pending_newline ? 1 : 0; buffer[n++] = s->at_line_lead ? 1 : 0; memcpy(&buffer[n], &s->pending_col, sizeof(int16_t)); n += sizeof(int16_t); + memcpy(&buffer[n], &s->flow_depth, sizeof(uint16_t)); n += sizeof(uint16_t); uint32_t count = s->len; while (n + sizeof(uint32_t) + count * sizeof(int16_t) > TREE_SITTER_SERIALIZATION_BUFFER_SIZE && count > 0) count--; memcpy(&buffer[n], &count, sizeof(uint32_t)); n += sizeof(uint32_t); @@ -72,12 +80,14 @@ void tree_sitter_yaml_external_scanner_deserialize(void *payload, const char *bu Scanner *s = (Scanner *)payload; s->len = 1; s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false; s->at_line_lead = true; - if (length < 3 + sizeof(int16_t) + sizeof(uint32_t)) return; + s->flow_depth = 0; + if (length < 3 + sizeof(int16_t) + sizeof(uint16_t) + sizeof(uint32_t)) return; unsigned n = 0; s->started = buffer[n++] != 0; s->pending_newline = buffer[n++] != 0; s->at_line_lead = buffer[n++] != 0; memcpy(&s->pending_col, &buffer[n], sizeof(int16_t)); n += sizeof(int16_t); + memcpy(&s->flow_depth, &buffer[n], sizeof(uint16_t)); n += sizeof(uint16_t); uint32_t count; memcpy(&count, &buffer[n], sizeof(uint32_t)); n += sizeof(uint32_t); if (count == 0) return; // keep stack[0] = 0 while (s->cap < count) { s->cap *= 2; s->stack = ts_realloc(s->stack, s->cap * sizeof(int16_t)); } @@ -185,6 +195,7 @@ static bool compact_content_is_structural(TSLexer *lexer) { // next call (then compact_col is no longer deeper, so this path is skipped). A leaf scalar (no `:`) // is emitted as usual, its end marked at the run end. Mirrors compactNestsHere's mapping-key arm. static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull, + int flow_depth, Scanner *s, int16_t compact_col, int indent_sym) { bool cm = compact_col >= 0; // compact-eligible: suppress per-char mark_end (zero-width INDENT on KEY) char buf[64]; @@ -194,14 +205,14 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan for (;;) { int32_t c = lexer->lookahead; if (c == 0 || c == '\n' || c == '\r') break; // newline / EOF - if (c == ',' || c == '[' || c == ']' || c == '{' || c == '}') break; // flow indicators end a scalar + if (flow_depth > 0 && (c == '[' || c == '{' || c == ']' || c == '}' || c == ',')) break; // flow indicators end a scalar — ONLY inside a flow collection if (!has_content && (c == '-' || c == '?')) { // A leading `-`/`?` is a block indicator (seq entry / explicit key) when followed by space/EOL/ // flow-indicator, and scalar content otherwise (`-1`, `?x`). Peek the next char to decide. lexer->advance(lexer, false); int32_t n = lexer->lookahead; if (n == 0 || n == ' ' || n == '\t' || n == '\n' || n == '\r' || - n == ',' || n == '[' || n == ']' || n == '{' || n == '}') return false; // indicator, not a scalar + (flow_depth > 0 && (n == '[' || n == '{' || n == ']' || n == '}' || n == ','))) return false; // indicator, not a scalar if (blen < sizeof(buf)) buf[blen++] = (char)c; // `-`/`?` glued to non-space is content has_content = true; if (!cm) lexer->mark_end(lexer); @@ -211,7 +222,7 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan lexer->advance(lexer, false); // past the ':' to peek the next char int32_t n = lexer->lookahead; if (n == 0 || n == ' ' || n == '\t' || n == '\n' || n == '\r' || - n == ',' || n == '[' || n == ']' || n == '{' || n == '}') { + (flow_depth > 0 && (n == '[' || n == '{' || n == ']' || n == '}' || n == ','))) { stopped_at_kv = true; break; // ':' is a key/value separator → end before it } if (blen < sizeof(buf)) buf[blen++] = ':'; // ':' glued to non-space is content @@ -311,6 +322,55 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const return false; } + // Inside a flow collection (flow_depth > 0) a line break is INSIGNIFICANT — indentation is suspended, + // so a flow scalar / nested bracket may sit on a following line (`[a,\n b]`). tree-sitter's `/\s/` + // extra cannot skip the newline here: the external scanner is consulted first, and a `false` return + // (the only way to "decline") rolls back any advance, so the newline is never consumed and the parser + // stalls into error recovery. So when in flow, the scanner itself eats the flow-insignificant run + // (spaces, tabs, newlines, comments) as the LEADING trivia of the next token it returns — the bracket + // emission and scalar scan below both return TRUE, which makes the skip stick. (In block context this + // is skipped: a newline IS significant and drives the INDENT/DEDENT/NEWLINE boundary logic.) + if (s->flow_depth > 0) { + for (;;) { + int32_t c = lexer->lookahead; + if (c == ' ' || c == '\t' || c == '\n' || c == '\r') skip(lexer); + else if (c == '#') { while (lexer->lookahead != 0 && lexer->lookahead != '\n' && lexer->lookahead != '\r') skip(lexer); } + else break; + } + } + // Flow-collection delimiters ([ ] { }). These are emitted as EXTERNAL tokens (not the internal DFA) so + // flow_depth — the open-bracket nesting that suspends indentation — PERSISTS: tree-sitter keeps an + // external scanner's struct mutations only across a token it RETURNS (on a `false` return it restores + // the pre-scan serialized state, so a peek-then-false counter is silently rolled back before the + // internal bracket is lexed). Each is gated on its own valid_symbols, so a `[`/`{` that is plain + // content (a scalar contains but never STARTS with one — handled in scan_scalar) is NOT consumed here: + // at those positions the flow token isn't valid and we fall through. Skip inline space/tab first (the + // flow newline skip above already ran when in flow; in block a newline still drives the indent logic). + { + while (lexer->lookahead == ' ' || lexer->lookahead == '\t') skip(lexer); + int32_t fc = lexer->lookahead; + if (fc == '[' && valid_symbols[_FLOW_LBRACKET]) { + advance(lexer); lexer->mark_end(lexer); lexer->result_symbol = _FLOW_LBRACKET; + if (s->flow_depth < UINT16_MAX) s->flow_depth++; + s->started = true; s->at_line_lead = false; return true; + } + if (fc == '{' && valid_symbols[_FLOW_LBRACE]) { + advance(lexer); lexer->mark_end(lexer); lexer->result_symbol = _FLOW_LBRACE; + if (s->flow_depth < UINT16_MAX) s->flow_depth++; + s->started = true; s->at_line_lead = false; return true; + } + if (fc == ']' && valid_symbols[_FLOW_RBRACKET]) { + advance(lexer); lexer->mark_end(lexer); lexer->result_symbol = _FLOW_RBRACKET; + if (s->flow_depth > 0) s->flow_depth--; + s->started = true; s->at_line_lead = false; return true; + } + if (fc == '}' && valid_symbols[_FLOW_RBRACE]) { + advance(lexer); lexer->mark_end(lexer); lexer->result_symbol = _FLOW_RBRACE; + if (s->flow_depth > 0) s->flow_depth--; + s->started = true; s->at_line_lead = false; return true; + } + } + // Compact block notation (`- a: 1` / `- - x` / `- &a k: v` / `? key`). The line-lead indicator was // just lexed by tree-sitter's internal DFA, so at_line_lead is still set (the scanner emits no // indicator token to clear it) and tree-sitter now wants the nested node's INDENT on the SAME line. @@ -324,7 +384,7 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const // • a plain/quoted SCALAR lead — fall through to scan_scalar (below), which pushes the compact // INDENT itself when the run is a mapping KEY. (A bare scalar must NOT be sniffed-then-rewound: a // plain scalar is external-only, so a false return here would loop.) - if (want_indent && s->at_line_lead) { + if (want_indent && s->at_line_lead && s->flow_depth == 0) { while (lexer->lookahead == ' ' || lexer->lookahead == '\t') skip(lexer); int32_t c = lexer->lookahead; bool nonscalar_lead = c == '&' || c == '!' || c == '[' || c == '{' || c == '*' || compact_is_indicator(c); @@ -345,8 +405,9 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const } // A block scalar value (`key: |`): scan its body before the indent logic — its more-indented - // lines are content, not nested structure. Skip the inline space after the `:`/`-` first. - if (want_block) { + // lines are content, not nested structure. Skip the inline space after the `:`/`-` first. Block + // scalars are a block-context construct — inside a flow collection `|`/`>` are plain content. + if (want_block && s->flow_depth == 0) { while (lexer->lookahead == ' ' || lexer->lookahead == '\t') skip(lexer); if (lexer->lookahead == '|' || lexer->lookahead == '>') { if (scan_block_scalar(s, lexer)) { s->at_line_lead = false; return true; } } } @@ -371,11 +432,14 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const compact_col = (int16_t)lexer->get_column(lexer); lexer->mark_end(lexer); // zero-width INDENT end at the content start (used iff the run is a KEY) } - if (scan_scalar(lexer, valid_symbols[KEY] != 0, valid_symbols[PLAIN] != 0, valid_symbols[NUM] != 0, valid_symbols[BOOL_NULL] != 0, s, compact_col, INDENT)) { s->started = true; s->at_line_lead = (lexer->result_symbol == INDENT); return true; } + if (scan_scalar(lexer, valid_symbols[KEY] != 0, valid_symbols[PLAIN] != 0, valid_symbols[NUM] != 0, valid_symbols[BOOL_NULL] != 0, s->flow_depth, s, compact_col, INDENT)) { s->started = true; s->at_line_lead = (lexer->result_symbol == INDENT); return true; } } } - if (!want_indent && !want_dedent && !want_newline) return false; // flow context — no indent tokens valid + if (!want_indent && !want_dedent && !want_newline) return false; // no indent tokens valid + // Inside a flow collection a newline is INSIGNIFICANT (indentation suspended): emit NO INDENT/DEDENT/ + // NEWLINE so the line break is consumed by tree-sitter's `/\s/` extra and the flow spans the line. + if (s->flow_depth > 0) return false; // Skip blank lines, comment-only lines, and leading whitespace, noting whether a line break was // crossed (only a real boundary drives the indent logic). From 595802b7a7fe3391370d74d2f246e50414744cc2 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 8 Jun 2026 08:14:06 +0800 Subject: [PATCH 08/10] =?UTF-8?q?Scan=20YAML=20document=20markers=20?= =?UTF-8?q?=E2=80=94=20`---`-led=20documents=20now=20parse=20(issue=20#3)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A document that started with `---`/`...` then a body on the next line failed: the external scalar scanner's `-`/`.` lead ran the `---` into a plain/key token before the internal `doc_start` could match (and the marker token's separator look-ahead is stripped by the token DFA). The scanner now probes for a document marker at column 0 (glyphs derived from `indent.blockScalar.documentMarkers`): a true sep-bounded marker → set a transient `marker_decline` + return false so the internal `---`/`...` token lexes it; a non-marker glyph (`---foo`) → claim it as plain content. The markers stay INTERNAL tokens (making them external perturbs the GLR tables and mis-lexes same-column block sequences). Plus: `started` is set whenever the column > 0 (so the NEWLINE after a leading marker is emitted, not suppressed), and a document-root block scalar (stack depth 1, parent indent −1) may have a column-0 body, ending only at a column-0 marker. Combined with the flow-depth fix, the bench jumps 72.4% → 94.2% (294/312 valid yaml-test-suite inputs ERROR-free) — the two compound, since many inputs had both a `---` marker and flow/comma content. The six other grammars stay byte-identical (all gated on grammar.indent); tsc clean; generate + build --wasm succeed; gate:treesitter 96.0%; src-coverage-yaml parser alignment 100% (yaml.ts untouched — tree-sitter target only). Refs #3 --- src/gen-treesitter.ts | 115 +++++++++++++++++++++++++++++---- tree-sitter/yaml/src/scanner.c | 72 ++++++++++++++++++++- 2 files changed, 174 insertions(+), 13 deletions(-) diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts index 9a3ba1f..33cbf59 100644 --- a/src/gen-treesitter.ts +++ b/src/gen-treesitter.ts @@ -1,6 +1,6 @@ import type { CstGrammar, RuleExpr, RuleDecl, TokenPattern } from './types.ts'; import { collectLiterals, isKeywordLiteral } from './grammar-utils.ts'; -import { tokenPatternIsNever, tokenPatternSource, tokenPatternStartsWithDecimal, tokenPatternStringDelimiters, tokenPatternTrailingCharClass } from './token-pattern.ts'; +import { tokenPatternIsNever, tokenPatternLiteralPrefix, tokenPatternSource, tokenPatternStartsWithDecimal, tokenPatternStringDelimiters, tokenPatternTrailingCharClass } from './token-pattern.ts'; // ════════════════════════════════════════════════════════════════════════════ // gen-treesitter — derive a tree-sitter parser package from one CstGrammar. @@ -770,6 +770,24 @@ function flowSyntheticTokens(grammar: CstGrammar): { sym: string; char: string; ]; } +/** + * The document-marker glyphs (`---` / `...`) of an indentation grammar, matched to + * `indent.blockScalar.documentMarkers` by token literal prefix — used by the external scanner's + * scan_scalar to claim a non-marker glyph as plain and decline a true marker (the markers stay + * INTERNAL tokens; see planScannerTokens). Longest glyph first (so a 3-char glyph beats a prefix of + * it). Empty unless the grammar declares documentMarkers. + */ +function documentMarkerGlyphs(grammar: CstGrammar): string[] { + const markers = grammar.indent?.blockScalar?.documentMarkers; + if (!markers || markers.length === 0) return []; + const out = new Set(); + for (const tok of grammar.tokens) { + const lit = tokenPatternLiteralPrefix(tok); + if (lit && markers.includes(lit)) out.add(lit); + } + return [...out].sort((a, b) => b.length - a.length); +} + /** Determine which tokens the external scanner must provide. */ function planScannerTokens(grammar: CstGrammar): Map { const map = new Map(); @@ -803,6 +821,14 @@ function planScannerTokens(grammar: CstGrammar): Map { // (a TRUE return). The synthetic name IS the snake symbol; the matching literal in the flow rules is // swapped for a ref to it in renderExpr. Appended last so the scalar-token positions are unchanged. for (const { sym } of flowSyntheticTokens(grammar)) map.set(sym, sym); + // Document markers (`---` / `...`) stay INTERNAL tokens (NOT added here). Their IR is + // `literal + a sep look-ahead`; tree-sitter's token() DFA drops the look-ahead, leaving a bare + // `---`/`...`. That is fine: the external scalar scanner CLAIMS a non-marker glyph (`---foo`) as a + // plain scalar (so it never reaches the internal token) and DECLINES a true sep-bounded marker (so + // the internal token lexes it — see scan_scalar's document-marker probe). Making them external + // instead perturbs the GLR parse tables — a marker token's valid-symbol set then shifts the lexer's + // scalar/indent decisions at unrelated boundaries (a same-column block sequence after a key + // mis-lexes) — so keeping them internal leaves the tables byte-identical to a no-marker build. } // The regex token: '/' is context-sensitive (regex vs division). The scanner // resolves it. @@ -1919,6 +1945,43 @@ function buildIndentScannerC(grammar: CstGrammar, ctx: GrammarJsContext, grammar // agree). Built by flowSyntheticTokens(grammar) and shared with the grammar.js side. const flowTokens = flowSyntheticTokens(grammar); // [{ sym, char, open }] + // DOCUMENT MARKERS (`---` / `...`) — INTERNAL tokens; the external scalar scanner only CLAIMS a non- + // marker glyph as plain and DECLINES a true (sep-bounded) marker so the internal token lexes it. + const markers = documentMarkerGlyphs(grammar); + const hasMarkers = markers.length > 0 && SCALAR; + const cChar = (ch: string) => (ch === '\\' || ch === "'" ? `'\\${ch}'` : `'${ch}'`); + // Advance over one glyph char (counting `matched`) — DON'T push to the run or mark the token end yet. + // The probe commits (mark_end) only on the plain-content path; a true-marker decline marks nothing, so + // the probed chars roll back cleanly and tree-sitter then lexes the internal marker token. + const eatGlyphChar = (ch: string) => `if (lexer->lookahead == ${cChar(ch)}) { lexer->advance(lexer, false); matched++; }`; + // Replay the k matched glyph chars into the run as scalar content (a non-marker glyph: `---foo`, + // `--x`). The lexer is already positioned past them; the main loop continues the run from here. + const replayGlyph = (glyph: string) => [...glyph].map((ch, k) => + `if (matched > ${k}) { if (blen < sizeof(buf)) buf[blen++] = ${cChar(ch)}; }`).join(' '); + const markerProbe = !hasMarkers ? '' : ` + // DOCUMENT-MARKER probe (column 0). A \`---\`/\`...\` glyph that is sep-bounded (ws / EOL / EOF) is a + // document marker — an INTERNAL token (its IR's sep look-ahead is beyond a tree-sitter token() DFA, + // but this external scanner decides the boundary). The glyph is matched WITHOUT marking the token end: + // • a FULL glyph + separator → a TRUE marker: set s->marker_decline and return false; nothing + // was marked, so the probed chars roll back and the internal \`---\`/\`...\` token lexes it (a non- + // marker glyph never reaches that token, so its dropped look-ahead is moot). + // • a LONE indicator char + sep → a block indicator (\`- \`/\`? \`); decline so the internal \`-\`/\`?\` + // token takes it. + // • anything else (\`---foo\`, \`-1\`) → plain content: replay the matched glyph chars and fall through + // to the scalar loop, which continues the run (so the marker glyph is CLAIMED as a plain scalar). + // Markers (and which lead chars are block indicators) are DERIVED from grammar.indent. + if (${hasCompact ? 'compact_col < 0 && ' : ''}lexer->get_column(lexer) == 0) {${markers.map(glyph => ` + if (lexer->lookahead == ${cChar(glyph[0])}) { + unsigned matched = 0; + ${[...glyph].map(eatGlyphChar).join('\n ')} + int32_t mn = lexer->lookahead; + bool msep = (mn == 0 || mn == ' ' || mn == '\\t' || mn == '\\n' || mn == '\\r'); + if (matched == ${glyph.length} && msep) { s->marker_decline = true; return false; }${compactIndicators.includes(glyph[0]) ? ` + if (matched == 1 && msep) return false; // lone \`${glyph[0]}\` + separator → block indicator, not content` : ''} + ${replayGlyph(glyph)} if (matched > 0) { has_content = true; lexer->mark_end(lexer); } + }`).join('')} + }`; + const scannerC = `// Tree-sitter external scanner generated by monogram (indentation path). // // Mirrors the indent-stack state machine of src/gen-lexer.ts: INDENT / DEDENT / NEWLINE are emitted @@ -1943,7 +2006,9 @@ typedef struct { bool pending_newline; // a NEWLINE is still owed once dedents reach pending_col bool started; // any content lexed yet (suppresses a leading NEWLINE)${hasCompact ? ` bool at_line_lead; // the next real content token is its line's first (compact-indicator probe)` : ''}${hasFlow ? ` - uint16_t flow_depth; // > 0 inside flow collections ([ ] { }) → indentation suspended, ,/[/]/{/}` : ''} + uint16_t flow_depth; // > 0 inside flow collections ([ ] { }) → indentation suspended, ,/[/]/{/}` : ''}${hasMarkers ? ` + bool marker_decline; // transient: scan_scalar saw a true \`---\`/\`...\` → external declines so the + // internal marker token lexes it. Set+consumed within one scan; not serialized.` : ''} } Scanner; static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } @@ -1961,7 +2026,8 @@ void *tree_sitter_${G}_external_scanner_create(void) { s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false;${hasCompact ? ` s->at_line_lead = true;` : ''}${hasFlow ? ` - s->flow_depth = 0;` : ''} + s->flow_depth = 0;` : ''}${hasMarkers ? ` + s->marker_decline = false;` : ''} return s; } @@ -1990,7 +2056,8 @@ void tree_sitter_${G}_external_scanner_deserialize(void *payload, const char *bu Scanner *s = (Scanner *)payload; s->len = 1; s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false;${hasCompact ? ` s->at_line_lead = true;` : ''}${hasFlow ? ` - s->flow_depth = 0;` : ''} + s->flow_depth = 0;` : ''}${hasMarkers ? ` + s->marker_decline = false;` : ''} if (length < ${hasCompact ? 3 : 2} + sizeof(int16_t)${hasFlow ? ' + sizeof(uint16_t)' : ''} + sizeof(uint32_t)) return; unsigned n = 0; s->started = buffer[n++] != 0; @@ -2010,8 +2077,14 @@ ${BLOCK ? `// A block scalar (\`|\` / \`>\`): the introducer + indicators + the // stack top); it ends at the first non-blank line at or below the parent, or a column-0 document // marker, or EOF. mark_end is advanced only over lines that belong to the scalar, so the next node's // indentation is left for the normal boundary logic. +// +// A ROOT block scalar (the document's own node — stack depth 1) has an effective parent indentation of +// -1, not 0: its body may sit at column 0 (\`--- >\\nline1\`, yaml-test-suite DK3J / FP8R). So at root, +// only a column-0 DOCUMENT MARKER (\`---\` / \`...\`) — never plain column-0 text — ends it. The marker +// is matched without committing the line (no mark_end), so a non-marker column-0 line stays body.${markers.length > 0 ? `` : ''} static bool scan_block_scalar(Scanner *s, TSLexer *lexer) { - int parent = s->stack[s->len - 1]; + bool root = s->len == 1; // a document-root block scalar: body may reach column 0 + int parent = root ? -1 : s->stack[s->len - 1]; advance(lexer); // the introducer (| or >) while (lexer->lookahead == '+' || lexer->lookahead == '-' || (lexer->lookahead >= '0' && lexer->lookahead <= '9')) advance(lexer); while (lexer->lookahead != 0 && lexer->lookahead != '\\n' && lexer->lookahead != '\\r') advance(lexer); @@ -2023,7 +2096,17 @@ static bool scan_block_scalar(Scanner *s, TSLexer *lexer) { int col = 0; while (lexer->lookahead == ' ') { advance(lexer); col++; } int32_t c = lexer->lookahead; - if (c == 0 || c == '\\n' || c == '\\r') { lexer->mark_end(lexer); continue; } // blank line → body + if (c == 0 || c == '\\n' || c == '\\r') { lexer->mark_end(lexer); continue; } // blank line → body${markers.length > 0 ? ` + if (root && col == 0) { // a column-0 document marker ends a root block scalar + bool is_marker = false;${markers.map(glyph => ` + if (!is_marker && c == ${cChar(glyph[0])}) { + unsigned m = 0; ${[...glyph].map(ch => `if (lexer->lookahead == ${cChar(ch)}) { advance(lexer); m++; }`).join(' ')} + int32_t a = lexer->lookahead; + if (m == ${glyph.length} && (a == 0 || a == ' ' || a == '\\t' || a == '\\n' || a == '\\r')) is_marker = true; + }`).join('')} + if (is_marker) break; // leave the marker line for the next token (no mark_end) + // not a marker: the chars probed above are body; fall through to consume the rest of the line. + }` : ''} if (col <= parent) break; // dedent to/below parent ends the scalar (the line is the next node) while (lexer->lookahead != 0 && lexer->lookahead != '\\n' && lexer->lookahead != '\\r') advance(lexer); lexer->mark_end(lexer); @@ -2105,13 +2188,15 @@ static bool compact_content_is_structural(TSLexer *lexer) { // next call (then compact_col is no longer deeper, so this path is skipped). A leaf scalar (no \`:\`) // is emitted as usual, its end marked at the run end. Mirrors compactNestsHere's mapping-key arm.` : ''} static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull${hasFlow ? `, - int flow_depth` : ''}${hasCompact ? `, - Scanner *s, int16_t compact_col, int indent_sym` : ''}) {${hasCompact ? ` + int flow_depth` : ''}${(hasCompact || hasMarkers) ? `, + Scanner *s` : ''}${hasCompact ? `, + int16_t compact_col, int indent_sym` : ''}) {${hasCompact ? ` bool cm = compact_col >= 0; // compact-eligible: suppress per-char mark_end (zero-width INDENT on KEY)` : ''} char buf[64]; unsigned blen = 0; // run text (capped) — for the number/bool-null shape test bool has_content = false; bool stopped_at_kv = false; // ended at a \`:\`-separator → this scalar is a mapping KEY +${markerProbe} for (;;) { int32_t c = lexer->lookahead; if (c == 0 || c == '\\n' || c == '\\r') break; // newline / EOF @@ -2216,7 +2301,13 @@ bool tree_sitter_${G}_external_scanner_scan(void *payload, TSLexer *lexer, const Scanner *s = (Scanner *)payload; bool want_indent = valid_symbols[${INDENT}]; bool want_dedent = valid_symbols[${DEDENT}]; - bool want_newline = valid_symbols[${NEWLINE}];${BLOCK ? `\n bool want_block = valid_symbols[${BLOCK}];` : ''} + bool want_newline = valid_symbols[${NEWLINE}];${BLOCK ? `\n bool want_block = valid_symbols[${BLOCK}];` : ''}${hasMarkers ? ` + // Content lies to our left whenever we are not at column 0 — including right after an INTERNAL token + // (e.g. a \`---\`/\`...\` document marker, whose match the scanner never sees). Mark started so the line + // boundary that follows emits its NEWLINE (the leading-NEWLINE suppression is only for blank lines at + // the very start of input, which are always at column 0). Without this, the NEWLINE after a leading + // \`---\` would be dropped and the document body could not attach. + if (lexer->get_column(lexer) > 0) s->started = true;` : ''} // Finish a line boundary already in progress: emit the remaining DEDENTs (one per call), then the // owed NEWLINE when the stack top reaches the boundary column. No input is consumed here. @@ -2328,8 +2419,10 @@ ${flowTokens.map(t => ` if (fc == ${charLit(t.char)} && valid_symbols[${t.sym compact_col = (int16_t)lexer->get_column(lexer); lexer->mark_end(lexer); // zero-width INDENT end at the content start (used iff the run is a KEY) } - if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)}${hasFlow ? ', s->flow_depth' : ''}, s, compact_col, ${INDENT})) { s->started = true; s->at_line_lead = (lexer->result_symbol == ${INDENT}); return true; }` : ` - if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)}${hasFlow ? ', s->flow_depth' : ''})) { s->started = true; return true; }`} + if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)}${hasFlow ? ', s->flow_depth' : ''}, s, compact_col, ${INDENT})) { s->started = true; s->at_line_lead = (lexer->result_symbol == ${INDENT}); return true; }${hasMarkers ? ` + if (s->marker_decline) { s->marker_decline = false; return false; } // a true \`---\`/\`...\` here → let the internal marker token lex it` : ''}` : ` + if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)}${hasFlow ? ', s->flow_depth' : ''}${hasMarkers ? ', s' : ''})) { s->started = true; return true; }${hasMarkers ? ` + if (s->marker_decline) { s->marker_decline = false; return false; }` : ''}`} } } ` : ''} diff --git a/tree-sitter/yaml/src/scanner.c b/tree-sitter/yaml/src/scanner.c index 8bda6c3..54115a7 100644 --- a/tree-sitter/yaml/src/scanner.c +++ b/tree-sitter/yaml/src/scanner.c @@ -34,6 +34,8 @@ typedef struct { bool started; // any content lexed yet (suppresses a leading NEWLINE) bool at_line_lead; // the next real content token is its line's first (compact-indicator probe) uint16_t flow_depth; // > 0 inside flow collections ([ ] { }) → indentation suspended, ,/[/]/{/} + bool marker_decline; // transient: scan_scalar saw a true `---`/`...` → external declines so the + // internal marker token lexes it. Set+consumed within one scan; not serialized. } Scanner; static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); } @@ -52,6 +54,7 @@ void *tree_sitter_yaml_external_scanner_create(void) { s->pending_col = -1; s->pending_newline = false; s->started = false; s->at_line_lead = true; s->flow_depth = 0; + s->marker_decline = false; return s; } @@ -81,6 +84,7 @@ void tree_sitter_yaml_external_scanner_deserialize(void *payload, const char *bu s->len = 1; s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false; s->at_line_lead = true; s->flow_depth = 0; + s->marker_decline = false; if (length < 3 + sizeof(int16_t) + sizeof(uint16_t) + sizeof(uint32_t)) return; unsigned n = 0; s->started = buffer[n++] != 0; @@ -100,8 +104,14 @@ void tree_sitter_yaml_external_scanner_deserialize(void *payload, const char *bu // stack top); it ends at the first non-blank line at or below the parent, or a column-0 document // marker, or EOF. mark_end is advanced only over lines that belong to the scalar, so the next node's // indentation is left for the normal boundary logic. +// +// A ROOT block scalar (the document's own node — stack depth 1) has an effective parent indentation of +// -1, not 0: its body may sit at column 0 (`--- >\nline1`, yaml-test-suite DK3J / FP8R). So at root, +// only a column-0 DOCUMENT MARKER (`---` / `...`) — never plain column-0 text — ends it. The marker +// is matched without committing the line (no mark_end), so a non-marker column-0 line stays body. static bool scan_block_scalar(Scanner *s, TSLexer *lexer) { - int parent = s->stack[s->len - 1]; + bool root = s->len == 1; // a document-root block scalar: body may reach column 0 + int parent = root ? -1 : s->stack[s->len - 1]; advance(lexer); // the introducer (| or >) while (lexer->lookahead == '+' || lexer->lookahead == '-' || (lexer->lookahead >= '0' && lexer->lookahead <= '9')) advance(lexer); while (lexer->lookahead != 0 && lexer->lookahead != '\n' && lexer->lookahead != '\r') advance(lexer); @@ -114,6 +124,21 @@ static bool scan_block_scalar(Scanner *s, TSLexer *lexer) { while (lexer->lookahead == ' ') { advance(lexer); col++; } int32_t c = lexer->lookahead; if (c == 0 || c == '\n' || c == '\r') { lexer->mark_end(lexer); continue; } // blank line → body + if (root && col == 0) { // a column-0 document marker ends a root block scalar + bool is_marker = false; + if (!is_marker && c == '-') { + unsigned m = 0; if (lexer->lookahead == '-') { advance(lexer); m++; } if (lexer->lookahead == '-') { advance(lexer); m++; } if (lexer->lookahead == '-') { advance(lexer); m++; } + int32_t a = lexer->lookahead; + if (m == 3 && (a == 0 || a == ' ' || a == '\t' || a == '\n' || a == '\r')) is_marker = true; + } + if (!is_marker && c == '.') { + unsigned m = 0; if (lexer->lookahead == '.') { advance(lexer); m++; } if (lexer->lookahead == '.') { advance(lexer); m++; } if (lexer->lookahead == '.') { advance(lexer); m++; } + int32_t a = lexer->lookahead; + if (m == 3 && (a == 0 || a == ' ' || a == '\t' || a == '\n' || a == '\r')) is_marker = true; + } + if (is_marker) break; // leave the marker line for the next token (no mark_end) + // not a marker: the chars probed above are body; fall through to consume the rest of the line. + } if (col <= parent) break; // dedent to/below parent ends the scalar (the line is the next node) while (lexer->lookahead != 0 && lexer->lookahead != '\n' && lexer->lookahead != '\r') advance(lexer); lexer->mark_end(lexer); @@ -196,12 +221,48 @@ static bool compact_content_is_structural(TSLexer *lexer) { // is emitted as usual, its end marked at the run end. Mirrors compactNestsHere's mapping-key arm. static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull, int flow_depth, - Scanner *s, int16_t compact_col, int indent_sym) { + Scanner *s, + int16_t compact_col, int indent_sym) { bool cm = compact_col >= 0; // compact-eligible: suppress per-char mark_end (zero-width INDENT on KEY) char buf[64]; unsigned blen = 0; // run text (capped) — for the number/bool-null shape test bool has_content = false; bool stopped_at_kv = false; // ended at a `:`-separator → this scalar is a mapping KEY + + // DOCUMENT-MARKER probe (column 0). A `---`/`...` glyph that is sep-bounded (ws / EOL / EOF) is a + // document marker — an INTERNAL token (its IR's sep look-ahead is beyond a tree-sitter token() DFA, + // but this external scanner decides the boundary). The glyph is matched WITHOUT marking the token end: + // • a FULL glyph + separator → a TRUE marker: set s->marker_decline and return false; nothing + // was marked, so the probed chars roll back and the internal `---`/`...` token lexes it (a non- + // marker glyph never reaches that token, so its dropped look-ahead is moot). + // • a LONE indicator char + sep → a block indicator (`- `/`? `); decline so the internal `-`/`?` + // token takes it. + // • anything else (`---foo`, `-1`) → plain content: replay the matched glyph chars and fall through + // to the scalar loop, which continues the run (so the marker glyph is CLAIMED as a plain scalar). + // Markers (and which lead chars are block indicators) are DERIVED from grammar.indent. + if (compact_col < 0 && lexer->get_column(lexer) == 0) { + if (lexer->lookahead == '-') { + unsigned matched = 0; + if (lexer->lookahead == '-') { lexer->advance(lexer, false); matched++; } + if (lexer->lookahead == '-') { lexer->advance(lexer, false); matched++; } + if (lexer->lookahead == '-') { lexer->advance(lexer, false); matched++; } + int32_t mn = lexer->lookahead; + bool msep = (mn == 0 || mn == ' ' || mn == '\t' || mn == '\n' || mn == '\r'); + if (matched == 3 && msep) { s->marker_decline = true; return false; } + if (matched == 1 && msep) return false; // lone `-` + separator → block indicator, not content + if (matched > 0) { if (blen < sizeof(buf)) buf[blen++] = '-'; } if (matched > 1) { if (blen < sizeof(buf)) buf[blen++] = '-'; } if (matched > 2) { if (blen < sizeof(buf)) buf[blen++] = '-'; } if (matched > 0) { has_content = true; lexer->mark_end(lexer); } + } + if (lexer->lookahead == '.') { + unsigned matched = 0; + if (lexer->lookahead == '.') { lexer->advance(lexer, false); matched++; } + if (lexer->lookahead == '.') { lexer->advance(lexer, false); matched++; } + if (lexer->lookahead == '.') { lexer->advance(lexer, false); matched++; } + int32_t mn = lexer->lookahead; + bool msep = (mn == 0 || mn == ' ' || mn == '\t' || mn == '\n' || mn == '\r'); + if (matched == 3 && msep) { s->marker_decline = true; return false; } + if (matched > 0) { if (blen < sizeof(buf)) buf[blen++] = '.'; } if (matched > 1) { if (blen < sizeof(buf)) buf[blen++] = '.'; } if (matched > 2) { if (blen < sizeof(buf)) buf[blen++] = '.'; } if (matched > 0) { has_content = true; lexer->mark_end(lexer); } + } + } for (;;) { int32_t c = lexer->lookahead; if (c == 0 || c == '\n' || c == '\r') break; // newline / EOF @@ -306,6 +367,12 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const bool want_dedent = valid_symbols[DEDENT]; bool want_newline = valid_symbols[NEWLINE]; bool want_block = valid_symbols[BLOCK_SCALAR]; + // Content lies to our left whenever we are not at column 0 — including right after an INTERNAL token + // (e.g. a `---`/`...` document marker, whose match the scanner never sees). Mark started so the line + // boundary that follows emits its NEWLINE (the leading-NEWLINE suppression is only for blank lines at + // the very start of input, which are always at column 0). Without this, the NEWLINE after a leading + // `---` would be dropped and the document body could not attach. + if (lexer->get_column(lexer) > 0) s->started = true; // Finish a line boundary already in progress: emit the remaining DEDENTs (one per call), then the // owed NEWLINE when the stack top reaches the boundary column. No input is consumed here. @@ -433,6 +500,7 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const lexer->mark_end(lexer); // zero-width INDENT end at the content start (used iff the run is a KEY) } if (scan_scalar(lexer, valid_symbols[KEY] != 0, valid_symbols[PLAIN] != 0, valid_symbols[NUM] != 0, valid_symbols[BOOL_NULL] != 0, s->flow_depth, s, compact_col, INDENT)) { s->started = true; s->at_line_lead = (lexer->result_symbol == INDENT); return true; } + if (s->marker_decline) { s->marker_decline = false; return false; } // a true `---`/`...` here → let the internal marker token lex it } } From 6b2fac5f55e74ea8f5cba6446ec04ebff0327428 Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 8 Jun 2026 08:35:21 +0800 Subject: [PATCH 09/10] Fold multi-line plain scalars inside flow collections (issue #3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Inside a flow collection (`[ ]` / `{ }`) a plain scalar folds across line breaks — the break + surrounding whitespace collapse to one space and the run continues on the next line until a flow terminator. The scanner's `scan_scalar` broke a run unconditionally at any newline, so a flow key / value / explicit-key spanning lines lexed as two scalars and the GLR parser couldn't chain them (ERROR). Now, at `flow_depth > 0` with content already scanned, a newline folds: advance past it + surrounding blank lines, stop at a flow terminator (`,`/brackets) / line-leading `#` / EOF, else append one folded space and continue (the next content char re-marks the token end). Block context is unchanged (its multi-line folding is separate indent/grammar machinery). Multi-line quoted scalars in flow already worked (the quoted token spans newlines natively). Bench: 294/312 → 299/312 (94.2% → 95.8%). Six other grammars byte-identical (yaml-only, gated on grammar.indent); tsc clean; generate + build --wasm succeed; gate:treesitter 96.0%. Refs #3 --- src/gen-treesitter.ts | 19 ++++++++++++++++++- tree-sitter/yaml/src/scanner.c | 19 ++++++++++++++++++- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts index 33cbf59..36e43f5 100644 --- a/src/gen-treesitter.ts +++ b/src/gen-treesitter.ts @@ -2199,7 +2199,24 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan ${markerProbe} for (;;) { int32_t c = lexer->lookahead; - if (c == 0 || c == '\\n' || c == '\\r') break; // newline / EOF + if (c == 0) break; // EOF + if (c == '\\n' || c == '\\r') {${hasFlow ? ` + // Inside a flow collection a plain scalar FOLDS across a line break (\`{ multi\\n line: v}\` → the + // key is \`multi line\`): the break + surrounding whitespace (and blank/comment-only lines) collapse + // to one space and the run continues on the next line. Peek past that trivia run WITHOUT committing + // (mark_end stays at the last content char, so a decline trims it): if the next significant char + // ENDS the scalar — EOF, a flow indicator/terminator (\`, [ ] { }\`), or a line-leading \`#\` comment — + // the break is trailing trivia and the scalar stops here; otherwise fold to a space and continue. + if (flow_depth > 0 && has_content) { + while (lexer->lookahead == ' ' || lexer->lookahead == '\\t' || + lexer->lookahead == '\\n' || lexer->lookahead == '\\r') lexer->advance(lexer, false); + int32_t nx = lexer->lookahead; + if (nx == 0 || nx == '#' || (${flowBreakCond('nx')})) break; // scalar ends — the trivia is trailing + if (blen < sizeof(buf)) buf[blen++] = ' '; // the folded break becomes one space + continue; // next content char marks the new token end + }` : ''} + break; // block context (or no content yet): the line break ends the scalar + } ${hasFlow ? `if (flow_depth > 0 && (${flowBreakCond('c')})) break; // flow indicators end a scalar — ONLY inside a flow collection` : `if (c == ',' || c == '[' || c == ']' || c == '{' || c == '}') break; // flow indicators end a scalar`} if (!has_content && (c == '-' || c == '?')) { // A leading \`-\`/\`?\` is a block indicator (seq entry / explicit key) when followed by space/EOL/ diff --git a/tree-sitter/yaml/src/scanner.c b/tree-sitter/yaml/src/scanner.c index 54115a7..09853dc 100644 --- a/tree-sitter/yaml/src/scanner.c +++ b/tree-sitter/yaml/src/scanner.c @@ -265,7 +265,24 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan } for (;;) { int32_t c = lexer->lookahead; - if (c == 0 || c == '\n' || c == '\r') break; // newline / EOF + if (c == 0) break; // EOF + if (c == '\n' || c == '\r') { + // Inside a flow collection a plain scalar FOLDS across a line break (`{ multi\n line: v}` → the + // key is `multi line`): the break + surrounding whitespace (and blank/comment-only lines) collapse + // to one space and the run continues on the next line. Peek past that trivia run WITHOUT committing + // (mark_end stays at the last content char, so a decline trims it): if the next significant char + // ENDS the scalar — EOF, a flow indicator/terminator (`, [ ] { }`), or a line-leading `#` comment — + // the break is trailing trivia and the scalar stops here; otherwise fold to a space and continue. + if (flow_depth > 0 && has_content) { + while (lexer->lookahead == ' ' || lexer->lookahead == '\t' || + lexer->lookahead == '\n' || lexer->lookahead == '\r') lexer->advance(lexer, false); + int32_t nx = lexer->lookahead; + if (nx == 0 || nx == '#' || (nx == '[' || nx == '{' || nx == ']' || nx == '}' || nx == ',')) break; // scalar ends — the trivia is trailing + if (blen < sizeof(buf)) buf[blen++] = ' '; // the folded break becomes one space + continue; // next content char marks the new token end + } + break; // block context (or no content yet): the line break ends the scalar + } if (flow_depth > 0 && (c == '[' || c == '{' || c == ']' || c == '}' || c == ',')) break; // flow indicators end a scalar — ONLY inside a flow collection if (!has_content && (c == '-' || c == '?')) { // A leading `-`/`?` is a block indicator (seq entry / explicit key) when followed by space/EOL/ From 687360984812fd05740ee0b1b0f8528a3f47122a Mon Sep 17 00:00:00 2001 From: Johnson Chu Date: Mon, 8 Jun 2026 08:47:37 +0800 Subject: [PATCH 10/10] Parse block keys with node properties / tags / aliases (issue #3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A block mapping whose KEY is preceded by a node property (`&anchor` / `!tag` / `!!tag` / `!`) ERRORed: the scanner's compact-block detection keys off `at_line_lead` ("the line's first token"), but anchor/tag/alias are INTERNAL tokens tree-sitter lexes WITHOUT consulting the scanner, so after a property was lexed `at_line_lead` was still set and the following key was mis-treated as a compact- nested mapping → a spurious INDENT that corrupted the structure. Fix: a transient `property_lead` field, latched at the genuine line lead (column == stack top, re-derived every boundary and for the first line) when the lead char is a property; the two compact-push sites skip a property-led line so its key stays at the node level. `property_lead` is NOT reset in deserialize — the one carry that must survive the property's internal lex (tree-sitter discards scanner mutations on a `false` return; only across a `true`-returned token does state persist). `yaml.ts` untouched — the grammar's BlockKey already had the production; the gap was the tree-sitter derivation. (yaml-test-suite ZH7C/74H7/E76Z/ 7FWL/HMQ5/2SXE.) Combined with the flow folding, the bench is 95.8% → 97.8% (305/312). Six other grammars byte- identical; tsc clean; generate + build --wasm succeed; gate:treesitter 96.0%; agnostic 9/9; test:yaml-issues 10/10; scope-gap:yaml 100%; src-coverage-yaml 100%. Refs #3 --- src/gen-treesitter.ts | 43 ++++++++++++++++++++++++++++++---- tree-sitter/yaml/src/scanner.c | 39 +++++++++++++++++++++++++++--- 2 files changed, 74 insertions(+), 8 deletions(-) diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts index 36e43f5..307cb3f 100644 --- a/src/gen-treesitter.ts +++ b/src/gen-treesitter.ts @@ -2005,7 +2005,18 @@ typedef struct { int16_t pending_col; // column of the line boundary mid-processing (-1 = none) bool pending_newline; // a NEWLINE is still owed once dedents reach pending_col bool started; // any content lexed yet (suppresses a leading NEWLINE)${hasCompact ? ` - bool at_line_lead; // the next real content token is its line's first (compact-indicator probe)` : ''}${hasFlow ? ` + bool at_line_lead; // the next real content token is its line's first (compact-indicator probe) + bool property_lead; // the line's FIRST token is a node property (\`&\`/\`!\`) — its inline content sits + // at the SAME node level, so it must NOT take the compact mapping-key push (\`&a + // a: b\` is the key \`a\` carrying anchor \`&a\`, not \`&a\`-then-INDENTED-\`a: b\`). + // gen-lexer clears atLineLead on the property token (it sees every token); the + // scanner does not lex the property, so it LATCHES this at the line lead and reads + // it at the push. It must survive the property's INTERNAL lex (which the scanner + // declines via a FALSE return) — tree-sitter deserializes the serialized fields on + // a false return, so a SERIALIZED flag would be rolled back; this one is therefore + // NOT serialized and NOT reset in deserialize (it keeps its in-memory value across + // the decline). It is RE-DERIVED from the lead char at every line boundary, so it + // is always correct at the only points it is read (a line lead).` : ''}${hasFlow ? ` uint16_t flow_depth; // > 0 inside flow collections ([ ] { }) → indentation suspended, ,/[/]/{/}` : ''}${hasMarkers ? ` bool marker_decline; // transient: scan_scalar saw a true \`---\`/\`...\` → external declines so the // internal marker token lexes it. Set+consumed within one scan; not serialized.` : ''} @@ -2025,7 +2036,7 @@ void *tree_sitter_${G}_external_scanner_create(void) { s->stack = ts_malloc(s->cap * sizeof(int16_t)); s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false;${hasCompact ? ` - s->at_line_lead = true;` : ''}${hasFlow ? ` + s->at_line_lead = true; s->property_lead = false;` : ''}${hasFlow ? ` s->flow_depth = 0;` : ''}${hasMarkers ? ` s->marker_decline = false;` : ''} return s; @@ -2391,8 +2402,20 @@ ${flowTokens.map(t => ` if (fc == ${charLit(t.char)} && valid_symbols[${t.sym if (want_indent && s->at_line_lead${hasFlow ? ' && s->flow_depth == 0' : ''}) { while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') skip(lexer); int32_t c = lexer->lookahead; + // GENUINE line lead: the line's first token is not yet lexed, so its column == the stack top (a + // line boundary set top to the lead column; at stream start both are the document level). Once a + // token has been lexed on this line the next content is DEEPER than top. Record whether that first + // token is a node PROPERTY (\`&\`/\`!\`): a property leads a node, so its inline content is at the + // SAME level (no compact push), whereas a compact indicator (\`-\`/\`?\`) DOES open a nested level + // for its content. This is the one fact lost when the property is lexed internally (the scanner + // never sees it), so it is latched here and checked at the two compact-push sites below. + if ((int16_t)lexer->get_column(lexer) == s->stack[s->len - 1]) s->property_lead = (c == '&' || c == '!'); bool nonscalar_lead = c == '&' || c == '!' || c == '[' || c == '{' || c == '*' || compact_is_indicator(c); - if (nonscalar_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) { + // A property that LEADS the line (property_lead) does not nest — skip the compact push so its key + // stays at the node's level (\`&a a: b\` / \`!!str &a1 "foo":\`). A property that follows a compact + // indicator (\`- &a k: v\`) is NOT a line lead (property_lead was set false at the \`-\`), so it still + // pushes via compact_content_is_structural's property-skip. + if (nonscalar_lead && !s->property_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) { int16_t col = (int16_t)lexer->get_column(lexer); lexer->mark_end(lexer); // freeze the zero-width INDENT end at the content column before the sniff advances if (compact_content_is_structural(lexer)) { @@ -2431,8 +2454,12 @@ ${flowTokens.map(t => ` if (fc == ${charLit(t.char)} && valid_symbols[${t.sym // Compact mapping-KEY (part B): a line-lead indicator's scalar-led inline content, deeper than the // stack top. scan_scalar then pushes the nested mapping's INDENT (zero-width, pre-marked here) if // the run is a KEY, or emits the leaf scalar otherwise (so \`- x\` stays a plain item, no push). + // \`!s->property_lead\`: a key after a LINE-LEAD node property (\`&a a: b\`) sits at the node's level, + // not a compact-nested one — so do NOT pre-mark a compact INDENT; scan_scalar then emits the key as + // an ordinary value-position scalar (the enclosing Node carries the property). A key after a compact + // indicator (\`- a: 1\`) has property_lead == false and still nests. int16_t compact_col = -1; - if (want_indent && s->at_line_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) { + if (want_indent && s->at_line_lead && !s->property_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) { compact_col = (int16_t)lexer->get_column(lexer); lexer->mark_end(lexer); // zero-width INDENT end at the content start (used iff the run is a KEY) } @@ -2471,7 +2498,13 @@ ${flowTokens.map(t => ` if (fc == ${charLit(t.char)} && valid_symbols[${t.sym int16_t col = (int16_t)lexer->get_column(lexer); lexer->mark_end(lexer); // INDENT/DEDENT/NEWLINE are zero-width at the content column int top = s->stack[s->len - 1];${hasCompact ? ` - s->at_line_lead = true; // a real line boundary — the next real token leads its line` : ''} + s->at_line_lead = true; // a real line boundary — the next real token leads its line + // Latch whether THIS new line is led by a node property (\`&\`/\`!\`) — the lookahead is the line's + // first content char (blanks/comments already skipped). A property leads a node, so its inline content + // is at the same level and must NOT take a compact push (the gates below check property_lead). This is + // a TRUE-return site so the latch persists through the property's internal lex; it is also re-derived + // for the FIRST line (which reaches no boundary) by the compact block's genuine-line-lead detection. + s->property_lead = (lexer->lookahead == '&' || lexer->lookahead == '!');` : ''} if (col > top) { if (want_indent) { push_indent(s, col); lexer->result_symbol = ${INDENT}; return true; } diff --git a/tree-sitter/yaml/src/scanner.c b/tree-sitter/yaml/src/scanner.c index 09853dc..f9ec5d8 100644 --- a/tree-sitter/yaml/src/scanner.c +++ b/tree-sitter/yaml/src/scanner.c @@ -33,6 +33,17 @@ typedef struct { bool pending_newline; // a NEWLINE is still owed once dedents reach pending_col bool started; // any content lexed yet (suppresses a leading NEWLINE) bool at_line_lead; // the next real content token is its line's first (compact-indicator probe) + bool property_lead; // the line's FIRST token is a node property (`&`/`!`) — its inline content sits + // at the SAME node level, so it must NOT take the compact mapping-key push (`&a + // a: b` is the key `a` carrying anchor `&a`, not `&a`-then-INDENTED-`a: b`). + // gen-lexer clears atLineLead on the property token (it sees every token); the + // scanner does not lex the property, so it LATCHES this at the line lead and reads + // it at the push. It must survive the property's INTERNAL lex (which the scanner + // declines via a FALSE return) — tree-sitter deserializes the serialized fields on + // a false return, so a SERIALIZED flag would be rolled back; this one is therefore + // NOT serialized and NOT reset in deserialize (it keeps its in-memory value across + // the decline). It is RE-DERIVED from the lead char at every line boundary, so it + // is always correct at the only points it is read (a line lead). uint16_t flow_depth; // > 0 inside flow collections ([ ] { }) → indentation suspended, ,/[/]/{/} bool marker_decline; // transient: scan_scalar saw a true `---`/`...` → external declines so the // internal marker token lexes it. Set+consumed within one scan; not serialized. @@ -52,7 +63,7 @@ void *tree_sitter_yaml_external_scanner_create(void) { s->stack = ts_malloc(s->cap * sizeof(int16_t)); s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false; - s->at_line_lead = true; + s->at_line_lead = true; s->property_lead = false; s->flow_depth = 0; s->marker_decline = false; return s; @@ -471,8 +482,20 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const if (want_indent && s->at_line_lead && s->flow_depth == 0) { while (lexer->lookahead == ' ' || lexer->lookahead == '\t') skip(lexer); int32_t c = lexer->lookahead; + // GENUINE line lead: the line's first token is not yet lexed, so its column == the stack top (a + // line boundary set top to the lead column; at stream start both are the document level). Once a + // token has been lexed on this line the next content is DEEPER than top. Record whether that first + // token is a node PROPERTY (`&`/`!`): a property leads a node, so its inline content is at the + // SAME level (no compact push), whereas a compact indicator (`-`/`?`) DOES open a nested level + // for its content. This is the one fact lost when the property is lexed internally (the scanner + // never sees it), so it is latched here and checked at the two compact-push sites below. + if ((int16_t)lexer->get_column(lexer) == s->stack[s->len - 1]) s->property_lead = (c == '&' || c == '!'); bool nonscalar_lead = c == '&' || c == '!' || c == '[' || c == '{' || c == '*' || compact_is_indicator(c); - if (nonscalar_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) { + // A property that LEADS the line (property_lead) does not nest — skip the compact push so its key + // stays at the node's level (`&a a: b` / `!!str &a1 "foo":`). A property that follows a compact + // indicator (`- &a k: v`) is NOT a line lead (property_lead was set false at the `-`), so it still + // pushes via compact_content_is_structural's property-skip. + if (nonscalar_lead && !s->property_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) { int16_t col = (int16_t)lexer->get_column(lexer); lexer->mark_end(lexer); // freeze the zero-width INDENT end at the content column before the sniff advances if (compact_content_is_structural(lexer)) { @@ -511,8 +534,12 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const // Compact mapping-KEY (part B): a line-lead indicator's scalar-led inline content, deeper than the // stack top. scan_scalar then pushes the nested mapping's INDENT (zero-width, pre-marked here) if // the run is a KEY, or emits the leaf scalar otherwise (so `- x` stays a plain item, no push). + // `!s->property_lead`: a key after a LINE-LEAD node property (`&a a: b`) sits at the node's level, + // not a compact-nested one — so do NOT pre-mark a compact INDENT; scan_scalar then emits the key as + // an ordinary value-position scalar (the enclosing Node carries the property). A key after a compact + // indicator (`- a: 1`) has property_lead == false and still nests. int16_t compact_col = -1; - if (want_indent && s->at_line_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) { + if (want_indent && s->at_line_lead && !s->property_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) { compact_col = (int16_t)lexer->get_column(lexer); lexer->mark_end(lexer); // zero-width INDENT end at the content start (used iff the run is a KEY) } @@ -550,6 +577,12 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const lexer->mark_end(lexer); // INDENT/DEDENT/NEWLINE are zero-width at the content column int top = s->stack[s->len - 1]; s->at_line_lead = true; // a real line boundary — the next real token leads its line + // Latch whether THIS new line is led by a node property (`&`/`!`) — the lookahead is the line's + // first content char (blanks/comments already skipped). A property leads a node, so its inline content + // is at the same level and must NOT take a compact push (the gates below check property_lead). This is + // a TRUE-return site so the latch persists through the property's internal lex; it is also re-derived + // for the FIRST line (which reaches no boundary) by the compact block's genuine-line-lead detection. + s->property_lead = (lexer->lookahead == '&' || lexer->lookahead == '!'); if (col > top) { if (want_indent) { push_indent(s, col); lexer->result_symbol = INDENT; return true; }