From 368bb3bf8842c7c7cc31f8e6cf0b121cf60bb1f6 Mon Sep 17 00:00:00 2001
From: Johnson Chu <johnsoncodehk@gmail.com>
Date: Mon, 8 Jun 2026 03:40:57 +0800
Subject: [PATCH 01/10] Make the derived YAML tree-sitter grammar generate +
 build (issue #3, pieces 1-2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`tree-sitter/yaml/grammar.js` previously did not `generate`. Two of the three blockers from the
issue are now resolved, in `src/gen-treesitter.ts` only — every other derived grammar (TS/JS/
TSX/JSX/HTML/Vue) regenerates byte-identical, and `tsc` is clean.

1. Structural indent tokens → externals. INDENT / DEDENT / NEWLINE and the block-scalar body are
   engine-emitted (their token IR is `never()`), so they serialized as never-match token rules the
   parser could never match. `planScannerTokens` now routes them to tree-sitter `externals` (keyed
   off `grammar.indent`), the way the HTML markup path handles `raw_text`: they appear in the
   `externals` block and the scanner.c `TokenType` enum, and references become `$.indent` etc.

2. Nullable-rule elimination. tree-sitter rejects a non-start rule that matches the empty string,
   and an indentation grammar has several (a YAML node/entry may be null: `key:` with no value,
   `{a: }`, an empty document) — `node`/`flow_node`/`flow_map_entry`/`flow_seq_entry`/`after_doc_end`.
   A general ε-elimination (`makeNonEmpty` + `wrapNullableRefs`) makes each such rule's body
   non-empty and wraps every reference to it in `optional(...)`; the accepted language is unchanged
   and only the tree-sitter target is touched. Gated on a grammar actually having nullable non-start
   rules, so the others are untouched.

   The resulting LR conflicts (YAML is massively ambiguous — exactly what tree-sitter's GLR is for)
   are declared: 37 tuples added to `LR_CONFLICT_CLOSURE` (the fixpoint of tree-sitter's own
   analysis, via test/collect-conflicts.ts). The closure filter also accepts TOKEN names now, not
   only rule names, so a token-vs-token conflict like YAML's `key`/`plain` (both can precede a `:`)
   is declarable. Every tuple is YAML-specific (zero rule/token-name overlap with the other
   grammars), so each is inert elsewhere.

`cd tree-sitter/yaml && npx tree-sitter generate && npx tree-sitter build --wasm .` now succeeds.
The C external scanner is still a stub (returns false), so indentation isn't parsed yet — that is
piece 3 (a real indent scanner) and is tracked separately.

Refs #3
---
 src/gen-treesitter.ts          | 153 ++++++++++++++++++++++++++++++++-
 tree-sitter/yaml/grammar.js    |  87 +++++++++++++------
 tree-sitter/yaml/src/scanner.c |   5 +-
 3 files changed, 218 insertions(+), 27 deletions(-)
diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts
index dda7dea..1d0ae3b 100644
--- a/src/gen-treesitter.ts
+++ b/src/gen-treesitter.ts
@@ -143,6 +143,10 @@ interface GrammarJsContext {
   externalSnake: Set<string>;
   /** original token name → external scanner token name (snake) if scanner-provided */
   scannerTokenFor: Map<string, string>;
+  /** Non-start rules whose body can derive the empty string. tree-sitter rejects these, so their
+   *  bodies are made non-empty and every reference to them is wrapped in optional() (ε-elimination,
+   *  see makeNonEmpty / wrapNullableRefs). Empty for grammars with no nullable non-start rules. */
+  nullableNonStart: Set<string>;
   /**
    * If the grammar declares an interpolated-template token, the plan for turning it
    * into a `template` RULE (delimiters + the `${ … }` hole) backed by an external
@@ -342,9 +346,116 @@ function buildPrattRule(rule: RuleDecl, ctx: GrammarJsContext): string {
   return `choice(\n      ${branches.join(',\n      ')}\n    )`;
 }
 
+// ── Nullable-rule elimination (ε-elimination) ────────────────────────────────
+// tree-sitter rejects a NON-START rule that can match the empty string. An indentation grammar like
+// YAML has several (a YAML node/entry may be NULL: `key:` with no value, `{a: }`, an empty doc), so
+// `node`/`flow_node`/`flow_map_entry`/… are nullable. We push the emptiness to the CALL SITES: make
+// each such rule's body NON-EMPTY (`makeNonEmpty`) and wrap every reference to it in `optional(...)`
+// (`wrapNullableRefs`). The accepted language is identical (rule-or-empty at each use), and ONLY the
+// tree-sitter target is touched — the parser and the other generators never see this. Computed once
+// and gated on the grammar actually having nullable non-start rules, so every grammar that already
+// `generate`s (no such rules) is byte-identical.
+
+/** Non-start rules whose body can derive ε. `isTerminal` flags tokens / external symbols (never nullable). */
+function computeNullableNonStart(grammar: CstGrammar, startName: string, isTerminal: (name: string) => boolean): Set<string> {
+  const ruleNames = new Set(grammar.rules.map(r => r.name));
+  const nullable = new Set<string>();
+  const exprNullable = (e: RuleExpr): boolean => {
+    switch (e.type) {
+      case 'literal': return e.value === '';
+      case 'ref': return ruleNames.has(e.name) && !isTerminal(e.name) && nullable.has(e.name);
+      case 'seq': return e.items.every(exprNullable);
+      case 'alt': return e.items.some(exprNullable);
+      case 'quantifier': return e.kind === '+' ? exprNullable(e.body) : true; // ?,* match empty
+      case 'group': return exprNullable(e.body);
+      case 'sep': return true;  // renders to optional(seq(...))
+      default: return true;     // not/sameLine/noCommentBefore/noMultilineFlowBefore/op/prefix/postfix → blank()
+    }
+  };
+  let changed = true;
+  while (changed) { changed = false; for (const r of grammar.rules) if (!nullable.has(r.name) && exprNullable(r.body)) { nullable.add(r.name); changed = true; } }
+  nullable.delete(startName); // the start rule MAY be nullable in tree-sitter
+  return nullable;
+}
+
+/** Wrap every reference to a made-non-empty (`nn`) rule in optional() — the may-be-empty form. */
+function wrapNullableRefs(e: RuleExpr, nn: Set<string>): RuleExpr {
+  switch (e.type) {
+    case 'ref': return nn.has(e.name) ? { type: 'quantifier', kind: '?', body: e } : e;
+    case 'seq': return { type: 'seq', items: e.items.map(i => wrapNullableRefs(i, nn)) };
+    case 'alt': return { type: 'alt', items: e.items.map(i => wrapNullableRefs(i, nn)) };
+    case 'quantifier': return { ...e, body: wrapNullableRefs(e.body, nn) };
+    case 'group': return { ...e, body: wrapNullableRefs(e.body, nn) };
+    case 'sep': return { ...e, element: wrapNullableRefs(e.element, nn) };
+    default: return e;
+  }
+}
+
+/** Whether e is nullable AFTER the transform (a ref to an `nn` rule is now wrapped optional → nullable;
+ *  every other ref is non-nullable, since `nn` is exactly the made-non-empty set). */
+function exprNullableAfter(e: RuleExpr, nn: Set<string>): boolean {
+  switch (e.type) {
+    case 'literal': return e.value === '';
+    case 'ref': return nn.has(e.name);
+    case 'seq': return e.items.every(i => exprNullableAfter(i, nn));
+    case 'alt': return e.items.some(i => exprNullableAfter(i, nn));
+    case 'quantifier': return e.kind === '+' ? exprNullableAfter(e.body, nn) : true;
+    case 'group': return exprNullableAfter(e.body, nn);
+    case 'sep': return true;
+    default: return true;
+  }
+}
+
+/** The non-empty form of a (nullable) expr — its language minus ε. `null` if that language is empty
+ *  (a purely zero-width expr). The chosen non-empty position is rendered UNWRAPPED; the rest get the
+ *  may-be-empty form `wrapNullableRefs`. */
+function makeNonEmpty(e: RuleExpr, nn: Set<string>): RuleExpr | null {
+  const T = (x: RuleExpr) => wrapNullableRefs(x, nn);
+  const NE = (x: RuleExpr) => makeNonEmpty(x, nn);
+  const nul = (x: RuleExpr) => exprNullableAfter(x, nn);
+  switch (e.type) {
+    case 'literal': return e.value === '' ? null : e;
+    case 'ref': return e;                                   // an nn rule (now non-empty) or a non-nullable rule/terminal
+    case 'group': { const b = NE(e.body); return b ? { ...e, body: b } : null; }
+    case 'alt': {
+      const parts: RuleExpr[] = [];
+      for (const m of e.items) { const r = nul(m) ? NE(m) : T(m); if (r) parts.push(r); }
+      return parts.length === 0 ? null : parts.length === 1 ? parts[0] : { type: 'alt', items: parts };
+    }
+    case 'seq': {
+      if (e.items.some(i => !nul(i))) return T(e);          // a non-nullable element already forces non-empty
+      const branches: RuleExpr[] = [];                       // all nullable → "first non-empty element is at i"
+      for (let i = 0; i < e.items.length; i++) {
+        const head = NE(e.items[i]);
+        if (!head) continue;
+        const tail = e.items.slice(i + 1).map(T);
+        branches.push(tail.length ? { type: 'seq', items: [head, ...tail] } : head);
+      }
+      return branches.length === 0 ? null : branches.length === 1 ? branches[0] : { type: 'alt', items: branches };
+    }
+    case 'quantifier': {
+      if (e.kind === '?') return nul(e.body) ? NE(e.body) : T(e.body);   // optional(x) non-empty = x non-empty
+      const head = nul(e.body) ? NE(e.body) : T(e.body);                 // *,+ non-empty = one non-empty iter, then repeat
+      return head ? { type: 'seq', items: [head, { type: 'quantifier', kind: '*', body: T(e.body) }] } : null;
+    }
+    case 'sep': {
+      const head = nul(e.element) ? NE(e.element) : T(e.element);
+      if (!head) return null;
+      const d: RuleExpr = { type: 'literal', value: e.delimiter };
+      return { type: 'seq', items: [head, { type: 'quantifier', kind: '*', body: { type: 'seq', items: [d, T(e.element)] } }, { type: 'quantifier', kind: '?', body: d }] };
+    }
+    default: return null;                                   // not/sameLine/…: zero-width, no non-empty form
+  }
+}
+
 /** Build a single rule's body string (Pratt or plain). */
 function buildRuleBody(rule: RuleDecl, ctx: GrammarJsContext): string {
   if (ctx.prattRules.has(rule.name)) return buildPrattRule(rule, ctx);
+  const nn = ctx.nullableNonStart;
+  if (nn.size > 0) {
+    const body = nn.has(rule.name) ? (makeNonEmpty(rule.body, nn) ?? rule.body) : wrapNullableRefs(rule.body, nn);
+    return renderExpr(body, ctx);
+  }
   return renderExpr(rule.body, ctx);
 }
 
@@ -423,6 +534,22 @@ const LR_CONFLICT_CLOSURE: string[][] = [
   // while completing the closure (CI builds only the typescript + html tree-sitters, so
   // tsx/jsx generate was never exercised). Each is inert for languages lacking the rule.
   ['type', 'class_heritage'], ['type_param', 'jsxtag_name'], ['expr', 'jsxcontainer'],
+  // YAML (issue #3): an indentation grammar is massively ambiguous — a newline may continue a node or
+  // start the next document, a `:` may open a value or be an empty-key map, a scalar may be a key or a
+  // leaf, a flow collection may be a value or an implicit block key. tree-sitter's GLR absorbs all of
+  // this once the states are declared. These 37 tuples are the fixpoint of its own analysis (collected
+  // via test/collect-conflicts.ts); every name is YAML-specific, so each is inert for the other
+  // languages (verified: zero rule-name overlap with the TS/JS/TSX/JSX grammars).
+  ['stream', 'node'], ['empty_key_mapping'], ['explicit_entry'], ['next_doc'], ['stream', 'next_doc'],
+  ['node'], ['key', 'plain'], ['scalar', 'doc_fold'], ['explicit_mapping'], ['block_sequence'],
+  ['map_value_scalar', 'map_value_node_scalar'], ['scalar', 'block_key_scalar'],
+  ['map_value', 'map_value_node'], ['flow_explicit'], ['flow_mapping'], ['flow_sequence'],
+  ['explicit_doc_body'], ['inline_doc_node'], ['alias_or_keyed'], ['doc_fold'], ['mapping_from_flow'],
+  ['mapping_or_scalar'], ['property', 'node'], ['seq_item'], ['property'], ['flow_node'],
+  ['node', 'explicit_doc_body'], ['node', 'after_doc_end'], ['after_doc_end'], ['map_entry'],
+  ['stream', 'explicit_doc_body'], ['map_entry_no_empty'], ['seq_value_node'],
+  ['mapping_or_scalar', 'doc_fold'], ['map_value_scalar', 'map_inline_scalar'],
+  ['content_node', 'mapping_from_flow'], ['mapping_or_scalar', 'map_value'],
 ];
 
 /**
@@ -475,9 +602,12 @@ function deriveConflicts(ctx: GrammarJsContext): string[][] {
   }
 
   // 3. The LR(1) closure tree-sitter's own analysis reports for this grammar.
-  //    Applied only for tuples whose rules all exist here (inert otherwise).
+  //    Applied only for tuples whose symbols ALL exist here (inert otherwise). A conflict symbol may
+  //    be a RULE or a TOKEN (e.g. YAML's `key`/`plain` are tokens that conflict on a trailing `:`), so
+  //    both name sets count — `$.key` is a valid conflict symbol whether key is a rule or a token.
+  const tokenSnakes = new Set(ctx.tokenSnake.values());
   for (const tuple of LR_CONFLICT_CLOSURE) {
-    if (tuple.every(r => ruleSnakes.has(r))) push(tuple);
+    if (tuple.every(r => ruleSnakes.has(r) || tokenSnakes.has(r))) push(tuple);
   }
 
   return conflicts;
@@ -587,6 +717,19 @@ function planScannerTokens(grammar: CstGrammar): Map<string, string> {
   // stateless external token (the scanner emits it at each significant line boundary). Listed
   // FIRST so it heads the enum / externals order.
   if (grammar.newline) map.set(grammar.newline.token, toSnake(grammar.newline.token));
+  // An indentation-sensitive grammar (YAML): INDENT / DEDENT / NEWLINE and the block-scalar body are
+  // engine-emitted — the lexer's indent stack (src/gen-lexer.ts) decides them, not a regex — so their
+  // token IR is `never()`. In tree-sitter they become EXTERNAL tokens the C scanner (src/scanner.c)
+  // provides; without this they would serialize as never-match token rules (`token(/[^\s\S]/)`) that
+  // the parser can never match (and the block-scalar body would orphan the scalar). Ordered
+  // indent/dedent/newline/body so grammar.js's `externals` and scanner.c's enum agree positionally.
+  if (grammar.indent) {
+    const ind = grammar.indent;
+    map.set(ind.indentToken, toSnake(ind.indentToken));
+    map.set(ind.dedentToken, toSnake(ind.dedentToken));
+    map.set(ind.newlineToken, toSnake(ind.newlineToken));
+    if (ind.blockScalar) map.set(ind.blockScalar.token, toSnake(ind.blockScalar.token));
+  }
   // The regex token: '/' is context-sensitive (regex vs division). The scanner
   // resolves it.
   const regexTok = grammar.tokens.find(t => t.flags.includes('regex'));
@@ -778,8 +921,14 @@ export function generateTreeSitter(grammar: CstGrammar, langName?: string): Tree
   // queries for them. Same shape rule gen-tm.ts uses (inferIdentScope).
   const nameFields = collectNameFields(grammar);
 
+  // ε-elimination set (see makeNonEmpty): the start rule is the entry rule, emitted FIRST below.
+  const entryName = grammar.rules[grammar.rules.length - 1].name;
+  const isTerminalName = (n: string) => tokenNames.has(n) || scannerTokenFor.has(n);
+  const nullableNonStart = computeNullableNonStart(grammar, entryName, isTerminalName);
+
   const ctx: GrammarJsContext = {
     grammar, tokenNames, ruleSnake, tokenSnake, prattRules, externalSnake, scannerTokenFor,
+    nullableNonStart,
     templatePlan,
     interpolationPlans,
     nameFieldNodes: nameFields.nodes,
diff --git a/tree-sitter/yaml/grammar.js b/tree-sitter/yaml/grammar.js
index 1e852fc..5d693f9 100644
--- a/tree-sitter/yaml/grammar.js
+++ b/tree-sitter/yaml/grammar.js
@@ -16,14 +16,61 @@ module.exports = grammar({
     $.comment
   ],
 
+  externals: $ => [
+    $.indent,
+    $.dedent,
+    $.newline,
+    $.block_scalar
+  ],
+
+  conflicts: $ => [
+    [$.stream, $.node],
+    [$.empty_key_mapping],
+    [$.explicit_entry],
+    [$.next_doc],
+    [$.stream, $.next_doc],
+    [$.node],
+    [$.key, $.plain],
+    [$.scalar, $.doc_fold],
+    [$.explicit_mapping],
+    [$.block_sequence],
+    [$.map_value_scalar, $.map_value_node_scalar],
+    [$.scalar, $.block_key_scalar],
+    [$.map_value, $.map_value_node],
+    [$.flow_explicit],
+    [$.flow_mapping],
+    [$.flow_sequence],
+    [$.explicit_doc_body],
+    [$.inline_doc_node],
+    [$.alias_or_keyed],
+    [$.doc_fold],
+    [$.mapping_from_flow],
+    [$.mapping_or_scalar],
+    [$.property, $.node],
+    [$.seq_item],
+    [$.property],
+    [$.flow_node],
+    [$.node, $.explicit_doc_body],
+    [$.node, $.after_doc_end],
+    [$.after_doc_end],
+    [$.map_entry],
+    [$.stream, $.explicit_doc_body],
+    [$.map_entry_no_empty],
+    [$.seq_value_node],
+    [$.mapping_or_scalar, $.doc_fold],
+    [$.map_value_scalar, $.map_inline_scalar],
+    [$.content_node, $.mapping_from_flow],
+    [$.mapping_or_scalar, $.map_value],
+  ],
+
   rules: {
-    stream: $ => choice(seq(repeat1(seq(choice($.yaml_directive, $.directive), optional($.newline))), optional(seq($.doc_start, optional($.explicit_doc_body), repeat(seq(optional($.newline), $.next_doc)), optional($.newline), optional($.doc_end), optional($.newline)))), seq(optional($.indent), optional(choice($.doc_fold, $.node)), optional($.dedent), repeat(seq(optional($.newline), $.next_doc)), optional($.newline), optional($.doc_end), optional($.newline))),
+    stream: $ => choice(seq(repeat1(seq(choice($.yaml_directive, $.directive), optional($.newline))), optional(seq($.doc_start, optional($.explicit_doc_body), repeat(seq(optional($.newline), $.next_doc)), optional($.newline), optional($.doc_end), optional($.newline)))), seq(optional($.indent), optional(choice($.doc_fold, optional($.node))), optional($.dedent), repeat(seq(optional($.newline), $.next_doc)), optional($.newline), optional($.doc_end), optional($.newline))),
 
     property: $ => choice(seq($.anchor, optional($.tag)), seq($.tag, optional($.anchor))),
 
     content_node: $ => choice($.block_sequence, $.explicit_mapping, $.empty_key_mapping, $.flow_mapping, $.flow_sequence, $.mapping_from_flow, $.alias_or_keyed, $.mapping_or_scalar),
 
-    node: $ => choice(seq(optional($.anchor), optional($.tag), optional(choice(seq($.indent, $.node, $.dedent), seq($.newline, $.node), $.block_sequence, $.explicit_mapping, $.empty_key_mapping, $.mapping_from_flow, $.flow_mapping, $.flow_sequence, $.alias_or_keyed, $.mapping_or_scalar))), seq($.tag, $.anchor, optional(choice(seq($.indent, $.node, $.dedent), seq($.newline, $.node), $.block_sequence, $.explicit_mapping, $.empty_key_mapping, $.mapping_from_flow, $.flow_mapping, $.flow_sequence, $.alias_or_keyed, $.mapping_or_scalar))), $.block_sequence),
+    node: $ => choice(choice(seq($.anchor, optional($.tag), optional(choice(seq($.indent, optional($.node), $.dedent), seq($.newline, optional($.node)), $.block_sequence, $.explicit_mapping, $.empty_key_mapping, $.mapping_from_flow, $.flow_mapping, $.flow_sequence, $.alias_or_keyed, $.mapping_or_scalar))), seq($.tag, optional(choice(seq($.indent, optional($.node), $.dedent), seq($.newline, optional($.node)), $.block_sequence, $.explicit_mapping, $.empty_key_mapping, $.mapping_from_flow, $.flow_mapping, $.flow_sequence, $.alias_or_keyed, $.mapping_or_scalar))), choice(seq($.indent, optional($.node), $.dedent), seq($.newline, optional($.node)), $.block_sequence, $.explicit_mapping, $.empty_key_mapping, $.mapping_from_flow, $.flow_mapping, $.flow_sequence, $.alias_or_keyed, $.mapping_or_scalar)), seq($.tag, $.anchor, optional(choice(seq($.indent, optional($.node), $.dedent), seq($.newline, optional($.node)), $.block_sequence, $.explicit_mapping, $.empty_key_mapping, $.mapping_from_flow, $.flow_mapping, $.flow_sequence, $.alias_or_keyed, $.mapping_or_scalar))), $.block_sequence),
 
     mapping_or_scalar: $ => choice(seq(choice($.num, $.bool_null, $.plain), $.indent, choice($.num, $.bool_null, $.plain), repeat(seq($.newline, choice($.num, $.bool_null, $.plain))), $.dedent), seq($.block_key_scalar, ":", optional($.map_value_scalar), repeat(seq($.newline, $.map_entry))), $.scalar),
 
@@ -41,11 +88,11 @@ module.exports = grammar({
 
     empty_key_mapping: $ => seq(":", optional($.map_value_scalar), repeat(seq($.newline, $.map_entry_no_empty))),
 
-    value: $ => choice(seq($.indent, choice($.num, $.bool_null, $.plain), repeat1(seq($.newline, $.plain)), $.dedent), seq($.indent, $.node, $.dedent), $.seq_value_node),
+    value: $ => choice(seq($.indent, choice($.num, $.bool_null, $.plain), repeat1(seq($.newline, $.plain)), $.dedent), seq($.indent, optional($.node), $.dedent), $.seq_value_node),
 
-    map_value: $ => choice(seq(choice($.num, $.bool_null, $.plain), $.indent, choice($.num, $.bool_null, $.plain), repeat(seq($.newline, choice($.num, $.bool_null, $.plain))), $.dedent), seq($.indent, choice($.num, $.bool_null, $.plain), repeat1(seq($.newline, $.plain)), $.dedent), seq($.indent, $.node, $.dedent), seq($.indent, $.property, $.dedent, $.content_node), seq($.indent, $.property, $.dedent, $.newline, $.block_sequence), seq($.property, $.newline, $.block_sequence), seq($.newline, $.block_sequence), $.map_value_node),
+    map_value: $ => choice(seq(choice($.num, $.bool_null, $.plain), $.indent, choice($.num, $.bool_null, $.plain), repeat(seq($.newline, choice($.num, $.bool_null, $.plain))), $.dedent), seq($.indent, choice($.num, $.bool_null, $.plain), repeat1(seq($.newline, $.plain)), $.dedent), seq($.indent, optional($.node), $.dedent), seq($.indent, $.property, $.dedent, $.content_node), seq($.indent, $.property, $.dedent, $.newline, $.block_sequence), seq($.property, $.newline, $.block_sequence), seq($.newline, $.block_sequence), $.map_value_node),
 
-    map_value_scalar: $ => choice(seq(choice($.num, $.bool_null, $.plain), $.indent, choice($.num, $.bool_null, $.plain), repeat(seq($.newline, choice($.num, $.bool_null, $.plain))), $.dedent), seq($.indent, choice($.num, $.bool_null, $.plain), repeat1(seq($.newline, $.plain)), $.dedent), seq($.indent, $.node, $.dedent), seq($.indent, $.property, $.dedent, $.content_node), seq($.indent, $.property, $.dedent, $.newline, $.block_sequence), seq($.property, $.newline, $.block_sequence), seq($.newline, $.block_sequence), $.map_value_node_scalar),
+    map_value_scalar: $ => choice(seq(choice($.num, $.bool_null, $.plain), $.indent, choice($.num, $.bool_null, $.plain), repeat(seq($.newline, choice($.num, $.bool_null, $.plain))), $.dedent), seq($.indent, choice($.num, $.bool_null, $.plain), repeat1(seq($.newline, $.plain)), $.dedent), seq($.indent, optional($.node), $.dedent), seq($.indent, $.property, $.dedent, $.content_node), seq($.indent, $.property, $.dedent, $.newline, $.block_sequence), seq($.property, $.newline, $.block_sequence), seq($.newline, $.block_sequence), $.map_value_node_scalar),
 
     indented_value_node: $ => choice(seq($.property, choice(seq($.indent, $.indented_value_node, $.dedent), $.collection_content)), $.content_node),
 
@@ -71,19 +118,19 @@ module.exports = grammar({
 
     seq_item: $ => seq("-", optional($.value)),
 
-    flow_node: $ => seq(optional($.property), optional(choice($.flow_mapping, $.flow_sequence, $.alias, $.scalar))),
+    flow_node: $ => choice(seq($.property, optional(choice($.flow_mapping, $.flow_sequence, $.alias, $.scalar))), choice($.flow_mapping, $.flow_sequence, $.alias, $.scalar)),
 
-    flow_explicit: $ => seq("?", optional($.flow_node)),
+    flow_explicit: $ => seq("?", optional(optional($.flow_node))),
 
-    flow_map_entry: $ => seq(optional($.flow_explicit), optional($.flow_node), optional(seq(":", optional($.flow_node)))),
+    flow_map_entry: $ => choice(seq($.flow_explicit, optional(optional($.flow_node)), optional(seq(":", optional(optional($.flow_node))))), seq($.flow_node, optional(seq(":", optional(optional($.flow_node))))), seq(":", optional(optional($.flow_node)))),
 
-    flow_mapping: $ => seq("{", optional(seq($.flow_map_entry, repeat(seq(",", $.flow_map_entry)))), optional(","), "}"),
+    flow_mapping: $ => seq("{", optional(seq(optional($.flow_map_entry), repeat(seq(",", optional($.flow_map_entry))))), optional(","), "}"),
 
-    flow_seq_entry: $ => choice(seq($.flow_seq_key, ":", optional($.flow_node)), seq("?", optional($.flow_node), optional(seq(":", optional($.flow_node)))), seq(":", optional($.flow_node)), $.flow_node),
+    flow_seq_entry: $ => choice(seq($.flow_seq_key, ":", optional(optional($.flow_node))), seq("?", optional(optional($.flow_node)), optional(seq(":", optional(optional($.flow_node))))), seq(":", optional(optional($.flow_node))), $.flow_node),
 
     flow_seq_key: $ => choice(seq(optional($.property), choice($.flow_mapping, $.flow_sequence, $.dquote_key, $.squote_key, $.key)), $.alias),
 
-    flow_sequence: $ => seq("[", optional(seq($.flow_seq_entry, repeat(seq(",", $.flow_seq_entry)))), optional(","), "]"),
+    flow_sequence: $ => seq("[", optional(seq(optional($.flow_seq_entry), repeat(seq(",", optional($.flow_seq_entry))))), optional(","), "]"),
 
     scalar: $ => choice($.dquote_key, $.squote_key, $.dquote, $.squote, $.block_scalar, $.key, $.num, $.bool_null, $.plain),
 
@@ -91,13 +138,13 @@ module.exports = grammar({
 
     doc_fold: $ => seq(choice($.num, $.bool_null, $.plain), repeat1(choice(seq($.newline, choice($.plain, $.yaml_directive, $.directive)), seq($.indent, choice($.plain, $.yaml_directive, $.directive), repeat(seq($.newline, choice($.plain, $.yaml_directive, $.directive))), $.dedent)))),
 
-    inline_doc_node: $ => choice(seq($.property, optional(choice(seq($.indent, $.doc_fold, $.dedent), seq($.indent, $.node, $.dedent), seq($.newline, $.doc_fold), seq($.newline, $.node), $.flow_mapping, $.flow_sequence, $.alias, $.scalar))), $.doc_fold, choice($.flow_mapping, $.flow_sequence, $.alias, $.scalar)),
+    inline_doc_node: $ => choice(seq($.property, optional(choice(seq($.indent, $.doc_fold, $.dedent), seq($.indent, optional($.node), $.dedent), seq($.newline, $.doc_fold), seq($.newline, optional($.node)), $.flow_mapping, $.flow_sequence, $.alias, $.scalar))), $.doc_fold, choice($.flow_mapping, $.flow_sequence, $.alias, $.scalar)),
 
-    explicit_doc_body: $ => choice(seq($.newline, optional($.indent), optional(choice($.doc_fold, $.node)), optional($.dedent)), $.inline_doc_node),
+    explicit_doc_body: $ => choice(seq($.newline, optional($.indent), optional(choice($.doc_fold, optional($.node))), optional($.dedent)), $.inline_doc_node),
 
-    after_doc_end: $ => choice(seq(repeat(seq(choice($.yaml_directive, $.directive), optional($.newline))), $.doc_start, optional($.explicit_doc_body)), seq(optional($.indent), choice($.doc_fold, $.node), optional($.dedent))),
+    after_doc_end: $ => choice(seq(repeat(seq(choice($.yaml_directive, $.directive), optional($.newline))), $.doc_start, optional($.explicit_doc_body)), choice(seq($.indent, choice($.doc_fold, optional($.node)), optional($.dedent)), seq(choice($.doc_fold, $.node), optional($.dedent)), $.dedent)),
 
-    next_doc: $ => choice(seq($.doc_start, optional($.explicit_doc_body)), seq($.doc_end, optional(seq($.newline, optional($.after_doc_end))))),
+    next_doc: $ => choice(seq($.doc_start, optional($.explicit_doc_body)), seq($.doc_end, optional(seq($.newline, optional(optional($.after_doc_end)))))),
 
     doc_start: $ => token(/---/),
 
@@ -123,21 +170,13 @@ module.exports = grammar({
 
     tag: $ => token(/!(?:<[^>]*>|[^\t\n\f\r \[\]{},]*)/),
 
-    block_scalar: $ => token(/[^\s\S]/),
-
     key: $ => token(/(?:[^\t\n\f\r \-?:,\[\]{}#&*!|>'"%@`]|[\-?:])(?:[^:#\n,\[\]{}]|:|#)*/),
 
     num: $ => token(/(?:[+\-]?\.(?:inf|Inf|INF)|\.(?:nan|NaN|NAN)|0x[0-9A-Fa-f]+|0o[0-7]+|[+\-]?(?:\.[0-9]+|[0-9]+(?:\.[0-9]*)?)(?:[eE][+\-]?[0-9]+)?)/),
 
     bool_null: $ => token(/(?:true|True|TRUE|false|False|FALSE|null|Null|NULL|~)/),
 
-    plain: $ => token(/(?:[^\t\n\f\r \-?:,\[\]{}#&*!|>'"%@`]|[\-?:])(?:[^:#\n,\[\]{}]|:|#)*/),
-
-    indent: $ => token(/[^\s\S]/),
-
-    dedent: $ => token(/[^\s\S]/),
-
-    newline: $ => token(/[^\s\S]/)
+    plain: $ => token(/(?:[^\t\n\f\r \-?:,\[\]{}#&*!|>'"%@`]|[\-?:])(?:[^:#\n,\[\]{}]|:|#)*/)
   }
 });
 
diff --git a/tree-sitter/yaml/src/scanner.c b/tree-sitter/yaml/src/scanner.c
index 7653989..2167de9 100644
--- a/tree-sitter/yaml/src/scanner.c
+++ b/tree-sitter/yaml/src/scanner.c
@@ -13,7 +13,10 @@
 #include <wctype.h>
 
 enum TokenType {
-  NO_EXTERNAL_TOKENS,
+  INDENT,
+  DEDENT,
+  NEWLINE,
+  BLOCK_SCALAR,
 };
 
 // The scanner is stateless — tree-sitter's `valid_symbols` already encodes

From 49a05b12a385780304c4ad5da04dfb6600661c1e Mon Sep 17 00:00:00 2001
From: Johnson Chu <johnsoncodehk@gmail.com>
Date: Mon, 8 Jun 2026 04:02:19 +0800
Subject: [PATCH 02/10] =?UTF-8?q?Add=20the=20YAML=20indentation=20external?=
 =?UTF-8?q?=20scanner=20(issue=20#3,=20piece=203=20=E2=80=94=20WIP)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`buildIndentScannerC` (src/gen-treesitter.ts) generates a real C external scanner for the YAML
indent tokens, replacing the stub. It mirrors src/gen-lexer.ts's indent-stack state machine:

- An indent stack in the Scanner struct, (de)serialized for incremental re-parsing.
- At each line boundary it measures the next content line's column and emits INDENT (deeper → push),
  DEDENT (shallower → pop, one per call until the stack top is reached), or NEWLINE (same column →
  sibling separator); blank and comment-only lines are skipped; open blocks are closed at EOF.
- A block-scalar body (`|`/`>`) is scanned verbatim up to the first line at or below the parent
  indentation.
- Flow needs no special case: inside `[`/`{` the grammar never references the indent tokens, so
  valid_symbols is false and the line break falls through to `extras`.
- All language data (comment introducer, block-scalar introducers) is DERIVED from `grammar.indent`.

`buildTokenBody` now emits a token's BLOCK pattern when it has one (YAML's scalar tokens), since the
tree-sitter grammar is block-context at the top level. (YAML is the only grammar with a blockPattern,
so the other six are byte-identical.)

Verified parsing (`tree-sitter parse`): nested mappings, nested sequences, block scalars, and flow
collections parse with no ERROR — the indent stack, INDENT/DEDENT/NEWLINE, and block-scalar bodies
all work.

KNOWN REMAINING: a flat single-line `key: value` / `- item` still mis-tokenizes — the `plain`/`key`
block patterns must stop at a `: ` separator via a lookahead (`:(?=\S)`), but tree-sitter's token DFA
forbids lookahead, so `sanitizeTreeSitterRegex` strips it and `plain` greedily eats `a: 1`. The
official tree-sitter-yaml scans scalars in C for exactly this reason. The fix (next) is to rewrite
the in-loop `:(?=\S)` boundary into an extent-equivalent consuming form (`:[^\s]`) for block-token
emission, or to scan plain/key scalars in the external scanner.

Refs #3
---
 src/gen-treesitter.ts          | 207 ++++++++++++++++++++++++++++++++-
 tree-sitter/yaml/grammar.js    |   4 +-
 tree-sitter/yaml/src/scanner.c | 168 ++++++++++++++++++++++----
 3 files changed, 353 insertions(+), 26 deletions(-)

diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts
index 1d0ae3b..437f805 100644
--- a/src/gen-treesitter.ts
+++ b/src/gen-treesitter.ts
@@ -478,7 +478,13 @@ function buildTokenBody(name: string, ctx: GrammarJsContext): string | null {
   // rule reference — but we still emit them so highlights can capture comments.
   // tree-sitter's token() DFA rejects zero-width assertions, so strip them first.
   if (tokenPatternIsNever(tok)) return 'token(/[^\\s\\S]/)';
-  return `token(${jsRegexLiteral(sanitizeTreeSitterRegex(tokenPatternSource(tok)))})`;
+  // A token with a BLOCK-context variant (YAML's scalar tokens: a block plain/key stops at a `: `
+  // separator and a value end, where the flow variant runs through them) — emit the block pattern.
+  // The tree-sitter grammar is block-context at the top level; flow collections are their own rules.
+  // Block-only (no `pattern`) and dual tokens both resolve here; YAML is the only grammar with a
+  // blockPattern, so every other language is unaffected (byte-identical).
+  const src = tok.blockPattern ? tokenPatternSource({ pattern: tok.blockPattern }) : tokenPatternSource(tok);
+  return `token(${jsRegexLiteral(sanitizeTreeSitterRegex(src))})`;
 }
 
 // ── conflicts ────────────────────────────────────────────────────────────────
@@ -1776,11 +1782,210 @@ function cCharList(s: string): string {
   return [...s].map(c => `'${c === '\\' || c === "'" ? '\\' + c : c}'`).join(', ');
 }
 
+// ── Indentation external scanner (YAML) ──────────────────────────────────────
+// An indentation-sensitive grammar emits INDENT / DEDENT / NEWLINE from a line-leading-column state
+// machine that a regex lexer cannot express, so they become external tokens scanned here. This C
+// scanner mirrors the indent-stack logic of src/gen-lexer.ts: at each line boundary it measures the
+// next content line's column and emits INDENT (deeper → push), DEDENT (shallower → pop, one per call
+// until the stack top is reached), or NEWLINE (same column → a sibling separator). Flow context needs
+// no special handling: inside `[`/`{` the grammar never references these tokens, so valid_symbols is
+// false there and the line break falls through to `extras`. The indent stack lives in the Scanner
+// struct and is (de)serialized for incremental re-parsing. Block-scalar bodies are scanned verbatim
+// up to the first line at or below the parent indentation. All language data (the comment introducer,
+// the block-scalar introducer chars, the document markers) is DERIVED from `grammar.indent`.
+function buildIndentScannerC(grammar: CstGrammar, ctx: GrammarJsContext, grammarName: string): { scannerC: string; externalTokens: string[] } {
+  const ind = grammar.indent!;
+  const externalTokens = externalSymbols(ctx); // order MUST match grammar.js externals
+  const sym = (tokenName: string) => ctx.scannerTokenFor.get(tokenName)!.toUpperCase();
+  const INDENT = sym(ind.indentToken), DEDENT = sym(ind.dedentToken), NEWLINE = sym(ind.newlineToken);
+  const BLOCK = ind.blockScalar ? sym(ind.blockScalar.token) : null;
+  const cmt = ind.comment ?? '#';
+  const cmtC = cmt.length === 1 ? (cmt === '\\' || cmt === "'" ? `'\\${cmt}'` : `'${cmt}'`) : null;
+  const introCond = (ind.blockScalar?.introducers ?? []).map(c => `lexer->lookahead == '${c}'`).join(' || ') || '0';
+  const enumBody = externalTokens.map(s => `  ${s.toUpperCase()},`).join('\n');
+  const G = grammarName;
+
+  const scannerC = `// Tree-sitter external scanner generated by monogram (indentation path).
+//
+// Mirrors the indent-stack state machine of src/gen-lexer.ts: INDENT / DEDENT / NEWLINE are emitted
+// from the line-leading column relative to an indent stack; a block-scalar body is scanned verbatim.
+// All language data (comment introducer, block-scalar introducers, document markers) is DERIVED from
+// the grammar's \`indent\` config — nothing below is hand-tuned for a specific language.
+
+#include "tree_sitter/parser.h"
+#include "tree_sitter/alloc.h"
+#include <string.h>
+
+enum TokenType {
+${enumBody}
+};
+
+typedef struct {
+  uint32_t len;          // indent-stack depth (>= 1; stack[0] == 0, the document level)
+  uint32_t cap;
+  int16_t *stack;        // indentation columns
+  int16_t pending_col;   // column of the line boundary mid-processing (-1 = none)
+  bool pending_newline;  // a NEWLINE is still owed once dedents reach pending_col
+  bool started;          // any content lexed yet (suppresses a leading NEWLINE)
+} Scanner;
+
+static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
+static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
+
+static void push_indent(Scanner *s, int16_t col) {
+  if (s->len == s->cap) { s->cap *= 2; s->stack = ts_realloc(s->stack, s->cap * sizeof(int16_t)); }
+  s->stack[s->len++] = col;
+}
+
+void *tree_sitter_${G}_external_scanner_create(void) {
+  Scanner *s = ts_malloc(sizeof(Scanner));
+  s->cap = 16; s->len = 1;
+  s->stack = ts_malloc(s->cap * sizeof(int16_t));
+  s->stack[0] = 0;
+  s->pending_col = -1; s->pending_newline = false; s->started = false;
+  return s;
+}
+
+void tree_sitter_${G}_external_scanner_destroy(void *payload) {
+  Scanner *s = (Scanner *)payload;
+  ts_free(s->stack);
+  ts_free(s);
+}
+
+unsigned tree_sitter_${G}_external_scanner_serialize(void *payload, char *buffer) {
+  Scanner *s = (Scanner *)payload;
+  unsigned n = 0;
+  buffer[n++] = s->started ? 1 : 0;
+  buffer[n++] = s->pending_newline ? 1 : 0;
+  memcpy(&buffer[n], &s->pending_col, sizeof(int16_t)); n += sizeof(int16_t);
+  uint32_t count = s->len;
+  while (n + sizeof(uint32_t) + count * sizeof(int16_t) > TREE_SITTER_SERIALIZATION_BUFFER_SIZE && count > 0) count--;
+  memcpy(&buffer[n], &count, sizeof(uint32_t)); n += sizeof(uint32_t);
+  memcpy(&buffer[n], s->stack, count * sizeof(int16_t)); n += count * sizeof(int16_t);
+  return n;
+}
+
+void tree_sitter_${G}_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
+  Scanner *s = (Scanner *)payload;
+  s->len = 1; s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false;
+  if (length < 2 + sizeof(int16_t) + sizeof(uint32_t)) return;
+  unsigned n = 0;
+  s->started = buffer[n++] != 0;
+  s->pending_newline = buffer[n++] != 0;
+  memcpy(&s->pending_col, &buffer[n], sizeof(int16_t)); n += sizeof(int16_t);
+  uint32_t count; memcpy(&count, &buffer[n], sizeof(uint32_t)); n += sizeof(uint32_t);
+  if (count == 0) return; // keep stack[0] = 0
+  while (s->cap < count) { s->cap *= 2; s->stack = ts_realloc(s->stack, s->cap * sizeof(int16_t)); }
+  memcpy(s->stack, &buffer[n], count * sizeof(int16_t));
+  s->len = count;
+}
+
+${BLOCK ? `// A block scalar (\`|\` / \`>\`): the introducer + indicators + the verbatim more-indented body, as
+// one token. The body runs while a line is blank or indented MORE than the parent block level (the
+// stack top); it ends at the first non-blank line at or below the parent, or a column-0 document
+// marker, or EOF. mark_end is advanced only over lines that belong to the scalar, so the next node's
+// indentation is left for the normal boundary logic.
+static bool scan_block_scalar(Scanner *s, TSLexer *lexer) {
+  int parent = s->stack[s->len - 1];
+  advance(lexer); // the introducer (| or >)
+  while (lexer->lookahead == '+' || lexer->lookahead == '-' || (lexer->lookahead >= '0' && lexer->lookahead <= '9')) advance(lexer);
+  while (lexer->lookahead != 0 && lexer->lookahead != '\\n' && lexer->lookahead != '\\r') advance(lexer);
+  lexer->mark_end(lexer); // the header line belongs to the scalar
+  for (;;) {
+    if (lexer->lookahead == '\\r') { advance(lexer); if (lexer->lookahead == '\\n') advance(lexer); }
+    else if (lexer->lookahead == '\\n') advance(lexer);
+    else break; // EOF
+    int col = 0;
+    while (lexer->lookahead == ' ') { advance(lexer); col++; }
+    int32_t c = lexer->lookahead;
+    if (c == 0 || c == '\\n' || c == '\\r') { lexer->mark_end(lexer); continue; } // blank line → body
+    if (col <= parent) break; // dedent to/below parent ends the scalar (the line is the next node)
+    while (lexer->lookahead != 0 && lexer->lookahead != '\\n' && lexer->lookahead != '\\r') advance(lexer);
+    lexer->mark_end(lexer);
+  }
+  lexer->result_symbol = ${BLOCK};
+  return true;
+}
+` : ''}
+bool tree_sitter_${G}_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
+  Scanner *s = (Scanner *)payload;
+  bool want_indent = valid_symbols[${INDENT}];
+  bool want_dedent = valid_symbols[${DEDENT}];
+  bool want_newline = valid_symbols[${NEWLINE}];${BLOCK ? `\n  bool want_block = valid_symbols[${BLOCK}];` : ''}
+
+  // Finish a line boundary already in progress: emit the remaining DEDENTs (one per call), then the
+  // owed NEWLINE when the stack top reaches the boundary column. No input is consumed here.
+  if (s->pending_col >= 0) {
+    int top = s->stack[s->len - 1];
+    if (s->pending_col < top) {
+      if (want_dedent) { s->len--; lexer->result_symbol = ${DEDENT}; return true; }
+      s->pending_col = -1; s->pending_newline = false; return false;
+    }
+    bool owed = s->pending_newline;
+    int16_t col = s->pending_col;
+    s->pending_col = -1; s->pending_newline = false;
+    if (col == top && owed && want_newline && s->started) { lexer->result_symbol = ${NEWLINE}; return true; }
+    return false;
+  }
+${BLOCK ? `
+  // A block scalar value (\`key: |\`): scan its body before the indent logic — its more-indented
+  // lines are content, not nested structure. Skip the inline space after the \`:\`/\`-\` first.
+  if (want_block) {
+    while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') skip(lexer);
+    if (${introCond}) { if (scan_block_scalar(s, lexer)) return true; }
+  }
+` : ''}
+  if (!want_indent && !want_dedent && !want_newline) return false; // flow context — no indent tokens valid
+
+  // Skip blank lines, comment-only lines, and leading whitespace, noting whether a line break was
+  // crossed (only a real boundary drives the indent logic).
+  bool found_eol = false;
+  for (;;) {
+    int32_t c = lexer->lookahead;
+    if (c == '\\n') { skip(lexer); found_eol = true; }
+    else if (c == '\\r') { skip(lexer); if (lexer->lookahead == '\\n') skip(lexer); found_eol = true; }
+    else if (c == ' ' || c == '\\t') { skip(lexer); }
+    ${cmtC ? `else if (c == ${cmtC}) { while (lexer->lookahead != 0 && lexer->lookahead != '\\n' && lexer->lookahead != '\\r') skip(lexer); }` : ''}
+    else break; // content or EOF
+  }
+
+  if (lexer->eof(lexer)) {
+    if (s->len > 1 && want_dedent) { s->len--; lexer->result_symbol = ${DEDENT}; return true; } // close open blocks at EOF
+    return false;
+  }
+  bool was_started = s->started;
+  s->started = true; // content lies ahead — mark started even on a mid-line return
+  if (!found_eol) return false; // not at a line boundary
+
+  int16_t col = (int16_t)lexer->get_column(lexer);
+  lexer->mark_end(lexer); // INDENT/DEDENT/NEWLINE are zero-width at the content column
+  int top = s->stack[s->len - 1];
+
+  if (col > top) {
+    if (want_indent) { push_indent(s, col); lexer->result_symbol = ${INDENT}; return true; }
+    return false;
+  }
+  if (col == top) {
+    if (want_newline && was_started) { lexer->result_symbol = ${NEWLINE}; return true; }
+    return false;
+  }
+  // col < top: a dedent. Emit one DEDENT now; queue the rest + the trailing NEWLINE for re-entry.
+  if (want_dedent) {
+    s->pending_col = col; s->pending_newline = true;
+    s->len--; lexer->result_symbol = ${DEDENT}; return true;
+  }
+  return false;
+}
+`;
+  return { scannerC, externalTokens };
+}
+
 function buildScannerC(
   grammar: CstGrammar,
   ctx: GrammarJsContext,
   grammarName: string,
 ): { scannerC: string; externalTokens: string[] } {
+  if (grammar.indent) return buildIndentScannerC(grammar, ctx, grammarName);
   const regexTok = grammar.tokens.find(t => t.flags.includes('regex'));
   const tp = ctx.templatePlan;
 
diff --git a/tree-sitter/yaml/grammar.js b/tree-sitter/yaml/grammar.js
index 5d693f9..78df6a8 100644
--- a/tree-sitter/yaml/grammar.js
+++ b/tree-sitter/yaml/grammar.js
@@ -170,13 +170,13 @@ module.exports = grammar({
 
     tag: $ => token(/!(?:<[^>]*>|[^\t\n\f\r \[\]{},]*)/),
 
-    key: $ => token(/(?:[^\t\n\f\r \-?:,\[\]{}#&*!|>'"%@`]|[\-?:])(?:[^:#\n,\[\]{}]|:|#)*/),
+    key: $ => token(/(?:[^\t\n\f\r \-?:,\[\]{}#&*!|>'"%@`]|[\-?:])(?:[^:#\n]|:|#)*/),
 
     num: $ => token(/(?:[+\-]?\.(?:inf|Inf|INF)|\.(?:nan|NaN|NAN)|0x[0-9A-Fa-f]+|0o[0-7]+|[+\-]?(?:\.[0-9]+|[0-9]+(?:\.[0-9]*)?)(?:[eE][+\-]?[0-9]+)?)/),
 
     bool_null: $ => token(/(?:true|True|TRUE|false|False|FALSE|null|Null|NULL|~)/),
 
-    plain: $ => token(/(?:[^\t\n\f\r \-?:,\[\]{}#&*!|>'"%@`]|[\-?:])(?:[^:#\n,\[\]{}]|:|#)*/)
+    plain: $ => token(/(?:[^\t\n\f\r \-?:,\[\]{}#&*!|>'"%@`]|[\-?:])(?:[^:#\n]|:|#)*/)
   }
 });
 
diff --git a/tree-sitter/yaml/src/scanner.c b/tree-sitter/yaml/src/scanner.c
index 2167de9..9b006ca 100644
--- a/tree-sitter/yaml/src/scanner.c
+++ b/tree-sitter/yaml/src/scanner.c
@@ -1,16 +1,13 @@
-// Tree-sitter external scanner generated by monogram.
+// Tree-sitter external scanner generated by monogram (indentation path).
 //
-// COMPLETE — the regex-literal scan and the template-literal scan are both
-// wired from the grammar's token hints (`regexContext` and `template`).
-//
-// All language-specific data below is DERIVED from the CstGrammar, not
-// hardcoded: the regex flag chars and the template delimiters all come from
-// the grammar's token hints.
+// Mirrors the indent-stack state machine of src/gen-lexer.ts: INDENT / DEDENT / NEWLINE are emitted
+// from the line-leading column relative to an indent stack; a block-scalar body is scanned verbatim.
+// All language data (comment introducer, block-scalar introducers, document markers) is DERIVED from
+// the grammar's `indent` config — nothing below is hand-tuned for a specific language.
 
 #include "tree_sitter/parser.h"
 #include "tree_sitter/alloc.h"
 #include <string.h>
-#include <wctype.h>
 
 enum TokenType {
   INDENT,
@@ -19,35 +16,160 @@ enum TokenType {
   BLOCK_SCALAR,
 };
 
-// The scanner is stateless — tree-sitter's `valid_symbols` already encodes
-// the parse context (inside a regex slot? inside a template span?), and the
-// `${ … }` brace nesting is handled by the template_substitution rule in the
-// CFG, so there is nothing to (de)serialize.
-typedef struct { char unused; } Scanner;
+typedef struct {
+  uint32_t len;          // indent-stack depth (>= 1; stack[0] == 0, the document level)
+  uint32_t cap;
+  int16_t *stack;        // indentation columns
+  int16_t pending_col;   // column of the line boundary mid-processing (-1 = none)
+  bool pending_newline;  // a NEWLINE is still owed once dedents reach pending_col
+  bool started;          // any content lexed yet (suppresses a leading NEWLINE)
+} Scanner;
+
+static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
+static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
+
+static void push_indent(Scanner *s, int16_t col) {
+  if (s->len == s->cap) { s->cap *= 2; s->stack = ts_realloc(s->stack, s->cap * sizeof(int16_t)); }
+  s->stack[s->len++] = col;
+}
 
 void *tree_sitter_yaml_external_scanner_create(void) {
-  return ts_calloc(1, sizeof(Scanner));
+  Scanner *s = ts_malloc(sizeof(Scanner));
+  s->cap = 16; s->len = 1;
+  s->stack = ts_malloc(s->cap * sizeof(int16_t));
+  s->stack[0] = 0;
+  s->pending_col = -1; s->pending_newline = false; s->started = false;
+  return s;
 }
 
 void tree_sitter_yaml_external_scanner_destroy(void *payload) {
-  ts_free(payload);
+  Scanner *s = (Scanner *)payload;
+  ts_free(s->stack);
+  ts_free(s);
 }
 
 unsigned tree_sitter_yaml_external_scanner_serialize(void *payload, char *buffer) {
-  (void)payload; (void)buffer;
-  return 0;
+  Scanner *s = (Scanner *)payload;
+  unsigned n = 0;
+  buffer[n++] = s->started ? 1 : 0;
+  buffer[n++] = s->pending_newline ? 1 : 0;
+  memcpy(&buffer[n], &s->pending_col, sizeof(int16_t)); n += sizeof(int16_t);
+  uint32_t count = s->len;
+  while (n + sizeof(uint32_t) + count * sizeof(int16_t) > TREE_SITTER_SERIALIZATION_BUFFER_SIZE && count > 0) count--;
+  memcpy(&buffer[n], &count, sizeof(uint32_t)); n += sizeof(uint32_t);
+  memcpy(&buffer[n], s->stack, count * sizeof(int16_t)); n += count * sizeof(int16_t);
+  return n;
 }
 
 void tree_sitter_yaml_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
-  (void)payload; (void)buffer; (void)length;
+  Scanner *s = (Scanner *)payload;
+  s->len = 1; s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false;
+  if (length < 2 + sizeof(int16_t) + sizeof(uint32_t)) return;
+  unsigned n = 0;
+  s->started = buffer[n++] != 0;
+  s->pending_newline = buffer[n++] != 0;
+  memcpy(&s->pending_col, &buffer[n], sizeof(int16_t)); n += sizeof(int16_t);
+  uint32_t count; memcpy(&count, &buffer[n], sizeof(uint32_t)); n += sizeof(uint32_t);
+  if (count == 0) return; // keep stack[0] = 0
+  while (s->cap < count) { s->cap *= 2; s->stack = ts_realloc(s->stack, s->cap * sizeof(int16_t)); }
+  memcpy(s->stack, &buffer[n], count * sizeof(int16_t));
+  s->len = count;
 }
 
-static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
-static inline void skip(TSLexer *lexer) { lexer->advance(lexer, true); }
+// A block scalar (`|` / `>`): the introducer + indicators + the verbatim more-indented body, as
+// one token. The body runs while a line is blank or indented MORE than the parent block level (the
+// stack top); it ends at the first non-blank line at or below the parent, or a column-0 document
+// marker, or EOF. mark_end is advanced only over lines that belong to the scalar, so the next node's
+// indentation is left for the normal boundary logic.
+static bool scan_block_scalar(Scanner *s, TSLexer *lexer) {
+  int parent = s->stack[s->len - 1];
+  advance(lexer); // the introducer (| or >)
+  while (lexer->lookahead == '+' || lexer->lookahead == '-' || (lexer->lookahead >= '0' && lexer->lookahead <= '9')) advance(lexer);
+  while (lexer->lookahead != 0 && lexer->lookahead != '\n' && lexer->lookahead != '\r') advance(lexer);
+  lexer->mark_end(lexer); // the header line belongs to the scalar
+  for (;;) {
+    if (lexer->lookahead == '\r') { advance(lexer); if (lexer->lookahead == '\n') advance(lexer); }
+    else if (lexer->lookahead == '\n') advance(lexer);
+    else break; // EOF
+    int col = 0;
+    while (lexer->lookahead == ' ') { advance(lexer); col++; }
+    int32_t c = lexer->lookahead;
+    if (c == 0 || c == '\n' || c == '\r') { lexer->mark_end(lexer); continue; } // blank line → body
+    if (col <= parent) break; // dedent to/below parent ends the scalar (the line is the next node)
+    while (lexer->lookahead != 0 && lexer->lookahead != '\n' && lexer->lookahead != '\r') advance(lexer);
+    lexer->mark_end(lexer);
+  }
+  lexer->result_symbol = BLOCK_SCALAR;
+  return true;
+}
+
+bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
+  Scanner *s = (Scanner *)payload;
+  bool want_indent = valid_symbols[INDENT];
+  bool want_dedent = valid_symbols[DEDENT];
+  bool want_newline = valid_symbols[NEWLINE];
+  bool want_block = valid_symbols[BLOCK_SCALAR];
+
+  // Finish a line boundary already in progress: emit the remaining DEDENTs (one per call), then the
+  // owed NEWLINE when the stack top reaches the boundary column. No input is consumed here.
+  if (s->pending_col >= 0) {
+    int top = s->stack[s->len - 1];
+    if (s->pending_col < top) {
+      if (want_dedent) { s->len--; lexer->result_symbol = DEDENT; return true; }
+      s->pending_col = -1; s->pending_newline = false; return false;
+    }
+    bool owed = s->pending_newline;
+    int16_t col = s->pending_col;
+    s->pending_col = -1; s->pending_newline = false;
+    if (col == top && owed && want_newline && s->started) { lexer->result_symbol = NEWLINE; return true; }
+    return false;
+  }
+
+  // A block scalar value (`key: |`): scan its body before the indent logic — its more-indented
+  // lines are content, not nested structure. Skip the inline space after the `:`/`-` first.
+  if (want_block) {
+    while (lexer->lookahead == ' ' || lexer->lookahead == '\t') skip(lexer);
+    if (lexer->lookahead == '|' || lexer->lookahead == '>') { if (scan_block_scalar(s, lexer)) return true; }
+  }
+
+  if (!want_indent && !want_dedent && !want_newline) return false; // flow context — no indent tokens valid
+
+  // Skip blank lines, comment-only lines, and leading whitespace, noting whether a line break was
+  // crossed (only a real boundary drives the indent logic).
+  bool found_eol = false;
+  for (;;) {
+    int32_t c = lexer->lookahead;
+    if (c == '\n') { skip(lexer); found_eol = true; }
+    else if (c == '\r') { skip(lexer); if (lexer->lookahead == '\n') skip(lexer); found_eol = true; }
+    else if (c == ' ' || c == '\t') { skip(lexer); }
+    else if (c == '#') { while (lexer->lookahead != 0 && lexer->lookahead != '\n' && lexer->lookahead != '\r') skip(lexer); }
+    else break; // content or EOF
+  }
+
+  if (lexer->eof(lexer)) {
+    if (s->len > 1 && want_dedent) { s->len--; lexer->result_symbol = DEDENT; return true; } // close open blocks at EOF
+    return false;
+  }
+  bool was_started = s->started;
+  s->started = true; // content lies ahead — mark started even on a mid-line return
+  if (!found_eol) return false; // not at a line boundary
 
-bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer,
-                                                          const bool *valid_symbols) {
-  (void)payload;
+  int16_t col = (int16_t)lexer->get_column(lexer);
+  lexer->mark_end(lexer); // INDENT/DEDENT/NEWLINE are zero-width at the content column
+  int top = s->stack[s->len - 1];
 
+  if (col > top) {
+    if (want_indent) { push_indent(s, col); lexer->result_symbol = INDENT; return true; }
+    return false;
+  }
+  if (col == top) {
+    if (want_newline && was_started) { lexer->result_symbol = NEWLINE; return true; }
+    return false;
+  }
+  // col < top: a dedent. Emit one DEDENT now; queue the rest + the trailing NEWLINE for re-entry.
+  if (want_dedent) {
+    s->pending_col = col; s->pending_newline = true;
+    s->len--; lexer->result_symbol = DEDENT; return true;
+  }
   return false;
 }

From e644798f9f75b949818f9bea2ecb039e94e87fdb Mon Sep 17 00:00:00 2001
From: Johnson Chu <johnsoncodehk@gmail.com>
Date: Mon, 8 Jun 2026 04:53:23 +0800
Subject: [PATCH 03/10] =?UTF-8?q?Scan=20YAML=20plain/key=20scalars=20in=20?=
 =?UTF-8?q?the=20C=20scanner=20=E2=80=94=20flat=20scalars=20now=20tokenize?=
 =?UTF-8?q?=20(issue=20#3,=20piece=203)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

tree-sitter token DFAs cannot use look-around, so a YAML plain scalar's boundary (`:` is content
unless followed by space; `#` is a comment only after a space) could not be a regex token — `plain`
greedily ate `a: 1`. `planScannerTokens` now also routes the plain + key tokens (identified by their
block-pattern shape: an in-loop char-class lookahead boundary) to the external scanner, and
`buildIndentScannerC` gains `scan_scalar`: it scans a plain run in C (stopping at `: `, ` #`, a
newline, or a flow indicator), trims trailing whitespace, DECLINES (returns false → tree-sitter rolls
back, letting the regex `num`/`bool_null` tokens match) for number/bool/null-shaped runs, and emits
KEY vs PLAIN by peeking for a trailing `: `. All derived from the grammar; the six other grammars stay
byte-identical and `gate:treesitter` is unaffected (96.0%, still beats official 92.5%).

Now parse with NO ERROR (verified via `tree-sitter parse`, structure checked): a single mapping
(`a: 1` → key + `num`), a flat sequence, a nested mapping (multi-entry — `b`/`c` both keyed),
a nested sequence + sibling, a block scalar, a flow mapping, a flow sequence, a plain scalar with
spaces (`hello world`; `true` → `bool_null`), a colon-in-key (`a:b: c`), and a trailing comment.

KNOWN REMAINING: a TOP-LEVEL multi-entry block mapping (`x: 1\ny: 2\nz: 3` — the most common YAML
shape) still mis-parses: the first entry's value is dropped and 3+ entries ERROR. NESTED multi-entry
mappings parse correctly, so this is specific to document-level NEWLINE-separated chaining — a
grammar/GLR-runtime issue in the `mapping_or_scalar`/`node`/`stream` rules (likely the ε-elimination
making a mapping value optional and GLR committing to the wrong split), NOT the scalar scanner. Next.

Refs #3
---
 src/gen-treesitter.ts          | 181 ++++++++++++++++++++++++++++++++-
 tree-sitter/yaml/grammar.js    |  10 +-
 tree-sitter/yaml/src/scanner.c | 118 +++++++++++++++++++++
 3 files changed, 302 insertions(+), 7 deletions(-)

diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts
index 437f805..eee0bca 100644
--- a/src/gen-treesitter.ts
+++ b/src/gen-treesitter.ts
@@ -1,4 +1,4 @@
-import type { CstGrammar, RuleExpr, RuleDecl } from './types.ts';
+import type { CstGrammar, RuleExpr, RuleDecl, TokenPattern } from './types.ts';
 import { collectLiterals, isKeywordLiteral } from './grammar-utils.ts';
 import { tokenPatternIsNever, tokenPatternSource, tokenPatternStartsWithDecimal, tokenPatternStringDelimiters, tokenPatternTrailingCharClass } from './token-pattern.ts';
 
@@ -716,6 +716,55 @@ function planInterpolations(grammar: CstGrammar): InterpolationPlan[] {
   return plans;
 }
 
+/**
+ * A "plain-family" scalar token (indentation grammars only): one whose boundary is a look-around
+ * INSIDE its body loop — the `:(?=\S)` colon-is-content rule of a YAML plain/key scalar. Concretely,
+ * its `blockPattern` contains, somewhere under a `repeat`, a `seq` ending in a POSITIVE char-class
+ * lookahead. That in-loop assertion is exactly what a tree-sitter `token()` DFA cannot honour (it
+ * needs look-ahead to decide where the scalar ends), so such a token must be scanned in C.
+ *
+ * A typed look-alike (num / bool-null) ALSO carries a `blockPattern`, but its boundary is a
+ * TOP-LEVEL trailing lookahead (`<body>(?=…)` — not inside a repeat), which the DFA enforces
+ * structurally; those are NOT plain-family and stay regex `token()` rules so the parser still
+ * classifies `1` as a number and `true` as a bool. The test is therefore purely STRUCTURAL — it
+ * never names a token — so any grammar without this shape is unaffected.
+ */
+function isPlainFamilyToken(tok: CstGrammar['tokens'][number]): boolean {
+  const p = tok.blockPattern;
+  if (!p || typeof p === 'string') return false;
+  let found = false;
+  const walk = (node: TokenPattern, inRepeat: boolean): void => {
+    if (typeof node === 'string') return;
+    switch (node.type) {
+      case 'repeat': walk(node.body, true); break;
+      case 'seq': {
+        if (inRepeat) {
+          const last = node.items[node.items.length - 1];
+          if (last && typeof last !== 'string' && last.type === 'lookahead' && !last.negate) found = true;
+        }
+        for (const it of node.items) walk(it, inRepeat);
+        break;
+      }
+      case 'alt': for (const it of node.items) walk(it, inRepeat); break;
+      case 'lookahead': case 'lookbehind': walk(node.body, inRepeat); break;
+      default: break;
+    }
+  };
+  walk(p, false);
+  return found;
+}
+
+/** The plain-family tokens of an indentation grammar, split into the PLAIN scalar and the KEY scalar
+ *  by their scope leaf (a cross-check on the structural detection): a plain scalar is scoped
+ *  `string.unquoted…`, a key scalar `entity.name.tag…`. Either may be absent. */
+function planPlainScalarTokens(grammar: CstGrammar): { plain?: string; key?: string } {
+  if (!grammar.indent) return {};
+  const fam = grammar.tokens.filter(isPlainFamilyToken);
+  const plain = fam.find(t => (t.scope ?? '').startsWith('string.unquoted'))?.name;
+  const key = fam.find(t => (t.scope ?? '').startsWith('entity.name.tag'))?.name;
+  return { plain, key };
+}
+
 /** Determine which tokens the external scanner must provide. */
 function planScannerTokens(grammar: CstGrammar): Map<string, string> {
   const map = new Map<string, string>();
@@ -735,6 +784,14 @@ function planScannerTokens(grammar: CstGrammar): Map<string, string> {
     map.set(ind.dedentToken, toSnake(ind.dedentToken));
     map.set(ind.newlineToken, toSnake(ind.newlineToken));
     if (ind.blockScalar) map.set(ind.blockScalar.token, toSnake(ind.blockScalar.token));
+    // The PLAIN and KEY scalars (a `:` is content unless followed by space/EOL; a `#` starts a
+    // comment only after a space) need look-ahead at their boundary, which a tree-sitter `token()`
+    // DFA lacks — so they too become external tokens, scanned by `scan_scalar` in C. Appended AFTER
+    // the block scalar so the enum stays INDENT,DEDENT,NEWLINE,BLOCK_SCALAR,PLAIN,KEY. (Num/BoolNull
+    // are NOT plain-family — their boundary is DFA-expressible — so they stay regex token rules.)
+    const { plain, key } = planPlainScalarTokens(grammar);
+    if (plain) map.set(plain, toSnake(plain));
+    if (key) map.set(key, toSnake(key));
   }
   // The regex token: '/' is context-sensitive (regex vs division). The scanner
   // resolves it.
@@ -1799,6 +1856,12 @@ function buildIndentScannerC(grammar: CstGrammar, ctx: GrammarJsContext, grammar
   const sym = (tokenName: string) => ctx.scannerTokenFor.get(tokenName)!.toUpperCase();
   const INDENT = sym(ind.indentToken), DEDENT = sym(ind.dedentToken), NEWLINE = sym(ind.newlineToken);
   const BLOCK = ind.blockScalar ? sym(ind.blockScalar.token) : null;
+  // The PLAIN / KEY scalar externals (a `:` is a separator only before space/EOL/flow-indicator; a
+  // `#` a comment only after a space) — scanned by scan_scalar where look-ahead IS available.
+  const { plain: plainTok, key: keyTok } = planPlainScalarTokens(grammar);
+  const PLAIN = plainTok ? sym(plainTok) : null;
+  const KEY = keyTok ? sym(keyTok) : null;
+  const SCALAR = PLAIN || KEY; // either may be absent; scan_scalar is emitted when at least one is
   const cmt = ind.comment ?? '#';
   const cmtC = cmt.length === 1 ? (cmt === '\\' || cmt === "'" ? `'\\${cmt}'` : `'${cmt}'`) : null;
   const introCond = (ind.blockScalar?.introducers ?? []).map(c => `lexer->lookahead == '${c}'`).join(' || ') || '0';
@@ -1906,6 +1969,105 @@ static bool scan_block_scalar(Scanner *s, TSLexer *lexer) {
   lexer->result_symbol = ${BLOCK};
   return true;
 }
+` : ''}${SCALAR ? `
+// A PLAIN / KEY scalar. A tree-sitter token() DFA can't decide a plain scalar's boundary (a \`:\` is
+// content unless followed by space/EOL/flow-indicator; a \`#\` starts a comment only after a space),
+// so we scan it here where look-ahead IS available. The run starts at the current column and ends
+// BEFORE the first key/value \`:\`-separator, comment, flow indicator, newline, or EOF; trailing
+// whitespace is trimmed. KEY vs PLAIN is decided by whether a \`:\`-separator immediately follows.
+//
+// A number- or bool/null-SHAPED run is left to the regex \`num\`/\`bool_null\` tokens (return false →
+// tree-sitter rolls back our advances and the typed token matches) — but ONLY where such a token is
+// valid. A multi-line plain fold's continuation line is plain-ONLY (its KEY symbol is not valid), so
+// a numeric-looking continuation ("123" under a plain scalar) must stay PLAIN, not be handed to num.
+static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain) {
+  char buf[64];
+  unsigned blen = 0;        // run text (capped) — for the number/bool-null shape test
+  bool has_content = false;
+  bool stopped_at_kv = false; // ended at a \`:\`-separator → this scalar is a mapping KEY
+  for (;;) {
+    int32_t c = lexer->lookahead;
+    if (c == 0 || c == '\\n' || c == '\\r') break;                 // newline / EOF
+    if (c == ',' || c == '[' || c == ']' || c == '{' || c == '}') break; // flow indicators end a scalar
+    if (!has_content && (c == '-' || c == '?')) {
+      // A leading \`-\`/\`?\` is a block indicator (seq entry / explicit key) when followed by space/EOL/
+      // flow-indicator, and scalar content otherwise (\`-1\`, \`?x\`). Peek the next char to decide.
+      lexer->advance(lexer, false);
+      int32_t n = lexer->lookahead;
+      if (n == 0 || n == ' ' || n == '\\t' || n == '\\n' || n == '\\r' ||
+          n == ',' || n == '[' || n == ']' || n == '{' || n == '}') return false; // indicator, not a scalar
+      if (blen < sizeof(buf)) buf[blen++] = (char)c;               // \`-\`/\`?\` glued to non-space is content
+      has_content = true;
+      lexer->mark_end(lexer);
+      continue;
+    }
+    if (c == ':') {
+      lexer->advance(lexer, false);                               // past the ':' to peek the next char
+      int32_t n = lexer->lookahead;
+      if (n == 0 || n == ' ' || n == '\\t' || n == '\\n' || n == '\\r' ||
+          n == ',' || n == '[' || n == ']' || n == '{' || n == '}') {
+        stopped_at_kv = true; break;                              // ':' is a key/value separator → end before it
+      }
+      if (blen < sizeof(buf)) buf[blen++] = ':';                  // ':' glued to non-space is content
+      has_content = true;
+      lexer->mark_end(lexer);
+      continue;
+    }
+    if (c == ' ' || c == '\\t') {
+      lexer->advance(lexer, false);                               // past the space to peek the next char
+      if (lexer->lookahead == '#') break;                        // " #" begins a comment → end before the space
+      if (blen < sizeof(buf)) buf[blen++] = ' ';                  // interior space (e.g. "hello world")
+      continue;                                                   // do NOT mark_end → trailing spaces are trimmed
+    }
+    if (blen < sizeof(buf)) buf[blen++] = (char)c;
+    has_content = true;
+    lexer->advance(lexer, false);
+    lexer->mark_end(lexer);                                       // token end follows the last content char
+  }
+  if (!has_content) return false;
+
+  // Number / bool-null SHAPE test (so the typed regex tokens still classify \`1\`/\`true\`). Decide KEY
+  // vs PLAIN first, because a typed-looking run is only deferred where a typed token is valid: a KEY
+  // position admits num/bool_null (block_key_scalar), and a non-KEY value position likewise — but a
+  // plain-ONLY fold continuation (neither KEY here, and the run is not a key) must stay PLAIN.
+  bool is_key = stopped_at_kv ? want_key : false;
+  // numeric: only [0-9 . + - e E x o a-f A-F _ : (already excluded)] — a loose superset is fine, the
+  // regex \`num\` makes the precise decision; if it doesn't match, tree-sitter falls back to us is NOT
+  // possible (we already returned), so keep the test TIGHT enough to never defer a real plain string.
+  bool numeric = blen > 0;
+  for (unsigned i = 0; i < blen; i++) {
+    char ch = buf[i];
+    bool ok = (ch >= '0' && ch <= '9') || ch == '.' || ch == '+' || ch == '-' || ch == 'e' || ch == 'E' ||
+              ch == 'x' || ch == 'o' || ch == 'n' /* inf/nan */ || ch == 'a' || ch == 'f' || ch == 'i' ||
+              ch == 'I' || ch == 'N' || ch == 'F' || (ch >= 'A' && ch <= 'F');
+    if (!ok) { numeric = false; break; }
+  }
+  // also require at least one digit OR a .inf/.nan/~ shape so a bare "e"/"a" word isn't called numeric
+  if (numeric) {
+    bool any_digit = false;
+    for (unsigned i = 0; i < blen; i++) if (buf[i] >= '0' && buf[i] <= '9') { any_digit = true; break; }
+    if (!any_digit) numeric = false;
+  }
+  bool boolnull = false;
+  {
+    static const char *WORDS[] = { "true","True","TRUE","false","False","FALSE","null","Null","NULL","~" };
+    for (unsigned w = 0; w < sizeof(WORDS)/sizeof(WORDS[0]); w++) {
+      const char *p = WORDS[w]; unsigned i = 0;
+      while (i < blen && p[i] && buf[i] == p[i]) i++;
+      if (i == blen && p[i] == 0) { boolnull = true; break; }
+    }
+  }
+  // Defer to the typed regex token ONLY when the run is typed-shaped AND a typed token is admissible
+  // here. \`want_key\` (a key slot) always admits num/bool_null; a value slot does too. A plain-ONLY
+  // fold continuation has want_key == false AND is not a key (stopped_at_kv == false) — but a value
+  // slot ALSO has want_key == false. We can't tell them apart from valid_symbols, so we defer in
+  // both: a numeric fold continuation typing as a number is the documented imprecise edge.
+  if ((numeric || boolnull) && want_plain) return false;
+
+  if (is_key) { lexer->result_symbol = want_key ? ${KEY ?? PLAIN} : ${PLAIN ?? KEY}; return true; }
+  lexer->result_symbol = want_plain ? ${PLAIN ?? KEY} : ${KEY ?? PLAIN};
+  return true;
+}
 ` : ''}
 bool tree_sitter_${G}_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
   Scanner *s = (Scanner *)payload;
@@ -1934,6 +2096,23 @@ ${BLOCK ? `
     while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') skip(lexer);
     if (${introCond}) { if (scan_block_scalar(s, lexer)) return true; }
   }
+` : ''}${SCALAR ? `
+  // A PLAIN / KEY scalar on the CURRENT line (not at a line boundary — a leading newline falls through
+  // to the indent logic so INDENT/DEDENT/NEWLINE are emitted first). Skip inline spaces/tabs, then if
+  // the next char could begin a plain scalar (not a newline/EOF and not a YAML indicator — \`-\`/\`?\`/\`:\`
+  // are handled inside scan_scalar, which declines when they are followed by space/EOL/flow-indicator),
+  // scan it where look-ahead is available.
+  if (valid_symbols[${KEY ?? PLAIN}] || valid_symbols[${PLAIN ?? KEY}]) {
+    while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') skip(lexer);
+    int32_t h = lexer->lookahead;
+    bool indicator = h == 0 || h == '\\n' || h == '\\r' || h == ',' || h == '[' || h == ']' || h == '{' ||
+                     h == '}' || h == '#' || h == '&' || h == '*' || h == '!' || h == '|' || h == '>' ||
+                     h == '\\'' || h == '"' || h == '%' || h == '@' || h == '\`';
+    if (!indicator) {
+      bool wk = valid_symbols[${KEY ?? PLAIN}] != 0, wp = valid_symbols[${PLAIN ?? KEY}] != 0;
+      if (scan_scalar(lexer, wk, wp)) { s->started = true; return true; }
+    }
+  }
 ` : ''}
   if (!want_indent && !want_dedent && !want_newline) return false; // flow context — no indent tokens valid
 
diff --git a/tree-sitter/yaml/grammar.js b/tree-sitter/yaml/grammar.js
index 78df6a8..1ec0754 100644
--- a/tree-sitter/yaml/grammar.js
+++ b/tree-sitter/yaml/grammar.js
@@ -20,7 +20,9 @@ module.exports = grammar({
     $.indent,
     $.dedent,
     $.newline,
-    $.block_scalar
+    $.block_scalar,
+    $.plain,
+    $.key
   ],
 
   conflicts: $ => [
@@ -170,13 +172,9 @@ module.exports = grammar({
 
     tag: $ => token(/!(?:<[^>]*>|[^\t\n\f\r \[\]{},]*)/),
 
-    key: $ => token(/(?:[^\t\n\f\r \-?:,\[\]{}#&*!|>'"%@`]|[\-?:])(?:[^:#\n]|:|#)*/),
-
     num: $ => token(/(?:[+\-]?\.(?:inf|Inf|INF)|\.(?:nan|NaN|NAN)|0x[0-9A-Fa-f]+|0o[0-7]+|[+\-]?(?:\.[0-9]+|[0-9]+(?:\.[0-9]*)?)(?:[eE][+\-]?[0-9]+)?)/),
 
-    bool_null: $ => token(/(?:true|True|TRUE|false|False|FALSE|null|Null|NULL|~)/),
-
-    plain: $ => token(/(?:[^\t\n\f\r \-?:,\[\]{}#&*!|>'"%@`]|[\-?:])(?:[^:#\n]|:|#)*/)
+    bool_null: $ => token(/(?:true|True|TRUE|false|False|FALSE|null|Null|NULL|~)/)
   }
 });
 
diff --git a/tree-sitter/yaml/src/scanner.c b/tree-sitter/yaml/src/scanner.c
index 9b006ca..7d55271 100644
--- a/tree-sitter/yaml/src/scanner.c
+++ b/tree-sitter/yaml/src/scanner.c
@@ -14,6 +14,8 @@ enum TokenType {
   DEDENT,
   NEWLINE,
   BLOCK_SCALAR,
+  PLAIN,
+  KEY,
 };
 
 typedef struct {
@@ -103,6 +105,105 @@ static bool scan_block_scalar(Scanner *s, TSLexer *lexer) {
   return true;
 }
 
+// A PLAIN / KEY scalar. A tree-sitter token() DFA can't decide a plain scalar's boundary (a `:` is
+// content unless followed by space/EOL/flow-indicator; a `#` starts a comment only after a space),
+// so we scan it here where look-ahead IS available. The run starts at the current column and ends
+// BEFORE the first key/value `:`-separator, comment, flow indicator, newline, or EOF; trailing
+// whitespace is trimmed. KEY vs PLAIN is decided by whether a `:`-separator immediately follows.
+//
+// A number- or bool/null-SHAPED run is left to the regex `num`/`bool_null` tokens (return false →
+// tree-sitter rolls back our advances and the typed token matches) — but ONLY where such a token is
+// valid. A multi-line plain fold's continuation line is plain-ONLY (its KEY symbol is not valid), so
+// a numeric-looking continuation ("123" under a plain scalar) must stay PLAIN, not be handed to num.
+static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain) {
+  char buf[64];
+  unsigned blen = 0;        // run text (capped) — for the number/bool-null shape test
+  bool has_content = false;
+  bool stopped_at_kv = false; // ended at a `:`-separator → this scalar is a mapping KEY
+  for (;;) {
+    int32_t c = lexer->lookahead;
+    if (c == 0 || c == '\n' || c == '\r') break;                 // newline / EOF
+    if (c == ',' || c == '[' || c == ']' || c == '{' || c == '}') break; // flow indicators end a scalar
+    if (!has_content && (c == '-' || c == '?')) {
+      // A leading `-`/`?` is a block indicator (seq entry / explicit key) when followed by space/EOL/
+      // flow-indicator, and scalar content otherwise (`-1`, `?x`). Peek the next char to decide.
+      lexer->advance(lexer, false);
+      int32_t n = lexer->lookahead;
+      if (n == 0 || n == ' ' || n == '\t' || n == '\n' || n == '\r' ||
+          n == ',' || n == '[' || n == ']' || n == '{' || n == '}') return false; // indicator, not a scalar
+      if (blen < sizeof(buf)) buf[blen++] = (char)c;               // `-`/`?` glued to non-space is content
+      has_content = true;
+      lexer->mark_end(lexer);
+      continue;
+    }
+    if (c == ':') {
+      lexer->advance(lexer, false);                               // past the ':' to peek the next char
+      int32_t n = lexer->lookahead;
+      if (n == 0 || n == ' ' || n == '\t' || n == '\n' || n == '\r' ||
+          n == ',' || n == '[' || n == ']' || n == '{' || n == '}') {
+        stopped_at_kv = true; break;                              // ':' is a key/value separator → end before it
+      }
+      if (blen < sizeof(buf)) buf[blen++] = ':';                  // ':' glued to non-space is content
+      has_content = true;
+      lexer->mark_end(lexer);
+      continue;
+    }
+    if (c == ' ' || c == '\t') {
+      lexer->advance(lexer, false);                               // past the space to peek the next char
+      if (lexer->lookahead == '#') break;                        // " #" begins a comment → end before the space
+      if (blen < sizeof(buf)) buf[blen++] = ' ';                  // interior space (e.g. "hello world")
+      continue;                                                   // do NOT mark_end → trailing spaces are trimmed
+    }
+    if (blen < sizeof(buf)) buf[blen++] = (char)c;
+    has_content = true;
+    lexer->advance(lexer, false);
+    lexer->mark_end(lexer);                                       // token end follows the last content char
+  }
+  if (!has_content) return false;
+
+  // Number / bool-null SHAPE test (so the typed regex tokens still classify `1`/`true`). Decide KEY
+  // vs PLAIN first, because a typed-looking run is only deferred where a typed token is valid: a KEY
+  // position admits num/bool_null (block_key_scalar), and a non-KEY value position likewise — but a
+  // plain-ONLY fold continuation (neither KEY here, and the run is not a key) must stay PLAIN.
+  bool is_key = stopped_at_kv ? want_key : false;
+  // numeric: only [0-9 . + - e E x o a-f A-F _ : (already excluded)] — a loose superset is fine, the
+  // regex `num` makes the precise decision; if it doesn't match, tree-sitter falls back to us is NOT
+  // possible (we already returned), so keep the test TIGHT enough to never defer a real plain string.
+  bool numeric = blen > 0;
+  for (unsigned i = 0; i < blen; i++) {
+    char ch = buf[i];
+    bool ok = (ch >= '0' && ch <= '9') || ch == '.' || ch == '+' || ch == '-' || ch == 'e' || ch == 'E' ||
+              ch == 'x' || ch == 'o' || ch == 'n' /* inf/nan */ || ch == 'a' || ch == 'f' || ch == 'i' ||
+              ch == 'I' || ch == 'N' || ch == 'F' || (ch >= 'A' && ch <= 'F');
+    if (!ok) { numeric = false; break; }
+  }
+  // also require at least one digit OR a .inf/.nan/~ shape so a bare "e"/"a" word isn't called numeric
+  if (numeric) {
+    bool any_digit = false;
+    for (unsigned i = 0; i < blen; i++) if (buf[i] >= '0' && buf[i] <= '9') { any_digit = true; break; }
+    if (!any_digit) numeric = false;
+  }
+  bool boolnull = false;
+  {
+    static const char *WORDS[] = { "true","True","TRUE","false","False","FALSE","null","Null","NULL","~" };
+    for (unsigned w = 0; w < sizeof(WORDS)/sizeof(WORDS[0]); w++) {
+      const char *p = WORDS[w]; unsigned i = 0;
+      while (i < blen && p[i] && buf[i] == p[i]) i++;
+      if (i == blen && p[i] == 0) { boolnull = true; break; }
+    }
+  }
+  // Defer to the typed regex token ONLY when the run is typed-shaped AND a typed token is admissible
+  // here. `want_key` (a key slot) always admits num/bool_null; a value slot does too. A plain-ONLY
+  // fold continuation has want_key == false AND is not a key (stopped_at_kv == false) — but a value
+  // slot ALSO has want_key == false. We can't tell them apart from valid_symbols, so we defer in
+  // both: a numeric fold continuation typing as a number is the documented imprecise edge.
+  if ((numeric || boolnull) && want_plain) return false;
+
+  if (is_key) { lexer->result_symbol = want_key ? KEY : PLAIN; return true; }
+  lexer->result_symbol = want_plain ? PLAIN : KEY;
+  return true;
+}
+
 bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
   Scanner *s = (Scanner *)payload;
   bool want_indent = valid_symbols[INDENT];
@@ -132,6 +233,23 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const
     if (lexer->lookahead == '|' || lexer->lookahead == '>') { if (scan_block_scalar(s, lexer)) return true; }
   }
 
+  // A PLAIN / KEY scalar on the CURRENT line (not at a line boundary — a leading newline falls through
+  // to the indent logic so INDENT/DEDENT/NEWLINE are emitted first). Skip inline spaces/tabs, then if
+  // the next char could begin a plain scalar (not a newline/EOF and not a YAML indicator — `-`/`?`/`:`
+  // are handled inside scan_scalar, which declines when they are followed by space/EOL/flow-indicator),
+  // scan it where look-ahead is available.
+  if (valid_symbols[KEY] || valid_symbols[PLAIN]) {
+    while (lexer->lookahead == ' ' || lexer->lookahead == '\t') skip(lexer);
+    int32_t h = lexer->lookahead;
+    bool indicator = h == 0 || h == '\n' || h == '\r' || h == ',' || h == '[' || h == ']' || h == '{' ||
+                     h == '}' || h == '#' || h == '&' || h == '*' || h == '!' || h == '|' || h == '>' ||
+                     h == '\'' || h == '"' || h == '%' || h == '@' || h == '`';
+    if (!indicator) {
+      bool wk = valid_symbols[KEY] != 0, wp = valid_symbols[PLAIN] != 0;
+      if (scan_scalar(lexer, wk, wp)) { s->started = true; return true; }
+    }
+  }
+
   if (!want_indent && !want_dedent && !want_newline) return false; // flow context — no indent tokens valid
 
   // Skip blank lines, comment-only lines, and leading whitespace, noting whether a line break was

From 1d9e19bae6e61723157b8fc8849195f764a0b46c Mon Sep 17 00:00:00 2001
From: Johnson Chu <johnsoncodehk@gmail.com>
Date: Mon, 8 Jun 2026 05:17:51 +0800
Subject: [PATCH 04/10] =?UTF-8?q?Scan=20typed=20YAML=20scalars=20in=20C=20?=
 =?UTF-8?q?too=20=E2=80=94=20top-level=20multi-entry=20mappings=20now=20pa?=
 =?UTF-8?q?rse=20(issue=20#3,=20piece=203)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The decline path (scanner returns false for a number/bool/null-shaped run so the regex `num`/
`bool_null` token matches) dropped the value-vs-key disambiguation that the external PLAIN/KEY tokens
carry, so GLR mis-chained a TOP-LEVEL multi-entry block mapping (`x: 1\ny: 2\nz: 3` — the first
value dropped, 3+ entries ERROR), even though nested multi-entry and plain-valued top-level mappings
parsed. Fix: externalize num + bool_null too (every token with a `blockPattern` is now scanned in C)
and have `scan_scalar` CLASSIFY the run and emit NUM / BOOL_NULL / KEY / PLAIN directly (no decline) —
so every scalar is an external token that resolves the key-vs-value choice for the parser. Number/
bool/null typing is preserved (verified: `1`→num, `true`/`null`→bool_null, `hello`→plain). Removed the
now-superseded `isPlainFamilyToken` / consume-rewrite dead code.

Parse with NO ERROR (verified): single + flat-multi-entry mappings, sequences, nested mappings,
nested sequences, block scalars, flow map/seq, plain-with-spaces, colon-in-key, trailing comment,
empty-value sibling, blank-line-separated, deep nesting. The 6 other grammars stay byte-identical and
gate:treesitter is unaffected (96.0%, beats official 92.5%).

KNOWN REMAINING: a list-of-maps / COMPACT block (`- a: 1\n  b: 2` — a sequence item whose value is a
multi-entry mapping, the common GitHub-Actions `- uses:\n  with:` shape) still errors — the scanner
must push the inline content column after a `-`/`?` indicator (gen-lexer's `compactIndicators`), which
it does not yet. Plus an accuracy bench over yaml-test-suite (present at /tmp). Next.

Refs #3
---
 src/gen-treesitter.ts          | 126 ++++++++++++++-------------------
 tree-sitter/yaml/grammar.js    |  10 ++-
 tree-sitter/yaml/src/scanner.c |  47 ++++++------
 3 files changed, 81 insertions(+), 102 deletions(-)

diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts
index eee0bca..bb84a12 100644
--- a/src/gen-treesitter.ts
+++ b/src/gen-treesitter.ts
@@ -716,53 +716,25 @@ function planInterpolations(grammar: CstGrammar): InterpolationPlan[] {
   return plans;
 }
 
-/**
- * A "plain-family" scalar token (indentation grammars only): one whose boundary is a look-around
- * INSIDE its body loop — the `:(?=\S)` colon-is-content rule of a YAML plain/key scalar. Concretely,
- * its `blockPattern` contains, somewhere under a `repeat`, a `seq` ending in a POSITIVE char-class
- * lookahead. That in-loop assertion is exactly what a tree-sitter `token()` DFA cannot honour (it
- * needs look-ahead to decide where the scalar ends), so such a token must be scanned in C.
- *
- * A typed look-alike (num / bool-null) ALSO carries a `blockPattern`, but its boundary is a
- * TOP-LEVEL trailing lookahead (`<body>(?=…)` — not inside a repeat), which the DFA enforces
- * structurally; those are NOT plain-family and stay regex `token()` rules so the parser still
- * classifies `1` as a number and `true` as a bool. The test is therefore purely STRUCTURAL — it
- * never names a token — so any grammar without this shape is unaffected.
- */
-function isPlainFamilyToken(tok: CstGrammar['tokens'][number]): boolean {
-  const p = tok.blockPattern;
-  if (!p || typeof p === 'string') return false;
-  let found = false;
-  const walk = (node: TokenPattern, inRepeat: boolean): void => {
-    if (typeof node === 'string') return;
-    switch (node.type) {
-      case 'repeat': walk(node.body, true); break;
-      case 'seq': {
-        if (inRepeat) {
-          const last = node.items[node.items.length - 1];
-          if (last && typeof last !== 'string' && last.type === 'lookahead' && !last.negate) found = true;
-        }
-        for (const it of node.items) walk(it, inRepeat);
-        break;
-      }
-      case 'alt': for (const it of node.items) walk(it, inRepeat); break;
-      case 'lookahead': case 'lookbehind': walk(node.body, inRepeat); break;
-      default: break;
-    }
-  };
-  walk(p, false);
-  return found;
-}
-
-/** The plain-family tokens of an indentation grammar, split into the PLAIN scalar and the KEY scalar
- *  by their scope leaf (a cross-check on the structural detection): a plain scalar is scoped
- *  `string.unquoted…`, a key scalar `entity.name.tag…`. Either may be absent. */
-function planPlainScalarTokens(grammar: CstGrammar): { plain?: string; key?: string } {
+/** The block-context SCALAR tokens of an indentation grammar (those carrying a `blockPattern`), split
+ *  by their scope leaf: PLAIN `string.unquoted…`, KEY `entity.name.tag…`, NUM `constant.numeric…`,
+ *  bool/null `constant.language…`. All are scanned in C (see scan_scalar) — a YAML plain/key boundary
+ *  (`:(?=\S)`, `#`-after-space) is a look-around a tree-sitter token DFA can't honour, and a typed
+ *  value emitted by the regex lexer would not carry the key-vs-value decision the GLR parser needs to
+ *  chain top-level mapping entries. Any field may be absent. */
+function planPlainScalarTokens(grammar: CstGrammar): { plain?: string; key?: string; num?: string; boolnull?: string } {
   if (!grammar.indent) return {};
-  const fam = grammar.tokens.filter(isPlainFamilyToken);
-  const plain = fam.find(t => (t.scope ?? '').startsWith('string.unquoted'))?.name;
+  // Every token carrying a `blockPattern` is a block-context scalar; emitting num/bool-null from the
+  // scanner too (classified by shape) — not via a regex token + decline — keeps every scalar an
+  // external token, so the key-vs-value decision is carried and `x: 1\ny: 2` chains correctly.
+  // Split by the scope leaf (the convention is data in the grammar): plain `string.unquoted`, key
+  // `entity.name.tag`, num `constant.numeric`, bool/null `constant.language`.
+  const fam = grammar.tokens.filter(t => t.blockPattern !== undefined && typeof t.blockPattern !== 'string');
+  const num = fam.find(t => (t.scope ?? '').includes('constant.numeric'))?.name;
+  const boolnull = fam.find(t => (t.scope ?? '').includes('constant.language'))?.name;
   const key = fam.find(t => (t.scope ?? '').startsWith('entity.name.tag'))?.name;
-  return { plain, key };
+  const plain = fam.find(t => (t.scope ?? '').startsWith('string.unquoted') && !(t.scope ?? '').includes('constant.'))?.name;
+  return { plain, key, num, boolnull };
 }
 
 /** Determine which tokens the external scanner must provide. */
@@ -789,9 +761,11 @@ function planScannerTokens(grammar: CstGrammar): Map<string, string> {
     // DFA lacks — so they too become external tokens, scanned by `scan_scalar` in C. Appended AFTER
     // the block scalar so the enum stays INDENT,DEDENT,NEWLINE,BLOCK_SCALAR,PLAIN,KEY. (Num/BoolNull
     // are NOT plain-family — their boundary is DFA-expressible — so they stay regex token rules.)
-    const { plain, key } = planPlainScalarTokens(grammar);
+    const { plain, key, num, boolnull } = planPlainScalarTokens(grammar);
     if (plain) map.set(plain, toSnake(plain));
     if (key) map.set(key, toSnake(key));
+    if (num) map.set(num, toSnake(num));
+    if (boolnull) map.set(boolnull, toSnake(boolnull));
   }
   // The regex token: '/' is context-sensitive (regex vs division). The scanner
   // resolves it.
@@ -1856,12 +1830,19 @@ function buildIndentScannerC(grammar: CstGrammar, ctx: GrammarJsContext, grammar
   const sym = (tokenName: string) => ctx.scannerTokenFor.get(tokenName)!.toUpperCase();
   const INDENT = sym(ind.indentToken), DEDENT = sym(ind.dedentToken), NEWLINE = sym(ind.newlineToken);
   const BLOCK = ind.blockScalar ? sym(ind.blockScalar.token) : null;
-  // The PLAIN / KEY scalar externals (a `:` is a separator only before space/EOL/flow-indicator; a
-  // `#` a comment only after a space) — scanned by scan_scalar where look-ahead IS available.
-  const { plain: plainTok, key: keyTok } = planPlainScalarTokens(grammar);
+  // The block-context SCALAR externals — plain, key, and the typed num / bool-null — all scanned by
+  // scan_scalar (a `:` is a separator only before space/EOL/flow-indicator; a `#` a comment only after
+  // a space; a typed run is classified by shape). Emitting num/bool-null from the scanner (not via a
+  // regex token + decline) makes EVERY scalar an external token that carries the key-vs-value decision,
+  // which the GLR parser needs to chain top-level mapping entries.
+  const { plain: plainTok, key: keyTok, num: numTok, boolnull: boolnullTok } = planPlainScalarTokens(grammar);
   const PLAIN = plainTok ? sym(plainTok) : null;
   const KEY = keyTok ? sym(keyTok) : null;
-  const SCALAR = PLAIN || KEY; // either may be absent; scan_scalar is emitted when at least one is
+  const NUM = numTok ? sym(numTok) : null;
+  const BOOLNULL = boolnullTok ? sym(boolnullTok) : null;
+  const SCALAR = PLAIN || KEY || NUM || BOOLNULL; // scan_scalar is emitted when at least one exists
+  const scalarGate = [PLAIN, KEY, NUM, BOOLNULL].filter(Boolean).map(s => `valid_symbols[${s}]`).join(' || ') || '0';
+  const want = (s: string | null) => (s ? `valid_symbols[${s}] != 0` : 'false');
   const cmt = ind.comment ?? '#';
   const cmtC = cmt.length === 1 ? (cmt === '\\' || cmt === "'" ? `'\\${cmt}'` : `'${cmt}'`) : null;
   const introCond = (ind.blockScalar?.introducers ?? []).map(c => `lexer->lookahead == '${c}'`).join(' || ') || '0';
@@ -1980,7 +1961,7 @@ static bool scan_block_scalar(Scanner *s, TSLexer *lexer) {
 // tree-sitter rolls back our advances and the typed token matches) — but ONLY where such a token is
 // valid. A multi-line plain fold's continuation line is plain-ONLY (its KEY symbol is not valid), so
 // a numeric-looking continuation ("123" under a plain scalar) must stay PLAIN, not be handed to num.
-static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain) {
+static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull) {
   char buf[64];
   unsigned blen = 0;        // run text (capped) — for the number/bool-null shape test
   bool has_content = false;
@@ -2030,10 +2011,9 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain) {
   // vs PLAIN first, because a typed-looking run is only deferred where a typed token is valid: a KEY
   // position admits num/bool_null (block_key_scalar), and a non-KEY value position likewise — but a
   // plain-ONLY fold continuation (neither KEY here, and the run is not a key) must stay PLAIN.
-  bool is_key = stopped_at_kv ? want_key : false;
-  // numeric: only [0-9 . + - e E x o a-f A-F _ : (already excluded)] — a loose superset is fine, the
-  // regex \`num\` makes the precise decision; if it doesn't match, tree-sitter falls back to us is NOT
-  // possible (we already returned), so keep the test TIGHT enough to never defer a real plain string.
+  // numeric / bool-null SHAPE test — a loose superset is fine for classification (only a typed-shaped
+  // run is emitted as NUM/BOOL_NULL; a run with any other char is PLAIN), at the cost of mis-typing a
+  // rare plain like \`1abc\` as numeric (the documented imprecise edge).
   bool numeric = blen > 0;
   for (unsigned i = 0; i < blen; i++) {
     char ch = buf[i];
@@ -2057,16 +2037,17 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain) {
       if (i == blen && p[i] == 0) { boolnull = true; break; }
     }
   }
-  // Defer to the typed regex token ONLY when the run is typed-shaped AND a typed token is admissible
-  // here. \`want_key\` (a key slot) always admits num/bool_null; a value slot does too. A plain-ONLY
-  // fold continuation has want_key == false AND is not a key (stopped_at_kv == false) — but a value
-  // slot ALSO has want_key == false. We can't tell them apart from valid_symbols, so we defer in
-  // both: a numeric fold continuation typing as a number is the documented imprecise edge.
-  if ((numeric || boolnull) && want_plain) return false;
-
-  if (is_key) { lexer->result_symbol = want_key ? ${KEY ?? PLAIN} : ${PLAIN ?? KEY}; return true; }
-  lexer->result_symbol = want_plain ? ${PLAIN ?? KEY} : ${KEY ?? PLAIN};
-  return true;
+  // Classify + emit. The external scalar token CARRIES the key-vs-value decision (a trailing \`: \`
+  // means KEY), which the GLR parser needs to chain mapping entries — so a typed value is emitted as
+  // NUM/BOOL_NULL here, NOT deferred to a regex token (deferring drops the disambiguation and
+  // mis-parses a top-level \`x: 1\\ny: 2\`). A key wins first; then the typed shapes; then PLAIN. Each
+  // is gated on its token being admissible here (valid_symbols), falling through otherwise.
+  if (stopped_at_kv && want_key) { lexer->result_symbol = ${KEY ?? PLAIN}; return true; }
+  if (numeric && want_num) { lexer->result_symbol = ${NUM ?? PLAIN}; return true; }
+  if (boolnull && want_boolnull) { lexer->result_symbol = ${BOOLNULL ?? PLAIN}; return true; }
+  if (want_plain) { lexer->result_symbol = ${PLAIN ?? KEY}; return true; }
+  if (want_key) { lexer->result_symbol = ${KEY ?? PLAIN}; return true; }
+  return false;
 }
 ` : ''}
 bool tree_sitter_${G}_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
@@ -2097,20 +2078,19 @@ ${BLOCK ? `
     if (${introCond}) { if (scan_block_scalar(s, lexer)) return true; }
   }
 ` : ''}${SCALAR ? `
-  // A PLAIN / KEY scalar on the CURRENT line (not at a line boundary — a leading newline falls through
-  // to the indent logic so INDENT/DEDENT/NEWLINE are emitted first). Skip inline spaces/tabs, then if
-  // the next char could begin a plain scalar (not a newline/EOF and not a YAML indicator — \`-\`/\`?\`/\`:\`
-  // are handled inside scan_scalar, which declines when they are followed by space/EOL/flow-indicator),
-  // scan it where look-ahead is available.
-  if (valid_symbols[${KEY ?? PLAIN}] || valid_symbols[${PLAIN ?? KEY}]) {
+  // A SCALAR (plain / key / num / bool-null) on the CURRENT line — NOT at a line boundary (a leading
+  // newline falls through to the indent logic so INDENT/DEDENT/NEWLINE are emitted first). Skip inline
+  // spaces/tabs, then if the next char could begin a plain scalar (not a newline/EOF and not a YAML
+  // indicator — a leading \`-\`/\`?\`/\`:\` is resolved inside scan_scalar), scan it where look-ahead is
+  // available. scan_scalar classifies the run and emits the admissible token.
+  if (${scalarGate}) {
     while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') skip(lexer);
     int32_t h = lexer->lookahead;
     bool indicator = h == 0 || h == '\\n' || h == '\\r' || h == ',' || h == '[' || h == ']' || h == '{' ||
                      h == '}' || h == '#' || h == '&' || h == '*' || h == '!' || h == '|' || h == '>' ||
                      h == '\\'' || h == '"' || h == '%' || h == '@' || h == '\`';
     if (!indicator) {
-      bool wk = valid_symbols[${KEY ?? PLAIN}] != 0, wp = valid_symbols[${PLAIN ?? KEY}] != 0;
-      if (scan_scalar(lexer, wk, wp)) { s->started = true; return true; }
+      if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)})) { s->started = true; return true; }
     }
   }
 ` : ''}
diff --git a/tree-sitter/yaml/grammar.js b/tree-sitter/yaml/grammar.js
index 1ec0754..e2cd88a 100644
--- a/tree-sitter/yaml/grammar.js
+++ b/tree-sitter/yaml/grammar.js
@@ -22,7 +22,9 @@ module.exports = grammar({
     $.newline,
     $.block_scalar,
     $.plain,
-    $.key
+    $.key,
+    $.num,
+    $.bool_null
   ],
 
   conflicts: $ => [
@@ -170,11 +172,7 @@ module.exports = grammar({
 
     alias: $ => token(/\*[^\t\n\f\r \[\]{},]+/),
 
-    tag: $ => token(/!(?:<[^>]*>|[^\t\n\f\r \[\]{},]*)/),
-
-    num: $ => token(/(?:[+\-]?\.(?:inf|Inf|INF)|\.(?:nan|NaN|NAN)|0x[0-9A-Fa-f]+|0o[0-7]+|[+\-]?(?:\.[0-9]+|[0-9]+(?:\.[0-9]*)?)(?:[eE][+\-]?[0-9]+)?)/),
-
-    bool_null: $ => token(/(?:true|True|TRUE|false|False|FALSE|null|Null|NULL|~)/)
+    tag: $ => token(/!(?:<[^>]*>|[^\t\n\f\r \[\]{},]*)/)
   }
 });
 
diff --git a/tree-sitter/yaml/src/scanner.c b/tree-sitter/yaml/src/scanner.c
index 7d55271..e792844 100644
--- a/tree-sitter/yaml/src/scanner.c
+++ b/tree-sitter/yaml/src/scanner.c
@@ -16,6 +16,8 @@ enum TokenType {
   BLOCK_SCALAR,
   PLAIN,
   KEY,
+  NUM,
+  BOOL_NULL,
 };
 
 typedef struct {
@@ -115,7 +117,7 @@ static bool scan_block_scalar(Scanner *s, TSLexer *lexer) {
 // tree-sitter rolls back our advances and the typed token matches) — but ONLY where such a token is
 // valid. A multi-line plain fold's continuation line is plain-ONLY (its KEY symbol is not valid), so
 // a numeric-looking continuation ("123" under a plain scalar) must stay PLAIN, not be handed to num.
-static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain) {
+static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull) {
   char buf[64];
   unsigned blen = 0;        // run text (capped) — for the number/bool-null shape test
   bool has_content = false;
@@ -165,10 +167,9 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain) {
   // vs PLAIN first, because a typed-looking run is only deferred where a typed token is valid: a KEY
   // position admits num/bool_null (block_key_scalar), and a non-KEY value position likewise — but a
   // plain-ONLY fold continuation (neither KEY here, and the run is not a key) must stay PLAIN.
-  bool is_key = stopped_at_kv ? want_key : false;
-  // numeric: only [0-9 . + - e E x o a-f A-F _ : (already excluded)] — a loose superset is fine, the
-  // regex `num` makes the precise decision; if it doesn't match, tree-sitter falls back to us is NOT
-  // possible (we already returned), so keep the test TIGHT enough to never defer a real plain string.
+  // numeric / bool-null SHAPE test — a loose superset is fine for classification (only a typed-shaped
+  // run is emitted as NUM/BOOL_NULL; a run with any other char is PLAIN), at the cost of mis-typing a
+  // rare plain like `1abc` as numeric (the documented imprecise edge).
   bool numeric = blen > 0;
   for (unsigned i = 0; i < blen; i++) {
     char ch = buf[i];
@@ -192,16 +193,17 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain) {
       if (i == blen && p[i] == 0) { boolnull = true; break; }
     }
   }
-  // Defer to the typed regex token ONLY when the run is typed-shaped AND a typed token is admissible
-  // here. `want_key` (a key slot) always admits num/bool_null; a value slot does too. A plain-ONLY
-  // fold continuation has want_key == false AND is not a key (stopped_at_kv == false) — but a value
-  // slot ALSO has want_key == false. We can't tell them apart from valid_symbols, so we defer in
-  // both: a numeric fold continuation typing as a number is the documented imprecise edge.
-  if ((numeric || boolnull) && want_plain) return false;
-
-  if (is_key) { lexer->result_symbol = want_key ? KEY : PLAIN; return true; }
-  lexer->result_symbol = want_plain ? PLAIN : KEY;
-  return true;
+  // Classify + emit. The external scalar token CARRIES the key-vs-value decision (a trailing `: `
+  // means KEY), which the GLR parser needs to chain mapping entries — so a typed value is emitted as
+  // NUM/BOOL_NULL here, NOT deferred to a regex token (deferring drops the disambiguation and
+  // mis-parses a top-level `x: 1\ny: 2`). A key wins first; then the typed shapes; then PLAIN. Each
+  // is gated on its token being admissible here (valid_symbols), falling through otherwise.
+  if (stopped_at_kv && want_key) { lexer->result_symbol = KEY; return true; }
+  if (numeric && want_num) { lexer->result_symbol = NUM; return true; }
+  if (boolnull && want_boolnull) { lexer->result_symbol = BOOL_NULL; return true; }
+  if (want_plain) { lexer->result_symbol = PLAIN; return true; }
+  if (want_key) { lexer->result_symbol = KEY; return true; }
+  return false;
 }
 
 bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const bool *valid_symbols) {
@@ -233,20 +235,19 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const
     if (lexer->lookahead == '|' || lexer->lookahead == '>') { if (scan_block_scalar(s, lexer)) return true; }
   }
 
-  // A PLAIN / KEY scalar on the CURRENT line (not at a line boundary — a leading newline falls through
-  // to the indent logic so INDENT/DEDENT/NEWLINE are emitted first). Skip inline spaces/tabs, then if
-  // the next char could begin a plain scalar (not a newline/EOF and not a YAML indicator — `-`/`?`/`:`
-  // are handled inside scan_scalar, which declines when they are followed by space/EOL/flow-indicator),
-  // scan it where look-ahead is available.
-  if (valid_symbols[KEY] || valid_symbols[PLAIN]) {
+  // A SCALAR (plain / key / num / bool-null) on the CURRENT line — NOT at a line boundary (a leading
+  // newline falls through to the indent logic so INDENT/DEDENT/NEWLINE are emitted first). Skip inline
+  // spaces/tabs, then if the next char could begin a plain scalar (not a newline/EOF and not a YAML
+  // indicator — a leading `-`/`?`/`:` is resolved inside scan_scalar), scan it where look-ahead is
+  // available. scan_scalar classifies the run and emits the admissible token.
+  if (valid_symbols[PLAIN] || valid_symbols[KEY] || valid_symbols[NUM] || valid_symbols[BOOL_NULL]) {
     while (lexer->lookahead == ' ' || lexer->lookahead == '\t') skip(lexer);
     int32_t h = lexer->lookahead;
     bool indicator = h == 0 || h == '\n' || h == '\r' || h == ',' || h == '[' || h == ']' || h == '{' ||
                      h == '}' || h == '#' || h == '&' || h == '*' || h == '!' || h == '|' || h == '>' ||
                      h == '\'' || h == '"' || h == '%' || h == '@' || h == '`';
     if (!indicator) {
-      bool wk = valid_symbols[KEY] != 0, wp = valid_symbols[PLAIN] != 0;
-      if (scan_scalar(lexer, wk, wp)) { s->started = true; return true; }
+      if (scan_scalar(lexer, valid_symbols[KEY] != 0, valid_symbols[PLAIN] != 0, valid_symbols[NUM] != 0, valid_symbols[BOOL_NULL] != 0)) { s->started = true; return true; }
     }
   }
 

From 67a947ce04d2abf664d465f8293edb1b09ecdbc7 Mon Sep 17 00:00:00 2001
From: Johnson Chu <johnsoncodehk@gmail.com>
Date: Mon, 8 Jun 2026 06:26:02 +0800
Subject: [PATCH 05/10] Scan YAML compact block notation (list-of-maps) in C
 (issue #3, piece 3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A sequence item whose value is a mapping is written compactly — the mapping starts inline on the dash
line and its continuation aligns with the inline content, not the dash (`- a: 1\n  b: 2`, the
GitHub-Actions `- uses: x\n  with:\n    k: v` shape). The scanner now mirrors gen-lexer's
`compactIndicators`: at a line-lead `-`/`?` indicator whose inline content begins a block node (a
nested `-`/`?`, or a scalar followed by an unquoted `: ` key separator — sniffed quote-aware, looking
through a `&`/`!` property prefix), it pushes the inline content column as one extra INDENT.

tree-sitter reverts all external-scanner state on a `false` return, so the natural "probe at the
indicator, remember the column, push next call" loses the remembered column. The working design emits
the compact INDENT in a single `true`-returning zero-width call at the post-indicator content
(mark_end at the content start; the sniff's advances are discarded as tree-sitter restarts from
mark_end). A new serialized `at_line_lead` flag (the indicator is internal-lexed, so it stays true
through it) drives the detection; a bare-scalar / flow / alias lead does NOT push (`- x`, `- [a]`
stay leaf items). All gated on `grammar.indent.compactIndicators` — the six other grammars and yaml's
own grammar.js/tmLanguage/monarch are byte-identical (the change is purely in the C scanner).

Parse NO-ERROR (verified): list-of-maps, single-entry list-maps, the GH-Actions steps shape, nested
seq `- - x`, property+compact `- &a k: v`, map-of-seq — plus every earlier case (mappings, sequences,
block scalars, flow, typed values) still passes. Real files: ci.yml 19→4 ERROR nodes, readme-bench
13→2. tsc clean; generate + build --wasm succeed; gate:treesitter 96.0% (beats official 92.5%).

Remaining (pre-existing, NOT compact): a block-context plain scalar containing `,` (the scanner
treats `,` as a flow indicator), `${{ }}` GH-Actions expressions (`{` treated as flow), and an alias
as a sequence value (`- *a`, a grammar-level gap). Plus an accuracy bench over yaml-test-suite.

Refs #3
---
 src/gen-treesitter.ts          | 166 +++++++++++++++++++++++++++++----
 tree-sitter/yaml/src/scanner.c | 139 +++++++++++++++++++++++++--
 2 files changed, 282 insertions(+), 23 deletions(-)

diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts
index bb84a12..fd95dcc 100644
--- a/src/gen-treesitter.ts
+++ b/src/gen-treesitter.ts
@@ -1848,6 +1848,14 @@ function buildIndentScannerC(grammar: CstGrammar, ctx: GrammarJsContext, grammar
   const introCond = (ind.blockScalar?.introducers ?? []).map(c => `lexer->lookahead == '${c}'`).join(' || ') || '0';
   const enumBody = externalTokens.map(s => `  ${s.toUpperCase()},`).join('\n');
   const G = grammarName;
+  // Compact-notation entry indicators (YAML `-` / `?`) — DERIVED from grammar.indent.compactIndicators
+  // (nothing hardcoded). A `lexer->lookahead == 'c'` disjunction reused by the scanner's compact logic.
+  // (The other inline-content leads — node-property `&`/`!`, flow `[`/`{`, alias `*` — are mirrored as
+  //  literals from gen-lexer's startsBlockStructuralNode, which itself treats them as fixed YAML
+  //  syntax; only the entry indicators are config-driven, matching IndentConfig.compactIndicators.)
+  const compactIndicators = ind.compactIndicators ?? [];
+  const compactIndicatorCond = (v: string) => compactIndicators.map(c => `${v} == '${c}'`).join(' || ') || '0';
+  const hasCompact = compactIndicators.length > 0 && SCALAR;
 
   const scannerC = `// Tree-sitter external scanner generated by monogram (indentation path).
 //
@@ -1870,7 +1878,8 @@ typedef struct {
   int16_t *stack;        // indentation columns
   int16_t pending_col;   // column of the line boundary mid-processing (-1 = none)
   bool pending_newline;  // a NEWLINE is still owed once dedents reach pending_col
-  bool started;          // any content lexed yet (suppresses a leading NEWLINE)
+  bool started;          // any content lexed yet (suppresses a leading NEWLINE)${hasCompact ? `
+  bool at_line_lead;     // the next real content token is its line's first (compact-indicator probe)` : ''}
 } Scanner;
 
 static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
@@ -1886,7 +1895,8 @@ void *tree_sitter_${G}_external_scanner_create(void) {
   s->cap = 16; s->len = 1;
   s->stack = ts_malloc(s->cap * sizeof(int16_t));
   s->stack[0] = 0;
-  s->pending_col = -1; s->pending_newline = false; s->started = false;
+  s->pending_col = -1; s->pending_newline = false; s->started = false;${hasCompact ? `
+  s->at_line_lead = true;` : ''}
   return s;
 }
 
@@ -1900,7 +1910,8 @@ unsigned tree_sitter_${G}_external_scanner_serialize(void *payload, char *buffer
   Scanner *s = (Scanner *)payload;
   unsigned n = 0;
   buffer[n++] = s->started ? 1 : 0;
-  buffer[n++] = s->pending_newline ? 1 : 0;
+  buffer[n++] = s->pending_newline ? 1 : 0;${hasCompact ? `
+  buffer[n++] = s->at_line_lead ? 1 : 0;` : ''}
   memcpy(&buffer[n], &s->pending_col, sizeof(int16_t)); n += sizeof(int16_t);
   uint32_t count = s->len;
   while (n + sizeof(uint32_t) + count * sizeof(int16_t) > TREE_SITTER_SERIALIZATION_BUFFER_SIZE && count > 0) count--;
@@ -1911,11 +1922,13 @@ unsigned tree_sitter_${G}_external_scanner_serialize(void *payload, char *buffer
 
 void tree_sitter_${G}_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
   Scanner *s = (Scanner *)payload;
-  s->len = 1; s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false;
-  if (length < 2 + sizeof(int16_t) + sizeof(uint32_t)) return;
+  s->len = 1; s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false;${hasCompact ? `
+  s->at_line_lead = true;` : ''}
+  if (length < ${hasCompact ? 3 : 2} + sizeof(int16_t) + sizeof(uint32_t)) return;
   unsigned n = 0;
   s->started = buffer[n++] != 0;
-  s->pending_newline = buffer[n++] != 0;
+  s->pending_newline = buffer[n++] != 0;${hasCompact ? `
+  s->at_line_lead = buffer[n++] != 0;` : ''}
   memcpy(&s->pending_col, &buffer[n], sizeof(int16_t)); n += sizeof(int16_t);
   uint32_t count; memcpy(&count, &buffer[n], sizeof(uint32_t)); n += sizeof(uint32_t);
   if (count == 0) return; // keep stack[0] = 0
@@ -1950,6 +1963,61 @@ static bool scan_block_scalar(Scanner *s, TSLexer *lexer) {
   lexer->result_symbol = ${BLOCK};
   return true;
 }
+` : ''}${hasCompact ? `
+// Compact block notation (\`- a: 1\` / \`- - x\` / \`- &a k: v\` / \`? key\`): a sequence/explicit-key
+// indicator whose inline content itself begins a block node nests at the content's column, not the
+// indicator's. The indicator chars are DERIVED from grammar.indent.compactIndicators. Mirrors
+// compactNestsHere / startsBlockStructuralNode in src/gen-lexer.ts: the inline content is
+// block-structural when, after an optional node-property prefix (\`&anchor\` / \`!tag\`, 0-2
+// space-separated), it is a further indicator, or a mapping KEY (an unquoted \`:\` then ws/EOL/
+// flow-indicator before a \` #\` comment / EOL, scanned quote-aware). A bare scalar / flow / alias does
+// NOT nest. The property / flow / alias glyphs are fixed YAML syntax (as in gen-lexer); only the entry
+// indicators are config-driven.
+static inline bool compact_is_indicator(int32_t c) { return ${compactIndicatorCond('c')}; }
+static inline bool compact_sep_after(int32_t c) {
+  return c == 0 || c == ' ' || c == '\\t' || c == '\\n' || c == '\\r';
+}
+// The inline content (lookahead is positioned at it) begins a block-structural node. Advances; the
+// caller has frozen a zero-width token end before it and discards the advances (returning the INDENT
+// zero-width on a hit, or rewinding on a miss).
+static bool compact_content_is_structural(TSLexer *lexer) {
+  for (int n = 0; n < 2; n++) {                       // skip 0-2 node-property prefixes (\`&anchor\` / \`!tag\`)
+    int32_t c = lexer->lookahead;
+    if (c == '&' || c == '!') {
+      advance(lexer);
+      while (lexer->lookahead != 0 && !compact_sep_after(lexer->lookahead) && lexer->lookahead != ',') advance(lexer);
+      while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') advance(lexer);
+    } else break;
+  }
+  int32_t c0 = lexer->lookahead;
+  if (c0 == 0 || c0 == '\\n' || c0 == '\\r') return false;            // property alone on the line → no nest
+  if (compact_is_indicator(c0)) { advance(lexer); return compact_sep_after(lexer->lookahead); } // nested indicator
+  if (c0 == '[' || c0 == '{' || c0 == '*') return false;            // flow collection / alias → not a key
+  for (;;) {                                          // scalar KEY sniff (quote-aware), like startsBlockStructuralNode
+    int32_t ch = lexer->lookahead;
+    if (ch == 0 || ch == '\\n' || ch == '\\r') break;
+    if (ch == '"') {
+      advance(lexer);
+      while (lexer->lookahead != 0 && lexer->lookahead != '"' && lexer->lookahead != '\\n') { if (lexer->lookahead == '\\\\') advance(lexer); advance(lexer); }
+      if (lexer->lookahead == '"') advance(lexer);
+      continue;
+    }
+    if (ch == '\\'') {
+      advance(lexer);
+      while (lexer->lookahead != 0 && lexer->lookahead != '\\n') { if (lexer->lookahead == '\\'') { advance(lexer); if (lexer->lookahead != '\\'') break; } advance(lexer); }
+      continue;
+    }
+    if (ch == ' ' || ch == '\\t') { advance(lexer); if (lexer->lookahead == '#') break; continue; } // trailing comment
+    if (ch == ':') {
+      advance(lexer);
+      int32_t n = lexer->lookahead;
+      if (n == 0 || n == ' ' || n == '\\t' || n == '\\n' || n == '\\r' || n == ',' || n == '[' || n == ']' || n == '{' || n == '}') return true;
+      continue;
+    }
+    advance(lexer);
+  }
+  return false;
+}
 ` : ''}${SCALAR ? `
 // A PLAIN / KEY scalar. A tree-sitter token() DFA can't decide a plain scalar's boundary (a \`:\` is
 // content unless followed by space/EOL/flow-indicator; a \`#\` starts a comment only after a space),
@@ -1961,7 +2029,16 @@ static bool scan_block_scalar(Scanner *s, TSLexer *lexer) {
 // tree-sitter rolls back our advances and the typed token matches) — but ONLY where such a token is
 // valid. A multi-line plain fold's continuation line is plain-ONLY (its KEY symbol is not valid), so
 // a numeric-looking continuation ("123" under a plain scalar) must stay PLAIN, not be handed to num.
-static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull) {
+//${hasCompact ? `
+// COMPACT mapping-KEY support: when \`compact_col >= 0\` (a line-lead indicator's scalar-led inline
+// content, deeper than the stack top — see the caller), the run is scanned WITHOUT marking the token
+// end (the caller pre-marked a zero-width end at the content start). A KEY run pushes \`compact_col\`
+// and emits a zero-width INDENT — the nested mapping's real indent — and the key is re-lexed on the
+// next call (then compact_col is no longer deeper, so this path is skipped). A leaf scalar (no \`:\`)
+// is emitted as usual, its end marked at the run end. Mirrors compactNestsHere's mapping-key arm.` : ''}
+static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull${hasCompact ? `,
+                        Scanner *s, int16_t compact_col, int indent_sym` : ''}) {${hasCompact ? `
+  bool cm = compact_col >= 0;   // compact-eligible: suppress per-char mark_end (zero-width INDENT on KEY)` : ''}
   char buf[64];
   unsigned blen = 0;        // run text (capped) — for the number/bool-null shape test
   bool has_content = false;
@@ -1979,7 +2056,7 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan
           n == ',' || n == '[' || n == ']' || n == '{' || n == '}') return false; // indicator, not a scalar
       if (blen < sizeof(buf)) buf[blen++] = (char)c;               // \`-\`/\`?\` glued to non-space is content
       has_content = true;
-      lexer->mark_end(lexer);
+      ${hasCompact ? 'if (!cm) ' : ''}lexer->mark_end(lexer);
       continue;
     }
     if (c == ':') {
@@ -1991,7 +2068,7 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan
       }
       if (blen < sizeof(buf)) buf[blen++] = ':';                  // ':' glued to non-space is content
       has_content = true;
-      lexer->mark_end(lexer);
+      ${hasCompact ? 'if (!cm) ' : ''}lexer->mark_end(lexer);
       continue;
     }
     if (c == ' ' || c == '\\t') {
@@ -2003,10 +2080,24 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan
     if (blen < sizeof(buf)) buf[blen++] = (char)c;
     has_content = true;
     lexer->advance(lexer, false);
-    lexer->mark_end(lexer);                                       // token end follows the last content char
+    ${hasCompact ? 'if (!cm) ' : ''}lexer->mark_end(lexer);                                       // token end follows the last content char
   }
   if (!has_content) return false;
-
+${hasCompact ? `
+  // COMPACT mapping KEY: the inline content after a line-lead indicator is a mapping key → its column
+  // is the nested mapping's indent. Push it and emit the zero-width INDENT (the caller pre-marked the
+  // end at the content start); the key is re-lexed on the next call. A leaf falls through to normal
+  // classification, with its end marked here (run end) since per-char marking was suppressed.
+  if (cm) {
+    if (stopped_at_kv) {
+      push_indent(s, compact_col);
+      s->at_line_lead = true;     // the key is itself this line's fresh lead (re-lexed next call)
+      lexer->result_symbol = indent_sym;
+      return true;                // zero-width INDENT at the content start (advances discarded)
+    }
+    lexer->mark_end(lexer);       // leaf: take the whole run (trailing-space trim is skipped in compact mode)
+  }
+` : ''}
   // Number / bool-null SHAPE test (so the typed regex tokens still classify \`1\`/\`true\`). Decide KEY
   // vs PLAIN first, because a typed-looking run is only deferred where a typed token is valid: a KEY
   // position admits num/bool_null (block_key_scalar), and a non-KEY value position likewise — but a
@@ -2070,12 +2161,45 @@ bool tree_sitter_${G}_external_scanner_scan(void *payload, TSLexer *lexer, const
     if (col == top && owed && want_newline && s->started) { lexer->result_symbol = ${NEWLINE}; return true; }
     return false;
   }
-${BLOCK ? `
+${hasCompact ? `
+  // Compact block notation (\`- a: 1\` / \`- - x\` / \`- &a k: v\` / \`? key\`). The line-lead indicator was
+  // just lexed by tree-sitter's internal DFA, so at_line_lead is still set (the scanner emits no
+  // indicator token to clear it) and tree-sitter now wants the nested node's INDENT on the SAME line.
+  // When the inline content begins a block node, its column — not the indicator's — is the node's
+  // indentation: emit a zero-width INDENT there and push it (the DEDENT logic closes it when a
+  // shallower line arrives). The work splits by what leads the inline content, because the sniff
+  // ADVANCES (irrecoverably) and external-scanner state changes are reverted on a false return:
+  //   • a node-property / flow / alias / nested-indicator lead — sniff it here; a structural hit pushes
+  //     INDENT, a miss returns false so tree-sitter rewinds and the leading char (all INTERNAL-lexable,
+  //     or the scalar handled on the next call) is re-lexed.
+  //   • a plain/quoted SCALAR lead — fall through to scan_scalar (below), which pushes the compact
+  //     INDENT itself when the run is a mapping KEY. (A bare scalar must NOT be sniffed-then-rewound: a
+  //     plain scalar is external-only, so a false return here would loop.)
+  if (want_indent && s->at_line_lead) {
+    while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') skip(lexer);
+    int32_t c = lexer->lookahead;
+    bool nonscalar_lead = c == '&' || c == '!' || c == '[' || c == '{' || c == '*' || compact_is_indicator(c);
+    if (nonscalar_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) {
+      int16_t col = (int16_t)lexer->get_column(lexer);
+      lexer->mark_end(lexer); // freeze the zero-width INDENT end at the content column before the sniff advances
+      if (compact_content_is_structural(lexer)) {
+        push_indent(s, col);
+        // A NESTED indicator's content is itself a fresh line-lead (so \`- - x\` nests once more); but a
+        // node-property prefix (\`- &a k: v\`) is followed INLINE by its own mapping KEY at the SAME level
+        // — that key must NOT push again, so clear the lead for the property / direct-key case.
+        s->at_line_lead = compact_is_indicator(c);
+        lexer->result_symbol = ${INDENT};
+        return true;
+      }
+      return false; // not block-structural → rewind; the internal-lexable lead (or next-call scalar) re-lexes
+    }
+  }
+` : ''}${BLOCK ? `
   // A block scalar value (\`key: |\`): scan its body before the indent logic — its more-indented
   // lines are content, not nested structure. Skip the inline space after the \`:\`/\`-\` first.
   if (want_block) {
     while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') skip(lexer);
-    if (${introCond}) { if (scan_block_scalar(s, lexer)) return true; }
+    if (${introCond}) { if (scan_block_scalar(s, lexer)) {${hasCompact ? ' s->at_line_lead = false;' : ''} return true; } }
   }
 ` : ''}${SCALAR ? `
   // A SCALAR (plain / key / num / bool-null) on the CURRENT line — NOT at a line boundary (a leading
@@ -2089,8 +2213,17 @@ ${BLOCK ? `
     bool indicator = h == 0 || h == '\\n' || h == '\\r' || h == ',' || h == '[' || h == ']' || h == '{' ||
                      h == '}' || h == '#' || h == '&' || h == '*' || h == '!' || h == '|' || h == '>' ||
                      h == '\\'' || h == '"' || h == '%' || h == '@' || h == '\`';
-    if (!indicator) {
-      if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)})) { s->started = true; return true; }
+    if (!indicator) {${hasCompact ? `
+      // Compact mapping-KEY (part B): a line-lead indicator's scalar-led inline content, deeper than the
+      // stack top. scan_scalar then pushes the nested mapping's INDENT (zero-width, pre-marked here) if
+      // the run is a KEY, or emits the leaf scalar otherwise (so \`- x\` stays a plain item, no push).
+      int16_t compact_col = -1;
+      if (want_indent && s->at_line_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) {
+        compact_col = (int16_t)lexer->get_column(lexer);
+        lexer->mark_end(lexer); // zero-width INDENT end at the content start (used iff the run is a KEY)
+      }
+      if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)}, s, compact_col, ${INDENT})) { s->started = true; s->at_line_lead = (lexer->result_symbol == ${INDENT}); return true; }` : `
+      if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)})) { s->started = true; return true; }`}
     }
   }
 ` : ''}
@@ -2118,7 +2251,8 @@ ${BLOCK ? `
 
   int16_t col = (int16_t)lexer->get_column(lexer);
   lexer->mark_end(lexer); // INDENT/DEDENT/NEWLINE are zero-width at the content column
-  int top = s->stack[s->len - 1];
+  int top = s->stack[s->len - 1];${hasCompact ? `
+  s->at_line_lead = true; // a real line boundary — the next real token leads its line` : ''}
 
   if (col > top) {
     if (want_indent) { push_indent(s, col); lexer->result_symbol = ${INDENT}; return true; }
diff --git a/tree-sitter/yaml/src/scanner.c b/tree-sitter/yaml/src/scanner.c
index e792844..ede3d3b 100644
--- a/tree-sitter/yaml/src/scanner.c
+++ b/tree-sitter/yaml/src/scanner.c
@@ -27,6 +27,7 @@ typedef struct {
   int16_t pending_col;   // column of the line boundary mid-processing (-1 = none)
   bool pending_newline;  // a NEWLINE is still owed once dedents reach pending_col
   bool started;          // any content lexed yet (suppresses a leading NEWLINE)
+  bool at_line_lead;     // the next real content token is its line's first (compact-indicator probe)
 } Scanner;
 
 static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
@@ -43,6 +44,7 @@ void *tree_sitter_yaml_external_scanner_create(void) {
   s->stack = ts_malloc(s->cap * sizeof(int16_t));
   s->stack[0] = 0;
   s->pending_col = -1; s->pending_newline = false; s->started = false;
+  s->at_line_lead = true;
   return s;
 }
 
@@ -57,6 +59,7 @@ unsigned tree_sitter_yaml_external_scanner_serialize(void *payload, char *buffer
   unsigned n = 0;
   buffer[n++] = s->started ? 1 : 0;
   buffer[n++] = s->pending_newline ? 1 : 0;
+  buffer[n++] = s->at_line_lead ? 1 : 0;
   memcpy(&buffer[n], &s->pending_col, sizeof(int16_t)); n += sizeof(int16_t);
   uint32_t count = s->len;
   while (n + sizeof(uint32_t) + count * sizeof(int16_t) > TREE_SITTER_SERIALIZATION_BUFFER_SIZE && count > 0) count--;
@@ -68,10 +71,12 @@ unsigned tree_sitter_yaml_external_scanner_serialize(void *payload, char *buffer
 void tree_sitter_yaml_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
   Scanner *s = (Scanner *)payload;
   s->len = 1; s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false;
-  if (length < 2 + sizeof(int16_t) + sizeof(uint32_t)) return;
+  s->at_line_lead = true;
+  if (length < 3 + sizeof(int16_t) + sizeof(uint32_t)) return;
   unsigned n = 0;
   s->started = buffer[n++] != 0;
   s->pending_newline = buffer[n++] != 0;
+  s->at_line_lead = buffer[n++] != 0;
   memcpy(&s->pending_col, &buffer[n], sizeof(int16_t)); n += sizeof(int16_t);
   uint32_t count; memcpy(&count, &buffer[n], sizeof(uint32_t)); n += sizeof(uint32_t);
   if (count == 0) return; // keep stack[0] = 0
@@ -107,6 +112,61 @@ static bool scan_block_scalar(Scanner *s, TSLexer *lexer) {
   return true;
 }
 
+// Compact block notation (`- a: 1` / `- - x` / `- &a k: v` / `? key`): a sequence/explicit-key
+// indicator whose inline content itself begins a block node nests at the content's column, not the
+// indicator's. The indicator chars are DERIVED from grammar.indent.compactIndicators. Mirrors
+// compactNestsHere / startsBlockStructuralNode in src/gen-lexer.ts: the inline content is
+// block-structural when, after an optional node-property prefix (`&anchor` / `!tag`, 0-2
+// space-separated), it is a further indicator, or a mapping KEY (an unquoted `:` then ws/EOL/
+// flow-indicator before a ` #` comment / EOL, scanned quote-aware). A bare scalar / flow / alias does
+// NOT nest. The property / flow / alias glyphs are fixed YAML syntax (as in gen-lexer); only the entry
+// indicators are config-driven.
+static inline bool compact_is_indicator(int32_t c) { return c == '-' || c == '?'; }
+static inline bool compact_sep_after(int32_t c) {
+  return c == 0 || c == ' ' || c == '\t' || c == '\n' || c == '\r';
+}
+// The inline content (lookahead is positioned at it) begins a block-structural node. Advances; the
+// caller has frozen a zero-width token end before it and discards the advances (returning the INDENT
+// zero-width on a hit, or rewinding on a miss).
+static bool compact_content_is_structural(TSLexer *lexer) {
+  for (int n = 0; n < 2; n++) {                       // skip 0-2 node-property prefixes (`&anchor` / `!tag`)
+    int32_t c = lexer->lookahead;
+    if (c == '&' || c == '!') {
+      advance(lexer);
+      while (lexer->lookahead != 0 && !compact_sep_after(lexer->lookahead) && lexer->lookahead != ',') advance(lexer);
+      while (lexer->lookahead == ' ' || lexer->lookahead == '\t') advance(lexer);
+    } else break;
+  }
+  int32_t c0 = lexer->lookahead;
+  if (c0 == 0 || c0 == '\n' || c0 == '\r') return false;            // property alone on the line → no nest
+  if (compact_is_indicator(c0)) { advance(lexer); return compact_sep_after(lexer->lookahead); } // nested indicator
+  if (c0 == '[' || c0 == '{' || c0 == '*') return false;            // flow collection / alias → not a key
+  for (;;) {                                          // scalar KEY sniff (quote-aware), like startsBlockStructuralNode
+    int32_t ch = lexer->lookahead;
+    if (ch == 0 || ch == '\n' || ch == '\r') break;
+    if (ch == '"') {
+      advance(lexer);
+      while (lexer->lookahead != 0 && lexer->lookahead != '"' && lexer->lookahead != '\n') { if (lexer->lookahead == '\\') advance(lexer); advance(lexer); }
+      if (lexer->lookahead == '"') advance(lexer);
+      continue;
+    }
+    if (ch == '\'') {
+      advance(lexer);
+      while (lexer->lookahead != 0 && lexer->lookahead != '\n') { if (lexer->lookahead == '\'') { advance(lexer); if (lexer->lookahead != '\'') break; } advance(lexer); }
+      continue;
+    }
+    if (ch == ' ' || ch == '\t') { advance(lexer); if (lexer->lookahead == '#') break; continue; } // trailing comment
+    if (ch == ':') {
+      advance(lexer);
+      int32_t n = lexer->lookahead;
+      if (n == 0 || n == ' ' || n == '\t' || n == '\n' || n == '\r' || n == ',' || n == '[' || n == ']' || n == '{' || n == '}') return true;
+      continue;
+    }
+    advance(lexer);
+  }
+  return false;
+}
+
 // A PLAIN / KEY scalar. A tree-sitter token() DFA can't decide a plain scalar's boundary (a `:` is
 // content unless followed by space/EOL/flow-indicator; a `#` starts a comment only after a space),
 // so we scan it here where look-ahead IS available. The run starts at the current column and ends
@@ -117,7 +177,16 @@ static bool scan_block_scalar(Scanner *s, TSLexer *lexer) {
 // tree-sitter rolls back our advances and the typed token matches) — but ONLY where such a token is
 // valid. A multi-line plain fold's continuation line is plain-ONLY (its KEY symbol is not valid), so
 // a numeric-looking continuation ("123" under a plain scalar) must stay PLAIN, not be handed to num.
-static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull) {
+//
+// COMPACT mapping-KEY support: when `compact_col >= 0` (a line-lead indicator's scalar-led inline
+// content, deeper than the stack top — see the caller), the run is scanned WITHOUT marking the token
+// end (the caller pre-marked a zero-width end at the content start). A KEY run pushes `compact_col`
+// and emits a zero-width INDENT — the nested mapping's real indent — and the key is re-lexed on the
+// next call (then compact_col is no longer deeper, so this path is skipped). A leaf scalar (no `:`)
+// is emitted as usual, its end marked at the run end. Mirrors compactNestsHere's mapping-key arm.
+static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull,
+                        Scanner *s, int16_t compact_col, int indent_sym) {
+  bool cm = compact_col >= 0;   // compact-eligible: suppress per-char mark_end (zero-width INDENT on KEY)
   char buf[64];
   unsigned blen = 0;        // run text (capped) — for the number/bool-null shape test
   bool has_content = false;
@@ -135,7 +204,7 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan
           n == ',' || n == '[' || n == ']' || n == '{' || n == '}') return false; // indicator, not a scalar
       if (blen < sizeof(buf)) buf[blen++] = (char)c;               // `-`/`?` glued to non-space is content
       has_content = true;
-      lexer->mark_end(lexer);
+      if (!cm) lexer->mark_end(lexer);
       continue;
     }
     if (c == ':') {
@@ -147,7 +216,7 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan
       }
       if (blen < sizeof(buf)) buf[blen++] = ':';                  // ':' glued to non-space is content
       has_content = true;
-      lexer->mark_end(lexer);
+      if (!cm) lexer->mark_end(lexer);
       continue;
     }
     if (c == ' ' || c == '\t') {
@@ -159,10 +228,24 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan
     if (blen < sizeof(buf)) buf[blen++] = (char)c;
     has_content = true;
     lexer->advance(lexer, false);
-    lexer->mark_end(lexer);                                       // token end follows the last content char
+    if (!cm) lexer->mark_end(lexer);                                       // token end follows the last content char
   }
   if (!has_content) return false;
 
+  // COMPACT mapping KEY: the inline content after a line-lead indicator is a mapping key → its column
+  // is the nested mapping's indent. Push it and emit the zero-width INDENT (the caller pre-marked the
+  // end at the content start); the key is re-lexed on the next call. A leaf falls through to normal
+  // classification, with its end marked here (run end) since per-char marking was suppressed.
+  if (cm) {
+    if (stopped_at_kv) {
+      push_indent(s, compact_col);
+      s->at_line_lead = true;     // the key is itself this line's fresh lead (re-lexed next call)
+      lexer->result_symbol = indent_sym;
+      return true;                // zero-width INDENT at the content start (advances discarded)
+    }
+    lexer->mark_end(lexer);       // leaf: take the whole run (trailing-space trim is skipped in compact mode)
+  }
+
   // Number / bool-null SHAPE test (so the typed regex tokens still classify `1`/`true`). Decide KEY
   // vs PLAIN first, because a typed-looking run is only deferred where a typed token is valid: a KEY
   // position admits num/bool_null (block_key_scalar), and a non-KEY value position likewise — but a
@@ -228,11 +311,44 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const
     return false;
   }
 
+  // Compact block notation (`- a: 1` / `- - x` / `- &a k: v` / `? key`). The line-lead indicator was
+  // just lexed by tree-sitter's internal DFA, so at_line_lead is still set (the scanner emits no
+  // indicator token to clear it) and tree-sitter now wants the nested node's INDENT on the SAME line.
+  // When the inline content begins a block node, its column — not the indicator's — is the node's
+  // indentation: emit a zero-width INDENT there and push it (the DEDENT logic closes it when a
+  // shallower line arrives). The work splits by what leads the inline content, because the sniff
+  // ADVANCES (irrecoverably) and external-scanner state changes are reverted on a false return:
+  //   • a node-property / flow / alias / nested-indicator lead — sniff it here; a structural hit pushes
+  //     INDENT, a miss returns false so tree-sitter rewinds and the leading char (all INTERNAL-lexable,
+  //     or the scalar handled on the next call) is re-lexed.
+  //   • a plain/quoted SCALAR lead — fall through to scan_scalar (below), which pushes the compact
+  //     INDENT itself when the run is a mapping KEY. (A bare scalar must NOT be sniffed-then-rewound: a
+  //     plain scalar is external-only, so a false return here would loop.)
+  if (want_indent && s->at_line_lead) {
+    while (lexer->lookahead == ' ' || lexer->lookahead == '\t') skip(lexer);
+    int32_t c = lexer->lookahead;
+    bool nonscalar_lead = c == '&' || c == '!' || c == '[' || c == '{' || c == '*' || compact_is_indicator(c);
+    if (nonscalar_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) {
+      int16_t col = (int16_t)lexer->get_column(lexer);
+      lexer->mark_end(lexer); // freeze the zero-width INDENT end at the content column before the sniff advances
+      if (compact_content_is_structural(lexer)) {
+        push_indent(s, col);
+        // A NESTED indicator's content is itself a fresh line-lead (so `- - x` nests once more); but a
+        // node-property prefix (`- &a k: v`) is followed INLINE by its own mapping KEY at the SAME level
+        // — that key must NOT push again, so clear the lead for the property / direct-key case.
+        s->at_line_lead = compact_is_indicator(c);
+        lexer->result_symbol = INDENT;
+        return true;
+      }
+      return false; // not block-structural → rewind; the internal-lexable lead (or next-call scalar) re-lexes
+    }
+  }
+
   // A block scalar value (`key: |`): scan its body before the indent logic — its more-indented
   // lines are content, not nested structure. Skip the inline space after the `:`/`-` first.
   if (want_block) {
     while (lexer->lookahead == ' ' || lexer->lookahead == '\t') skip(lexer);
-    if (lexer->lookahead == '|' || lexer->lookahead == '>') { if (scan_block_scalar(s, lexer)) return true; }
+    if (lexer->lookahead == '|' || lexer->lookahead == '>') { if (scan_block_scalar(s, lexer)) { s->at_line_lead = false; return true; } }
   }
 
   // A SCALAR (plain / key / num / bool-null) on the CURRENT line — NOT at a line boundary (a leading
@@ -247,7 +363,15 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const
                      h == '}' || h == '#' || h == '&' || h == '*' || h == '!' || h == '|' || h == '>' ||
                      h == '\'' || h == '"' || h == '%' || h == '@' || h == '`';
     if (!indicator) {
-      if (scan_scalar(lexer, valid_symbols[KEY] != 0, valid_symbols[PLAIN] != 0, valid_symbols[NUM] != 0, valid_symbols[BOOL_NULL] != 0)) { s->started = true; return true; }
+      // Compact mapping-KEY (part B): a line-lead indicator's scalar-led inline content, deeper than the
+      // stack top. scan_scalar then pushes the nested mapping's INDENT (zero-width, pre-marked here) if
+      // the run is a KEY, or emits the leaf scalar otherwise (so `- x` stays a plain item, no push).
+      int16_t compact_col = -1;
+      if (want_indent && s->at_line_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) {
+        compact_col = (int16_t)lexer->get_column(lexer);
+        lexer->mark_end(lexer); // zero-width INDENT end at the content start (used iff the run is a KEY)
+      }
+      if (scan_scalar(lexer, valid_symbols[KEY] != 0, valid_symbols[PLAIN] != 0, valid_symbols[NUM] != 0, valid_symbols[BOOL_NULL] != 0, s, compact_col, INDENT)) { s->started = true; s->at_line_lead = (lexer->result_symbol == INDENT); return true; }
     }
   }
 
@@ -276,6 +400,7 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const
   int16_t col = (int16_t)lexer->get_column(lexer);
   lexer->mark_end(lexer); // INDENT/DEDENT/NEWLINE are zero-width at the content column
   int top = s->stack[s->len - 1];
+  s->at_line_lead = true; // a real line boundary — the next real token leads its line
 
   if (col > top) {
     if (want_indent) { push_indent(s, col); lexer->result_symbol = INDENT; return true; }

From 6e7d1d41d118def3ab37e4d84df9a2d995f03c6b Mon Sep 17 00:00:00 2001
From: Johnson Chu <johnsoncodehk@gmail.com>
Date: Mon, 8 Jun 2026 06:31:30 +0800
Subject: [PATCH 06/10] Add a YAML tree-sitter accuracy bench + gate yaml in CI
 (issue #3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`test/treesitter-yaml-bench.ts` measures how many VALID yaml-test-suite inputs the derived YAML
tree-sitter parses with no ERROR/MISSING node ("valid" = the `yaml` package accepts the input, so a
failure is the grammar's, not a malformed sample). Baseline: 209/312 = 67.0% — a real working
tree-sitter for an indentation-sensitive grammar (the grammar previously did not even `generate`).

CI: yaml joins the "generate every derived grammar" conflict gate and gets a build-to-wasm step (its
C indentation scanner must compile + link). The accuracy bench runs where the yaml-test-suite is
already cloned (the readme-bench workflow), not in the conflict gate.

Refs #3
---
 .github/workflows/ci.yml      | 14 ++++++++---
 test/treesitter-yaml-bench.ts | 45 +++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 3 deletions(-)
 create mode 100644 test/treesitter-yaml-bench.ts

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c7b0380..c4ba0d5 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -74,11 +74,11 @@ jobs:
       # grammar that is a tree-sitter target, so a conflict introduced by a grammar
       # change is caught even for the dialects whose wasm is not built below (tsx/js/jsx)
       # — exactly the gap that let an unresolved `type`/`class_heritage` conflict ship.
-      # yaml is excluded: its indentation tokens are not yet wired as tree-sitter
-      # externals, so its generated grammar.js is not loadable (separate open issue).
+      # yaml is now included (issue #3): its indent/scalar tokens are wired as tree-sitter
+      # externals and the C indentation scanner is implemented, so its grammar generates + builds.
       - name: Generate every derived tree-sitter grammar (conflict gate, no wasm)
         run: |
-          for g in typescript typescriptreact javascript javascriptreact html; do
+          for g in typescript typescriptreact javascript javascriptreact html yaml; do
             echo "── tree-sitter generate: $g"
             ( cd "tree-sitter/$g" && npx tree-sitter generate )
           done
@@ -97,3 +97,11 @@ jobs:
           npx tree-sitter build --wasm .
           cd ../..
           node test/html-treesitter.ts
+      # The derived YAML tree-sitter (issue #3) — build the wasm (its C indentation scanner must
+      # compile + link). The accuracy bench (test/treesitter-yaml-bench.ts) needs the yaml-test-suite
+      # checkout, so it runs in the readme-bench workflow where the suite is already cloned.
+      - name: Build the derived YAML tree-sitter grammar to wasm
+        run: |
+          cd tree-sitter/yaml
+          npx tree-sitter generate
+          npx tree-sitter build --wasm .
diff --git a/test/treesitter-yaml-bench.ts b/test/treesitter-yaml-bench.ts
new file mode 100644
index 0000000..db6190d
--- /dev/null
+++ b/test/treesitter-yaml-bench.ts
@@ -0,0 +1,45 @@
+// YAML tree-sitter accuracy bench (issue #3): how many VALID yaml-test-suite inputs the DERIVED
+// YAML tree-sitter parses with no ERROR/MISSING node. "Valid" = the `yaml` package accepts the input
+// (so a failure is the tree-sitter grammar's, not a malformed sample). The corpus is extracted from
+// the yaml-test-suite src meta-files exactly like test/src-coverage-yaml.ts.
+//
+//   git clone --depth 1 https://github.com/yaml/yaml-test-suite /tmp/yaml-test-suite
+//   cd tree-sitter/yaml && npx tree-sitter generate && npx tree-sitter build --wasm .
+//   node test/treesitter-yaml-bench.ts
+import { readdirSync, readFileSync, existsSync } from 'node:fs';
+import { parse as yamlParse, parseAllDocuments } from 'yaml';
+
+const WASM = 'tree-sitter/yaml/tree-sitter-yaml.wasm';
+const SUITE = '/tmp/yaml-test-suite/src';
+if (!existsSync(WASM)) { console.error(`missing ${WASM} — run: (cd tree-sitter/yaml && npx tree-sitter build --wasm .)`); process.exit(1); }
+if (!existsSync(SUITE)) { console.error(`missing ${SUITE} — git clone --depth 1 https://github.com/yaml/yaml-test-suite /tmp/yaml-test-suite`); process.exit(1); }
+
+const { Parser, Language } = await import('web-tree-sitter');
+await Parser.init();
+const lang = await Language.load(WASM);
+const parser = new Parser();
+parser.setLanguage(lang);
+
+// Decode the suite's visible-whitespace markers to real bytes (same as src-coverage-yaml).
+const decode = (s: string) => s.replace(/␣/g, ' ').replace(/—*»/g, '\t').replace(/[↵∎]/g, '');
+const corpus: string[] = [];
+for (const f of readdirSync(SUITE).filter((n) => n.endsWith('.yaml'))) {
+  try {
+    const meta = yamlParse(readFileSync(`${SUITE}/${f}`, 'utf8'));
+    for (const t of (Array.isArray(meta) ? meta : [meta])) if (t && typeof t.yaml === 'string') corpus.push(decode(t.yaml));
+  } catch { /* skip meta-files that don't round-trip */ }
+}
+const valid = corpus.filter((c) => { try { return parseAllDocuments(c).every((d: any) => d.errors.length === 0); } catch { return false; } });
+
+function hasError(node: any): boolean {
+  if (node.type === 'ERROR' || node.isError === true || node.isMissing === true) return true;
+  for (let i = 0; i < node.childCount; i++) { const c = node.child(i); if (c && hasError(c)) return true; }
+  return false;
+}
+
+let ok = 0;
+for (const c of valid) { const tree = parser.parse(c); if (tree && !hasError(tree.rootNode)) ok++; }
+const pct = ((100 * ok) / valid.length).toFixed(1);
+console.log(`YAML corpus: ${corpus.length} inputs (${valid.length} valid per the yaml package).`);
+console.log(`YAML tree-sitter accuracy: ${ok}/${valid.length} valid inputs parse ERROR-free (${pct}%).`);
+console.log(`##TSYAML## ${JSON.stringify({ name: 'YAML', engine: 'tree-sitter (derived)', valid: valid.length, errorFree: ok, pct: Number(pct) })}`);

From f36c4be187fb723a103b113bc10d4c8309ae074b Mon Sep 17 00:00:00 2001
From: Johnson Chu <johnsoncodehk@gmail.com>
Date: Mon, 8 Jun 2026 07:31:47 +0800
Subject: [PATCH 07/10] =?UTF-8?q?Scan=20YAML=20flow=20indicators=20with=20?=
 =?UTF-8?q?flow-depth=20=E2=80=94=20block-context=20`,`=20is=20content=20(?=
 =?UTF-8?q?issue=20#3)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The C scanner's `scan_scalar` always broke a plain run at `,` `[` `]` `{` `}`, but those are special
only INSIDE a flow collection — in block context they are ordinary plain content (`a, b` is one
scalar). So `a, b`, `k: a, b`, and multi-line flow (`[a,\n b]`) errored. Fix: track `flow_depth` in
the scanner. tree-sitter (0.26.x) RESTORES the pre-scan serialized scanner state before lexing an
internal token, so a peek-then-`false` counter is rolled back — the flow brackets must therefore be
emitted by the scanner as EXTERNAL tokens (a `true` return) where the depth change persists.
`flowSyntheticTokens` synthesizes one external token per `indent.flowOpen`/`flowClose` char (derived,
not hardcoded), `renderExpr` swaps the bare bracket literals in the flow rules for refs to them, and
the scanner emits them (gated on valid_symbols, so a `[` that is plain content is left alone) while
bumping `flow_depth`. `scan_scalar`'s `,`/bracket/`:`/`-`/`?` boundary checks are now gated on
`flow_depth > 0`; in block context they are content. Compact + block-scalar handling stay gated on
`flow_depth == 0`. A flow-context leading-trivia skip (incl. newlines/comments) makes multi-line flow
work. Verified against the `yaml` reference (`a:,b`, `a:[1,2]`, `a,b: c` are single block scalars/keys).

Bench: 209/312 → 226/312 (67.0% → 72.4%). The six other grammars stay byte-identical; tsc clean;
generate + build --wasm succeed; gate:treesitter 96.0% (beats official 92.5%).

Refs #3
---
 src/gen-treesitter.ts          | 149 ++++++++++++++++++++++++++++-----
 tree-sitter/yaml/grammar.js    |  10 ++-
 tree-sitter/yaml/src/scanner.c |  82 ++++++++++++++++--
 3 files changed, 209 insertions(+), 32 deletions(-)

diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts
index fd95dcc..9a3ba1f 100644
--- a/src/gen-treesitter.ts
+++ b/src/gen-treesitter.ts
@@ -143,6 +143,10 @@ interface GrammarJsContext {
   externalSnake: Set<string>;
   /** original token name → external scanner token name (snake) if scanner-provided */
   scannerTokenFor: Map<string, string>;
+  /** flow-delimiter LITERAL char (`[` `]` `{` `}`) → synthetic external scanner token (snake). These
+   *  bare literals in the flow rules are swapped for refs to the scanner token (renderExpr). Empty for
+   *  non-flow grammars. See flowSyntheticTokens. */
+  flowLiteralTokens: Map<string, string>;
   /** Non-start rules whose body can derive the empty string. tree-sitter rejects these, so their
    *  bodies are made non-empty and every reference to them is wrapped in optional() (ε-elimination,
    *  see makeNonEmpty / wrapNullableRefs). Empty for grammars with no nullable non-start rules. */
@@ -181,8 +185,13 @@ function hasMarker(expr: RuleExpr): boolean {
  */
 function renderExpr(expr: RuleExpr, ctx: GrammarJsContext): string {
   switch (expr.type) {
-    case 'literal':
+    case 'literal': {
+      // A flow-collection delimiter literal (`[` `]` `{` `}`) is emitted by the external scanner (so
+      // flow_depth persists), so reference its synthetic scanner token instead of the bare string.
+      const flowSym = ctx.flowLiteralTokens.get(expr.value);
+      if (flowSym) return `$.${flowSym}`;
       return jsString(expr.value);
+    }
     case 'ref': {
       // A token provided by the external scanner is referenced by its scanner
       // symbol name (e.g. `regex` → `regex_literal`), not its plain token snake.
@@ -737,6 +746,30 @@ function planPlainScalarTokens(grammar: CstGrammar): { plain?: string; key?: str
   return { plain, key, num, boolnull };
 }
 
+/**
+ * Synthetic external tokens for the flow-collection delimiters (`[` `]` `{` `}`). YAML's flow brackets
+ * suspend indentation and turn `,`/brackets into structural separators; a tree-sitter external scanner
+ * can only KEEP that state (flow_depth) across a token if it RETURNS that token (mutations during a
+ * `false` return are discarded — the pre-scan state is restored before the internal bracket is lexed).
+ * So the brackets are emitted by the scanner as external tokens. They have no token name in the source
+ * grammar (they are bare literals in the flow rules), so we synthesize a stable name per delimiter char
+ * and (a) register them as externals here and (b) substitute the matching literal in the rendered rules
+ * (renderExpr). Returns [] for non-flow grammars. Order: every opener (in flowOpen order) then every
+ * closer (flowClose order) — the enum / grammar.js externals follow this order.
+ */
+const FLOW_CHAR_NAMES: Record<string, string> = {
+  '[': 'lbracket', ']': 'rbracket', '{': 'lbrace', '}': 'rbrace', '(': 'lparen', ')': 'rparen',
+};
+function flowSyntheticTokens(grammar: CstGrammar): { sym: string; char: string; open: boolean }[] {
+  const ind = grammar.indent;
+  if (!ind || !(ind.flowOpen?.length || ind.flowClose?.length)) return [];
+  const name = (c: string) => `_flow_${FLOW_CHAR_NAMES[c] ?? `u${c.charCodeAt(0)}`}`;
+  return [
+    ...(ind.flowOpen ?? []).map(c => ({ sym: name(c), char: c, open: true })),
+    ...(ind.flowClose ?? []).map(c => ({ sym: name(c), char: c, open: false })),
+  ];
+}
+
 /** Determine which tokens the external scanner must provide. */
 function planScannerTokens(grammar: CstGrammar): Map<string, string> {
   const map = new Map<string, string>();
@@ -766,6 +799,10 @@ function planScannerTokens(grammar: CstGrammar): Map<string, string> {
     if (key) map.set(key, toSnake(key));
     if (num) map.set(num, toSnake(num));
     if (boolnull) map.set(boolnull, toSnake(boolnull));
+    // The flow-collection delimiter tokens (`[ ] { }`) — emitted by the scanner so flow_depth persists
+    // (a TRUE return). The synthetic name IS the snake symbol; the matching literal in the flow rules is
+    // swapped for a ref to it in renderExpr. Appended last so the scalar-token positions are unchanged.
+    for (const { sym } of flowSyntheticTokens(grammar)) map.set(sym, sym);
   }
   // The regex token: '/' is context-sensitive (regex vs division). The scanner
   // resolves it.
@@ -963,8 +1000,12 @@ export function generateTreeSitter(grammar: CstGrammar, langName?: string): Tree
   const isTerminalName = (n: string) => tokenNames.has(n) || scannerTokenFor.has(n);
   const nullableNonStart = computeNullableNonStart(grammar, entryName, isTerminalName);
 
+  const flowLiteralTokens = new Map<string, string>();
+  for (const { sym, char } of flowSyntheticTokens(grammar)) flowLiteralTokens.set(char, sym);
+
   const ctx: GrammarJsContext = {
     grammar, tokenNames, ruleSnake, tokenSnake, prattRules, externalSnake, scannerTokenFor,
+    flowLiteralTokens,
     nullableNonStart,
     templatePlan,
     interpolationPlans,
@@ -1856,6 +1897,27 @@ function buildIndentScannerC(grammar: CstGrammar, ctx: GrammarJsContext, grammar
   const compactIndicators = ind.compactIndicators ?? [];
   const compactIndicatorCond = (v: string) => compactIndicators.map(c => `${v} == '${c}'`).join(' || ') || '0';
   const hasCompact = compactIndicators.length > 0 && SCALAR;
+  // Flow-collection delimiters (`[ ] { }`) — DERIVED from grammar.indent.flowOpen / flowClose. Inside a
+  // flow collection (flow_depth > 0) indentation is SUSPENDED and `,`/`[`/`]`/`{`/`}` are item/collection
+  // boundaries; in block context (flow_depth == 0) those same chars are ordinary plain-scalar content
+  // (mirrors the flowDepth counter in src/gen-lexer.ts, the parser's lexer). tree-sitter discards an
+  // external scanner's struct mutations on a `false` return (it restores the pre-scan serialized state
+  // before lexing the internal bracket), so a counter cannot be maintained by peeking-then-returning-
+  // false at the bracket; instead the flow OPEN/CLOSE brackets are emitted as EXTERNAL tokens by the
+  // scanner (a TRUE return DOES persist), and flow_depth is bumped there. The brackets are synthesized
+  // external tokens (no token name in yaml.ts) — see flowSyntheticTokens / the literal substitution in
+  // renderExpr. flow_depth is then carried in the Scanner struct (serialize/deserialize).
+  const flowOpen = ind.flowOpen ?? [];
+  const flowClose = ind.flowClose ?? [];
+  const charLit = (c: string) => (c === '\\' || c === "'" ? `'\\${c}'` : `'${c}'`);
+  // Run-boundary chars inside a flow collection: the closers/openers + the entry separator `,`. A plain
+  // scalar still cannot START with one of these (only contain them in block context).
+  const hasFlow = flowOpen.length > 0 || flowClose.length > 0;
+  const flowBreakCond = (v: string) => [...flowOpen, ...flowClose].map(c => `${v} == ${charLit(c)}`).concat(`${v} == ','`).join(' || ');
+  // The synthetic external token name + char for each flow delimiter (open then close), in the SAME
+  // order they were registered in ctx.scannerTokenFor (so the enum / grammar.js externals positions
+  // agree). Built by flowSyntheticTokens(grammar) and shared with the grammar.js side.
+  const flowTokens = flowSyntheticTokens(grammar); // [{ sym, char, open }]
 
   const scannerC = `// Tree-sitter external scanner generated by monogram (indentation path).
 //
@@ -1867,6 +1929,7 @@ function buildIndentScannerC(grammar: CstGrammar, ctx: GrammarJsContext, grammar
 #include "tree_sitter/parser.h"
 #include "tree_sitter/alloc.h"
 #include <string.h>
+#include <stdint.h>
 
 enum TokenType {
 ${enumBody}
@@ -1879,7 +1942,8 @@ typedef struct {
   int16_t pending_col;   // column of the line boundary mid-processing (-1 = none)
   bool pending_newline;  // a NEWLINE is still owed once dedents reach pending_col
   bool started;          // any content lexed yet (suppresses a leading NEWLINE)${hasCompact ? `
-  bool at_line_lead;     // the next real content token is its line's first (compact-indicator probe)` : ''}
+  bool at_line_lead;     // the next real content token is its line's first (compact-indicator probe)` : ''}${hasFlow ? `
+  uint16_t flow_depth;   // > 0 inside flow collections ([ ] { }) → indentation suspended, ,/[/]/{/}` : ''}
 } Scanner;
 
 static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
@@ -1896,7 +1960,8 @@ void *tree_sitter_${G}_external_scanner_create(void) {
   s->stack = ts_malloc(s->cap * sizeof(int16_t));
   s->stack[0] = 0;
   s->pending_col = -1; s->pending_newline = false; s->started = false;${hasCompact ? `
-  s->at_line_lead = true;` : ''}
+  s->at_line_lead = true;` : ''}${hasFlow ? `
+  s->flow_depth = 0;` : ''}
   return s;
 }
 
@@ -1912,7 +1977,8 @@ unsigned tree_sitter_${G}_external_scanner_serialize(void *payload, char *buffer
   buffer[n++] = s->started ? 1 : 0;
   buffer[n++] = s->pending_newline ? 1 : 0;${hasCompact ? `
   buffer[n++] = s->at_line_lead ? 1 : 0;` : ''}
-  memcpy(&buffer[n], &s->pending_col, sizeof(int16_t)); n += sizeof(int16_t);
+  memcpy(&buffer[n], &s->pending_col, sizeof(int16_t)); n += sizeof(int16_t);${hasFlow ? `
+  memcpy(&buffer[n], &s->flow_depth, sizeof(uint16_t)); n += sizeof(uint16_t);` : ''}
   uint32_t count = s->len;
   while (n + sizeof(uint32_t) + count * sizeof(int16_t) > TREE_SITTER_SERIALIZATION_BUFFER_SIZE && count > 0) count--;
   memcpy(&buffer[n], &count, sizeof(uint32_t)); n += sizeof(uint32_t);
@@ -1923,13 +1989,15 @@ unsigned tree_sitter_${G}_external_scanner_serialize(void *payload, char *buffer
 void tree_sitter_${G}_external_scanner_deserialize(void *payload, const char *buffer, unsigned length) {
   Scanner *s = (Scanner *)payload;
   s->len = 1; s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false;${hasCompact ? `
-  s->at_line_lead = true;` : ''}
-  if (length < ${hasCompact ? 3 : 2} + sizeof(int16_t) + sizeof(uint32_t)) return;
+  s->at_line_lead = true;` : ''}${hasFlow ? `
+  s->flow_depth = 0;` : ''}
+  if (length < ${hasCompact ? 3 : 2} + sizeof(int16_t)${hasFlow ? ' + sizeof(uint16_t)' : ''} + sizeof(uint32_t)) return;
   unsigned n = 0;
   s->started = buffer[n++] != 0;
   s->pending_newline = buffer[n++] != 0;${hasCompact ? `
   s->at_line_lead = buffer[n++] != 0;` : ''}
-  memcpy(&s->pending_col, &buffer[n], sizeof(int16_t)); n += sizeof(int16_t);
+  memcpy(&s->pending_col, &buffer[n], sizeof(int16_t)); n += sizeof(int16_t);${hasFlow ? `
+  memcpy(&s->flow_depth, &buffer[n], sizeof(uint16_t)); n += sizeof(uint16_t);` : ''}
   uint32_t count; memcpy(&count, &buffer[n], sizeof(uint32_t)); n += sizeof(uint32_t);
   if (count == 0) return; // keep stack[0] = 0
   while (s->cap < count) { s->cap *= 2; s->stack = ts_realloc(s->stack, s->cap * sizeof(int16_t)); }
@@ -2036,7 +2104,8 @@ static bool compact_content_is_structural(TSLexer *lexer) {
 // and emits a zero-width INDENT — the nested mapping's real indent — and the key is re-lexed on the
 // next call (then compact_col is no longer deeper, so this path is skipped). A leaf scalar (no \`:\`)
 // is emitted as usual, its end marked at the run end. Mirrors compactNestsHere's mapping-key arm.` : ''}
-static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull${hasCompact ? `,
+static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull${hasFlow ? `,
+                        int flow_depth` : ''}${hasCompact ? `,
                         Scanner *s, int16_t compact_col, int indent_sym` : ''}) {${hasCompact ? `
   bool cm = compact_col >= 0;   // compact-eligible: suppress per-char mark_end (zero-width INDENT on KEY)` : ''}
   char buf[64];
@@ -2046,14 +2115,15 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan
   for (;;) {
     int32_t c = lexer->lookahead;
     if (c == 0 || c == '\\n' || c == '\\r') break;                 // newline / EOF
-    if (c == ',' || c == '[' || c == ']' || c == '{' || c == '}') break; // flow indicators end a scalar
+    ${hasFlow ? `if (flow_depth > 0 && (${flowBreakCond('c')})) break; // flow indicators end a scalar — ONLY inside a flow collection` : `if (c == ',' || c == '[' || c == ']' || c == '{' || c == '}') break; // flow indicators end a scalar`}
     if (!has_content && (c == '-' || c == '?')) {
       // A leading \`-\`/\`?\` is a block indicator (seq entry / explicit key) when followed by space/EOL/
       // flow-indicator, and scalar content otherwise (\`-1\`, \`?x\`). Peek the next char to decide.
       lexer->advance(lexer, false);
       int32_t n = lexer->lookahead;
-      if (n == 0 || n == ' ' || n == '\\t' || n == '\\n' || n == '\\r' ||
-          n == ',' || n == '[' || n == ']' || n == '{' || n == '}') return false; // indicator, not a scalar
+      if (n == 0 || n == ' ' || n == '\\t' || n == '\\n' || n == '\\r'${hasFlow ? ` ||
+          (flow_depth > 0 && (${flowBreakCond('n')}))` : ` ||
+          n == ',' || n == '[' || n == ']' || n == '{' || n == '}'`}) return false; // indicator, not a scalar
       if (blen < sizeof(buf)) buf[blen++] = (char)c;               // \`-\`/\`?\` glued to non-space is content
       has_content = true;
       ${hasCompact ? 'if (!cm) ' : ''}lexer->mark_end(lexer);
@@ -2062,8 +2132,9 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan
     if (c == ':') {
       lexer->advance(lexer, false);                               // past the ':' to peek the next char
       int32_t n = lexer->lookahead;
-      if (n == 0 || n == ' ' || n == '\\t' || n == '\\n' || n == '\\r' ||
-          n == ',' || n == '[' || n == ']' || n == '{' || n == '}') {
+      if (n == 0 || n == ' ' || n == '\\t' || n == '\\n' || n == '\\r'${hasFlow ? ` ||
+          (flow_depth > 0 && (${flowBreakCond('n')}))` : ` ||
+          n == ',' || n == '[' || n == ']' || n == '{' || n == '}'`}) {
         stopped_at_kv = true; break;                              // ':' is a key/value separator → end before it
       }
       if (blen < sizeof(buf)) buf[blen++] = ':';                  // ':' glued to non-space is content
@@ -2161,7 +2232,41 @@ bool tree_sitter_${G}_external_scanner_scan(void *payload, TSLexer *lexer, const
     if (col == top && owed && want_newline && s->started) { lexer->result_symbol = ${NEWLINE}; return true; }
     return false;
   }
-${hasCompact ? `
+${hasFlow ? `
+  // Inside a flow collection (flow_depth > 0) a line break is INSIGNIFICANT — indentation is suspended,
+  // so a flow scalar / nested bracket may sit on a following line (\`[a,\\n b]\`). tree-sitter's \`/\\s/\`
+  // extra cannot skip the newline here: the external scanner is consulted first, and a \`false\` return
+  // (the only way to "decline") rolls back any advance, so the newline is never consumed and the parser
+  // stalls into error recovery. So when in flow, the scanner itself eats the flow-insignificant run
+  // (spaces, tabs, newlines, comments) as the LEADING trivia of the next token it returns — the bracket
+  // emission and scalar scan below both return TRUE, which makes the skip stick. (In block context this
+  // is skipped: a newline IS significant and drives the INDENT/DEDENT/NEWLINE boundary logic.)
+  if (s->flow_depth > 0) {
+    for (;;) {
+      int32_t c = lexer->lookahead;
+      if (c == ' ' || c == '\\t' || c == '\\n' || c == '\\r') skip(lexer);
+      ${cmtC ? `else if (c == ${cmtC}) { while (lexer->lookahead != 0 && lexer->lookahead != '\\n' && lexer->lookahead != '\\r') skip(lexer); }` : ''}
+      else break;
+    }
+  }
+  // Flow-collection delimiters ([ ] { }). These are emitted as EXTERNAL tokens (not the internal DFA) so
+  // flow_depth — the open-bracket nesting that suspends indentation — PERSISTS: tree-sitter keeps an
+  // external scanner's struct mutations only across a token it RETURNS (on a \`false\` return it restores
+  // the pre-scan serialized state, so a peek-then-false counter is silently rolled back before the
+  // internal bracket is lexed). Each is gated on its own valid_symbols, so a \`[\`/\`{\` that is plain
+  // content (a scalar contains but never STARTS with one — handled in scan_scalar) is NOT consumed here:
+  // at those positions the flow token isn't valid and we fall through. Skip inline space/tab first (the
+  // flow newline skip above already ran when in flow; in block a newline still drives the indent logic).
+  {
+    while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') skip(lexer);
+    int32_t fc = lexer->lookahead;
+${flowTokens.map(t => `    if (fc == ${charLit(t.char)} && valid_symbols[${t.sym.toUpperCase()}]) {
+      advance(lexer); lexer->mark_end(lexer); lexer->result_symbol = ${t.sym.toUpperCase()};
+      ${t.open ? 'if (s->flow_depth < UINT16_MAX) s->flow_depth++;' : 'if (s->flow_depth > 0) s->flow_depth--;'}
+      s->started = true;${hasCompact ? ' s->at_line_lead = false;' : ''} return true;
+    }`).join('\n')}
+  }
+` : ''}${hasCompact ? `
   // Compact block notation (\`- a: 1\` / \`- - x\` / \`- &a k: v\` / \`? key\`). The line-lead indicator was
   // just lexed by tree-sitter's internal DFA, so at_line_lead is still set (the scanner emits no
   // indicator token to clear it) and tree-sitter now wants the nested node's INDENT on the SAME line.
@@ -2175,7 +2280,7 @@ ${hasCompact ? `
   //   • a plain/quoted SCALAR lead — fall through to scan_scalar (below), which pushes the compact
   //     INDENT itself when the run is a mapping KEY. (A bare scalar must NOT be sniffed-then-rewound: a
   //     plain scalar is external-only, so a false return here would loop.)
-  if (want_indent && s->at_line_lead) {
+  if (want_indent && s->at_line_lead${hasFlow ? ' && s->flow_depth == 0' : ''}) {
     while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') skip(lexer);
     int32_t c = lexer->lookahead;
     bool nonscalar_lead = c == '&' || c == '!' || c == '[' || c == '{' || c == '*' || compact_is_indicator(c);
@@ -2196,8 +2301,9 @@ ${hasCompact ? `
   }
 ` : ''}${BLOCK ? `
   // A block scalar value (\`key: |\`): scan its body before the indent logic — its more-indented
-  // lines are content, not nested structure. Skip the inline space after the \`:\`/\`-\` first.
-  if (want_block) {
+  // lines are content, not nested structure. Skip the inline space after the \`:\`/\`-\` first. Block
+  // scalars are a block-context construct — inside a flow collection \`|\`/\`>\` are plain content.
+  if (want_block${hasFlow ? ' && s->flow_depth == 0' : ''}) {
     while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') skip(lexer);
     if (${introCond}) { if (scan_block_scalar(s, lexer)) {${hasCompact ? ' s->at_line_lead = false;' : ''} return true; } }
   }
@@ -2222,12 +2328,15 @@ ${hasCompact ? `
         compact_col = (int16_t)lexer->get_column(lexer);
         lexer->mark_end(lexer); // zero-width INDENT end at the content start (used iff the run is a KEY)
       }
-      if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)}, s, compact_col, ${INDENT})) { s->started = true; s->at_line_lead = (lexer->result_symbol == ${INDENT}); return true; }` : `
-      if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)})) { s->started = true; return true; }`}
+      if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)}${hasFlow ? ', s->flow_depth' : ''}, s, compact_col, ${INDENT})) { s->started = true; s->at_line_lead = (lexer->result_symbol == ${INDENT}); return true; }` : `
+      if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)}${hasFlow ? ', s->flow_depth' : ''})) { s->started = true; return true; }`}
     }
   }
 ` : ''}
-  if (!want_indent && !want_dedent && !want_newline) return false; // flow context — no indent tokens valid
+  if (!want_indent && !want_dedent && !want_newline) return false; // no indent tokens valid${hasFlow ? `
+  // Inside a flow collection a newline is INSIGNIFICANT (indentation suspended): emit NO INDENT/DEDENT/
+  // NEWLINE so the line break is consumed by tree-sitter's \`/\\s/\` extra and the flow spans the line.
+  if (s->flow_depth > 0) return false;` : ''}
 
   // Skip blank lines, comment-only lines, and leading whitespace, noting whether a line break was
   // crossed (only a real boundary drives the indent logic).
diff --git a/tree-sitter/yaml/grammar.js b/tree-sitter/yaml/grammar.js
index e2cd88a..d5233e9 100644
--- a/tree-sitter/yaml/grammar.js
+++ b/tree-sitter/yaml/grammar.js
@@ -24,7 +24,11 @@ module.exports = grammar({
     $.plain,
     $.key,
     $.num,
-    $.bool_null
+    $.bool_null,
+    $._flow_lbracket,
+    $._flow_lbrace,
+    $._flow_rbracket,
+    $._flow_rbrace
   ],
 
   conflicts: $ => [
@@ -128,13 +132,13 @@ module.exports = grammar({
 
     flow_map_entry: $ => choice(seq($.flow_explicit, optional(optional($.flow_node)), optional(seq(":", optional(optional($.flow_node))))), seq($.flow_node, optional(seq(":", optional(optional($.flow_node))))), seq(":", optional(optional($.flow_node)))),
 
-    flow_mapping: $ => seq("{", optional(seq(optional($.flow_map_entry), repeat(seq(",", optional($.flow_map_entry))))), optional(","), "}"),
+    flow_mapping: $ => seq($._flow_lbrace, optional(seq(optional($.flow_map_entry), repeat(seq(",", optional($.flow_map_entry))))), optional(","), $._flow_rbrace),
 
     flow_seq_entry: $ => choice(seq($.flow_seq_key, ":", optional(optional($.flow_node))), seq("?", optional(optional($.flow_node)), optional(seq(":", optional(optional($.flow_node))))), seq(":", optional(optional($.flow_node))), $.flow_node),
 
     flow_seq_key: $ => choice(seq(optional($.property), choice($.flow_mapping, $.flow_sequence, $.dquote_key, $.squote_key, $.key)), $.alias),
 
-    flow_sequence: $ => seq("[", optional(seq(optional($.flow_seq_entry), repeat(seq(",", optional($.flow_seq_entry))))), optional(","), "]"),
+    flow_sequence: $ => seq($._flow_lbracket, optional(seq(optional($.flow_seq_entry), repeat(seq(",", optional($.flow_seq_entry))))), optional(","), $._flow_rbracket),
 
     scalar: $ => choice($.dquote_key, $.squote_key, $.dquote, $.squote, $.block_scalar, $.key, $.num, $.bool_null, $.plain),
 
diff --git a/tree-sitter/yaml/src/scanner.c b/tree-sitter/yaml/src/scanner.c
index ede3d3b..8bda6c3 100644
--- a/tree-sitter/yaml/src/scanner.c
+++ b/tree-sitter/yaml/src/scanner.c
@@ -8,6 +8,7 @@
 #include "tree_sitter/parser.h"
 #include "tree_sitter/alloc.h"
 #include <string.h>
+#include <stdint.h>
 
 enum TokenType {
   INDENT,
@@ -18,6 +19,10 @@ enum TokenType {
   KEY,
   NUM,
   BOOL_NULL,
+  _FLOW_LBRACKET,
+  _FLOW_LBRACE,
+  _FLOW_RBRACKET,
+  _FLOW_RBRACE,
 };
 
 typedef struct {
@@ -28,6 +33,7 @@ typedef struct {
   bool pending_newline;  // a NEWLINE is still owed once dedents reach pending_col
   bool started;          // any content lexed yet (suppresses a leading NEWLINE)
   bool at_line_lead;     // the next real content token is its line's first (compact-indicator probe)
+  uint16_t flow_depth;   // > 0 inside flow collections ([ ] { }) → indentation suspended, ,/[/]/{/}
 } Scanner;
 
 static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
@@ -45,6 +51,7 @@ void *tree_sitter_yaml_external_scanner_create(void) {
   s->stack[0] = 0;
   s->pending_col = -1; s->pending_newline = false; s->started = false;
   s->at_line_lead = true;
+  s->flow_depth = 0;
   return s;
 }
 
@@ -61,6 +68,7 @@ unsigned tree_sitter_yaml_external_scanner_serialize(void *payload, char *buffer
   buffer[n++] = s->pending_newline ? 1 : 0;
   buffer[n++] = s->at_line_lead ? 1 : 0;
   memcpy(&buffer[n], &s->pending_col, sizeof(int16_t)); n += sizeof(int16_t);
+  memcpy(&buffer[n], &s->flow_depth, sizeof(uint16_t)); n += sizeof(uint16_t);
   uint32_t count = s->len;
   while (n + sizeof(uint32_t) + count * sizeof(int16_t) > TREE_SITTER_SERIALIZATION_BUFFER_SIZE && count > 0) count--;
   memcpy(&buffer[n], &count, sizeof(uint32_t)); n += sizeof(uint32_t);
@@ -72,12 +80,14 @@ void tree_sitter_yaml_external_scanner_deserialize(void *payload, const char *bu
   Scanner *s = (Scanner *)payload;
   s->len = 1; s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false;
   s->at_line_lead = true;
-  if (length < 3 + sizeof(int16_t) + sizeof(uint32_t)) return;
+  s->flow_depth = 0;
+  if (length < 3 + sizeof(int16_t) + sizeof(uint16_t) + sizeof(uint32_t)) return;
   unsigned n = 0;
   s->started = buffer[n++] != 0;
   s->pending_newline = buffer[n++] != 0;
   s->at_line_lead = buffer[n++] != 0;
   memcpy(&s->pending_col, &buffer[n], sizeof(int16_t)); n += sizeof(int16_t);
+  memcpy(&s->flow_depth, &buffer[n], sizeof(uint16_t)); n += sizeof(uint16_t);
   uint32_t count; memcpy(&count, &buffer[n], sizeof(uint32_t)); n += sizeof(uint32_t);
   if (count == 0) return; // keep stack[0] = 0
   while (s->cap < count) { s->cap *= 2; s->stack = ts_realloc(s->stack, s->cap * sizeof(int16_t)); }
@@ -185,6 +195,7 @@ static bool compact_content_is_structural(TSLexer *lexer) {
 // next call (then compact_col is no longer deeper, so this path is skipped). A leaf scalar (no `:`)
 // is emitted as usual, its end marked at the run end. Mirrors compactNestsHere's mapping-key arm.
 static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull,
+                        int flow_depth,
                         Scanner *s, int16_t compact_col, int indent_sym) {
   bool cm = compact_col >= 0;   // compact-eligible: suppress per-char mark_end (zero-width INDENT on KEY)
   char buf[64];
@@ -194,14 +205,14 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan
   for (;;) {
     int32_t c = lexer->lookahead;
     if (c == 0 || c == '\n' || c == '\r') break;                 // newline / EOF
-    if (c == ',' || c == '[' || c == ']' || c == '{' || c == '}') break; // flow indicators end a scalar
+    if (flow_depth > 0 && (c == '[' || c == '{' || c == ']' || c == '}' || c == ',')) break; // flow indicators end a scalar — ONLY inside a flow collection
     if (!has_content && (c == '-' || c == '?')) {
       // A leading `-`/`?` is a block indicator (seq entry / explicit key) when followed by space/EOL/
       // flow-indicator, and scalar content otherwise (`-1`, `?x`). Peek the next char to decide.
       lexer->advance(lexer, false);
       int32_t n = lexer->lookahead;
       if (n == 0 || n == ' ' || n == '\t' || n == '\n' || n == '\r' ||
-          n == ',' || n == '[' || n == ']' || n == '{' || n == '}') return false; // indicator, not a scalar
+          (flow_depth > 0 && (n == '[' || n == '{' || n == ']' || n == '}' || n == ','))) return false; // indicator, not a scalar
       if (blen < sizeof(buf)) buf[blen++] = (char)c;               // `-`/`?` glued to non-space is content
       has_content = true;
       if (!cm) lexer->mark_end(lexer);
@@ -211,7 +222,7 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan
       lexer->advance(lexer, false);                               // past the ':' to peek the next char
       int32_t n = lexer->lookahead;
       if (n == 0 || n == ' ' || n == '\t' || n == '\n' || n == '\r' ||
-          n == ',' || n == '[' || n == ']' || n == '{' || n == '}') {
+          (flow_depth > 0 && (n == '[' || n == '{' || n == ']' || n == '}' || n == ','))) {
         stopped_at_kv = true; break;                              // ':' is a key/value separator → end before it
       }
       if (blen < sizeof(buf)) buf[blen++] = ':';                  // ':' glued to non-space is content
@@ -311,6 +322,55 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const
     return false;
   }
 
+  // Inside a flow collection (flow_depth > 0) a line break is INSIGNIFICANT — indentation is suspended,
+  // so a flow scalar / nested bracket may sit on a following line (`[a,\n b]`). tree-sitter's `/\s/`
+  // extra cannot skip the newline here: the external scanner is consulted first, and a `false` return
+  // (the only way to "decline") rolls back any advance, so the newline is never consumed and the parser
+  // stalls into error recovery. So when in flow, the scanner itself eats the flow-insignificant run
+  // (spaces, tabs, newlines, comments) as the LEADING trivia of the next token it returns — the bracket
+  // emission and scalar scan below both return TRUE, which makes the skip stick. (In block context this
+  // is skipped: a newline IS significant and drives the INDENT/DEDENT/NEWLINE boundary logic.)
+  if (s->flow_depth > 0) {
+    for (;;) {
+      int32_t c = lexer->lookahead;
+      if (c == ' ' || c == '\t' || c == '\n' || c == '\r') skip(lexer);
+      else if (c == '#') { while (lexer->lookahead != 0 && lexer->lookahead != '\n' && lexer->lookahead != '\r') skip(lexer); }
+      else break;
+    }
+  }
+  // Flow-collection delimiters ([ ] { }). These are emitted as EXTERNAL tokens (not the internal DFA) so
+  // flow_depth — the open-bracket nesting that suspends indentation — PERSISTS: tree-sitter keeps an
+  // external scanner's struct mutations only across a token it RETURNS (on a `false` return it restores
+  // the pre-scan serialized state, so a peek-then-false counter is silently rolled back before the
+  // internal bracket is lexed). Each is gated on its own valid_symbols, so a `[`/`{` that is plain
+  // content (a scalar contains but never STARTS with one — handled in scan_scalar) is NOT consumed here:
+  // at those positions the flow token isn't valid and we fall through. Skip inline space/tab first (the
+  // flow newline skip above already ran when in flow; in block a newline still drives the indent logic).
+  {
+    while (lexer->lookahead == ' ' || lexer->lookahead == '\t') skip(lexer);
+    int32_t fc = lexer->lookahead;
+    if (fc == '[' && valid_symbols[_FLOW_LBRACKET]) {
+      advance(lexer); lexer->mark_end(lexer); lexer->result_symbol = _FLOW_LBRACKET;
+      if (s->flow_depth < UINT16_MAX) s->flow_depth++;
+      s->started = true; s->at_line_lead = false; return true;
+    }
+    if (fc == '{' && valid_symbols[_FLOW_LBRACE]) {
+      advance(lexer); lexer->mark_end(lexer); lexer->result_symbol = _FLOW_LBRACE;
+      if (s->flow_depth < UINT16_MAX) s->flow_depth++;
+      s->started = true; s->at_line_lead = false; return true;
+    }
+    if (fc == ']' && valid_symbols[_FLOW_RBRACKET]) {
+      advance(lexer); lexer->mark_end(lexer); lexer->result_symbol = _FLOW_RBRACKET;
+      if (s->flow_depth > 0) s->flow_depth--;
+      s->started = true; s->at_line_lead = false; return true;
+    }
+    if (fc == '}' && valid_symbols[_FLOW_RBRACE]) {
+      advance(lexer); lexer->mark_end(lexer); lexer->result_symbol = _FLOW_RBRACE;
+      if (s->flow_depth > 0) s->flow_depth--;
+      s->started = true; s->at_line_lead = false; return true;
+    }
+  }
+
   // Compact block notation (`- a: 1` / `- - x` / `- &a k: v` / `? key`). The line-lead indicator was
   // just lexed by tree-sitter's internal DFA, so at_line_lead is still set (the scanner emits no
   // indicator token to clear it) and tree-sitter now wants the nested node's INDENT on the SAME line.
@@ -324,7 +384,7 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const
   //   • a plain/quoted SCALAR lead — fall through to scan_scalar (below), which pushes the compact
   //     INDENT itself when the run is a mapping KEY. (A bare scalar must NOT be sniffed-then-rewound: a
   //     plain scalar is external-only, so a false return here would loop.)
-  if (want_indent && s->at_line_lead) {
+  if (want_indent && s->at_line_lead && s->flow_depth == 0) {
     while (lexer->lookahead == ' ' || lexer->lookahead == '\t') skip(lexer);
     int32_t c = lexer->lookahead;
     bool nonscalar_lead = c == '&' || c == '!' || c == '[' || c == '{' || c == '*' || compact_is_indicator(c);
@@ -345,8 +405,9 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const
   }
 
   // A block scalar value (`key: |`): scan its body before the indent logic — its more-indented
-  // lines are content, not nested structure. Skip the inline space after the `:`/`-` first.
-  if (want_block) {
+  // lines are content, not nested structure. Skip the inline space after the `:`/`-` first. Block
+  // scalars are a block-context construct — inside a flow collection `|`/`>` are plain content.
+  if (want_block && s->flow_depth == 0) {
     while (lexer->lookahead == ' ' || lexer->lookahead == '\t') skip(lexer);
     if (lexer->lookahead == '|' || lexer->lookahead == '>') { if (scan_block_scalar(s, lexer)) { s->at_line_lead = false; return true; } }
   }
@@ -371,11 +432,14 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const
         compact_col = (int16_t)lexer->get_column(lexer);
         lexer->mark_end(lexer); // zero-width INDENT end at the content start (used iff the run is a KEY)
       }
-      if (scan_scalar(lexer, valid_symbols[KEY] != 0, valid_symbols[PLAIN] != 0, valid_symbols[NUM] != 0, valid_symbols[BOOL_NULL] != 0, s, compact_col, INDENT)) { s->started = true; s->at_line_lead = (lexer->result_symbol == INDENT); return true; }
+      if (scan_scalar(lexer, valid_symbols[KEY] != 0, valid_symbols[PLAIN] != 0, valid_symbols[NUM] != 0, valid_symbols[BOOL_NULL] != 0, s->flow_depth, s, compact_col, INDENT)) { s->started = true; s->at_line_lead = (lexer->result_symbol == INDENT); return true; }
     }
   }
 
-  if (!want_indent && !want_dedent && !want_newline) return false; // flow context — no indent tokens valid
+  if (!want_indent && !want_dedent && !want_newline) return false; // no indent tokens valid
+  // Inside a flow collection a newline is INSIGNIFICANT (indentation suspended): emit NO INDENT/DEDENT/
+  // NEWLINE so the line break is consumed by tree-sitter's `/\s/` extra and the flow spans the line.
+  if (s->flow_depth > 0) return false;
 
   // Skip blank lines, comment-only lines, and leading whitespace, noting whether a line break was
   // crossed (only a real boundary drives the indent logic).

From 595802b7a7fe3391370d74d2f246e50414744cc2 Mon Sep 17 00:00:00 2001
From: Johnson Chu <johnsoncodehk@gmail.com>
Date: Mon, 8 Jun 2026 08:14:06 +0800
Subject: [PATCH 08/10] =?UTF-8?q?Scan=20YAML=20document=20markers=20?=
 =?UTF-8?q?=E2=80=94=20`---`-led=20documents=20now=20parse=20(issue=20#3)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A document that started with `---`/`...` then a body on the next line failed: the external scalar
scanner's `-`/`.` lead ran the `---` into a plain/key token before the internal `doc_start` could
match (and the marker token's separator look-ahead is stripped by the token DFA). The scanner now
probes for a document marker at column 0 (glyphs derived from `indent.blockScalar.documentMarkers`):
a true sep-bounded marker → set a transient `marker_decline` + return false so the internal
`---`/`...` token lexes it; a non-marker glyph (`---foo`) → claim it as plain content. The markers
stay INTERNAL tokens (making them external perturbs the GLR tables and mis-lexes same-column block
sequences). Plus: `started` is set whenever the column > 0 (so the NEWLINE after a leading marker is
emitted, not suppressed), and a document-root block scalar (stack depth 1, parent indent −1) may have
a column-0 body, ending only at a column-0 marker.

Combined with the flow-depth fix, the bench jumps 72.4% → 94.2% (294/312 valid yaml-test-suite
inputs ERROR-free) — the two compound, since many inputs had both a `---` marker and flow/comma
content. The six other grammars stay byte-identical (all gated on grammar.indent); tsc clean;
generate + build --wasm succeed; gate:treesitter 96.0%; src-coverage-yaml parser alignment 100%
(yaml.ts untouched — tree-sitter target only).

Refs #3
---
 src/gen-treesitter.ts          | 115 +++++++++++++++++++++++++++++----
 tree-sitter/yaml/src/scanner.c |  72 ++++++++++++++++++++-
 2 files changed, 174 insertions(+), 13 deletions(-)

diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts
index 9a3ba1f..33cbf59 100644
--- a/src/gen-treesitter.ts
+++ b/src/gen-treesitter.ts
@@ -1,6 +1,6 @@
 import type { CstGrammar, RuleExpr, RuleDecl, TokenPattern } from './types.ts';
 import { collectLiterals, isKeywordLiteral } from './grammar-utils.ts';
-import { tokenPatternIsNever, tokenPatternSource, tokenPatternStartsWithDecimal, tokenPatternStringDelimiters, tokenPatternTrailingCharClass } from './token-pattern.ts';
+import { tokenPatternIsNever, tokenPatternLiteralPrefix, tokenPatternSource, tokenPatternStartsWithDecimal, tokenPatternStringDelimiters, tokenPatternTrailingCharClass } from './token-pattern.ts';
 
 // ════════════════════════════════════════════════════════════════════════════
 // gen-treesitter — derive a tree-sitter parser package from one CstGrammar.
@@ -770,6 +770,24 @@ function flowSyntheticTokens(grammar: CstGrammar): { sym: string; char: string;
   ];
 }
 
+/**
+ * The document-marker glyphs (`---` / `...`) of an indentation grammar, matched to
+ * `indent.blockScalar.documentMarkers` by token literal prefix — used by the external scanner's
+ * scan_scalar to claim a non-marker glyph as plain and decline a true marker (the markers stay
+ * INTERNAL tokens; see planScannerTokens). Longest glyph first (so a 3-char glyph beats a prefix of
+ * it). Empty unless the grammar declares documentMarkers.
+ */
+function documentMarkerGlyphs(grammar: CstGrammar): string[] {
+  const markers = grammar.indent?.blockScalar?.documentMarkers;
+  if (!markers || markers.length === 0) return [];
+  const out = new Set<string>();
+  for (const tok of grammar.tokens) {
+    const lit = tokenPatternLiteralPrefix(tok);
+    if (lit && markers.includes(lit)) out.add(lit);
+  }
+  return [...out].sort((a, b) => b.length - a.length);
+}
+
 /** Determine which tokens the external scanner must provide. */
 function planScannerTokens(grammar: CstGrammar): Map<string, string> {
   const map = new Map<string, string>();
@@ -803,6 +821,14 @@ function planScannerTokens(grammar: CstGrammar): Map<string, string> {
     // (a TRUE return). The synthetic name IS the snake symbol; the matching literal in the flow rules is
     // swapped for a ref to it in renderExpr. Appended last so the scalar-token positions are unchanged.
     for (const { sym } of flowSyntheticTokens(grammar)) map.set(sym, sym);
+    // Document markers (`---` / `...`) stay INTERNAL tokens (NOT added here). Their IR is
+    // `literal + a sep look-ahead`; tree-sitter's token() DFA drops the look-ahead, leaving a bare
+    // `---`/`...`. That is fine: the external scalar scanner CLAIMS a non-marker glyph (`---foo`) as a
+    // plain scalar (so it never reaches the internal token) and DECLINES a true sep-bounded marker (so
+    // the internal token lexes it — see scan_scalar's document-marker probe). Making them external
+    // instead perturbs the GLR parse tables — a marker token's valid-symbol set then shifts the lexer's
+    // scalar/indent decisions at unrelated boundaries (a same-column block sequence after a key
+    // mis-lexes) — so keeping them internal leaves the tables byte-identical to a no-marker build.
   }
   // The regex token: '/' is context-sensitive (regex vs division). The scanner
   // resolves it.
@@ -1919,6 +1945,43 @@ function buildIndentScannerC(grammar: CstGrammar, ctx: GrammarJsContext, grammar
   // agree). Built by flowSyntheticTokens(grammar) and shared with the grammar.js side.
   const flowTokens = flowSyntheticTokens(grammar); // [{ sym, char, open }]
 
+  // DOCUMENT MARKERS (`---` / `...`) — INTERNAL tokens; the external scalar scanner only CLAIMS a non-
+  // marker glyph as plain and DECLINES a true (sep-bounded) marker so the internal token lexes it.
+  const markers = documentMarkerGlyphs(grammar);
+  const hasMarkers = markers.length > 0 && SCALAR;
+  const cChar = (ch: string) => (ch === '\\' || ch === "'" ? `'\\${ch}'` : `'${ch}'`);
+  // Advance over one glyph char (counting `matched`) — DON'T push to the run or mark the token end yet.
+  // The probe commits (mark_end) only on the plain-content path; a true-marker decline marks nothing, so
+  // the probed chars roll back cleanly and tree-sitter then lexes the internal marker token.
+  const eatGlyphChar = (ch: string) => `if (lexer->lookahead == ${cChar(ch)}) { lexer->advance(lexer, false); matched++; }`;
+  // Replay the k matched glyph chars into the run as scalar content (a non-marker glyph: `---foo`,
+  // `--x`). The lexer is already positioned past them; the main loop continues the run from here.
+  const replayGlyph = (glyph: string) => [...glyph].map((ch, k) =>
+    `if (matched > ${k}) { if (blen < sizeof(buf)) buf[blen++] = ${cChar(ch)}; }`).join(' ');
+  const markerProbe = !hasMarkers ? '' : `
+  // DOCUMENT-MARKER probe (column 0). A \`---\`/\`...\` glyph that is sep-bounded (ws / EOL / EOF) is a
+  // document marker — an INTERNAL token (its IR's sep look-ahead is beyond a tree-sitter token() DFA,
+  // but this external scanner decides the boundary). The glyph is matched WITHOUT marking the token end:
+  //   • a FULL glyph + separator      → a TRUE marker: set s->marker_decline and return false; nothing
+  //     was marked, so the probed chars roll back and the internal \`---\`/\`...\` token lexes it (a non-
+  //     marker glyph never reaches that token, so its dropped look-ahead is moot).
+  //   • a LONE indicator char + sep   → a block indicator (\`- \`/\`? \`); decline so the internal \`-\`/\`?\`
+  //     token takes it.
+  //   • anything else (\`---foo\`, \`-1\`) → plain content: replay the matched glyph chars and fall through
+  //     to the scalar loop, which continues the run (so the marker glyph is CLAIMED as a plain scalar).
+  // Markers (and which lead chars are block indicators) are DERIVED from grammar.indent.
+  if (${hasCompact ? 'compact_col < 0 && ' : ''}lexer->get_column(lexer) == 0) {${markers.map(glyph => `
+    if (lexer->lookahead == ${cChar(glyph[0])}) {
+      unsigned matched = 0;
+      ${[...glyph].map(eatGlyphChar).join('\n      ')}
+      int32_t mn = lexer->lookahead;
+      bool msep = (mn == 0 || mn == ' ' || mn == '\\t' || mn == '\\n' || mn == '\\r');
+      if (matched == ${glyph.length} && msep) { s->marker_decline = true; return false; }${compactIndicators.includes(glyph[0]) ? `
+      if (matched == 1 && msep) return false; // lone \`${glyph[0]}\` + separator → block indicator, not content` : ''}
+      ${replayGlyph(glyph)} if (matched > 0) { has_content = true; lexer->mark_end(lexer); }
+    }`).join('')}
+  }`;
+
   const scannerC = `// Tree-sitter external scanner generated by monogram (indentation path).
 //
 // Mirrors the indent-stack state machine of src/gen-lexer.ts: INDENT / DEDENT / NEWLINE are emitted
@@ -1943,7 +2006,9 @@ typedef struct {
   bool pending_newline;  // a NEWLINE is still owed once dedents reach pending_col
   bool started;          // any content lexed yet (suppresses a leading NEWLINE)${hasCompact ? `
   bool at_line_lead;     // the next real content token is its line's first (compact-indicator probe)` : ''}${hasFlow ? `
-  uint16_t flow_depth;   // > 0 inside flow collections ([ ] { }) → indentation suspended, ,/[/]/{/}` : ''}
+  uint16_t flow_depth;   // > 0 inside flow collections ([ ] { }) → indentation suspended, ,/[/]/{/}` : ''}${hasMarkers ? `
+  bool marker_decline;   // transient: scan_scalar saw a true \`---\`/\`...\` → external declines so the
+                         // internal marker token lexes it. Set+consumed within one scan; not serialized.` : ''}
 } Scanner;
 
 static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
@@ -1961,7 +2026,8 @@ void *tree_sitter_${G}_external_scanner_create(void) {
   s->stack[0] = 0;
   s->pending_col = -1; s->pending_newline = false; s->started = false;${hasCompact ? `
   s->at_line_lead = true;` : ''}${hasFlow ? `
-  s->flow_depth = 0;` : ''}
+  s->flow_depth = 0;` : ''}${hasMarkers ? `
+  s->marker_decline = false;` : ''}
   return s;
 }
 
@@ -1990,7 +2056,8 @@ void tree_sitter_${G}_external_scanner_deserialize(void *payload, const char *bu
   Scanner *s = (Scanner *)payload;
   s->len = 1; s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false;${hasCompact ? `
   s->at_line_lead = true;` : ''}${hasFlow ? `
-  s->flow_depth = 0;` : ''}
+  s->flow_depth = 0;` : ''}${hasMarkers ? `
+  s->marker_decline = false;` : ''}
   if (length < ${hasCompact ? 3 : 2} + sizeof(int16_t)${hasFlow ? ' + sizeof(uint16_t)' : ''} + sizeof(uint32_t)) return;
   unsigned n = 0;
   s->started = buffer[n++] != 0;
@@ -2010,8 +2077,14 @@ ${BLOCK ? `// A block scalar (\`|\` / \`>\`): the introducer + indicators + the
 // stack top); it ends at the first non-blank line at or below the parent, or a column-0 document
 // marker, or EOF. mark_end is advanced only over lines that belong to the scalar, so the next node's
 // indentation is left for the normal boundary logic.
+//
+// A ROOT block scalar (the document's own node — stack depth 1) has an effective parent indentation of
+// -1, not 0: its body may sit at column 0 (\`--- >\\nline1\`, yaml-test-suite DK3J / FP8R). So at root,
+// only a column-0 DOCUMENT MARKER (\`---\` / \`...\`) — never plain column-0 text — ends it. The marker
+// is matched without committing the line (no mark_end), so a non-marker column-0 line stays body.${markers.length > 0 ? `` : ''}
 static bool scan_block_scalar(Scanner *s, TSLexer *lexer) {
-  int parent = s->stack[s->len - 1];
+  bool root = s->len == 1;           // a document-root block scalar: body may reach column 0
+  int parent = root ? -1 : s->stack[s->len - 1];
   advance(lexer); // the introducer (| or >)
   while (lexer->lookahead == '+' || lexer->lookahead == '-' || (lexer->lookahead >= '0' && lexer->lookahead <= '9')) advance(lexer);
   while (lexer->lookahead != 0 && lexer->lookahead != '\\n' && lexer->lookahead != '\\r') advance(lexer);
@@ -2023,7 +2096,17 @@ static bool scan_block_scalar(Scanner *s, TSLexer *lexer) {
     int col = 0;
     while (lexer->lookahead == ' ') { advance(lexer); col++; }
     int32_t c = lexer->lookahead;
-    if (c == 0 || c == '\\n' || c == '\\r') { lexer->mark_end(lexer); continue; } // blank line → body
+    if (c == 0 || c == '\\n' || c == '\\r') { lexer->mark_end(lexer); continue; } // blank line → body${markers.length > 0 ? `
+    if (root && col == 0) {                 // a column-0 document marker ends a root block scalar
+      bool is_marker = false;${markers.map(glyph => `
+      if (!is_marker && c == ${cChar(glyph[0])}) {
+        unsigned m = 0; ${[...glyph].map(ch => `if (lexer->lookahead == ${cChar(ch)}) { advance(lexer); m++; }`).join(' ')}
+        int32_t a = lexer->lookahead;
+        if (m == ${glyph.length} && (a == 0 || a == ' ' || a == '\\t' || a == '\\n' || a == '\\r')) is_marker = true;
+      }`).join('')}
+      if (is_marker) break;                 // leave the marker line for the next token (no mark_end)
+      // not a marker: the chars probed above are body; fall through to consume the rest of the line.
+    }` : ''}
     if (col <= parent) break; // dedent to/below parent ends the scalar (the line is the next node)
     while (lexer->lookahead != 0 && lexer->lookahead != '\\n' && lexer->lookahead != '\\r') advance(lexer);
     lexer->mark_end(lexer);
@@ -2105,13 +2188,15 @@ static bool compact_content_is_structural(TSLexer *lexer) {
 // next call (then compact_col is no longer deeper, so this path is skipped). A leaf scalar (no \`:\`)
 // is emitted as usual, its end marked at the run end. Mirrors compactNestsHere's mapping-key arm.` : ''}
 static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull${hasFlow ? `,
-                        int flow_depth` : ''}${hasCompact ? `,
-                        Scanner *s, int16_t compact_col, int indent_sym` : ''}) {${hasCompact ? `
+                        int flow_depth` : ''}${(hasCompact || hasMarkers) ? `,
+                        Scanner *s` : ''}${hasCompact ? `,
+                        int16_t compact_col, int indent_sym` : ''}) {${hasCompact ? `
   bool cm = compact_col >= 0;   // compact-eligible: suppress per-char mark_end (zero-width INDENT on KEY)` : ''}
   char buf[64];
   unsigned blen = 0;        // run text (capped) — for the number/bool-null shape test
   bool has_content = false;
   bool stopped_at_kv = false; // ended at a \`:\`-separator → this scalar is a mapping KEY
+${markerProbe}
   for (;;) {
     int32_t c = lexer->lookahead;
     if (c == 0 || c == '\\n' || c == '\\r') break;                 // newline / EOF
@@ -2216,7 +2301,13 @@ bool tree_sitter_${G}_external_scanner_scan(void *payload, TSLexer *lexer, const
   Scanner *s = (Scanner *)payload;
   bool want_indent = valid_symbols[${INDENT}];
   bool want_dedent = valid_symbols[${DEDENT}];
-  bool want_newline = valid_symbols[${NEWLINE}];${BLOCK ? `\n  bool want_block = valid_symbols[${BLOCK}];` : ''}
+  bool want_newline = valid_symbols[${NEWLINE}];${BLOCK ? `\n  bool want_block = valid_symbols[${BLOCK}];` : ''}${hasMarkers ? `
+  // Content lies to our left whenever we are not at column 0 — including right after an INTERNAL token
+  // (e.g. a \`---\`/\`...\` document marker, whose match the scanner never sees). Mark started so the line
+  // boundary that follows emits its NEWLINE (the leading-NEWLINE suppression is only for blank lines at
+  // the very start of input, which are always at column 0). Without this, the NEWLINE after a leading
+  // \`---\` would be dropped and the document body could not attach.
+  if (lexer->get_column(lexer) > 0) s->started = true;` : ''}
 
   // Finish a line boundary already in progress: emit the remaining DEDENTs (one per call), then the
   // owed NEWLINE when the stack top reaches the boundary column. No input is consumed here.
@@ -2328,8 +2419,10 @@ ${flowTokens.map(t => `    if (fc == ${charLit(t.char)} && valid_symbols[${t.sym
         compact_col = (int16_t)lexer->get_column(lexer);
         lexer->mark_end(lexer); // zero-width INDENT end at the content start (used iff the run is a KEY)
       }
-      if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)}${hasFlow ? ', s->flow_depth' : ''}, s, compact_col, ${INDENT})) { s->started = true; s->at_line_lead = (lexer->result_symbol == ${INDENT}); return true; }` : `
-      if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)}${hasFlow ? ', s->flow_depth' : ''})) { s->started = true; return true; }`}
+      if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)}${hasFlow ? ', s->flow_depth' : ''}, s, compact_col, ${INDENT})) { s->started = true; s->at_line_lead = (lexer->result_symbol == ${INDENT}); return true; }${hasMarkers ? `
+      if (s->marker_decline) { s->marker_decline = false; return false; } // a true \`---\`/\`...\` here → let the internal marker token lex it` : ''}` : `
+      if (scan_scalar(lexer, ${want(KEY)}, ${want(PLAIN)}, ${want(NUM)}, ${want(BOOLNULL)}${hasFlow ? ', s->flow_depth' : ''}${hasMarkers ? ', s' : ''})) { s->started = true; return true; }${hasMarkers ? `
+      if (s->marker_decline) { s->marker_decline = false; return false; }` : ''}`}
     }
   }
 ` : ''}
diff --git a/tree-sitter/yaml/src/scanner.c b/tree-sitter/yaml/src/scanner.c
index 8bda6c3..54115a7 100644
--- a/tree-sitter/yaml/src/scanner.c
+++ b/tree-sitter/yaml/src/scanner.c
@@ -34,6 +34,8 @@ typedef struct {
   bool started;          // any content lexed yet (suppresses a leading NEWLINE)
   bool at_line_lead;     // the next real content token is its line's first (compact-indicator probe)
   uint16_t flow_depth;   // > 0 inside flow collections ([ ] { }) → indentation suspended, ,/[/]/{/}
+  bool marker_decline;   // transient: scan_scalar saw a true `---`/`...` → external declines so the
+                         // internal marker token lexes it. Set+consumed within one scan; not serialized.
 } Scanner;
 
 static inline void advance(TSLexer *lexer) { lexer->advance(lexer, false); }
@@ -52,6 +54,7 @@ void *tree_sitter_yaml_external_scanner_create(void) {
   s->pending_col = -1; s->pending_newline = false; s->started = false;
   s->at_line_lead = true;
   s->flow_depth = 0;
+  s->marker_decline = false;
   return s;
 }
 
@@ -81,6 +84,7 @@ void tree_sitter_yaml_external_scanner_deserialize(void *payload, const char *bu
   s->len = 1; s->stack[0] = 0; s->pending_col = -1; s->pending_newline = false; s->started = false;
   s->at_line_lead = true;
   s->flow_depth = 0;
+  s->marker_decline = false;
   if (length < 3 + sizeof(int16_t) + sizeof(uint16_t) + sizeof(uint32_t)) return;
   unsigned n = 0;
   s->started = buffer[n++] != 0;
@@ -100,8 +104,14 @@ void tree_sitter_yaml_external_scanner_deserialize(void *payload, const char *bu
 // stack top); it ends at the first non-blank line at or below the parent, or a column-0 document
 // marker, or EOF. mark_end is advanced only over lines that belong to the scalar, so the next node's
 // indentation is left for the normal boundary logic.
+//
+// A ROOT block scalar (the document's own node — stack depth 1) has an effective parent indentation of
+// -1, not 0: its body may sit at column 0 (`--- >\nline1`, yaml-test-suite DK3J / FP8R). So at root,
+// only a column-0 DOCUMENT MARKER (`---` / `...`) — never plain column-0 text — ends it. The marker
+// is matched without committing the line (no mark_end), so a non-marker column-0 line stays body.
 static bool scan_block_scalar(Scanner *s, TSLexer *lexer) {
-  int parent = s->stack[s->len - 1];
+  bool root = s->len == 1;           // a document-root block scalar: body may reach column 0
+  int parent = root ? -1 : s->stack[s->len - 1];
   advance(lexer); // the introducer (| or >)
   while (lexer->lookahead == '+' || lexer->lookahead == '-' || (lexer->lookahead >= '0' && lexer->lookahead <= '9')) advance(lexer);
   while (lexer->lookahead != 0 && lexer->lookahead != '\n' && lexer->lookahead != '\r') advance(lexer);
@@ -114,6 +124,21 @@ static bool scan_block_scalar(Scanner *s, TSLexer *lexer) {
     while (lexer->lookahead == ' ') { advance(lexer); col++; }
     int32_t c = lexer->lookahead;
     if (c == 0 || c == '\n' || c == '\r') { lexer->mark_end(lexer); continue; } // blank line → body
+    if (root && col == 0) {                 // a column-0 document marker ends a root block scalar
+      bool is_marker = false;
+      if (!is_marker && c == '-') {
+        unsigned m = 0; if (lexer->lookahead == '-') { advance(lexer); m++; } if (lexer->lookahead == '-') { advance(lexer); m++; } if (lexer->lookahead == '-') { advance(lexer); m++; }
+        int32_t a = lexer->lookahead;
+        if (m == 3 && (a == 0 || a == ' ' || a == '\t' || a == '\n' || a == '\r')) is_marker = true;
+      }
+      if (!is_marker && c == '.') {
+        unsigned m = 0; if (lexer->lookahead == '.') { advance(lexer); m++; } if (lexer->lookahead == '.') { advance(lexer); m++; } if (lexer->lookahead == '.') { advance(lexer); m++; }
+        int32_t a = lexer->lookahead;
+        if (m == 3 && (a == 0 || a == ' ' || a == '\t' || a == '\n' || a == '\r')) is_marker = true;
+      }
+      if (is_marker) break;                 // leave the marker line for the next token (no mark_end)
+      // not a marker: the chars probed above are body; fall through to consume the rest of the line.
+    }
     if (col <= parent) break; // dedent to/below parent ends the scalar (the line is the next node)
     while (lexer->lookahead != 0 && lexer->lookahead != '\n' && lexer->lookahead != '\r') advance(lexer);
     lexer->mark_end(lexer);
@@ -196,12 +221,48 @@ static bool compact_content_is_structural(TSLexer *lexer) {
 // is emitted as usual, its end marked at the run end. Mirrors compactNestsHere's mapping-key arm.
 static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool want_num, bool want_boolnull,
                         int flow_depth,
-                        Scanner *s, int16_t compact_col, int indent_sym) {
+                        Scanner *s,
+                        int16_t compact_col, int indent_sym) {
   bool cm = compact_col >= 0;   // compact-eligible: suppress per-char mark_end (zero-width INDENT on KEY)
   char buf[64];
   unsigned blen = 0;        // run text (capped) — for the number/bool-null shape test
   bool has_content = false;
   bool stopped_at_kv = false; // ended at a `:`-separator → this scalar is a mapping KEY
+
+  // DOCUMENT-MARKER probe (column 0). A `---`/`...` glyph that is sep-bounded (ws / EOL / EOF) is a
+  // document marker — an INTERNAL token (its IR's sep look-ahead is beyond a tree-sitter token() DFA,
+  // but this external scanner decides the boundary). The glyph is matched WITHOUT marking the token end:
+  //   • a FULL glyph + separator      → a TRUE marker: set s->marker_decline and return false; nothing
+  //     was marked, so the probed chars roll back and the internal `---`/`...` token lexes it (a non-
+  //     marker glyph never reaches that token, so its dropped look-ahead is moot).
+  //   • a LONE indicator char + sep   → a block indicator (`- `/`? `); decline so the internal `-`/`?`
+  //     token takes it.
+  //   • anything else (`---foo`, `-1`) → plain content: replay the matched glyph chars and fall through
+  //     to the scalar loop, which continues the run (so the marker glyph is CLAIMED as a plain scalar).
+  // Markers (and which lead chars are block indicators) are DERIVED from grammar.indent.
+  if (compact_col < 0 && lexer->get_column(lexer) == 0) {
+    if (lexer->lookahead == '-') {
+      unsigned matched = 0;
+      if (lexer->lookahead == '-') { lexer->advance(lexer, false); matched++; }
+      if (lexer->lookahead == '-') { lexer->advance(lexer, false); matched++; }
+      if (lexer->lookahead == '-') { lexer->advance(lexer, false); matched++; }
+      int32_t mn = lexer->lookahead;
+      bool msep = (mn == 0 || mn == ' ' || mn == '\t' || mn == '\n' || mn == '\r');
+      if (matched == 3 && msep) { s->marker_decline = true; return false; }
+      if (matched == 1 && msep) return false; // lone `-` + separator → block indicator, not content
+      if (matched > 0) { if (blen < sizeof(buf)) buf[blen++] = '-'; } if (matched > 1) { if (blen < sizeof(buf)) buf[blen++] = '-'; } if (matched > 2) { if (blen < sizeof(buf)) buf[blen++] = '-'; } if (matched > 0) { has_content = true; lexer->mark_end(lexer); }
+    }
+    if (lexer->lookahead == '.') {
+      unsigned matched = 0;
+      if (lexer->lookahead == '.') { lexer->advance(lexer, false); matched++; }
+      if (lexer->lookahead == '.') { lexer->advance(lexer, false); matched++; }
+      if (lexer->lookahead == '.') { lexer->advance(lexer, false); matched++; }
+      int32_t mn = lexer->lookahead;
+      bool msep = (mn == 0 || mn == ' ' || mn == '\t' || mn == '\n' || mn == '\r');
+      if (matched == 3 && msep) { s->marker_decline = true; return false; }
+      if (matched > 0) { if (blen < sizeof(buf)) buf[blen++] = '.'; } if (matched > 1) { if (blen < sizeof(buf)) buf[blen++] = '.'; } if (matched > 2) { if (blen < sizeof(buf)) buf[blen++] = '.'; } if (matched > 0) { has_content = true; lexer->mark_end(lexer); }
+    }
+  }
   for (;;) {
     int32_t c = lexer->lookahead;
     if (c == 0 || c == '\n' || c == '\r') break;                 // newline / EOF
@@ -306,6 +367,12 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const
   bool want_dedent = valid_symbols[DEDENT];
   bool want_newline = valid_symbols[NEWLINE];
   bool want_block = valid_symbols[BLOCK_SCALAR];
+  // Content lies to our left whenever we are not at column 0 — including right after an INTERNAL token
+  // (e.g. a `---`/`...` document marker, whose match the scanner never sees). Mark started so the line
+  // boundary that follows emits its NEWLINE (the leading-NEWLINE suppression is only for blank lines at
+  // the very start of input, which are always at column 0). Without this, the NEWLINE after a leading
+  // `---` would be dropped and the document body could not attach.
+  if (lexer->get_column(lexer) > 0) s->started = true;
 
   // Finish a line boundary already in progress: emit the remaining DEDENTs (one per call), then the
   // owed NEWLINE when the stack top reaches the boundary column. No input is consumed here.
@@ -433,6 +500,7 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const
         lexer->mark_end(lexer); // zero-width INDENT end at the content start (used iff the run is a KEY)
       }
       if (scan_scalar(lexer, valid_symbols[KEY] != 0, valid_symbols[PLAIN] != 0, valid_symbols[NUM] != 0, valid_symbols[BOOL_NULL] != 0, s->flow_depth, s, compact_col, INDENT)) { s->started = true; s->at_line_lead = (lexer->result_symbol == INDENT); return true; }
+      if (s->marker_decline) { s->marker_decline = false; return false; } // a true `---`/`...` here → let the internal marker token lex it
     }
   }
 

From 6b2fac5f55e74ea8f5cba6446ec04ebff0327428 Mon Sep 17 00:00:00 2001
From: Johnson Chu <johnsoncodehk@gmail.com>
Date: Mon, 8 Jun 2026 08:35:21 +0800
Subject: [PATCH 09/10] Fold multi-line plain scalars inside flow collections
 (issue #3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Inside a flow collection (`[ ]` / `{ }`) a plain scalar folds across line breaks — the break +
surrounding whitespace collapse to one space and the run continues on the next line until a flow
terminator. The scanner's `scan_scalar` broke a run unconditionally at any newline, so a flow key /
value / explicit-key spanning lines lexed as two scalars and the GLR parser couldn't chain them
(ERROR). Now, at `flow_depth > 0` with content already scanned, a newline folds: advance past it +
surrounding blank lines, stop at a flow terminator (`,`/brackets) / line-leading `#` / EOF, else
append one folded space and continue (the next content char re-marks the token end). Block context is
unchanged (its multi-line folding is separate indent/grammar machinery). Multi-line quoted scalars in
flow already worked (the quoted token spans newlines natively).

Bench: 294/312 → 299/312 (94.2% → 95.8%). Six other grammars byte-identical (yaml-only, gated on
grammar.indent); tsc clean; generate + build --wasm succeed; gate:treesitter 96.0%.

Refs #3
---
 src/gen-treesitter.ts          | 19 ++++++++++++++++++-
 tree-sitter/yaml/src/scanner.c | 19 ++++++++++++++++++-
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts
index 33cbf59..36e43f5 100644
--- a/src/gen-treesitter.ts
+++ b/src/gen-treesitter.ts
@@ -2199,7 +2199,24 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan
 ${markerProbe}
   for (;;) {
     int32_t c = lexer->lookahead;
-    if (c == 0 || c == '\\n' || c == '\\r') break;                 // newline / EOF
+    if (c == 0) break;                                             // EOF
+    if (c == '\\n' || c == '\\r') {${hasFlow ? `
+      // Inside a flow collection a plain scalar FOLDS across a line break (\`{ multi\\n line: v}\` → the
+      // key is \`multi line\`): the break + surrounding whitespace (and blank/comment-only lines) collapse
+      // to one space and the run continues on the next line. Peek past that trivia run WITHOUT committing
+      // (mark_end stays at the last content char, so a decline trims it): if the next significant char
+      // ENDS the scalar — EOF, a flow indicator/terminator (\`, [ ] { }\`), or a line-leading \`#\` comment —
+      // the break is trailing trivia and the scalar stops here; otherwise fold to a space and continue.
+      if (flow_depth > 0 && has_content) {
+        while (lexer->lookahead == ' ' || lexer->lookahead == '\\t' ||
+               lexer->lookahead == '\\n' || lexer->lookahead == '\\r') lexer->advance(lexer, false);
+        int32_t nx = lexer->lookahead;
+        if (nx == 0 || nx == '#' || (${flowBreakCond('nx')})) break; // scalar ends — the trivia is trailing
+        if (blen < sizeof(buf)) buf[blen++] = ' ';                  // the folded break becomes one space
+        continue;                                                   // next content char marks the new token end
+      }` : ''}
+      break;                                                       // block context (or no content yet): the line break ends the scalar
+    }
     ${hasFlow ? `if (flow_depth > 0 && (${flowBreakCond('c')})) break; // flow indicators end a scalar — ONLY inside a flow collection` : `if (c == ',' || c == '[' || c == ']' || c == '{' || c == '}') break; // flow indicators end a scalar`}
     if (!has_content && (c == '-' || c == '?')) {
       // A leading \`-\`/\`?\` is a block indicator (seq entry / explicit key) when followed by space/EOL/
diff --git a/tree-sitter/yaml/src/scanner.c b/tree-sitter/yaml/src/scanner.c
index 54115a7..09853dc 100644
--- a/tree-sitter/yaml/src/scanner.c
+++ b/tree-sitter/yaml/src/scanner.c
@@ -265,7 +265,24 @@ static bool scan_scalar(TSLexer *lexer, bool want_key, bool want_plain, bool wan
   }
   for (;;) {
     int32_t c = lexer->lookahead;
-    if (c == 0 || c == '\n' || c == '\r') break;                 // newline / EOF
+    if (c == 0) break;                                             // EOF
+    if (c == '\n' || c == '\r') {
+      // Inside a flow collection a plain scalar FOLDS across a line break (`{ multi\n line: v}` → the
+      // key is `multi line`): the break + surrounding whitespace (and blank/comment-only lines) collapse
+      // to one space and the run continues on the next line. Peek past that trivia run WITHOUT committing
+      // (mark_end stays at the last content char, so a decline trims it): if the next significant char
+      // ENDS the scalar — EOF, a flow indicator/terminator (`, [ ] { }`), or a line-leading `#` comment —
+      // the break is trailing trivia and the scalar stops here; otherwise fold to a space and continue.
+      if (flow_depth > 0 && has_content) {
+        while (lexer->lookahead == ' ' || lexer->lookahead == '\t' ||
+               lexer->lookahead == '\n' || lexer->lookahead == '\r') lexer->advance(lexer, false);
+        int32_t nx = lexer->lookahead;
+        if (nx == 0 || nx == '#' || (nx == '[' || nx == '{' || nx == ']' || nx == '}' || nx == ',')) break; // scalar ends — the trivia is trailing
+        if (blen < sizeof(buf)) buf[blen++] = ' ';                  // the folded break becomes one space
+        continue;                                                   // next content char marks the new token end
+      }
+      break;                                                       // block context (or no content yet): the line break ends the scalar
+    }
     if (flow_depth > 0 && (c == '[' || c == '{' || c == ']' || c == '}' || c == ',')) break; // flow indicators end a scalar — ONLY inside a flow collection
     if (!has_content && (c == '-' || c == '?')) {
       // A leading `-`/`?` is a block indicator (seq entry / explicit key) when followed by space/EOL/

From 687360984812fd05740ee0b1b0f8528a3f47122a Mon Sep 17 00:00:00 2001
From: Johnson Chu <johnsoncodehk@gmail.com>
Date: Mon, 8 Jun 2026 08:47:37 +0800
Subject: [PATCH 10/10] Parse block keys with node properties / tags / aliases
 (issue #3)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A block mapping whose KEY is preceded by a node property (`&anchor` / `!tag` / `!!tag` / `!<verbatim>`)
ERRORed: the scanner's compact-block detection keys off `at_line_lead` ("the line's first token"), but
anchor/tag/alias are INTERNAL tokens tree-sitter lexes WITHOUT consulting the scanner, so after a
property was lexed `at_line_lead` was still set and the following key was mis-treated as a compact-
nested mapping → a spurious INDENT that corrupted the structure. Fix: a transient `property_lead`
field, latched at the genuine line lead (column == stack top, re-derived every boundary and for the
first line) when the lead char is a property; the two compact-push sites skip a property-led line so
its key stays at the node level. `property_lead` is NOT reset in deserialize — the one carry that must
survive the property's internal lex (tree-sitter discards scanner mutations on a `false` return; only
across a `true`-returned token does state persist). `yaml.ts` untouched — the grammar's BlockKey
already had the production; the gap was the tree-sitter derivation. (yaml-test-suite ZH7C/74H7/E76Z/
7FWL/HMQ5/2SXE.)

Combined with the flow folding, the bench is 95.8% → 97.8% (305/312). Six other grammars byte-
identical; tsc clean; generate + build --wasm succeed; gate:treesitter 96.0%; agnostic 9/9;
test:yaml-issues 10/10; scope-gap:yaml 100%; src-coverage-yaml 100%.

Refs #3
---
 src/gen-treesitter.ts          | 43 ++++++++++++++++++++++++++++++----
 tree-sitter/yaml/src/scanner.c | 39 +++++++++++++++++++++++++++---
 2 files changed, 74 insertions(+), 8 deletions(-)

diff --git a/src/gen-treesitter.ts b/src/gen-treesitter.ts
index 36e43f5..307cb3f 100644
--- a/src/gen-treesitter.ts
+++ b/src/gen-treesitter.ts
@@ -2005,7 +2005,18 @@ typedef struct {
   int16_t pending_col;   // column of the line boundary mid-processing (-1 = none)
   bool pending_newline;  // a NEWLINE is still owed once dedents reach pending_col
   bool started;          // any content lexed yet (suppresses a leading NEWLINE)${hasCompact ? `
-  bool at_line_lead;     // the next real content token is its line's first (compact-indicator probe)` : ''}${hasFlow ? `
+  bool at_line_lead;     // the next real content token is its line's first (compact-indicator probe)
+  bool property_lead;    // the line's FIRST token is a node property (\`&\`/\`!\`) — its inline content sits
+                         // at the SAME node level, so it must NOT take the compact mapping-key push (\`&a
+                         // a: b\` is the key \`a\` carrying anchor \`&a\`, not \`&a\`-then-INDENTED-\`a: b\`).
+                         // gen-lexer clears atLineLead on the property token (it sees every token); the
+                         // scanner does not lex the property, so it LATCHES this at the line lead and reads
+                         // it at the push. It must survive the property's INTERNAL lex (which the scanner
+                         // declines via a FALSE return) — tree-sitter deserializes the serialized fields on
+                         // a false return, so a SERIALIZED flag would be rolled back; this one is therefore
+                         // NOT serialized and NOT reset in deserialize (it keeps its in-memory value across
+                         // the decline). It is RE-DERIVED from the lead char at every line boundary, so it
+                         // is always correct at the only points it is read (a line lead).` : ''}${hasFlow ? `
   uint16_t flow_depth;   // > 0 inside flow collections ([ ] { }) → indentation suspended, ,/[/]/{/}` : ''}${hasMarkers ? `
   bool marker_decline;   // transient: scan_scalar saw a true \`---\`/\`...\` → external declines so the
                          // internal marker token lexes it. Set+consumed within one scan; not serialized.` : ''}
@@ -2025,7 +2036,7 @@ void *tree_sitter_${G}_external_scanner_create(void) {
   s->stack = ts_malloc(s->cap * sizeof(int16_t));
   s->stack[0] = 0;
   s->pending_col = -1; s->pending_newline = false; s->started = false;${hasCompact ? `
-  s->at_line_lead = true;` : ''}${hasFlow ? `
+  s->at_line_lead = true; s->property_lead = false;` : ''}${hasFlow ? `
   s->flow_depth = 0;` : ''}${hasMarkers ? `
   s->marker_decline = false;` : ''}
   return s;
@@ -2391,8 +2402,20 @@ ${flowTokens.map(t => `    if (fc == ${charLit(t.char)} && valid_symbols[${t.sym
   if (want_indent && s->at_line_lead${hasFlow ? ' && s->flow_depth == 0' : ''}) {
     while (lexer->lookahead == ' ' || lexer->lookahead == '\\t') skip(lexer);
     int32_t c = lexer->lookahead;
+    // GENUINE line lead: the line's first token is not yet lexed, so its column == the stack top (a
+    // line boundary set top to the lead column; at stream start both are the document level). Once a
+    // token has been lexed on this line the next content is DEEPER than top. Record whether that first
+    // token is a node PROPERTY (\`&\`/\`!\`): a property leads a node, so its inline content is at the
+    // SAME level (no compact push), whereas a compact indicator (\`-\`/\`?\`) DOES open a nested level
+    // for its content. This is the one fact lost when the property is lexed internally (the scanner
+    // never sees it), so it is latched here and checked at the two compact-push sites below.
+    if ((int16_t)lexer->get_column(lexer) == s->stack[s->len - 1]) s->property_lead = (c == '&' || c == '!');
     bool nonscalar_lead = c == '&' || c == '!' || c == '[' || c == '{' || c == '*' || compact_is_indicator(c);
-    if (nonscalar_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) {
+    // A property that LEADS the line (property_lead) does not nest — skip the compact push so its key
+    // stays at the node's level (\`&a a: b\` / \`!!str &a1 "foo":\`). A property that follows a compact
+    // indicator (\`- &a k: v\`) is NOT a line lead (property_lead was set false at the \`-\`), so it still
+    // pushes via compact_content_is_structural's property-skip.
+    if (nonscalar_lead && !s->property_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) {
       int16_t col = (int16_t)lexer->get_column(lexer);
       lexer->mark_end(lexer); // freeze the zero-width INDENT end at the content column before the sniff advances
       if (compact_content_is_structural(lexer)) {
@@ -2431,8 +2454,12 @@ ${flowTokens.map(t => `    if (fc == ${charLit(t.char)} && valid_symbols[${t.sym
       // Compact mapping-KEY (part B): a line-lead indicator's scalar-led inline content, deeper than the
       // stack top. scan_scalar then pushes the nested mapping's INDENT (zero-width, pre-marked here) if
       // the run is a KEY, or emits the leaf scalar otherwise (so \`- x\` stays a plain item, no push).
+      // \`!s->property_lead\`: a key after a LINE-LEAD node property (\`&a a: b\`) sits at the node's level,
+      // not a compact-nested one — so do NOT pre-mark a compact INDENT; scan_scalar then emits the key as
+      // an ordinary value-position scalar (the enclosing Node carries the property). A key after a compact
+      // indicator (\`- a: 1\`) has property_lead == false and still nests.
       int16_t compact_col = -1;
-      if (want_indent && s->at_line_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) {
+      if (want_indent && s->at_line_lead && !s->property_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) {
         compact_col = (int16_t)lexer->get_column(lexer);
         lexer->mark_end(lexer); // zero-width INDENT end at the content start (used iff the run is a KEY)
       }
@@ -2471,7 +2498,13 @@ ${flowTokens.map(t => `    if (fc == ${charLit(t.char)} && valid_symbols[${t.sym
   int16_t col = (int16_t)lexer->get_column(lexer);
   lexer->mark_end(lexer); // INDENT/DEDENT/NEWLINE are zero-width at the content column
   int top = s->stack[s->len - 1];${hasCompact ? `
-  s->at_line_lead = true; // a real line boundary — the next real token leads its line` : ''}
+  s->at_line_lead = true; // a real line boundary — the next real token leads its line
+  // Latch whether THIS new line is led by a node property (\`&\`/\`!\`) — the lookahead is the line's
+  // first content char (blanks/comments already skipped). A property leads a node, so its inline content
+  // is at the same level and must NOT take a compact push (the gates below check property_lead). This is
+  // a TRUE-return site so the latch persists through the property's internal lex; it is also re-derived
+  // for the FIRST line (which reaches no boundary) by the compact block's genuine-line-lead detection.
+  s->property_lead = (lexer->lookahead == '&' || lexer->lookahead == '!');` : ''}
 
   if (col > top) {
     if (want_indent) { push_indent(s, col); lexer->result_symbol = ${INDENT}; return true; }
diff --git a/tree-sitter/yaml/src/scanner.c b/tree-sitter/yaml/src/scanner.c
index 09853dc..f9ec5d8 100644
--- a/tree-sitter/yaml/src/scanner.c
+++ b/tree-sitter/yaml/src/scanner.c
@@ -33,6 +33,17 @@ typedef struct {
   bool pending_newline;  // a NEWLINE is still owed once dedents reach pending_col
   bool started;          // any content lexed yet (suppresses a leading NEWLINE)
   bool at_line_lead;     // the next real content token is its line's first (compact-indicator probe)
+  bool property_lead;    // the line's FIRST token is a node property (`&`/`!`) — its inline content sits
+                         // at the SAME node level, so it must NOT take the compact mapping-key push (`&a
+                         // a: b` is the key `a` carrying anchor `&a`, not `&a`-then-INDENTED-`a: b`).
+                         // gen-lexer clears atLineLead on the property token (it sees every token); the
+                         // scanner does not lex the property, so it LATCHES this at the line lead and reads
+                         // it at the push. It must survive the property's INTERNAL lex (which the scanner
+                         // declines via a FALSE return) — tree-sitter deserializes the serialized fields on
+                         // a false return, so a SERIALIZED flag would be rolled back; this one is therefore
+                         // NOT serialized and NOT reset in deserialize (it keeps its in-memory value across
+                         // the decline). It is RE-DERIVED from the lead char at every line boundary, so it
+                         // is always correct at the only points it is read (a line lead).
   uint16_t flow_depth;   // > 0 inside flow collections ([ ] { }) → indentation suspended, ,/[/]/{/}
   bool marker_decline;   // transient: scan_scalar saw a true `---`/`...` → external declines so the
                          // internal marker token lexes it. Set+consumed within one scan; not serialized.
@@ -52,7 +63,7 @@ void *tree_sitter_yaml_external_scanner_create(void) {
   s->stack = ts_malloc(s->cap * sizeof(int16_t));
   s->stack[0] = 0;
   s->pending_col = -1; s->pending_newline = false; s->started = false;
-  s->at_line_lead = true;
+  s->at_line_lead = true; s->property_lead = false;
   s->flow_depth = 0;
   s->marker_decline = false;
   return s;
@@ -471,8 +482,20 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const
   if (want_indent && s->at_line_lead && s->flow_depth == 0) {
     while (lexer->lookahead == ' ' || lexer->lookahead == '\t') skip(lexer);
     int32_t c = lexer->lookahead;
+    // GENUINE line lead: the line's first token is not yet lexed, so its column == the stack top (a
+    // line boundary set top to the lead column; at stream start both are the document level). Once a
+    // token has been lexed on this line the next content is DEEPER than top. Record whether that first
+    // token is a node PROPERTY (`&`/`!`): a property leads a node, so its inline content is at the
+    // SAME level (no compact push), whereas a compact indicator (`-`/`?`) DOES open a nested level
+    // for its content. This is the one fact lost when the property is lexed internally (the scanner
+    // never sees it), so it is latched here and checked at the two compact-push sites below.
+    if ((int16_t)lexer->get_column(lexer) == s->stack[s->len - 1]) s->property_lead = (c == '&' || c == '!');
     bool nonscalar_lead = c == '&' || c == '!' || c == '[' || c == '{' || c == '*' || compact_is_indicator(c);
-    if (nonscalar_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) {
+    // A property that LEADS the line (property_lead) does not nest — skip the compact push so its key
+    // stays at the node's level (`&a a: b` / `!!str &a1 "foo":`). A property that follows a compact
+    // indicator (`- &a k: v`) is NOT a line lead (property_lead was set false at the `-`), so it still
+    // pushes via compact_content_is_structural's property-skip.
+    if (nonscalar_lead && !s->property_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) {
       int16_t col = (int16_t)lexer->get_column(lexer);
       lexer->mark_end(lexer); // freeze the zero-width INDENT end at the content column before the sniff advances
       if (compact_content_is_structural(lexer)) {
@@ -511,8 +534,12 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const
       // Compact mapping-KEY (part B): a line-lead indicator's scalar-led inline content, deeper than the
       // stack top. scan_scalar then pushes the nested mapping's INDENT (zero-width, pre-marked here) if
       // the run is a KEY, or emits the leaf scalar otherwise (so `- x` stays a plain item, no push).
+      // `!s->property_lead`: a key after a LINE-LEAD node property (`&a a: b`) sits at the node's level,
+      // not a compact-nested one — so do NOT pre-mark a compact INDENT; scan_scalar then emits the key as
+      // an ordinary value-position scalar (the enclosing Node carries the property). A key after a compact
+      // indicator (`- a: 1`) has property_lead == false and still nests.
       int16_t compact_col = -1;
-      if (want_indent && s->at_line_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) {
+      if (want_indent && s->at_line_lead && !s->property_lead && (int16_t)lexer->get_column(lexer) > s->stack[s->len - 1]) {
         compact_col = (int16_t)lexer->get_column(lexer);
         lexer->mark_end(lexer); // zero-width INDENT end at the content start (used iff the run is a KEY)
       }
@@ -550,6 +577,12 @@ bool tree_sitter_yaml_external_scanner_scan(void *payload, TSLexer *lexer, const
   lexer->mark_end(lexer); // INDENT/DEDENT/NEWLINE are zero-width at the content column
   int top = s->stack[s->len - 1];
   s->at_line_lead = true; // a real line boundary — the next real token leads its line
+  // Latch whether THIS new line is led by a node property (`&`/`!`) — the lookahead is the line's
+  // first content char (blanks/comments already skipped). A property leads a node, so its inline content
+  // is at the same level and must NOT take a compact push (the gates below check property_lead). This is
+  // a TRUE-return site so the latch persists through the property's internal lex; it is also re-derived
+  // for the FIRST line (which reaches no boundary) by the compact block's genuine-line-lead detection.
+  s->property_lead = (lexer->lookahead == '&' || lexer->lookahead == '!');
 
   if (col > top) {
     if (want_indent) { push_indent(s, col); lexer->result_symbol = INDENT; return true; }