From e4fa7d64445f2137233546a3d8f8612944fc6f16 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Thu, 4 Jun 2026 07:23:39 -0300 Subject: [PATCH 01/10] fix(tree-sitter): correct --visibility and --signatures extraction across languages Begins v0.9.0 "Trustworthy Output". Fixes six verified signature/visibility extraction bugs that made advertised flags lie, each with a regression test: - Java --visibility was a total no-op: get_visibility returned All, so --visibility public dropped every symbol and private leaked all (B12) - C/C++ pointer/reference-return functions were silently dropped (B13) - C++ qualified return types (std::string) misread as the function name (B14) - Rust bodiless trait methods (function_signature_item) dropped from signatures and structure counts (B15) - Python class base lists double-parenthesized: class User((Base)) (B16) - Rust pub(crate)/pub(super) reported as fully public (B18) Also modernize two sort_by calls to satisfy newer clippy under -D warnings, and add docs/research/next-release-roadmap.md (the full v0.9.0 plan). Verified: 318 lib tests (all features) + full default suite pass; clippy --all-targets --all-features -D warnings clean; fmt clean. --- CHANGELOG.md | 17 +++ docs/research/next-release-roadmap.md | 152 ++++++++++++++++++++++++++ src/lib.rs | 4 +- src/tree_sitter/languages/c.rs | 64 ++++++++++- src/tree_sitter/languages/cpp.rs | 89 +++++++++++++-- src/tree_sitter/languages/java.rs | 77 +++++++++++-- src/tree_sitter/languages/python.rs | 31 +++++- src/tree_sitter/languages/rust.rs | 124 +++++++++++++++++---- 8 files changed, 513 insertions(+), 45 deletions(-) create mode 100644 docs/research/next-release-roadmap.md diff --git a/CHANGELOG.md b/CHANGELOG.md index ca95f8c..bae9788 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,23 @@ All notable changes to this project will be documented in this file. +## v0.9.0 (in progress) — "Trustworthy Output" + +> Planned per `docs/research/next-release-roadmap.md`. This section tracks work as it lands. + +- **Tree-Sitter correctness — honest `--signatures` / `--visibility`** + - Fixed Java `--visibility` filter being completely non-functional — `get_visibility` returned `Visibility::All` unconditionally, so `--visibility public` dropped *every* Java symbol and `--visibility private` leaked *all* of them. It now inspects the declaration's `modifiers` node (B12) + - Fixed C/C++ functions returning a pointer or reference being silently dropped — `find_function_name` now descends through `pointer_declarator` / `reference_declarator` / `parenthesized_declarator` to locate the `function_declarator` (B13) + - Fixed C++ qualified return types (e.g. `std::string`) being misread as the function name — the name is now resolved strictly inside `function_declarator`, with no sibling-identifier fallback (B14) + - Fixed Rust bodiless trait methods (`function_signature_item`, e.g. `fn draw(&self);`) being dropped from both signature extraction and structure counts (B15) + - Fixed Python class base lists being double-parenthesized (`class User((Base))`) — `argument_list` already includes the surrounding parentheses (B16) + - Fixed Rust restricted visibility (`pub(crate)` / `pub(super)` / `pub(in ...)`) being reported as fully public; restricted forms are no longer matched by `--visibility public` (B18) + - Added regression tests covering each of the above + +- **Maintenance** + - Modernized two `sort_by` comparisons to `sort_by_key(Reverse(..))` to satisfy a newer clippy under `-D warnings` (surfaced after the toolchain advanced during the release gap) + - Added `docs/research/next-release-roadmap.md` — the full prioritized v0.9.0 plan (verified bug backlog, competitive refresh, feature/dependency/DX roadmap) + ## v0.8.3 - **Bug Fixes** (identified by Gemini Deep Think v6 — clean benchmark prompt, zero historical bias) diff --git a/docs/research/next-release-roadmap.md b/docs/research/next-release-roadmap.md new file mode 100644 index 0000000..00f8cd7 --- /dev/null +++ b/docs/research/next-release-roadmap.md @@ -0,0 +1,152 @@ +# context-builder — Next Release Roadmap + +> **Generated:** 2026-06-04 via multi-agent codebase analysis (subsystem mapping, prior-research mining, competitive refresh to mid-2026, adversarially-verified bug hunt, dependency + DX audit). Bug findings were adversarially verified (20/22 confirmed); the four headline "broken feature" claims were independently re-verified against live source. + +_Current: v0.8.3 (Feb 2026), ~4 months dormant. All file:line references verified against live source unless marked "per corpus."_ + +--- + +## 1. Executive summary + +context-builder is a healthy, conservatively-engineered Rust CLI with a **genuinely differentiated core**: relevance-ordered traversal, deterministic content-hash output for prompt caching, snapshot auto-diff/`--diff-only`, and feature-gated tree-sitter signature/structure extraction across 8 languages. The dependency stack is clean (no open advisories; crossbeam-channel already on the patched 0.5.15), the test surface is large (~232 unit + ~51 integration tests), and CI runs a 3-OS matrix. This is not a project in trouble. + +It is, however, a project that has **stopped moving while the category accelerated**. Repomix shipped a release nearly every month of 2026 (v1.14.1 on 2026-05-27), and the Rust rival yek shipped v0.25.3 on 2026-06-02 — the most recent release of any tool reviewed — while context-builder's last release was Feb 2026. More importantly, the analysis surfaced a cluster of issues that are individually small but collectively corrosive to the project's *one core promise — accurate, trustworthy, LLM-optimized output*: + +- **Two flagship features are partially or wholly broken.** `--truncate smart` is dead code (`find_smart_truncation_point` has zero production callers — verified). `--visibility public` is completely non-functional for Java (and C++): `get_visibility` returns `Visibility::All` unconditionally (java.rs:151 — verified), so `--visibility public` drops *every* Java symbol and `--visibility private` leaks *all* of them. +- **The token budget — the headline value prop — is inaccurate and non-deterministic.** `--max-tokens` is gated on a `buf.len()/4` byte heuristic (markdown.rs:156), the first file always bypasses the budget (`tokens_used > 0` guard, markdown.rs:160 — verified), parallel vs serial paths estimate differently (breaking the deterministic-hash guarantee across builds), and the tokenizer is hardcoded to `cl100k_base` (token_count.rs:10 — verified), which now *under-counts* every modern OpenAI model. The `--token-count` preview compounds this: it reads files via `read_to_string` and ignores tree-sitter enrichment and `--max-tokens` entirely, so even the *previewed* number won't match the produced document. +- **Output is Markdown-only** while every serious competitor offers XML (Anthropic's recommended format for Claude) and stdout piping — the single most conspicuous competitive gap. + +### Recommended theme: **"Trustworthy output: accurate tokens, honest features, and pipe-friendly formats."** + +This theme is *focused* (not a grab-bag), plays directly to the project's existing identity as the precision/correctness tool, and is **shippable**. It bundles (a) the highest-consensus verified bugs that make advertised features lie, (b) the lowest-effort/highest-credibility table-stakes wins (o200k_base, XML, stdout), and (c) a tokenizer/budget unification that retires a whole family of bugs at once. It deliberately **defers** the big strategic bets (MCP, remote repos) to the *following* release, because shipping a credible, momentum-restoring 0.9.0 in weeks beats shipping a sprawling one in months — and because MCP/remote should be built on top of a *trustworthy* core and a clean library API, not on top of the current split-brain pipeline. + +A note on leverage: XML+stdout are the **table-stakes/competitive** king — they unblock the dominant 2026 pipe-to-LLM workflow and match every rival. They are not the *per-token-reasoning* king; by the corpus's own cross-model consensus, the dependency/module graph and doc-comment extraction are the highest output-quality wins. Those are deliberately deferred (graph to P2/v0.10, doc-comments to P2) so v0.9.0 stays focused and shippable. + +--- + +## 2. Recommended next release — **v0.9.0 "Trustworthy Output"** (concrete, shippable scope) + +Effort: S (≤½ day), M (1–3 days), L (~1 week). + +### Core feature set +| # | Item | Effort | Rationale | +|---|------|--------|-----------| +| F1 | **Selectable tokenizer `--encoding {cl100k_base\|o200k_base}`, default `o200k_base`** | S | `o200k_base` already exists in the pinned tiktoken-rs 0.9.1, so this needs **no dep bump to function**; cl100k_base now under-counts every current OpenAI model. Lowest-effort, highest-credibility fix; directly upgrades `--token-count`/`--max-tokens`. (token_count.rs:10) | +| F2 | **`--stdout` / `-o -` output target** | M | The dominant 2026 usage pattern is `context-builder \| llm`; impossible today (always writes a file). The renderer already streams to `impl Write` — route it to stdout and gate chatter to stderr. | +| F3 | **`--format {markdown\|xml}` with an output-renderer trait** | L | XML (``/``) is Anthropic's recommended Claude structure and Repomix's default. Introduce an `OutputFormat`/renderer trait so the streaming pipeline is parameterized. **Scope note:** format logic is currently hardwired in `process_file`/`write_text_content` AND there is a near-duplicate language map at markdown.rs:332 and :607 — the trait must dedup that map too, or formats will drift. | +| F4 | **Unify token budgeting on the real tokenizer + fix first-file bypass + debit header/tree + mirror counter** | M | Replace both `buf.len()/4` gates (markdown.rs:156; serial path) with `estimate_tokens()`, apply the budget to file index 0, **debit the document header + file-tree tokens before the per-file loop**, and **make `--token-count` share one rendering fn with the renderer** so the preview stops lying (folds B9). | +| F5 | **Wire `--truncate smart` into the budget path** | M | `find_smart_truncation_point` is fully built and tested but has **zero callers**. **Hard prerequisite: B19** (fallback returns non-char-boundary `max_bytes` → panic on slice). | +| F6 | **Implement Java + C++ visibility filtering (+ Java field-kind reclassification)** | M | `--visibility public` is a no-op (worse: actively wrong) for Java/C++. While in java.rs, also reclassify Java `field` (currently mis-kinded as `SignatureKind::Constant`). (B12) | +| F7 | **Constrain `--truncate`/`--visibility`/`--encoding` to clap `ValueEnum`** | S | Free validation + auto-generated `[possible values]` in `--help` + makes `--help` the source of truth, killing the doc-drift bug class (B3, B4). Does *not* cover B5 (`encoding_strategy` is config-only) — that's a separate config-load validation fix. | +| F8 | **Tree-sitter integration test through `run_with_args`** | M | The tree-sitter signature path has **zero integration-level coverage** today, yet F5/F6/B12–B18 rewrite it heavily. An end-to-end test driving `signatures`/`--visibility`/`--truncate smart` through `run_with_args` is a *dependency* of trusting those fixes. | + +### Sequencing (strict chains) +- **F1 → F4 → B19 → F5.** F1 must land before F4 rewrites the budget gate onto `estimate_tokens()`. F4 establishes the per-file budget; F5 consumes it. B19 (char-boundary clamp) is a **blocking gate** on F5. +- **B13 → B14.** Both touch `find_function_name` in c.rs/cpp.rs. B14 only works *after* B13 makes the walker descend through `pointer_declarator`. +- **B6+B7 are one work item** (unified config-hash), not two. + +--- + +## 3. Verified bug backlog (ordered by severity — quick wins) + +### HIGH +| ID | Bug | File:line | Fix | +|----|-----|-----------|-----| +| B1 | **Token budget bypass: oversized first file always emitted in full** — `tokens_used > 0` guard short-circuits on index 0; parallel uses `buf.len()/4`, serial uses raw `metadata().len()/4`. | markdown.rs:158-178 (parallel), :257-268 (serial) | Apply budget to first file too; unify both paths on `estimate_tokens()`. (= F4) | +| B12 | **Java visibility filter non-functional both directions** — `get_visibility` returns `Visibility::All` unconditionally. Same no-op in cpp.rs. Bundle the Java `field`→`Constant` mis-kinding fix. | java.rs:147-152 (applied :162,213,248,283,318); cpp.rs:211 | Scan the `modifiers` node for public/private/protected. (= F6) | +| B13 | **C/C++ pointer/reference-return functions silently dropped** — `find_function_name` only checks direct `function_declarator` children. **Fix before B14.** | c.rs:302-315; cpp.rs:384-400 | Recurse through `pointer_declarator`/`reference_declarator`/`parenthesized_declarator`. | + +### MEDIUM +| ID | Bug | File:line | Fix | +|----|-----|-----------|-----| +| B2 | **Content hash covers files that truncation omits — and the auto-diff path emits no content hash at all** (uses volatile `**Generated:**` timestamp). | markdown.rs:88-104; auto-diff path in lib.rs | Hash bytes actually written; give the auto-diff path a real content hash. (pairs with F4) | +| B6+B7 | **Cache/auto-diff config-hash omits output-affecting fields — fix as ONE unified-hash pass** — both omit `encoding_strategy`, `diff_only`, `output_folder`, `timestamped_output`; separately `effective_config` (lib.rs:338-346) stops after line_numbers so resolved `--signatures/--structure/--truncate/--visibility` never reach `final_config`. | cache.rs:94-119; state.rs:225-250; lib.rs:338-346, 862-866 | Collapse both hashers into one shared canonical fingerprint fn; add the four omitted fields; propagate resolved fields into `final_config`. | +| B8 | **`--diff-only` silently ignored when `auto_diff` off** — full content emitted, no warning. | lib.rs:333, 544-575 | Warn/error when `diff_only && !auto_diff`. | +| B14 | **C++ qualified return type misread as function name** — `std::string s(...)` → name="std::string". **Sequence after B13.** | cpp.rs:384-400 | Resolve name strictly from inside `function_declarator`. | +| B15 | **Rust bodiless trait methods (`function_signature_item`) dropped** — trait views, where signatures matter most, are incomplete. | rust.rs:138-184, :200 | Add `"function_signature_item"` arm; add to structure counter and `walk_for_boundary`. | + +### LOW (cheap, bundle opportunistically) +| ID | Bug | File:line | Fix | +|----|-----|-----------|-----| +| B3 | Invalid `--visibility` values silently coerced to `all`. | cli.rs:67-69 | clap `ValueEnum`. (= F7) | +| B4 | Invalid `--truncate` values silently accepted; flag inert. | cli.rs:63-65 | `ValueEnum {Smart,Byte}`. (= F5 + F7) | +| B5 | Invalid `encoding_strategy` in config silently falls back to `detect`. **Config-only — NOT covered by F7.** | config.rs:71; markdown.rs:438-450 | Validate on config load; warn on others. | +| B9 | Token-count estimate diverges from output for non-UTF-8/binary; ignores tree-sitter enrichment and `--max-tokens`. **Folded into F4.** | token_count.rs:34-46 | Share one rendering fn between counter and renderer. | +| B16 | Python class base list double-parenthesized: `class User((Base))`. | python.rs:227, 236-240 | Strip parens before re-wrapping. | +| B17 | C/C++ struct/enum/alias drop inheritance/base types/aliased targets (`format!`-based). TS class inheritance loss is the same bug. | cpp.rs:316-366; c.rs:230-284; typescript.rs | Byte-slice up to body node / trailing `;`. | +| B18 | Rust `pub(crate)`/`pub(super)` reported as fully public. | rust.rs:221-237 | Inspect modifier text; treat restricted as a non-public tier. | +| B19 | `find_smart_truncation_point` fallback can return a non-char-boundary offset → panic when wired up. **Blocking gate on F5.** | truncation.rs:9-19 + per-lang fallback | Clamp with `ensure_utf8_boundary` before returning. | +| B20 | fs2 lock not atomic on crash (`set_len(0)` then write); truncated cache → silently dropped baseline. | cache.rs:167-204 | Write temp file + `fs::rename`. (pairs with fs2→fs4) | + +--- + +## 4. Feature roadmap (prioritized) + +| Priority | Feature | Effort | Impact | Rationale | +|----------|---------|--------|--------|-----------| +| **P0** | Model-accurate tokenizers (`--encoding`, o200k_base default) | S | High | cl100k_base under-counts every current OpenAI model; o200k_base already in pinned 0.9.1 | +| **P0** | stdout/pipe output | M | High | Unblocks `context-builder \| llm`; renderer already writes to `impl Write` | +| **P0** | Multi-format output (XML for Claude; later JSON) | L | High | Repomix *defaults* to XML; Anthropic recommends XML tags. Most-cited competitive gap | +| **P0** | Wire up `--truncate smart` (verified dead code) | M | High | Headline feature that does nothing today | +| **P1** | MCP server (`--mcp`, rmcp, stdio) | M-L | High | Both top Rust competitors ship it; agents consume context via MCP. Build on a library API first. Defer to v0.10 | +| **P1** | Secret scanning/redaction (`--redact`) | M | Med-High | Piping a repo to an LLM leaks keys; pure-Rust fits single-binary ethos | +| **P1** | Glob include patterns (`--include 'src/**/*.rs'`) | M | Medium | We only filter by extension; `file_utils.rs` already uses `OverrideBuilder` | +| **P1** | Remote-repo ingestion (`--remote `, shallow clone) | M | Medium | Repomix/gitingest/codefetch all do it; removes first-use friction | +| **P1** | Git-aware diff baselines (`--diff-against `) | L | High | Turns auto-diff into "vs main" — highest-value evolution of the diff moat | +| **P2** | Doc-comment extraction into signatures | M | High (reasoning) | High cross-model consensus as a per-token-reasoning win | +| **P2** | Dependency/module graph + symbol map | L | High (reasoning) | Highest-consensus unshipped feature (9/10 models) | +| **P2** | More tree-sitter languages (C#, Ruby, PHP, Kotlin, Swift, Bash) | L | Medium | 8 langs misses popular ecosystems; pair with query-migration refactor | +| **P2** | Make tree-sitter a default feature | S | Medium | `cargo install` silently degrades AST features today | +| **P2** | Library/crate API surface | M | Medium | Foundation an MCP server should build on | +| **P3** | Watch mode, template engine, web playground/VSCode ext | M-L | Low-Med | Nice-to-haves / heavy distribution lifts; later | + +--- + +## 5. Dependency & maintenance + +Stack health is **good** — no open advisory affects pinned versions; crossbeam-channel already on patched 0.5.15. + +**Bump (feature-relevant):** `tiktoken-rs` 0.9.1 → 0.12.0 (adds o-series/gpt-5/o200k_harmony mappings — an *enhancement* to F1, not a hard blocker since o200k_base already exists in 0.9.1). + +**Dependency diet (one mechanical pass, edition-2024 MSRV makes std swaps free):** +- **Drop `walkdir`** — declared direct dep but *never imported* (traversal is all via `ignore`). +- **`fs2` → `fs4`** — fs2 unreleased since 2017 (RustSec unmaintained profile). Single lock site (cache.rs:7). Do alongside B20. +- **`num_cpus` → `std::thread::available_parallelism()`** — one site (markdown.rs:125). +- **`once_cell::Lazy` → `std::sync::LazyLock`** — one site (token_count.rs:10). + +**Routine patch bumps** (manifest is source of truth): clap, chrono, tempfile (and de-dupe — listed in both `[dependencies]` and `[dev-dependencies]`), serde_json, toml, rayon floor, env_logger; **tree-sitter core + grammars together** (ABI pairing). + +**CI/tooling gap:** No `cargo audit`/`cargo deny` and no dependabot/renovate. Add both (parallel infra track, not on the v0.9.0 critical path). + +--- + +## 6. DX, docs & distribution + +**Critical-path doc fixes (ship in v0.9.0):** +- **README.md:270** — `--truncate` documents `none (default) or smart`. Both wrong: clap default is `smart` (cli.rs:64), and `none` is fabricated. Fix to `smart (AST-aware, default) or byte`. +- **SKILL.md:160** — documents `smart … or simple`. `simple` isn't a real mode. Standardize on `smart`/`byte` everywhere. +- **README/SKILL** advertise "smart AST-boundary truncation" as working (README:80) — it's dead code. Either ship F5 or soften the claim. +- **README.md:257-272** — omits `--visibility` entirely (a real shipping flag) and never documents the `private` value. +- **README.md:95** still has `curl … | bash` despite commit 87657bf removing it from SKILL.md for a VirusTotal flag. Make this a release-gate checklist item. + +**Critical-path distribution (ship in v0.9.0):** +1. **`[package.metadata.binstall]`** (S) — prebuilt archives already exist but `cargo binstall context-builder` can't find them. +2. **aarch64-unknown-linux-gnu release target** (M, **HIGH**) — install.sh advertises ARM Linux but release.yml never builds it → `curl|install.sh` 404s on Raspberry Pi/ARM cloud/WSL-ARM today. +3. **`examples/` dir + sample output.md** (S) — no before/after sample showcases the differentiators. + +**Parallel infra track (does NOT gate the feature release):** +- AGENTS.md release process covers only crates.io; omits the tag-triggered binary release and winget bump. AGENTS.md/DEVELOPMENT.md layout omits `src/tree_sitter/`. +- CI MSRV job is fake — pins `stable`, no `rust-version` in Cargo.toml. Add `rust-version = "1.85"`. +- Homebrew tap (L), Winget automation (manifest stuck at 0.8.2), CHANGELOG-driven release notes (S), `cargo audit` + dependabot. + +--- + +## 7. Deferred / stretch (later releases) + +- **v0.10 strategic bets:** MCP server, remote-repo ingestion, git-aware `--diff-against `, library/crate API. Ship the clean **library API first**, then build MCP on top of it. +- **Architectural refactor (high internal leverage):** unify on `ResolvedConfig` end-to-end + stream auto-diff through a `Writer` + store hashes-not-content in the cache — retires an entire cluster of bugs (B6/B7, BTreeMap-ordering family, auto-diff OOM, encoding-bypass). Worth doing **before** building MCP/library API. Pairs with extracting `auto_diff.rs` out of the 2505-line lib.rs. (F4's diff-path point-patches are knowingly throwaway once this lands.) +- **Output-quality bets:** dependency/module graph + symbol map, doc-comment extraction, per-file token statistics, smart lockfile summarization, configurable relevance rules in TOML. +- **tree-sitter depth:** migrate hand-rolled walkers to `.scm` queries (de-risks every future language), add C#/Ruby/PHP/Kotlin/Swift/Bash, TS class inheritance/generics preservation (B17's TS parallel). +- **Distribution/ecosystem (heavy):** web playground, VSCode/browser extensions, Docker image + GHCR, GitHub Action. + +**Bottom line:** Ship v0.9.0 "Trustworthy Output" soon — accurate tokenizers, honest features (truncate/visibility actually work, verified by a new tree-sitter integration test), and XML+stdout — bundled with the verified bug fixes and the dependency diet. Keep the infra overhaul on a parallel track. It restores momentum, plays to the project's precision identity, and lays clean groundwork (library API, unified config, streamed diff) for the MCP/remote-repo bets in v0.10. diff --git a/src/lib.rs b/src/lib.rs index 25677bd..6b32c22 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -246,7 +246,7 @@ pub fn run_with_args(args: Args, config: Config, prompter: &impl Prompter) -> io } if !large_files.is_empty() { - large_files.sort_by(|a, b| b.1.cmp(&a.1)); // Sort by size descending + large_files.sort_by_key(|b| std::cmp::Reverse(b.1)); // Sort by size descending eprintln!( "\n⚠ {} large file(s) detected (>{} KB):", large_files.len(), @@ -895,7 +895,7 @@ fn detect_major_file_types() -> io::Result> { // Convert to vector of (extension, count) pairs and sort by count let mut extensions: Vec<(String, usize)> = extension_counts.into_iter().collect(); - extensions.sort_by(|a, b| b.1.cmp(&a.1)); + extensions.sort_by_key(|b| std::cmp::Reverse(b.1)); // Take the top 5 extensions or all if less than 5 let top_extensions: Vec = extensions.into_iter().take(5).map(|(ext, _)| ext).collect(); diff --git a/src/tree_sitter/languages/c.rs b/src/tree_sitter/languages/c.rs index a0f4d3a..efee2e2 100644 --- a/src/tree_sitter/languages/c.rs +++ b/src/tree_sitter/languages/c.rs @@ -300,15 +300,49 @@ impl CSupport { } fn find_function_name(&self, node: &tree_sitter::Node, source: &str) -> Option { + // The `function_declarator` may be nested inside a `pointer_declarator` + // (functions returning a pointer, e.g. `char *dup(const char *)`) or a + // `parenthesized_declarator`, so descend through those wrappers instead of + // only inspecting direct children — otherwise pointer-returning functions + // are silently dropped. + let decl = self.find_function_declarator(node)?; + self.declarator_name(&decl, source) + } + + fn find_function_declarator<'a>( + &self, + node: &tree_sitter::Node<'a>, + ) -> Option> { + let mut cursor = node.walk(); + for child in node.children(&mut cursor) { + match child.kind() { + "function_declarator" => return Some(child), + "pointer_declarator" | "parenthesized_declarator" => { + if let Some(found) = self.find_function_declarator(&child) { + return Some(found); + } + } + _ => {} + } + } + None + } + + /// Resolve the declared name inside a `function_declarator`, skipping the + /// parameter list and descending through nested declarators. + fn declarator_name(&self, node: &tree_sitter::Node, source: &str) -> Option { let mut cursor = node.walk(); for child in node.children(&mut cursor) { - if child.kind() == "function_declarator" { - let mut inner_cursor = child.walk(); - for inner in child.children(&mut inner_cursor) { - if inner.kind() == "identifier" { - return Some(source[inner.start_byte()..inner.end_byte()].to_string()); + match child.kind() { + "identifier" | "field_identifier" => { + return Some(source[child.start_byte()..child.end_byte()].to_string()); + } + "pointer_declarator" | "parenthesized_declarator" | "function_declarator" => { + if let Some(name) = self.declarator_name(&child, source) { + return Some(name); } } + _ => {} } } None @@ -400,6 +434,26 @@ void hello(const char* name) { assert!(funcs.len() >= 2); } + #[test] + fn test_pointer_return_function_extracted() { + // Regression: functions returning a pointer were silently dropped + // because find_function_name only inspected direct children. + let source = r#" +char *dup(const char *s) { + return 0; +} +"#; + + let signatures = CSupport.extract_signatures(source, Visibility::All); + assert!( + signatures + .iter() + .any(|s| s.kind == SignatureKind::Function && s.name == "dup"), + "pointer-returning function `dup` should be extracted, got: {:?}", + signatures.iter().map(|s| &s.name).collect::>() + ); + } + #[test] fn test_extract_struct_signature() { let source = r#" diff --git a/src/tree_sitter/languages/cpp.rs b/src/tree_sitter/languages/cpp.rs index fcfdfc8..0556a76 100644 --- a/src/tree_sitter/languages/cpp.rs +++ b/src/tree_sitter/languages/cpp.rs @@ -382,18 +382,56 @@ impl CppSupport { } fn find_function_name(&self, node: &tree_sitter::Node, source: &str) -> Option { + // Resolve the name strictly from inside the `function_declarator`, + // descending through pointer/reference/parenthesized wrappers so that + // pointer- and reference-returning functions are not dropped. We do NOT + // fall back to a sibling identifier — doing so would misread a qualified + // return type like `std::string` as the function name. + let decl = self.find_function_declarator(node)?; + self.declarator_name(&decl, source) + } + + fn find_function_declarator<'a>( + &self, + node: &tree_sitter::Node<'a>, + ) -> Option> { let mut cursor = node.walk(); for child in node.children(&mut cursor) { - if child.kind() == "function_declarator" || child.kind() == "reference_declarator" { - let mut inner_cursor = child.walk(); - for inner in child.children(&mut inner_cursor) { - if inner.kind() == "identifier" || inner.kind() == "qualified_identifier" { - return Some(source[inner.start_byte()..inner.end_byte()].to_string()); + match child.kind() { + "function_declarator" => return Some(child), + "pointer_declarator" | "reference_declarator" | "parenthesized_declarator" => { + if let Some(found) = self.find_function_declarator(&child) { + return Some(found); } } + _ => {} } - if child.kind() == "identifier" || child.kind() == "qualified_identifier" { - return Some(source[child.start_byte()..child.end_byte()].to_string()); + } + None + } + + /// Resolve the declared name inside a `function_declarator`, skipping the + /// parameter list and descending through nested declarators. + fn declarator_name(&self, node: &tree_sitter::Node, source: &str) -> Option { + let mut cursor = node.walk(); + for child in node.children(&mut cursor) { + match child.kind() { + "identifier" + | "qualified_identifier" + | "field_identifier" + | "destructor_name" + | "operator_name" => { + return Some(source[child.start_byte()..child.end_byte()].to_string()); + } + "pointer_declarator" + | "reference_declarator" + | "parenthesized_declarator" + | "function_declarator" => { + if let Some(name) = self.declarator_name(&child, source) { + return Some(name); + } + } + _ => {} } } None @@ -508,6 +546,43 @@ void greet(const std::string& name) { assert!(funcs.len() >= 2); } + #[test] + fn test_pointer_and_qualified_return_functions() { + // Regression: pointer-returning functions were dropped, and a qualified + // return type (`std::string`) was misread as the function name. + let source = r#" +int* makeArray(int n) { + return nullptr; +} + +std::string greet(const std::string& who) { + return who; +} +"#; + + let signatures = CppSupport.extract_signatures(source, Visibility::All); + let names: Vec<&str> = signatures + .iter() + .filter(|s| s.kind == SignatureKind::Function) + .map(|s| s.name.as_str()) + .collect(); + assert!( + names.contains(&"makeArray"), + "pointer-returning function dropped; got {:?}", + names + ); + assert!( + names.contains(&"greet"), + "function with qualified return type dropped; got {:?}", + names + ); + assert!( + !names.contains(&"std::string"), + "qualified return type misread as function name; got {:?}", + names + ); + } + #[test] fn test_extract_struct_signature() { let source = r#" diff --git a/src/tree_sitter/languages/java.rs b/src/tree_sitter/languages/java.rs index d1eddb9..07b4112 100644 --- a/src/tree_sitter/languages/java.rs +++ b/src/tree_sitter/languages/java.rs @@ -144,11 +144,23 @@ impl JavaSupport { } } - #[allow(dead_code)] - fn get_visibility(&self, _node: &tree_sitter::Node) -> Visibility { - // Java visibility is determined by modifiers - // Simplified: check for public/private/protected keywords in AST modifiers - Visibility::All + /// Determine a Java declaration's visibility from its `modifiers` child. + /// `public` → Public; `private`/`protected` and package-private (no modifier) + /// → Private. This keeps `--visibility public` scoped to the true public API + /// surface and `--visibility private` to everything else. + fn get_visibility(&self, node: &tree_sitter::Node, source: &str) -> Visibility { + let mut cursor = node.walk(); + for child in node.children(&mut cursor) { + if child.kind() == "modifiers" { + let text = &source[child.start_byte()..child.end_byte()]; + if text.split_whitespace().any(|t| t == "public") { + return Visibility::Public; + } + return Visibility::Private; + } + } + // No modifiers node → package-private, treated as non-public. + Visibility::Private } fn extract_method_signature( @@ -157,7 +169,7 @@ impl JavaSupport { node: &tree_sitter::Node, visibility: Visibility, ) -> Option { - let vis = self.get_visibility(node); + let vis = self.get_visibility(node, source); if visibility == Visibility::Public && vis != Visibility::Public { return None; @@ -208,7 +220,7 @@ impl JavaSupport { node: &tree_sitter::Node, visibility: Visibility, ) -> Option { - let vis = self.get_visibility(node); + let vis = self.get_visibility(node, source); if visibility == Visibility::Public && vis != Visibility::Public { return None; @@ -243,7 +255,7 @@ impl JavaSupport { node: &tree_sitter::Node, visibility: Visibility, ) -> Option { - let vis = self.get_visibility(node); + let vis = self.get_visibility(node, source); if visibility == Visibility::Public && vis != Visibility::Public { return None; @@ -278,7 +290,7 @@ impl JavaSupport { node: &tree_sitter::Node, visibility: Visibility, ) -> Option { - let vis = self.get_visibility(node); + let vis = self.get_visibility(node, source); if visibility == Visibility::Public && vis != Visibility::Public { return None; @@ -313,7 +325,7 @@ impl JavaSupport { node: &tree_sitter::Node, visibility: Visibility, ) -> Option { - let vis = self.get_visibility(node); + let vis = self.get_visibility(node, source); if visibility == Visibility::Public && vis != Visibility::Public { return None; @@ -497,6 +509,51 @@ public class Dog extends Animal implements Runnable { assert_eq!(classes[0].name, "Dog"); } + #[test] + fn test_visibility_public_filter() { + // Regression: get_visibility returned All unconditionally, so the + // `public` filter dropped every symbol. + let source = r#" +public class Api { + public int publicMethod() { return 1; } + private int privateMethod() { return 2; } + int packageMethod() { return 3; } +} +"#; + + let public_only = JavaSupport.extract_signatures(source, Visibility::Public); + assert!(public_only.iter().any(|s| s.name == "publicMethod")); + assert!( + !public_only.iter().any(|s| s.name == "privateMethod"), + "private method leaked into `public` filter" + ); + assert!( + !public_only.iter().any(|s| s.name == "packageMethod"), + "package-private method leaked into `public` filter" + ); + } + + #[test] + fn test_visibility_private_filter_excludes_public() { + let source = r#" +public class Api { + public int publicMethod() { return 1; } + private int privateMethod() { return 2; } +} +"#; + + let private_only = JavaSupport.extract_signatures(source, Visibility::Private); + assert!(private_only.iter().any(|s| s.name == "privateMethod")); + assert!( + !private_only.iter().any(|s| s.name == "publicMethod"), + "public method leaked into `private` filter" + ); + assert!( + !private_only.iter().any(|s| s.name == "Api"), + "public class leaked into `private` filter" + ); + } + #[test] fn test_extract_structure() { let source = r#" diff --git a/src/tree_sitter/languages/python.rs b/src/tree_sitter/languages/python.rs index be20c3b..c469479 100644 --- a/src/tree_sitter/languages/python.rs +++ b/src/tree_sitter/languages/python.rs @@ -234,9 +234,10 @@ impl PythonSupport { full_sig.push_str("class "); full_sig.push_str(&name); if let Some(b) = &bases { - full_sig.push('('); + // `argument_list` already includes the surrounding parentheses, + // e.g. "(Base)", so append it directly to avoid double-wrapping + // into `class User((Base))`. full_sig.push_str(b); - full_sig.push(')'); } Some(Signature { @@ -360,6 +361,32 @@ class User: assert_eq!(classes[0].name, "User"); } + #[test] + fn test_class_base_list_not_double_wrapped() { + // Regression: `argument_list` already includes parentheses, so the base + // list was rendered as `class User((Base))`. + let source = r#" +class User(Base): + pass +"#; + + let signatures = PythonSupport.extract_signatures(source, Visibility::All); + let class = signatures + .iter() + .find(|s| s.kind == SignatureKind::Class) + .expect("class extracted"); + assert!( + class.full_signature.contains("class User(Base)"), + "expected single-wrapped base list, got: {}", + class.full_signature + ); + assert!( + !class.full_signature.contains("((Base))"), + "base list double-wrapped: {}", + class.full_signature + ); + } + #[test] fn test_file_extensions() { assert!(PythonSupport.supports_extension("py")); diff --git a/src/tree_sitter/languages/rust.rs b/src/tree_sitter/languages/rust.rs index ea1f466..744d756 100644 --- a/src/tree_sitter/languages/rust.rs +++ b/src/tree_sitter/languages/rust.rs @@ -102,6 +102,7 @@ impl RustSupport { let is_item = matches!( node.kind(), "function_item" + | "function_signature_item" | "struct_item" | "enum_item" | "trait_item" @@ -181,6 +182,11 @@ impl RustSupport { signatures.push(sig); } } + "function_signature_item" => { + if let Some(sig) = self.extract_function_sig_item(source, node, visibility) { + signatures.push(sig); + } + } _ => {} } @@ -204,6 +210,7 @@ impl RustSupport { "const_item" => structure.constants += 1, "type_item" => structure.type_aliases += 1, "macro_definition" => structure.macros += 1, + "function_signature_item" => structure.functions += 1, "use_declaration" => { structure .imports @@ -218,22 +225,23 @@ impl RustSupport { } } - fn is_public(&self, node: &tree_sitter::Node) -> bool { + /// Determine a Rust item's visibility. Only a bare `pub` counts as Public; + /// restricted forms (`pub(crate)`, `pub(super)`, `pub(in path)`) are not part + /// of the crate's public API, so they are treated as Private for filtering. + fn get_visibility(&self, node: &tree_sitter::Node, source: &str) -> Visibility { let mut cursor = node.walk(); for child in node.children(&mut cursor) { if child.kind() == "visibility_modifier" { - return true; + let text = self.node_text(source, &child).trim(); + return if text == "pub" { + Visibility::Public + } else { + // pub(crate) / pub(super) / pub(self) / pub(in ...) → restricted + Visibility::Private + }; } } - false - } - - fn get_visibility(&self, node: &tree_sitter::Node) -> Visibility { - if self.is_public(node) { - Visibility::Public - } else { - Visibility::Private - } + Visibility::Private } fn node_text<'a>(&self, source: &'a str, node: &tree_sitter::Node) -> &'a str { @@ -246,7 +254,7 @@ impl RustSupport { node: &tree_sitter::Node, visibility_filter: Visibility, ) -> Option { - let vis = self.get_visibility(node); + let vis = self.get_visibility(node, source); if !vis.matches_filter(visibility_filter) { return None; } @@ -285,13 +293,51 @@ impl RustSupport { }) } + /// Extract a bodiless function declaration (`function_signature_item`), e.g. a + /// required trait method `fn draw(&self);` or an `extern` block declaration. + /// These have no `block`, so the slice-before-body path finds no body; we + /// slice the whole node and drop the trailing `;` to preserve generics, + /// return types, and where-clauses. + fn extract_function_sig_item( + &self, + source: &str, + node: &tree_sitter::Node, + visibility_filter: Visibility, + ) -> Option { + let vis = self.get_visibility(node, source); + if !vis.matches_filter(visibility_filter) { + return None; + } + + let name = self.find_child_text(node, "identifier", source)?; + let params = self.find_child_text(node, "parameters", source); + let return_type = self.find_child_text(node, "return_type", source); + + let text = self.node_text(source, node).trim_end(); + let full_signature = text + .strip_suffix(';') + .unwrap_or(text) + .trim_end() + .to_string(); + + Some(Signature { + kind: SignatureKind::Function, + name, + params, + return_type, + visibility: vis, + line_number: node.start_position().row + 1, + full_signature, + }) + } + fn extract_struct_signature( &self, source: &str, node: &tree_sitter::Node, visibility_filter: Visibility, ) -> Option { - let vis = self.get_visibility(node); + let vis = self.get_visibility(node, source); if !vis.matches_filter(visibility_filter) { return None; } @@ -333,7 +379,7 @@ impl RustSupport { node: &tree_sitter::Node, visibility_filter: Visibility, ) -> Option { - let vis = self.get_visibility(node); + let vis = self.get_visibility(node, source); if !vis.matches_filter(visibility_filter) { return None; } @@ -369,7 +415,7 @@ impl RustSupport { node: &tree_sitter::Node, visibility_filter: Visibility, ) -> Option { - let vis = self.get_visibility(node); + let vis = self.get_visibility(node, source); if !vis.matches_filter(visibility_filter) { return None; } @@ -428,7 +474,7 @@ impl RustSupport { node: &tree_sitter::Node, visibility_filter: Visibility, ) -> Option { - let vis = self.get_visibility(node); + let vis = self.get_visibility(node, source); if !vis.matches_filter(visibility_filter) { return None; } @@ -459,7 +505,7 @@ impl RustSupport { node: &tree_sitter::Node, visibility_filter: Visibility, ) -> Option { - let vis = self.get_visibility(node); + let vis = self.get_visibility(node, source); if !vis.matches_filter(visibility_filter) { return None; } @@ -490,7 +536,7 @@ impl RustSupport { node: &tree_sitter::Node, visibility_filter: Visibility, ) -> Option { - let vis = self.get_visibility(node); + let vis = self.get_visibility(node, source); if !vis.matches_filter(visibility_filter) { return None; } @@ -521,7 +567,7 @@ impl RustSupport { node: &tree_sitter::Node, visibility_filter: Visibility, ) -> Option { - let vis = self.get_visibility(node); + let vis = self.get_visibility(node, source); if !vis.matches_filter(visibility_filter) { return None; } @@ -607,6 +653,46 @@ fn private_fn() {} assert_eq!(signatures[0].name, "public_fn"); } + #[test] + fn test_extract_trait_method_signatures() { + // Regression: bodiless trait methods (`function_signature_item`) were + // dropped entirely, leaving trait views incomplete. + let source = r#" +pub trait Drawable { + fn draw(&self); + fn area(&self) -> f64; +} +"#; + + let signatures = RustSupport.extract_signatures(source, Visibility::All); + assert!( + signatures.iter().any(|s| s.name == "draw"), + "bodiless trait method `draw` should be extracted" + ); + let area = signatures + .iter() + .find(|s| s.name == "area") + .expect("bodiless trait method `area` should be extracted"); + assert!(area.full_signature.contains("-> f64")); + assert!(!area.full_signature.ends_with(';')); + } + + #[test] + fn test_pub_crate_is_not_public() { + // Regression: pub(crate)/pub(super) were reported as fully public. + let source = r#" +pub fn fully_public() {} +pub(crate) fn crate_only() {} +"#; + + let public_only = RustSupport.extract_signatures(source, Visibility::Public); + assert!(public_only.iter().any(|s| s.name == "fully_public")); + assert!( + !public_only.iter().any(|s| s.name == "crate_only"), + "pub(crate) must not pass the `public` visibility filter" + ); + } + #[test] fn test_extract_struct_signature() { let source = r#" From e8b07edb6d96fb64d5b317bf63a77ddd303467bb Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Thu, 4 Jun 2026 07:38:26 -0300 Subject: [PATCH 02/10] feat(tokenizer): add --encoding (default o200k_base) and validate enum flags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v0.9.0 "Trustworthy Output" — F1 + F7. F1 — token counting is now model-accurate and selectable: - New `--encoding {o200k_base|cl100k_base}` flag + `encoding` TOML option, defaulting to o200k_base (GPT-4o / o-series). The old hardcoded cl100k_base under-counts every modern OpenAI model. - Encoding threads through estimate_tokens / count_file_tokens / count_tree_tokens and the `--token-count` report. F7 — invalid flag values are now rejected instead of silently coerced: - `--truncate`, `--visibility`, `--encoding` use clap value_parser, so bad values error with `[possible values: ...]` and `--help` lists them (fixes B3 --visibility and B4 --truncate silent acceptance). Docs: document `--encoding`/`--visibility`, fix the `--truncate` default in README (was wrongly "none (default)"; clap default is "smart", modes smart|byte). Verified: 321 lib tests (all features) + full integration suite pass; clippy --all-targets --all-features -D warnings clean; fmt clean. Manual run confirms o200k vs cl100k produce different counts and invalid values are rejected. --- CHANGELOG.md | 7 ++ README.md | 10 +- benches/context_bench.rs | 2 + src/cli.rs | 29 +++++- src/config.rs | 6 ++ src/config_resolver.rs | 16 +++ src/lib.rs | 78 +++++++++++---- src/token_count.rs | 130 ++++++++++++++++++------- tests/cli_integration.rs | 8 ++ tests/test_auto_diff.rs | 7 ++ tests/test_binary_file_autodiff.rs | 3 + tests/test_comprehensive_edge_cases.rs | 10 ++ tests/test_config_resolution.rs | 6 ++ tests/test_cwd_independence.rs | 2 + tests/test_determinism.rs | 9 ++ tests/test_parallel_memory.rs | 3 + tests/test_phase4_integration.rs | 2 + 17 files changed, 270 insertions(+), 58 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bae9788..86fdf36 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,13 @@ All notable changes to this project will be documented in this file. > Planned per `docs/research/next-release-roadmap.md`. This section tracks work as it lands. +- **Accurate token counting — selectable tokenizer (F1)** + - New `--encoding {o200k_base|cl100k_base}` flag (and `encoding` config option), defaulting to **`o200k_base`** (GPT-4o / o-series). The previous hardcoded `cl100k_base` under-counts every modern OpenAI model. The selected encoding flows through `--token-count` (and, once F4 lands, `--max-tokens`) + - `--token-count` now reports counts using the chosen encoding + +- **CLI validation — invalid values are now rejected (F7)** + - `--truncate`, `--visibility`, and `--encoding` are constrained to their allowed sets via clap `value_parser`, so invalid values produce a clear `error: ... [possible values: ...]` instead of being silently coerced (fixes B3 `--visibility` and B4 `--truncate` silent-acceptance) and `--help` now lists the valid values + - **Tree-Sitter correctness — honest `--signatures` / `--visibility`** - Fixed Java `--visibility` filter being completely non-functional — `get_visibility` returned `Visibility::All` unconditionally, so `--visibility public` dropped *every* Java symbol and `--visibility private` leaked *all* of them. It now inspects the declaration's `modifiers` node (B12) - Fixed C/C++ functions returning a pointer or reference being silently dropped — `find_function_name` now descends through `pointer_declarator` / `reference_declarator` / `parenthesized_declarator` to locate the `function_declarator` (B13) diff --git a/README.md b/README.md index d9d799d..c3483c0 100644 --- a/README.md +++ b/README.md @@ -218,6 +218,10 @@ preview = false # Token counting mode token_count = false +# Tokenizer encoding for --token-count / --max-tokens +# Options: "o200k_base" (GPT-4o / o-series, default) or "cl100k_base" (GPT-4 / GPT-3.5) +encoding = "o200k_base" + # Automatically answer yes to all prompts @@ -267,14 +271,16 @@ If you also set `diff_only = true` (or pass `--diff-only`), the full “## Files - `--clear-cache` - Remove stored state used for auto-diff; next run becomes a fresh baseline. - `--signatures` - Replace full file content with extracted function/class signatures *(requires tree-sitter)*. - `--structure` - Append structural summary (function/class counts) to each file *(requires tree-sitter)*. -- `--truncate ` - Truncation strategy: `none` (default) or `smart` (AST-boundary aware) *(requires tree-sitter)*. +- `--truncate ` - Truncation strategy for `--max-tokens`: `smart` (AST-boundary aware, default) or `byte` *(requires tree-sitter)*. +- `--visibility ` - Filter extracted signatures by visibility: `all` (default), `public`, or `private` *(requires tree-sitter)*. +- `--encoding ` - Tokenizer used for `--token-count` and `--max-tokens`: `o200k_base` (GPT-4o / o-series, default) or `cl100k_base` (GPT-4 / GPT-3.5). - `--init` - Initialize a new `context-builder.toml` config file. - `-h, --help` - Show help information. --- ## Token Counting -Context Builder uses the `tiktoken-rs` library to provide accurate token counts for OpenAI models. This ensures that the token count is as close as possible to the actual number of tokens that will be used by the model. +Context Builder uses the `tiktoken-rs` library to provide accurate token counts. By default it uses the **`o200k_base`** encoding, which matches GPT-4o and the o-series (and is a close approximation for current frontier models). Use `--encoding cl100k_base` for GPT-4 / GPT-3.5. The selected encoding applies to both `--token-count` and `--max-tokens` budgeting. --- diff --git a/benches/context_bench.rs b/benches/context_bench.rs index 5f7785e..83cc3c3 100644 --- a/benches/context_bench.rs +++ b/benches/context_bench.rs @@ -209,6 +209,7 @@ fn bench_scenario(c: &mut Criterion, spec: DatasetSpec, line_numbers: bool) { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -257,6 +258,7 @@ fn bench_scenario(c: &mut Criterion, spec: DatasetSpec, line_numbers: bool) { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, diff --git a/src/cli.rs b/src/cli.rs index 1ca0310..58c13c5 100644 --- a/src/cli.rs +++ b/src/cli.rs @@ -61,12 +61,17 @@ pub struct Args { pub structure: bool, /// Truncation mode for max-tokens: "smart" (AST boundaries) or "byte" - #[clap(long, value_name = "MODE", default_value = "smart")] + #[clap(long, value_name = "MODE", value_parser = ["smart", "byte"], default_value = "smart")] pub truncate: String, /// Filter signatures by visibility: "all", "public", or "private" - #[clap(long, default_value = "all")] + #[clap(long, value_parser = ["all", "public", "private"], default_value = "all")] pub visibility: String, + + /// Tokenizer encoding used for `--token-count` and `--max-tokens` budgeting. + /// "o200k_base" matches GPT-4o/o-series (default); "cl100k_base" matches GPT-4/3.5. + #[clap(long, value_parser = ["o200k_base", "cl100k_base"], default_value = "o200k_base")] + pub encoding: String, } #[cfg(test)] @@ -211,4 +216,24 @@ mod tests { .expect("should parse with default visibility"); assert_eq!(args_default.visibility, "all"); } + + #[test] + fn parses_encoding_flag_with_default() { + let args = Args::try_parse_from(["context-builder", "--encoding", "cl100k_base"]) + .expect("should parse encoding flag"); + assert_eq!(args.encoding, "cl100k_base"); + + let args_default = + Args::try_parse_from(["context-builder"]).expect("should parse with default encoding"); + assert_eq!(args_default.encoding, "o200k_base"); + } + + #[test] + fn rejects_invalid_enum_values() { + // value_parser restricts these flags to their allowed sets, so invalid + // values now error at parse time instead of being silently coerced. + assert!(Args::try_parse_from(["context-builder", "--truncate", "bogus"]).is_err()); + assert!(Args::try_parse_from(["context-builder", "--visibility", "bogus"]).is_err()); + assert!(Args::try_parse_from(["context-builder", "--encoding", "bogus"]).is_err()); + } } diff --git a/src/config.rs b/src/config.rs index 5ce891a..6f932c8 100644 --- a/src/config.rs +++ b/src/config.rs @@ -84,6 +84,11 @@ pub struct Config { /// Filter signatures by visibility: "all", "public", or "private" pub visibility: Option, + + /// Tokenizer encoding for token counting/budgeting. + /// - "o200k_base": GPT-4o / o-series (default) + /// - "cl100k_base": GPT-4 / GPT-3.5 + pub encoding: Option, } /// Load configuration from `context-builder.toml` in the current working directory. @@ -269,6 +274,7 @@ invalid_toml [ assert!(config.structure.is_none()); assert!(config.truncate.is_none()); assert!(config.visibility.is_none()); + assert!(config.encoding.is_none()); } #[test] diff --git a/src/config_resolver.rs b/src/config_resolver.rs index ab5339d..5af1bcd 100644 --- a/src/config_resolver.rs +++ b/src/config_resolver.rs @@ -31,6 +31,7 @@ pub struct ResolvedConfig { pub structure: bool, pub truncate: String, pub visibility: String, + pub encoding: String, } /// Result of configuration resolution including the final config and any warnings @@ -96,6 +97,14 @@ pub fn resolve_final_config(mut args: Args, config: Option) -> ConfigRes .clone() .unwrap_or_else(|| args.visibility.clone()) }, + encoding: if args.encoding != "o200k_base" { + args.encoding.clone() + } else { + final_config + .encoding + .clone() + .unwrap_or_else(|| args.encoding.clone()) + }, }; ConfigResolution { @@ -239,6 +248,7 @@ mod tests { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -276,6 +286,7 @@ mod tests { yes: false, // Default value diff_only: false, // Default value clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -324,6 +335,7 @@ mod tests { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -358,6 +370,7 @@ mod tests { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -390,6 +403,7 @@ mod tests { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -424,6 +438,7 @@ mod tests { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -458,6 +473,7 @@ mod tests { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, diff --git a/src/lib.rs b/src/lib.rs index 6b32c22..b9dc07b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -26,7 +26,7 @@ use diff::render_per_file_diffs; use file_utils::{collect_files, confirm_overwrite, confirm_processing}; use markdown::generate_markdown; use state::{ProjectState, StateComparison}; -use token_count::{count_file_tokens, count_tree_tokens, estimate_tokens}; +use token_count::{Encoding, count_file_tokens, count_tree_tokens, estimate_tokens}; use tree::{build_file_tree, print_tree}; /// Configuration for diff operations @@ -279,34 +279,44 @@ pub fn run_with_args(args: Args, config: Config, prompter: &impl Prompter) -> io if final_args.token_count { if !silent { + let encoding = final_args.encoding.parse::().unwrap_or_default(); println!("\n# Token Count Estimation\n"); let mut total_tokens = 0; - total_tokens += estimate_tokens("# Directory Structure Report\n\n"); + total_tokens += estimate_tokens(encoding, "# Directory Structure Report\n\n"); if !final_args.filter.is_empty() { - total_tokens += estimate_tokens(&format!( - "This document contains files from the `{}` directory with extensions: {} \n", - final_args.input, - final_args.filter.join(", ") - )); + total_tokens += estimate_tokens( + encoding, + &format!( + "This document contains files from the `{}` directory with extensions: {} \n", + final_args.input, + final_args.filter.join(", ") + ), + ); } else { - total_tokens += estimate_tokens(&format!( - "This document contains all files from the `{}` directory, optimized for LLM consumption.\n", - final_args.input - )); + total_tokens += estimate_tokens( + encoding, + &format!( + "This document contains all files from the `{}` directory, optimized for LLM consumption.\n", + final_args.input + ), + ); } if !final_args.ignore.is_empty() { - total_tokens += estimate_tokens(&format!( - "Custom ignored patterns: {} \n", - final_args.ignore.join(", ") - )); + total_tokens += estimate_tokens( + encoding, + &format!( + "Custom ignored patterns: {} \n", + final_args.ignore.join(", ") + ), + ); } - total_tokens += estimate_tokens("Content hash: 0000000000000000\n\n"); - total_tokens += estimate_tokens("## File Tree Structure\n\n"); - let tree_tokens = count_tree_tokens(&file_tree, 0); + total_tokens += estimate_tokens(encoding, "Content hash: 0000000000000000\n\n"); + total_tokens += estimate_tokens(encoding, "## File Tree Structure\n\n"); + let tree_tokens = count_tree_tokens(&file_tree, 0, encoding); total_tokens += tree_tokens; let file_tokens: usize = files .iter() - .map(|entry| count_file_tokens(base_path, entry, final_args.line_numbers)) + .map(|entry| count_file_tokens(base_path, entry, final_args.line_numbers, encoding)) .sum(); total_tokens += file_tokens; println!("Estimated total tokens: {}", total_tokens); @@ -856,6 +866,7 @@ pub fn run() -> io::Result<()> { structure: resolution.config.structure, truncate: resolution.config.truncate, visibility: resolution.config.visibility, + encoding: resolution.config.encoding, }; // Create final Config with resolved values @@ -1041,6 +1052,7 @@ mod tests { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -1077,6 +1089,7 @@ mod tests { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -1118,6 +1131,7 @@ mod tests { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -1157,6 +1171,7 @@ mod tests { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -1199,6 +1214,7 @@ mod tests { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -1242,6 +1258,7 @@ mod tests { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -1284,6 +1301,7 @@ mod tests { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -1332,6 +1350,7 @@ mod tests { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -1379,6 +1398,7 @@ mod tests { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -1425,6 +1445,7 @@ mod tests { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -1474,6 +1495,7 @@ mod tests { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -1520,6 +1542,7 @@ mod tests { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -1623,6 +1646,7 @@ mod tests { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -1663,6 +1687,7 @@ mod tests { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -1703,6 +1728,7 @@ mod tests { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -1747,6 +1773,7 @@ mod tests { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -1779,6 +1806,7 @@ mod tests { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -1816,6 +1844,7 @@ mod tests { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: Some(100), signatures: false, @@ -1861,6 +1890,7 @@ mod tests { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -1892,6 +1922,7 @@ mod tests { yes: true, diff_only: true, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -1937,6 +1968,7 @@ mod tests { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -2015,6 +2047,7 @@ mod tests { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -2184,6 +2217,7 @@ mod tests { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -2225,6 +2259,7 @@ mod tests { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -2267,6 +2302,7 @@ mod tests { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -2307,6 +2343,7 @@ mod tests { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -2336,6 +2373,7 @@ mod tests { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -2384,6 +2422,7 @@ mod tests { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -2457,6 +2496,7 @@ mod tests { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, diff --git a/src/token_count.rs b/src/token_count.rs index 8dcf985..ea97a3c 100644 --- a/src/token_count.rs +++ b/src/token_count.rs @@ -3,32 +3,74 @@ use once_cell::sync::Lazy; use std::collections::BTreeMap; use std::fs; use std::path::Path; +use std::str::FromStr; /// Token counting utilities for estimating LLM token usage -use tiktoken_rs::{CoreBPE, cl100k_base}; +use tiktoken_rs::{CoreBPE, cl100k_base, o200k_base}; + +/// Tokenizer encoding used for token estimation and budgeting. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub enum Encoding { + /// `o200k_base` — GPT-4o / o-series, and the closest match for current + /// frontier models. Default, because `cl100k_base` under-counts these. + #[default] + O200kBase, + /// `cl100k_base` — GPT-4 / GPT-3.5-turbo. + Cl100kBase, +} -// Initialize the tokenizer once and reuse it -static TOKENIZER: Lazy = Lazy::new(|| cl100k_base().unwrap()); +impl FromStr for Encoding { + type Err = (); -/// Estimates the number of tokens in a text string using a real tokenizer -pub fn estimate_tokens(text: &str) -> usize { - TOKENIZER.encode_with_special_tokens(text).len() + fn from_str(s: &str) -> Result { + match s.trim().to_lowercase().as_str() { + "o200k_base" | "o200k" => Ok(Encoding::O200kBase), + "cl100k_base" | "cl100k" => Ok(Encoding::Cl100kBase), + _ => Err(()), + } + } +} + +// Initialize each tokenizer once and reuse it. +static O200K_BASE: Lazy = Lazy::new(|| o200k_base().unwrap()); +static CL100K_BASE: Lazy = Lazy::new(|| cl100k_base().unwrap()); + +impl Encoding { + fn bpe(self) -> &'static CoreBPE { + match self { + Encoding::O200kBase => &O200K_BASE, + Encoding::Cl100kBase => &CL100K_BASE, + } + } +} + +/// Estimates the number of tokens in a text string using the given encoding. +pub fn estimate_tokens(encoding: Encoding, text: &str) -> usize { + encoding.bpe().encode_with_special_tokens(text).len() } /// Counts the tokens that would be generated for a file -pub fn count_file_tokens(base_path: &Path, entry: &DirEntry, line_numbers: bool) -> usize { +pub fn count_file_tokens( + base_path: &Path, + entry: &DirEntry, + line_numbers: bool, + encoding: Encoding, +) -> usize { let file_path = entry.path(); let relative_path = file_path.strip_prefix(base_path).unwrap_or(file_path); // Start with tokens for the file header (path, size, modified time) - let mut token_count = estimate_tokens(&format!( - "\n### File: `{}`\n\n- Size: {} bytes\n- Modified: {}\n\n", - relative_path.display(), - entry.metadata().map(|m| m.len()).unwrap_or(0), - "Unknown" - )); // Using "Unknown" as placeholder for modified time in estimation + let mut token_count = estimate_tokens( + encoding, + &format!( + "\n### File: `{}`\n\n- Size: {} bytes\n- Modified: {}\n\n", + relative_path.display(), + entry.metadata().map(|m| m.len()).unwrap_or(0), + "Unknown" + ), + ); // Using "Unknown" as placeholder for modified time in estimation // Add tokens for the code fences - token_count += estimate_tokens("```\n```"); + token_count += estimate_tokens(encoding, "```\n```"); // Try to read file content if let Ok(content) = fs::read_to_string(file_path) { @@ -39,9 +81,9 @@ pub fn count_file_tokens(base_path: &Path, entry: &DirEntry, line_numbers: bool) .enumerate() .map(|(i, line)| format!("{:>4} | {}\n", i + 1, line)) .collect(); - token_count += estimate_tokens(&lines_with_numbers); + token_count += estimate_tokens(encoding, &lines_with_numbers); } else { - token_count += estimate_tokens(&content); + token_count += estimate_tokens(encoding, &content); } } @@ -49,7 +91,11 @@ pub fn count_file_tokens(base_path: &Path, entry: &DirEntry, line_numbers: bool) } /// Counts the tokens that would be generated for the entire file tree section -pub fn count_tree_tokens(tree: &BTreeMap, depth: usize) -> usize { +pub fn count_tree_tokens( + tree: &BTreeMap, + depth: usize, + encoding: Encoding, +) -> usize { let mut token_count = 0; // Add tokens for indentation @@ -58,11 +104,11 @@ pub fn count_tree_tokens(tree: &BTreeMap, depth: for (name, node) in tree { match node { crate::tree::FileNode::File => { - token_count += estimate_tokens(&format!("{}- 📄 {}\n", indent, name)); + token_count += estimate_tokens(encoding, &format!("{}- 📄 {}\n", indent, name)); } crate::tree::FileNode::Directory(children) => { - token_count += estimate_tokens(&format!("{}- 📁 {}\n", indent, name)); - token_count += count_tree_tokens(children, depth + 1); + token_count += estimate_tokens(encoding, &format!("{}- 📁 {}\n", indent, name)); + token_count += count_tree_tokens(children, depth + 1, encoding); } } } @@ -79,17 +125,29 @@ mod tests { fn test_estimate_tokens() { // Test with a simple string let text = "Hello, world!"; - let tokens = estimate_tokens(text); + let tokens = estimate_tokens(Encoding::Cl100kBase, text); // "Hello, world!" is 4 tokens with cl100k_base assert_eq!(tokens, 4); // Test with code-like content let code_text = "fn main() {\n println!(\"Hello, world!\");\n}"; - let tokens = estimate_tokens(code_text); + let tokens = estimate_tokens(Encoding::Cl100kBase, code_text); // This specific code snippet is 12 tokens with cl100k_base assert_eq!(tokens, 12); } + #[test] + fn test_encoding_default_and_parse() { + assert_eq!(Encoding::default(), Encoding::O200kBase); + assert_eq!("o200k_base".parse::(), Ok(Encoding::O200kBase)); + assert_eq!("cl100k_base".parse::(), Ok(Encoding::Cl100kBase)); + assert_eq!("O200K".parse::(), Ok(Encoding::O200kBase)); + assert!("bogus".parse::().is_err()); + // Both encodings tokenize non-empty text to a non-zero count. + assert!(estimate_tokens(Encoding::O200kBase, "hello world") > 0); + assert!(estimate_tokens(Encoding::Cl100kBase, "hello world") > 0); + } + #[test] fn test_count_tree_tokens() { // Create a simple tree structure @@ -100,7 +158,7 @@ mod tests { subdir.insert("file2.md".to_string(), crate::tree::FileNode::File); tree.insert("src".to_string(), crate::tree::FileNode::Directory(subdir)); - let tokens = count_tree_tokens(&tree, 0); + let tokens = count_tree_tokens(&tree, 0, Encoding::Cl100kBase); // "- 📄 file1.rs\n" -> 8 tokens // "- 📁 src\n" -> 6 tokens // " - 📄 file2.md\n" -> 9 tokens @@ -123,7 +181,7 @@ mod tests { .unwrap(); // Estimate tokens for the file - let estimated_tokens = count_file_tokens(dir.path(), &entry, false); + let estimated_tokens = count_file_tokens(dir.path(), &entry, false, Encoding::Cl100kBase); // Generate actual markdown content let mut actual_content = Vec::new(); @@ -139,7 +197,7 @@ mod tests { let actual_content_str = String::from_utf8(actual_content).unwrap(); // Count actual tokens - let actual_tokens = estimate_tokens(&actual_content_str); + let actual_tokens = estimate_tokens(Encoding::Cl100kBase, &actual_content_str); // The estimation should be close to actual (within a reasonable margin) // Allow for some variance due to timestamp differences and minor formatting @@ -159,19 +217,19 @@ mod tests { #[test] fn test_estimate_tokens_empty_string() { - let tokens = estimate_tokens(""); + let tokens = estimate_tokens(Encoding::Cl100kBase, ""); assert_eq!(tokens, 0); } #[test] fn test_estimate_tokens_whitespace_only() { - let tokens = estimate_tokens(" \n\t "); + let tokens = estimate_tokens(Encoding::Cl100kBase, " \n\t "); assert!(tokens > 0); // Whitespace still counts as tokens } #[test] fn test_estimate_tokens_unicode() { - let tokens = estimate_tokens("Hello 世界! 🌍"); + let tokens = estimate_tokens(Encoding::Cl100kBase, "Hello 世界! 🌍"); assert!(tokens > 0); // Unicode characters may be encoded as multiple tokens assert!(tokens >= 4); @@ -191,8 +249,10 @@ mod tests { .unwrap() .unwrap(); - let tokens_without_line_numbers = count_file_tokens(dir.path(), &entry, false); - let tokens_with_line_numbers = count_file_tokens(dir.path(), &entry, true); + let tokens_without_line_numbers = + count_file_tokens(dir.path(), &entry, false, Encoding::Cl100kBase); + let tokens_with_line_numbers = + count_file_tokens(dir.path(), &entry, true, Encoding::Cl100kBase); // With line numbers should have more tokens due to line number prefixes assert!(tokens_with_line_numbers > tokens_without_line_numbers); @@ -225,7 +285,7 @@ mod tests { std::fs::remove_file(&test_file).unwrap(); if let Some(entry) = found_entry { - let tokens = count_file_tokens(dir.path(), &entry, false); + let tokens = count_file_tokens(dir.path(), &entry, false, Encoding::Cl100kBase); // Should still return some tokens for the file header even if content can't be read assert!(tokens > 0); } @@ -234,7 +294,7 @@ mod tests { #[test] fn test_count_tree_tokens_empty_tree() { let tree = BTreeMap::new(); - let tokens = count_tree_tokens(&tree, 0); + let tokens = count_tree_tokens(&tree, 0, Encoding::Cl100kBase); assert_eq!(tokens, 0); } @@ -263,11 +323,11 @@ mod tests { crate::tree::FileNode::Directory(level1), ); - let tokens = count_tree_tokens(&tree, 0); + let tokens = count_tree_tokens(&tree, 0, Encoding::Cl100kBase); assert!(tokens > 0); // Should account for indentation at different levels - let tokens_with_depth = count_tree_tokens(&tree, 2); + let tokens_with_depth = count_tree_tokens(&tree, 2, Encoding::Cl100kBase); assert!(tokens_with_depth > tokens); // More indentation = more tokens } @@ -290,7 +350,7 @@ mod tests { crate::tree::FileNode::Directory(subdir), ); - let tokens = count_tree_tokens(&tree, 0); + let tokens = count_tree_tokens(&tree, 0, Encoding::Cl100kBase); assert!(tokens > 0); // Verify it handles unicode filenames without crashing diff --git a/tests/cli_integration.rs b/tests/cli_integration.rs index 465d88e..afcbf30 100644 --- a/tests/cli_integration.rs +++ b/tests/cli_integration.rs @@ -65,6 +65,7 @@ fn preview_mode_does_not_create_output_file() { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -110,6 +111,7 @@ fn preview_mode_skips_overwrite_confirmation() { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -160,6 +162,7 @@ fn token_count_mode_skips_overwrite_confirmation() { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -207,6 +210,7 @@ fn both_preview_and_token_count_modes_work_together() { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -267,6 +271,7 @@ fn end_to_end_generates_output_with_filters_ignores_and_line_numbers() { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -359,6 +364,7 @@ fn overwrite_prompt_is_respected() { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -403,6 +409,7 @@ fn confirm_processing_receives_large_count() { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -444,6 +451,7 @@ fn token_count_mode_does_not_create_output_file() { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, diff --git a/tests/test_auto_diff.rs b/tests/test_auto_diff.rs index 08a4d74..8b8a140 100644 --- a/tests/test_auto_diff.rs +++ b/tests/test_auto_diff.rs @@ -83,6 +83,7 @@ fn test_auto_diff_workflow_basic() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -242,6 +243,7 @@ fn test_auto_diff_added_and_removed_files() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -406,6 +408,7 @@ diff_only = true yes: true, diff_only: false, // Config file should override this clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -554,6 +557,7 @@ fn test_cache_invalidation_on_config_change() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -709,6 +713,7 @@ fn test_concurrent_cache_access() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -764,6 +769,7 @@ fn test_corrupted_cache_recovery() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -935,6 +941,7 @@ diff_only = true yes: true, diff_only: false, // Will be overridden by config clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, diff --git a/tests/test_binary_file_autodiff.rs b/tests/test_binary_file_autodiff.rs index 1f4cbf9..7576140 100644 --- a/tests/test_binary_file_autodiff.rs +++ b/tests/test_binary_file_autodiff.rs @@ -109,6 +109,7 @@ fn test_binary_files_dont_crash_autodiff() { yes: true, // Auto-confirm to avoid prompts diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -192,6 +193,7 @@ fn test_mixed_text_and_binary_files_autodiff() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -259,6 +261,7 @@ fn test_large_binary_file_autodiff() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, diff --git a/tests/test_comprehensive_edge_cases.rs b/tests/test_comprehensive_edge_cases.rs index 9d32ec7..1aed9b4 100644 --- a/tests/test_comprehensive_edge_cases.rs +++ b/tests/test_comprehensive_edge_cases.rs @@ -117,6 +117,7 @@ fn test_comprehensive_binary_file_edge_cases() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -206,6 +207,7 @@ fn test_configuration_precedence_edge_cases() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -246,6 +248,7 @@ fn test_configuration_precedence_edge_cases() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -306,6 +309,7 @@ timestamped_output = true yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -412,6 +416,7 @@ fn test_error_conditions_and_exit_codes() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -442,6 +447,7 @@ fn test_error_conditions_and_exit_codes() { yes: false, // Don't auto-confirm diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -469,6 +475,7 @@ fn test_error_conditions_and_exit_codes() { yes: false, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -518,6 +525,7 @@ fn test_memory_usage_under_parallel_processing() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -609,6 +617,7 @@ line_numbers = true yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -711,6 +720,7 @@ fn test_edge_case_filenames_and_paths() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, diff --git a/tests/test_config_resolution.rs b/tests/test_config_resolution.rs index e904169..8cdad2f 100644 --- a/tests/test_config_resolution.rs +++ b/tests/test_config_resolution.rs @@ -68,6 +68,7 @@ fn run_with_resolved_config( structure: resolution.config.structure, truncate: resolution.config.truncate, visibility: resolution.config.visibility, + encoding: resolution.config.encoding, }; // Create final Config with resolved values @@ -122,6 +123,7 @@ output = "from_config.md" yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -204,6 +206,7 @@ ignore = ["target"] yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -290,6 +293,7 @@ timestamped_output = true yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -371,6 +375,7 @@ yes = true yes: false, // Default - should use config diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -447,6 +452,7 @@ timestamped_output = false yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, diff --git a/tests/test_cwd_independence.rs b/tests/test_cwd_independence.rs index 7088660..2651ee6 100644 --- a/tests/test_cwd_independence.rs +++ b/tests/test_cwd_independence.rs @@ -94,6 +94,7 @@ filter = ["txt"] yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -186,6 +187,7 @@ timestamped_output = true yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, diff --git a/tests/test_determinism.rs b/tests/test_determinism.rs index f430494..66bf8a7 100644 --- a/tests/test_determinism.rs +++ b/tests/test_determinism.rs @@ -90,6 +90,7 @@ fn test_deterministic_output_multiple_runs() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -117,6 +118,7 @@ fn test_deterministic_output_multiple_runs() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -268,6 +270,7 @@ fn test_deterministic_file_tree_order() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -336,6 +339,7 @@ fn test_cache_collision_prevention() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -363,6 +367,7 @@ fn test_cache_collision_prevention() { diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, @@ -433,6 +438,7 @@ fn test_custom_ignores_performance() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -492,6 +498,7 @@ fn test_configuration_affects_cache_key() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -511,6 +518,7 @@ fn test_configuration_affects_cache_key() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -580,6 +588,7 @@ auto_diff = true yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, diff --git a/tests/test_parallel_memory.rs b/tests/test_parallel_memory.rs index c779479..c9194d7 100644 --- a/tests/test_parallel_memory.rs +++ b/tests/test_parallel_memory.rs @@ -64,6 +64,7 @@ fn test_streaming_parallel_processing() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -182,6 +183,7 @@ fn test_parallel_error_handling() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -244,6 +246,7 @@ fn test_memory_efficiency_with_large_files() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, diff --git a/tests/test_phase4_integration.rs b/tests/test_phase4_integration.rs index 20bd553..49f84cd 100644 --- a/tests/test_phase4_integration.rs +++ b/tests/test_phase4_integration.rs @@ -120,6 +120,7 @@ filter = ["rs", "txt"] yes: true, diff_only: false, // Will be overridden by config clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, @@ -298,6 +299,7 @@ fn test_encoding_strategy_configuration() { yes: true, diff_only: false, clear_cache: false, + encoding: "o200k_base".to_string(), init: false, max_tokens: None, signatures: false, From 9054ebbbf06491eb189dae6a5f267bd55679ed98 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Thu, 4 Jun 2026 08:18:57 -0300 Subject: [PATCH 03/10] feat(budget): unify --max-tokens on the real tokenizer; fix first-file bypass & hash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v0.9.0 "Trustworthy Output" — F4 + B1 + B2. - --max-tokens now counts the real tokenizer (per --encoding) on each file's rendered output, replacing the buf.len()/4 (parallel) and metadata().len()/4 (serial) byte heuristics. Both paths estimate identically, restoring byte-for-byte determinism between parallel and non-parallel builds. - Fix B1: the first file no longer bypasses the budget (dropped the `tokens_used > 0` guard); a single oversized file is now omitted with a notice. - Debit the header + file tree from the budget so --max-tokens covers the whole document. Tokenization is skipped entirely when no budget is set (no overhead on normal runs — caught a 50x test-suite slowdown from unconditional counting). - Fix B2: the content hash now folds in every output-affecting option (line_numbers, max_tokens, encoding, tree-sitter flags, encoding_strategy), so toggling e.g. --line-numbers changes the hash. Still hashes raw file content (not rendered output, which embeds volatile mtimes) to stay checkout-stable. Adds regression tests for first-file truncation and hash sensitivity/determinism. Verified: 323 lib (all-features) + 209 lib (serial) + full integration pass; clippy --all-targets --all-features -D warnings clean; fmt clean. --- CHANGELOG.md | 7 ++ src/lib.rs | 1 + src/markdown.rs | 249 +++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 211 insertions(+), 46 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 86fdf36..16cc637 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,13 @@ All notable changes to this project will be documented in this file. - **CLI validation — invalid values are now rejected (F7)** - `--truncate`, `--visibility`, and `--encoding` are constrained to their allowed sets via clap `value_parser`, so invalid values produce a clear `error: ... [possible values: ...]` instead of being silently coerced (fixes B3 `--visibility` and B4 `--truncate` silent-acceptance) and `--help` now lists the valid values +- **Trustworthy token budget — `--max-tokens` uses the real tokenizer (F4, B1, B2)** + - `--max-tokens` now counts the real tokenizer (per `--encoding`) on each file's rendered output, replacing the crude `buf.len()/4` (parallel) and `metadata().len()/4` (serial) byte heuristics. Both code paths now estimate identically, restoring byte-for-byte determinism between parallel and non-parallel builds + - Fixed the budget bypass where the first file was always emitted in full regardless of `--max-tokens` (the `tokens_used > 0` guard). The budget now applies to every file, including the first (B1) + - The document header + file tree are debited from the budget, so `--max-tokens` accounts for the whole document, not just file bodies + - Tokenization is skipped entirely when no budget is set — no overhead on normal runs + - The deterministic content hash now folds in every output-affecting option (`line_numbers`, `max_tokens`, `encoding`, tree-sitter flags, `encoding_strategy`). Previously, toggling e.g. `--line-numbers` produced a different document under the **same** hash, silently breaking LLM prompt caching. The hash still fingerprints raw file content (not the rendered output, which embeds volatile mtimes), so it stays stable across checkouts (B2) + - **Tree-Sitter correctness — honest `--signatures` / `--visibility`** - Fixed Java `--visibility` filter being completely non-functional — `get_visibility` returned `Visibility::All` unconditionally, so `--visibility public` dropped *every* Java symbol and `--visibility private` leaked *all* of them. It now inspects the declaration's `modifiers` node (B12) - Fixed C/C++ functions returning a pointer or reference being silently dropped — `find_function_name` now descends through `pointer_declarator` / `reference_declarator` / `parenthesized_declarator` to locate the `function_declarator` (B13) diff --git a/src/lib.rs b/src/lib.rs index b9dc07b..5b8261b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -581,6 +581,7 @@ pub fn run_with_args(args: Args, config: Config, prompter: &impl Prompter) -> io final_args.line_numbers, config.encoding_strategy.as_deref(), final_args.max_tokens, + final_args.encoding.parse::().unwrap_or_default(), &ts_config, )?; diff --git a/src/markdown.rs b/src/markdown.rs index f3d655f..6e9c8c5 100644 --- a/src/markdown.rs +++ b/src/markdown.rs @@ -5,6 +5,7 @@ use std::fs; use std::io::{self, Read, Seek, SeekFrom, Write}; use std::path::Path; +use crate::token_count::{Encoding as TokenEncoding, estimate_tokens}; use crate::tree::{FileTree, write_tree_to_file}; use encoding_rs::{Encoding, UTF_8}; @@ -39,6 +40,7 @@ pub fn generate_markdown( line_numbers: bool, encoding_strategy: Option<&str>, max_tokens: Option, + token_encoding: TokenEncoding, ts_config: &TreeSitterConfig, ) -> io::Result<()> { if let Some(parent) = Path::new(output_path).parent() @@ -60,32 +62,55 @@ pub fn generate_markdown( input_dir.to_string() }; - // --- Header --- // - writeln!(output, "# Directory Structure Report\n")?; + // --- Header + file tree (buffered) --- // + // Build the header and tree into a buffer first so we can (a) write them in + // one shot and (b) debit their token cost from the `--max-tokens` budget. + let mut head_buf: Vec = Vec::new(); + writeln!(head_buf, "# Directory Structure Report\n")?; if !filters.is_empty() { writeln!( - output, + head_buf, "This document contains files from the `{}` directory with extensions: {}", input_dir_name, filters.join(", ") )?; } else { writeln!( - output, + head_buf, "This document contains all files from the `{}` directory, optimized for LLM consumption.", input_dir_name )?; } if !ignores.is_empty() { - writeln!(output, "Custom ignored patterns: {}", ignores.join(", "))?; + writeln!(head_buf, "Custom ignored patterns: {}", ignores.join(", "))?; } - // Deterministic content hash (enables LLM prompt caching across runs) - // Uses xxh3 over file content bytes — stable across Rust versions and machines. - // Previous implementation hashed mtime (broken by git checkout, cp, etc.) + // Deterministic content hash (enables LLM prompt caching across runs). + // Hashes raw file content (NOT mtime — see v0.7.0; the rendered output embeds + // each file's mtime, so hashing emitted bytes would be volatile) PLUS every + // option that changes the rendered output. The hash is therefore a complete + // fingerprint: two runs share a hash iff they produce identical output. + // Folding in line_numbers / max_tokens / encoding / tree-sitter flags fixes + // the bug where toggling those yielded a different document under the same + // hash, and keeps the hash honest when `--max-tokens` truncates the file set. let mut content_hasher = xxhash_rust::xxh3::Xxh3::new(); + content_hasher.update(b"line_numbers\0"); + content_hasher.update(&[line_numbers as u8]); + // u64::MAX sentinel distinguishes "no budget" from any real value. + content_hasher.update(b"max_tokens\0"); + content_hasher.update(&max_tokens.map_or(u64::MAX, |v| v as u64).to_le_bytes()); + content_hasher.update(b"encoding\0"); + content_hasher.update(format!("{token_encoding:?}").as_bytes()); + content_hasher.update(b"\0ts\0"); + content_hasher.update(&[ts_config.signatures as u8, ts_config.structure as u8]); + content_hasher.update(ts_config.truncate.as_bytes()); + content_hasher.update(b"\0"); + content_hasher.update(ts_config.visibility.as_bytes()); + content_hasher.update(b"\0encoding_strategy\0"); + content_hasher.update(encoding_strategy.unwrap_or("").as_bytes()); + content_hasher.update(b"\0files\0"); for entry in files { // Hash relative unix-style path for cross-OS determinism. // Using absolute or OS-native paths would produce different hashes @@ -101,16 +126,22 @@ pub fn generate_markdown( } content_hasher.update(b"\0"); } - writeln!(output, "Content hash: {:016x}", content_hasher.digest())?; - writeln!(output)?; - - // --- File Tree --- // - - writeln!(output, "## File Tree Structure\n")?; - - write_tree_to_file(&mut output, file_tree, 0)?; + writeln!(head_buf, "Content hash: {:016x}", content_hasher.digest())?; + writeln!(head_buf)?; + + writeln!(head_buf, "## File Tree Structure\n")?; + write_tree_to_file(&mut head_buf, file_tree, 0)?; + writeln!(head_buf)?; + + // Debit the header + tree token cost so `--max-tokens` budgets the whole + // document, not just file bodies. Only computed when a budget is set + // (the tokenizer call is not free). + let header_tokens = match max_tokens { + Some(_) => estimate_tokens(token_encoding, &String::from_utf8_lossy(&head_buf)), + None => 0, + }; - writeln!(output)?; + output.write_all(&head_buf)?; // (No '## Files' heading here; it will be injected later only once during final composition) // (Diff section will be conditionally inserted later by the auto_diff logic in lib.rs) @@ -133,7 +164,7 @@ pub fn generate_markdown( let mut completed_chunks = std::collections::BTreeMap::new(); let mut next_index = 0; let mut errors = Vec::new(); - let mut tokens_used: usize = 0; + let mut tokens_used: usize = header_tokens; let mut budget_exceeded = false; // Receive chunks and write them in order @@ -152,12 +183,24 @@ pub fn generate_markdown( match chunk_result { Ok(buf) => { - // Estimate tokens for this chunk (~4 bytes per token) - let chunk_tokens = buf.len() / 4; - + // Count tokens of the rendered chunk with the real + // tokenizer (same as the serial path, so both builds + // truncate identically and the hash stays stable). + // Only pay for tokenization when a budget is set. + let chunk_tokens = if budget.is_some() { + estimate_tokens( + token_encoding, + &String::from_utf8_lossy(&buf), + ) + } else { + 0 + }; + + // Budget applies to every file, including the first + // (no `tokens_used > 0` bypass): a single oversized + // file no longer slips through in full. if let Some(max) = budget && tokens_used + chunk_tokens > max - && tokens_used > 0 { let remaining = total_files - next_index; let notice = format!( @@ -245,37 +288,43 @@ pub fn generate_markdown( #[cfg(not(feature = "parallel"))] { - let mut tokens_used: usize = 0; + let mut tokens_used: usize = header_tokens; for (idx, entry) in files.iter().enumerate() { - // Estimate tokens for this file (~4 bytes per token) - let file_size = std::fs::metadata(entry.path()) - .map(|m| m.len()) - .unwrap_or(0); - let estimated_file_tokens = (file_size as usize) / 4; - - if let Some(budget) = max_tokens { - if tokens_used + estimated_file_tokens > budget && tokens_used > 0 { - let remaining = files.len() - idx; - writeln!(output, "---\n")?; - writeln!( - output, - "_⚠️ Token budget ({}) reached. {} remaining files omitted._\n", - budget, remaining - )?; - break; - } - } - - tokens_used += estimated_file_tokens; + // Render the file into a buffer first, then count the rendered chunk + // with the real tokenizer — identical to the parallel path. + let mut buf: Vec = Vec::new(); process_file( base_path, entry.path(), - &mut output, + &mut buf, line_numbers, encoding_strategy, ts_config, )?; + // Only pay for tokenization when a budget is set. + let chunk_tokens = if max_tokens.is_some() { + estimate_tokens(token_encoding, &String::from_utf8_lossy(&buf)) + } else { + 0 + }; + + // Budget applies to every file, including the first (no bypass). + if let Some(budget) = max_tokens + && tokens_used + chunk_tokens > budget + { + let remaining = files.len() - idx; + writeln!(output, "---\n")?; + writeln!( + output, + "_⚠️ Token budget ({}) reached. {} remaining files omitted._\n", + budget, remaining + )?; + break; + } + + tokens_used += chunk_tokens; + output.write_all(&buf)?; } } @@ -1040,6 +1089,7 @@ mod tests { false, None, None, // max_tokens + TokenEncoding::default(), &TreeSitterConfig::default(), ); @@ -1074,6 +1124,7 @@ mod tests { false, None, None, // max_tokens + TokenEncoding::default(), &TreeSitterConfig::default(), ); @@ -1106,6 +1157,7 @@ mod tests { true, Some("strict"), None, // max_tokens + TokenEncoding::default(), &TreeSitterConfig::default(), ); @@ -1474,8 +1526,18 @@ mod tests { let base_path = dir.path(); let output_path = base_path.join("output.md"); - fs::write(base_path.join("file1.txt"), "x".repeat(50000)).unwrap(); - fs::write(base_path.join("file2.txt"), "y".repeat(50000)).unwrap(); + // Whitespace-separated words so the tokenizer pre-splits cheaply (a long + // run of a single char with no whitespace is a pathological BPE case). + fs::write( + base_path.join("file1.txt"), + "alpha beta gamma delta ".repeat(1000), + ) + .unwrap(); + fs::write( + base_path.join("file2.txt"), + "epsilon zeta eta theta ".repeat(1000), + ) + .unwrap(); let files = crate::file_utils::collect_files(base_path, &[], &[], &[]).unwrap(); let file_tree = crate::tree::build_file_tree(&files, base_path); @@ -1491,6 +1553,7 @@ mod tests { false, None, Some(100), + TokenEncoding::default(), &TreeSitterConfig::default(), ); @@ -1499,6 +1562,99 @@ mod tests { assert!(content.contains("Token budget") || content.len() < 1000); } + #[test] + fn test_max_tokens_truncates_single_oversized_first_file() { + // Regression (B1): the first file used to bypass the budget and was always + // emitted in full. A single oversized file is now omitted with a notice. + let dir = tempdir().unwrap(); + let base_path = dir.path(); + let output_path = base_path.join("output.md"); + + // ~17 KB of whitespace-separated text (well over a 100-token budget) with a + // unique marker we can assert is absent. Avoid a single-char run (slow BPE). + let body = "UNIQUE_BODY_MARKER alpha beta gamma ".repeat(500); + fs::write(base_path.join("huge.txt"), &body).unwrap(); + + let files = crate::file_utils::collect_files(base_path, &[], &[], &[]).unwrap(); + let file_tree = crate::tree::build_file_tree(&files, base_path); + + let result = generate_markdown( + &output_path.to_string_lossy(), + "project", + &[], + &[], + &file_tree, + &files, + base_path, + false, + None, + Some(100), // tiny budget — smaller than the single file + TokenEncoding::default(), + &TreeSitterConfig::default(), + ); + + assert!(result.is_ok()); + let content = fs::read_to_string(&output_path).unwrap(); + assert!( + content.contains("Token budget"), + "expected the budget notice" + ); + assert!( + !content.contains("UNIQUE_BODY_MARKER"), + "oversized first file body leaked despite the budget" + ); + } + + #[test] + fn test_content_hash_reflects_output_options() { + // Regression (B2): the hash must change when an output-affecting option + // changes (else prompt caching reuses a stale document), and must be + // identical for identical inputs (deterministic / cacheable). + let dir = tempdir().unwrap(); + let base_path = dir.path(); + fs::write(base_path.join("a.txt"), "hello world").unwrap(); + let files = crate::file_utils::collect_files(base_path, &[], &[], &[]).unwrap(); + let file_tree = crate::tree::build_file_tree(&files, base_path); + + let render = |line_numbers: bool, out: &std::path::Path| { + generate_markdown( + &out.to_string_lossy(), + "p", + &[], + &[], + &file_tree, + &files, + base_path, + line_numbers, + None, + None, + TokenEncoding::default(), + &TreeSitterConfig::default(), + ) + .unwrap(); + }; + let hash_of = |p: &std::path::Path| { + fs::read_to_string(p) + .unwrap() + .lines() + .find(|l| l.starts_with("Content hash:")) + .unwrap() + .to_string() + }; + + let o1 = base_path.join("o1.md"); + let o2 = base_path.join("o2.md"); + let o3 = base_path.join("o3.md"); + render(false, &o1); + render(true, &o2); + render(false, &o3); + + // Toggling line_numbers changes the rendered output → hash must change. + assert_ne!(hash_of(&o1), hash_of(&o2)); + // Same options → identical hash. + assert_eq!(hash_of(&o1), hash_of(&o3)); + } + #[test] fn test_process_file_empty_file() { let dir = tempdir().unwrap(); @@ -1572,6 +1728,7 @@ mod tests { false, None, None, + TokenEncoding::default(), &TreeSitterConfig::default(), ); From 5ad5005b01690f6c80d30304585ec59ee8ebb6b1 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Thu, 4 Jun 2026 08:40:33 -0300 Subject: [PATCH 04/10] fix(cache): unify config fingerprint; reflect resolved flags; warn on stray --diff-only MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v0.9.0 "Trustworthy Output" — B6 + B7 + B8. - B6: collapse the two byte-identical, "must stay in sync" config-hash functions (cache.rs::hash_config and state.rs::compute_config_hash) into one shared config::config_fingerprint. No more drift hazard. - B7: the auto-diff cache hash now reflects the *resolved* CLI values for --signatures/--structure/--truncate/--visibility (effective_config previously only carried filter/ignore/line_numbers, so the hash saw the config-file values — usually None — and toggling these flags reused a stale baseline). Also fold in encoding_strategy (it transcodes non-UTF-8 content). Pure output-rendering options (--diff-only, --encoding tokenizer) are deliberately EXCLUDED so toggling them against an existing baseline does not discard it. - B8: --diff-only now warns when used without auto_diff instead of silently emitting full file contents. Note: the roadmap suggested also hashing diff_only/timestamped_output/ output_folder, but test_diff_only_mode_includes_added_files proves --diff-only must stay toggleable against a baseline; new config_fingerprint_sensitivity test documents the state-vs-output-only distinction. Verified: 324 lib (all-features) + 209 lib (serial) + full integration pass; clippy --all-targets --all-features -D warnings clean; fmt clean. --- CHANGELOG.md | 5 ++++ src/cache.rs | 29 +++---------------- src/config.rs | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++ src/lib.rs | 20 +++++++++++++ src/state.rs | 29 +++---------------- 5 files changed, 113 insertions(+), 50 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 16cc637..df05ea6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -20,6 +20,11 @@ All notable changes to this project will be documented in this file. - Tokenization is skipped entirely when no budget is set — no overhead on normal runs - The deterministic content hash now folds in every output-affecting option (`line_numbers`, `max_tokens`, `encoding`, tree-sitter flags, `encoding_strategy`). Previously, toggling e.g. `--line-numbers` produced a different document under the **same** hash, silently breaking LLM prompt caching. The hash still fingerprints raw file content (not the rendered output, which embeds volatile mtimes), so it stays stable across checkouts (B2) +- **Reliable auto-diff cache invalidation (B6, B7, B8)** + - Unified the two duplicated config-hash functions (`cache.rs::hash_config` and `state.rs::compute_config_hash`) into one shared `config::config_fingerprint`, removing the "must stay in sync" drift hazard (B6) + - The auto-diff cache hash now reflects the **resolved CLI values** for `--signatures` / `--structure` / `--truncate` / `--visibility` — previously these arrived from CLI args while the hash only saw the (often empty) config-file values, so toggling them reused a stale diff baseline. `encoding_strategy` is now included too (it transcodes non-UTF-8 content). Pure output-rendering options (`--diff-only`, `--encoding` tokenizer) are deliberately **excluded** so toggling them against an existing baseline doesn't discard it (B7) + - `--diff-only` now warns when used without `auto_diff` instead of silently emitting full file contents (B8) + - **Tree-Sitter correctness — honest `--signatures` / `--visibility`** - Fixed Java `--visibility` filter being completely non-functional — `get_visibility` returned `Visibility::All` unconditionally, so `--visibility public` dropped *every* Java symbol and `--visibility private` leaked *all* of them. It now inspects the declaration's `modifiers` node (B12) - Fixed C/C++ functions returning a pointer or reference being silently dropped — `find_function_name` now descends through `pointer_declarator` / `reference_declarator` / `parenthesized_declarator` to locate the `function_declarator` (B13) diff --git a/src/cache.rs b/src/cache.rs index db2c59c..ee90b49 100644 --- a/src/cache.rs +++ b/src/cache.rs @@ -90,32 +90,11 @@ impl CacheManager { } } - /// Generate a hash from the configuration + /// Generate a hash from the configuration. + /// Delegates to the single shared `config_fingerprint` so the cache key and + /// `state.rs`'s config hash can never drift apart. fn hash_config(config: &Config) -> String { - // Build a stable string representation of config for hashing. - // IMPORTANT: Must stay in sync with state.rs::compute_config_hash - let mut config_str = String::new(); - if let Some(ref filters) = config.filter { - config_str.push_str(&filters.join(",")); - } - config_str.push('|'); - if let Some(ref ignores) = config.ignore { - config_str.push_str(&ignores.join(",")); - } - config_str.push('|'); - config_str.push_str(&format!( - "{:?}|{:?}|{:?}|{:?}|{:?}|{:?}|{:?}|{:?}", - config.line_numbers, - config.auto_diff, - config.diff_context_lines, - config.signatures, - config.structure, - config.truncate, - config.visibility, - config.max_tokens, - )); - let hash = xxhash_rust::xxh3::xxh3_64(config_str.as_bytes()); - format!("{:x}", hash) + crate::config::config_fingerprint(config) } /// Get the cache file path for this specific project and configuration diff --git a/src/config.rs b/src/config.rs index 6f932c8..c6cc089 100644 --- a/src/config.rs +++ b/src/config.rs @@ -91,6 +91,44 @@ pub struct Config { pub encoding: Option, } +/// Stable fingerprint of all configuration that affects auto-diff output and the +/// cache baseline. Shared by the cache key (`cache.rs`) and the project-state +/// config hash (`state.rs`) so the two can never drift out of sync. Every +/// output-affecting option must be included here, otherwise toggling it reuses a +/// stale cached diff baseline. +pub(crate) fn config_fingerprint(config: &Config) -> String { + let mut s = String::new(); + if let Some(ref filters) = config.filter { + s.push_str(&filters.join(",")); + } + s.push('|'); + if let Some(ref ignores) = config.ignore { + s.push_str(&ignores.join(",")); + } + s.push('|'); + // Only include options that change WHICH files are captured or HOW their + // content is read into the diff baseline. `encoding_strategy` is new here + // (it transcodes non-UTF-8 content, so it genuinely affects the captured + // state). Pure output-rendering options (diff_only, encoding/tokenizer, + // timestamped_output, output_folder) are deliberately EXCLUDED: users toggle + // them — especially `--diff-only` — against an existing baseline, and they do + // not change the diffed content, so they must not invalidate the cache. + s.push_str(&format!( + "{:?}|{:?}|{:?}|{:?}|{:?}|{:?}|{:?}|{:?}|{:?}", + config.line_numbers, + config.auto_diff, + config.diff_context_lines, + config.signatures, + config.structure, + config.truncate, + config.visibility, + config.max_tokens, + config.encoding_strategy, + )); + let hash = xxhash_rust::xxh3::xxh3_64(s.as_bytes()); + format!("{:x}", hash) +} + /// Load configuration from `context-builder.toml` in the current working directory. /// Returns `None` if the file does not exist or cannot be parsed. pub fn load_config() -> Option { @@ -277,6 +315,48 @@ invalid_toml [ assert!(config.encoding.is_none()); } + #[test] + fn config_fingerprint_sensitivity() { + // The cache/diff fingerprint must change for options that affect which + // files are captured or how their content is read, but must stay stable + // for pure output-rendering options that users toggle against an existing + // baseline (notably --diff-only). + let base = Config::default(); + let base_h = config_fingerprint(&base); + + let mut c = base.clone(); + c.signatures = Some(true); + assert_ne!( + config_fingerprint(&c), + base_h, + "signatures should affect the fingerprint" + ); + + let mut c = base.clone(); + c.encoding_strategy = Some("strict".to_string()); + assert_ne!( + config_fingerprint(&c), + base_h, + "encoding_strategy should affect the fingerprint" + ); + + let mut c = base.clone(); + c.diff_only = Some(true); + assert_eq!( + config_fingerprint(&c), + base_h, + "diff_only must NOT invalidate the diff baseline" + ); + + let mut c = base.clone(); + c.encoding = Some("cl100k_base".to_string()); + assert_eq!( + config_fingerprint(&c), + base_h, + "tokenizer encoding must NOT invalidate the diff baseline" + ); + } + #[test] #[serial] fn load_config_invalid_toml_in_cwd() { diff --git a/src/lib.rs b/src/lib.rs index 5b8261b..054b07f 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -340,6 +340,15 @@ pub fn run_with_args(args: Args, config: Config, prompter: &impl Prompter) -> io // by config_resolver.rs with proper CLI-takes-precedence semantics. // Do NOT re-apply them here as that would silently overwrite CLI flags. + // B8: --diff-only only takes effect together with auto_diff (+ timestamped + // output). Warn instead of silently emitting full file contents. + if final_args.diff_only && !config.auto_diff.unwrap_or(false) && !silent { + eprintln!( + "⚠️ --diff-only has no effect without auto_diff (it also needs timestamped_output). \ + Full file contents will be emitted. Enable auto_diff = true + timestamped_output = true to use diff-only mode." + ); + } + if config.auto_diff.unwrap_or(false) { // Build an effective config that mirrors the *actual* operational settings coming // from resolved CLI args (filters/ignores/line_numbers). This ensures the @@ -354,6 +363,17 @@ pub fn run_with_args(args: Args, config: Config, prompter: &impl Prompter) -> io effective_config.ignore = Some(final_args.ignore.clone()); } effective_config.line_numbers = Some(final_args.line_numbers); + // Propagate the *resolved* CLI values so the cache/diff config hash sees + // them too (B7) — these arrive from CLI args, not the config file, so + // without this, toggling e.g. --signatures/--visibility would reuse a + // stale diff baseline. Only fields that are part of the fingerprint are + // set here; pure output-rendering flags (diff_only, encoding) are + // intentionally NOT propagated so they don't invalidate the baseline. + effective_config.signatures = Some(final_args.signatures); + effective_config.structure = Some(final_args.structure); + effective_config.truncate = Some(final_args.truncate.clone()); + effective_config.visibility = Some(final_args.visibility.clone()); + effective_config.max_tokens = final_args.max_tokens; // 1. Create current project state let current_state = ProjectState::from_files( diff --git a/src/state.rs b/src/state.rs index 8d4d514..7b45978 100644 --- a/src/state.rs +++ b/src/state.rs @@ -221,32 +221,11 @@ impl ProjectState { false } - /// Generate a configuration hash for cache validation + /// Generate a configuration hash for cache validation. + /// Delegates to the single shared `config_fingerprint` (see `config.rs`) so + /// this hash and the cache key in `cache.rs` always agree. fn compute_config_hash(config: &Config) -> String { - // Build a stable string representation for hashing - let mut config_str = String::new(); - if let Some(ref filters) = config.filter { - config_str.push_str(&filters.join(",")); - } - config_str.push('|'); - if let Some(ref ignores) = config.ignore { - config_str.push_str(&ignores.join(",")); - } - config_str.push('|'); - config_str.push_str(&format!( - "{:?}|{:?}|{:?}|{:?}|{:?}|{:?}|{:?}|{:?}", - config.line_numbers, - config.auto_diff, - config.diff_context_lines, - config.signatures, - config.structure, - config.truncate, - config.visibility, - config.max_tokens, - )); - - let hash = xxhash_rust::xxh3::xxh3_64(config_str.as_bytes()); - format!("{:x}", hash) + crate::config::config_fingerprint(config) } } From e139eccdf5bb68c6ee30d457b1266690809350f7 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Thu, 4 Jun 2026 08:47:47 -0300 Subject: [PATCH 05/10] feat(output): stream to stdout with `-o -` (pipe mode) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v0.9.0 "Trustworthy Output" — F2. - `-o -` writes the generated document to stdout instead of a file, enabling `context-builder -f rs -o - | llm`. generate_markdown writes through a Box that is either the file or io::stdout(). - The auto-diff path also honors stdout (its in-memory composed document is written to stdout), so diffs pipe too. - Human-facing chatter ("Documentation created", timings, context-size warning) is suppressed in stdout mode so it never corrupts the pipe; genuine warnings stay on stderr. - `-` is never folded into an output folder or timestamped name (resolve_output_path returns early), and no file literally named `-` is created. Uses the `-o -` convention (no new Args field, so no churn across the ~90 test literals). Adds a config-resolver regression test; manually verified the pipe is clean, chatter is absent from stdout, and normal file output is unaffected. Verified: 325 lib (all-features) + 211 (serial) + full integration pass; clippy --all-targets --all-features -D warnings clean; fmt clean. --- CHANGELOG.md | 3 +++ README.md | 5 ++++- src/config_resolver.rs | 38 ++++++++++++++++++++++++++++++++++++++ src/lib.rs | 37 ++++++++++++++++++++++--------------- src/markdown.rs | 19 ++++++++++++------- 5 files changed, 79 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index df05ea6..cb7c419 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,9 @@ All notable changes to this project will be documented in this file. - **CLI validation — invalid values are now rejected (F7)** - `--truncate`, `--visibility`, and `--encoding` are constrained to their allowed sets via clap `value_parser`, so invalid values produce a clear `error: ... [possible values: ...]` instead of being silently coerced (fixes B3 `--visibility` and B4 `--truncate` silent-acceptance) and `--help` now lists the valid values +- **Pipe-friendly output — stream to stdout (F2)** + - `-o -` streams the generated document to **stdout** instead of a file, enabling `context-builder -f rs -o - | llm`. All progress/status chatter is suppressed (or already on stderr) in this mode so the pipe stays clean, and `-` is never folded into an output folder or timestamped name. Works for the auto-diff path too (the composed diff document is written to stdout) + - **Trustworthy token budget — `--max-tokens` uses the real tokenizer (F4, B1, B2)** - `--max-tokens` now counts the real tokenizer (per `--encoding`) on each file's rendered output, replacing the crude `buf.len()/4` (parallel) and `metadata().len()/4` (serial) byte heuristics. Both code paths now estimate identically, restoring byte-for-byte determinism between parallel and non-parallel builds - Fixed the budget bypass where the first file was always emitted in full regardless of `--max-tokens` (the `tokens_used > 0` guard). The budget now applies to every file, including the first (B1) diff --git a/README.md b/README.md index c3483c0..aa728bc 100644 --- a/README.md +++ b/README.md @@ -160,6 +160,9 @@ context-builder --token-count # Add line numbers to all code blocks context-builder --line-numbers +# Stream the document to stdout and pipe it straight into an LLM tool +context-builder -f rs -o - | llm + # Skip all confirmation prompts (auto-answer yes) context-builder --yes @@ -259,7 +262,7 @@ If you also set `diff_only = true` (or pass `--diff-only`), the full “## Files ### Command Line Options - `-d, --input ` - Directory path to process (default: current directory). -- `-o, --output ` - Output file path (default: `output.md`). +- `-o, --output ` - Output file path (default: `output.md`). Use `-` to stream the document to **stdout** (e.g. `context-builder -o - | llm`); progress messages then go to stderr so the pipe stays clean. - `-f, --filter ` - File extensions to include (can be used multiple times). - `-i, --ignore ` - Folder or file names to ignore (can be used multiple times). - `--max-tokens ` - Maximum token budget for the output. Files are truncated/skipped when exceeded. diff --git a/src/config_resolver.rs b/src/config_resolver.rs index 5af1bcd..aa6b508 100644 --- a/src/config_resolver.rs +++ b/src/config_resolver.rs @@ -184,6 +184,11 @@ fn apply_config_to_args(args: &mut Args, config: &Config, warnings: &mut Vec) { + // `-` means stdout: never fold in an output folder or a timestamp. + if args.output == "-" { + return; + } + let mut output_folder_path: Option = None; // Apply output folder first @@ -425,6 +430,39 @@ mod tests { assert!(resolution.config.output.ends_with(".md")); } + #[test] + fn stdout_output_bypasses_folder_and_timestamp() { + let args = Args { + input: "src".to_string(), + output: "-".to_string(), // stdout + filter: vec![], + ignore: vec![], + line_numbers: false, + preview: false, + token_count: false, + yes: false, + diff_only: false, + clear_cache: false, + encoding: "o200k_base".to_string(), + init: false, + max_tokens: None, + signatures: false, + structure: false, + truncate: "smart".to_string(), + visibility: "all".to_string(), + }; + + let config = Config { + output_folder: Some("docs".to_string()), + timestamped_output: Some(true), + ..Default::default() + }; + + let resolution = resolve_final_config(args, Some(config)); + // `-` must survive untouched — not folded into docs/ nor timestamped. + assert_eq!(resolution.config.output, "-"); + } + #[test] fn test_auto_diff_without_timestamping_warning() { let args = Args { diff --git a/src/lib.rs b/src/lib.rs index 054b07f..1339c9b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -72,6 +72,9 @@ pub fn run_with_args(args: Args, config: Config, prompter: &impl Prompter) -> io // Use the finalized args passed in from run() let final_args = args; + // `-o -` streams the document to stdout (pipe mode). In this mode all + // human-facing chatter must go to stderr so it doesn't corrupt the pipe. + let to_stdout = final_args.output == "-"; // Resolve base path. If input is '.' but current working directory lost the project context // (no context-builder.toml), attempt to infer project root from output path (parent of 'output' dir). let mut resolved_base = PathBuf::from(&final_args.input); @@ -520,20 +523,24 @@ pub fn run_with_args(args: Args, config: Config, prompter: &impl Prompter) -> io } } - // 5. Write output - let output_path = Path::new(&final_args.output); - if let Some(parent) = output_path.parent() - && !parent.exists() - && let Err(e) = fs::create_dir_all(parent) - { - return Err(io::Error::other(format!( - "Failed to create output directory {}: {}", - parent.display(), - e - ))); + // 5. Write output — to stdout in pipe mode, otherwise to the file. + if to_stdout { + io::stdout().write_all(final_doc.as_bytes())?; + } else { + let output_path = Path::new(&final_args.output); + if let Some(parent) = output_path.parent() + && !parent.exists() + && let Err(e) = fs::create_dir_all(parent) + { + return Err(io::Error::other(format!( + "Failed to create output directory {}: {}", + parent.display(), + e + ))); + } + let mut final_output = fs::File::create(output_path)?; + final_output.write_all(final_doc.as_bytes())?; } - let mut final_output = fs::File::create(output_path)?; - final_output.write_all(final_doc.as_bytes())?; // 6. Update cache with current state if let Err(e) = cache_manager.write_cache(¤t_state) @@ -543,7 +550,7 @@ pub fn run_with_args(args: Args, config: Config, prompter: &impl Prompter) -> io } let duration = start_time.elapsed(); - if !silent { + if !silent && !to_stdout { if let Some(comp) = &comparison { if comp.summary.has_changes() { println!( @@ -606,7 +613,7 @@ pub fn run_with_args(args: Args, config: Config, prompter: &impl Prompter) -> io )?; let duration = start_time.elapsed(); - if !silent { + if !silent && !to_stdout { println!("Documentation created successfully: {}", final_args.output); println!("Processing time: {:.2?}", duration); diff --git a/src/markdown.rs b/src/markdown.rs index 6e9c8c5..10856d2 100644 --- a/src/markdown.rs +++ b/src/markdown.rs @@ -43,13 +43,18 @@ pub fn generate_markdown( token_encoding: TokenEncoding, ts_config: &TreeSitterConfig, ) -> io::Result<()> { - if let Some(parent) = Path::new(output_path).parent() - && !parent.exists() - { - fs::create_dir_all(parent)?; - } - - let mut output = fs::File::create(output_path)?; + // `-` selects stdout (pipe mode, e.g. `context-builder -o - | llm`); + // otherwise create/truncate the file path (creating parent dirs as needed). + let mut output: Box = if output_path == "-" { + Box::new(io::stdout()) + } else { + if let Some(parent) = Path::new(output_path).parent() + && !parent.exists() + { + fs::create_dir_all(parent)?; + } + Box::new(fs::File::create(output_path)?) + }; let input_dir_name = if input_dir == "." { let current_dir = std::env::current_dir()?; From d9b22636fe6ae397986d5297f156b0bc41f693f5 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Thu, 4 Jun 2026 08:58:59 -0300 Subject: [PATCH 06/10] fix: char-boundary truncation clamp, atomic cache write, encoding_strategy warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v0.9.0 "Trustworthy Output" — low-severity bug mop-up (B19, B20, B5). - B19: clamp find_truncation_point's result to a UTF-8 char boundary in the shared dispatcher. The per-language `max_bytes` fallback could land mid-char, panicking once smart truncation slices on it. Node boundaries are already char-safe, so the clamp is a no-op for them. (regression test added) - B20: write the auto-diff cache atomically (NamedTempFile + persist/rename) instead of truncate-then-write under a lock. A crash mid-write no longer leaves a truncated, baseline-dropping cache; readers always see a complete file. - Drop the `fs2` dependency: std::fs::File provides advisory locking (lock_shared/unlock) since Rust 1.89, which already shadowed fs2's trait. Set rust-version = "1.89" to document the resulting MSRV. (roadmap §5 dep-diet) - B5: warn on an unrecognized `encoding_strategy` in config instead of silently falling back to "detect". Verified: 326 lib (all-features) + 211 (serial) + full integration pass; clippy --all-targets --all-features -D warnings clean; fmt clean. --- Cargo.lock | 11 ----------- Cargo.toml | 4 +++- src/cache.rs | 32 ++++++++++++++------------------ src/lib.rs | 11 +++++++++++ src/tree_sitter/truncation.rs | 26 +++++++++++++++++++++++++- 5 files changed, 53 insertions(+), 31 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c5f4bbf..deacaa8 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -260,7 +260,6 @@ dependencies = [ "crossbeam-channel", "encoding_rs", "env_logger", - "fs2", "ignore", "log", "num_cpus", @@ -457,16 +456,6 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" -[[package]] -name = "fs2" -version = "0.4.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9564fc758e15025b46aa6643b1b77d047d1a56a1aea6e01002ac0c7026876213" -dependencies = [ - "libc", - "winapi", -] - [[package]] name = "futures-core" version = "0.3.31" diff --git a/Cargo.toml b/Cargo.toml index b732d32..1a4861d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -3,6 +3,8 @@ name = "context-builder" version = "0.8.3" default-run = "context-builder" edition = "2024" +# std::fs::File advisory locking (used by the cache) stabilized in Rust 1.89. +rust-version = "1.89" authors = ["Igor Lins e Silva"] description = "CLI tool to aggregate directory contents into a single markdown file optimized for LLM consumption" readme = "README.md" @@ -25,9 +27,9 @@ similar = "2.7.0" tempfile = "3.25.0" tiktoken-rs = "0.9.1" once_cell = "1.21.3" -fs2 = "0.4.3" serde_json = "1.0.143" crossbeam-channel = "0.5.15" +# (fs2 removed in v0.9.0 — std::fs::File now provides advisory locking) num_cpus = "1.17.0" encoding_rs = "0.8.35" walkdir = "2.5.0" diff --git a/src/cache.rs b/src/cache.rs index ee90b49..7ff96ca 100644 --- a/src/cache.rs +++ b/src/cache.rs @@ -4,8 +4,8 @@ //! It uses a hash of the project path and configuration to avoid cache collisions //! between different projects or configurations. -use fs2::FileExt; - +// File advisory locking (`lock_shared`/`unlock`) is provided by std::fs::File +// (stabilized in Rust 1.89), so the former `fs2` dependency is no longer needed. use std::fs; use std::fs::File; @@ -158,26 +158,22 @@ impl CacheManager { Ok(Some(state)) } - /// Write the project state to cache with file locking + /// Write the project state to cache atomically. + /// + /// Serializes into a temp file in the cache directory, then renames it over + /// the target (B20). The previous implementation truncated the cache under an + /// exclusive lock and then wrote — a crash in that window left a truncated + /// (corrupt) cache, silently dropping the auto-diff baseline. With an atomic + /// rename, a crash leaves the old cache intact and readers always see a + /// complete file (old or new), never a partial write. pub fn write_cache(&self, state: &ProjectState) -> Result<(), Box> { let cache_path = self.get_cache_path(); - - let file = std::fs::OpenOptions::new() - .write(true) - .create(true) - .truncate(false) - .open(&cache_path)?; - // Acquire exclusive lock BEFORE truncating to prevent TOCTOU races - file.lock_exclusive()?; - file.set_len(0)?; - let json = serde_json::to_string_pretty(state)?; - let mut file = std::io::BufWriter::new(file); - file.write_all(json.as_bytes())?; - file.flush()?; - // Release lock - file.get_ref().unlock()?; + let mut tmp = tempfile::NamedTempFile::new_in(&self.cache_dir)?; + tmp.write_all(json.as_bytes())?; + tmp.flush()?; + tmp.persist(&cache_path).map_err(|e| e.error)?; Ok(()) } diff --git a/src/lib.rs b/src/lib.rs index 1339c9b..cc852b6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -75,6 +75,17 @@ pub fn run_with_args(args: Args, config: Config, prompter: &impl Prompter) -> io // `-o -` streams the document to stdout (pipe mode). In this mode all // human-facing chatter must go to stderr so it doesn't corrupt the pipe. let to_stdout = final_args.output == "-"; + + // B5: warn on an unrecognized encoding_strategy instead of silently using + // "detect". Validates the config value against the supported set. + if let Some(ref strat) = config.encoding_strategy + && !matches!(strat.as_str(), "detect" | "strict" | "skip") + && !silent + { + eprintln!( + "⚠️ Unknown encoding_strategy '{strat}' in config; expected one of: detect, strict, skip. Falling back to 'detect'." + ); + } // Resolve base path. If input is '.' but current working directory lost the project context // (no context-builder.toml), attempt to infer project root from output path (parent of 'output' dir). let mut resolved_base = PathBuf::from(&final_args.input); diff --git a/src/tree_sitter/truncation.rs b/src/tree_sitter/truncation.rs index 7593c11..ab49164 100644 --- a/src/tree_sitter/truncation.rs +++ b/src/tree_sitter/truncation.rs @@ -15,7 +15,11 @@ pub fn find_truncation_point( return source.len(); } - support.find_truncation_point(source, max_bytes) + // Per-language impls return either a node boundary (already char-safe) or the + // raw `max_bytes` fallback (which may split a multi-byte char). Clamp to a + // UTF-8 boundary so callers can slice the result without panicking (B19). + let point = support.find_truncation_point(source, max_bytes); + ensure_utf8_boundary(source, point) } /// Check if truncation is needed at a UTF-8 boundary. @@ -94,6 +98,26 @@ mod tests { assert_eq!(point, source.len()); } + #[test] + #[cfg(feature = "tree-sitter-rust")] + fn test_find_truncation_point_clamps_to_char_boundary() { + // Regression (B19): a comment-only source has no AST item boundary, so the + // per-language walker falls back to `max_bytes`. With `max_bytes` landing + // inside a multi-byte char, the returned point must be clamped to a UTF-8 + // boundary (otherwise slicing it panics once smart truncation is wired up). + let lang = super::super::languages::get_language_support("rs").unwrap(); + let source = "// 世界世界世界"; // '世'/'界' are 3 bytes each + let max_bytes = 4; // byte 4 is inside the first '世' (bytes 3..6) + assert!(!source.is_char_boundary(max_bytes)); + let point = find_truncation_point(source, max_bytes, lang); + assert!( + source.is_char_boundary(point), + "truncation point {point} is not a char boundary" + ); + // Slicing must not panic. + let _ = &source[..point]; + } + #[test] #[cfg(feature = "tree-sitter-rust")] fn test_find_truncation_point_source_exceeds_limit() { From 5b1f0f5ab1d1219030d6f6593bc0ba3c98a57e46 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Thu, 4 Jun 2026 09:04:50 -0300 Subject: [PATCH 07/10] fix(tree-sitter,tokens): preserve C/C++ signature details; accurate token-count preview MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v0.9.0 "Trustworthy Output" — B17 + B9 (completes the low-severity mop-up). - B17: C++ struct inheritance, enum underlying types, and using/typedef alias targets — plus C typedef aliased types — are now byte-sliced/preserved instead of collapsed to a bare `struct X`/`enum X`/`typedef X`/`using/typedef X`. - B9: `--token-count` now renders each file through the same process_file path the document uses, then tokenizes that — so the preview matches the produced output (encoding transcoding + tree-sitter enrichment) instead of re-reading raw bytes via a divergent path. A file that fails to render contributes 0 tokens (matching the actual output); the unreadable-file test was updated. Also brings the CHANGELOG up to date for the whole mop-up (B5/B9/B17/B19/B20). Adds a C++ struct/enum/alias regression test. Verified: 327 lib (all-features) + 211 (serial) + full integration pass; clippy --all-targets --all-features -D warnings clean; fmt clean. --- CHANGELOG.md | 7 +++ src/lib.rs | 20 ++++++- src/token_count.rs | 99 ++++++++++++++++++-------------- src/tree_sitter/languages/c.rs | 5 +- src/tree_sitter/languages/cpp.rs | 60 ++++++++++++++++++- 5 files changed, 144 insertions(+), 47 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cb7c419..ab1823d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,13 @@ All notable changes to this project will be documented in this file. - Tokenization is skipped entirely when no budget is set — no overhead on normal runs - The deterministic content hash now folds in every output-affecting option (`line_numbers`, `max_tokens`, `encoding`, tree-sitter flags, `encoding_strategy`). Previously, toggling e.g. `--line-numbers` produced a different document under the **same** hash, silently breaking LLM prompt caching. The hash still fingerprints raw file content (not the rendered output, which embeds volatile mtimes), so it stays stable across checkouts (B2) +- **Robustness & accuracy — low-severity mop-up (B5, B9, B17, B19, B20)** + - Auto-diff cache is now written atomically (temp file + rename) instead of truncate-then-write, so a crash mid-write can't leave a corrupt, baseline-dropping cache (B20). The `fs2` dependency was dropped — `std::fs::File` provides advisory locking natively since Rust 1.89 (MSRV is now 1.89) + - `find_smart_truncation_point` clamps its result to a UTF-8 char boundary, preventing a panic when the byte-budget fallback lands mid-character (B19) + - `--token-count` renders each file through the **same path as the document**, so the preview matches the produced output (including encoding transcoding and tree-sitter signature/structure enrichment) instead of diverging via a separate raw-byte read (B9) + - C++ struct inheritance, `enum` underlying types, and `using`/`typedef` alias targets — plus C `typedef` aliased types — are preserved in extracted signatures instead of being reduced to a bare name (B17) + - An unrecognized `encoding_strategy` in config now warns instead of silently falling back to `detect` (B5) + - **Reliable auto-diff cache invalidation (B6, B7, B8)** - Unified the two duplicated config-hash functions (`cache.rs::hash_config` and `state.rs::compute_config_hash`) into one shared `config::config_fingerprint`, removing the "must stay in sync" drift hazard (B6) - The auto-diff cache hash now reflects the **resolved CLI values** for `--signatures` / `--structure` / `--truncate` / `--visibility` — previously these arrived from CLI args while the hash only saw the (often empty) config-file values, so toggling them reused a stale diff baseline. `encoding_strategy` is now included too (it transcodes non-UTF-8 content). Pure output-rendering options (`--diff-only`, `--encoding` tokenizer) are deliberately **excluded** so toggling them against an existing baseline doesn't discard it (B7) diff --git a/src/lib.rs b/src/lib.rs index cc852b6..f766327 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -294,6 +294,15 @@ pub fn run_with_args(args: Args, config: Config, prompter: &impl Prompter) -> io if final_args.token_count { if !silent { let encoding = final_args.encoding.parse::().unwrap_or_default(); + // Render each file through the same path as the document so the + // preview matches what would actually be produced (B9). + let ts_config = markdown::TreeSitterConfig { + signatures: final_args.signatures, + structure: final_args.structure, + truncate: final_args.truncate.clone(), + visibility: final_args.visibility.clone(), + }; + let enc_strategy = config.encoding_strategy.as_deref(); println!("\n# Token Count Estimation\n"); let mut total_tokens = 0; total_tokens += estimate_tokens(encoding, "# Directory Structure Report\n\n"); @@ -330,7 +339,16 @@ pub fn run_with_args(args: Args, config: Config, prompter: &impl Prompter) -> io total_tokens += tree_tokens; let file_tokens: usize = files .iter() - .map(|entry| count_file_tokens(base_path, entry, final_args.line_numbers, encoding)) + .map(|entry| { + count_file_tokens( + base_path, + entry, + final_args.line_numbers, + encoding, + enc_strategy, + &ts_config, + ) + }) .sum(); total_tokens += file_tokens; println!("Estimated total tokens: {}", total_tokens); diff --git a/src/token_count.rs b/src/token_count.rs index ea97a3c..f1cf682 100644 --- a/src/token_count.rs +++ b/src/token_count.rs @@ -1,7 +1,6 @@ use ignore::DirEntry; use once_cell::sync::Lazy; use std::collections::BTreeMap; -use std::fs; use std::path::Path; use std::str::FromStr; /// Token counting utilities for estimating LLM token usage @@ -48,46 +47,35 @@ pub fn estimate_tokens(encoding: Encoding, text: &str) -> usize { encoding.bpe().encode_with_special_tokens(text).len() } -/// Counts the tokens that would be generated for a file +/// Counts the tokens that would be generated for a file. +/// +/// Renders the file through the same `process_file` path the document uses, then +/// tokenizes the result — so the `--token-count` preview matches the produced +/// document, including encoding transcoding and tree-sitter signature/structure +/// enrichment, instead of re-reading raw bytes through a divergent code path (B9). pub fn count_file_tokens( base_path: &Path, entry: &DirEntry, line_numbers: bool, encoding: Encoding, + encoding_strategy: Option<&str>, + ts_config: &crate::markdown::TreeSitterConfig, ) -> usize { - let file_path = entry.path(); - let relative_path = file_path.strip_prefix(base_path).unwrap_or(file_path); - - // Start with tokens for the file header (path, size, modified time) - let mut token_count = estimate_tokens( - encoding, - &format!( - "\n### File: `{}`\n\n- Size: {} bytes\n- Modified: {}\n\n", - relative_path.display(), - entry.metadata().map(|m| m.len()).unwrap_or(0), - "Unknown" - ), - ); // Using "Unknown" as placeholder for modified time in estimation - - // Add tokens for the code fences - token_count += estimate_tokens(encoding, "```\n```"); - - // Try to read file content - if let Ok(content) = fs::read_to_string(file_path) { - if line_numbers { - // When line numbers are enabled, we add the line number prefix to each line - let lines_with_numbers: String = content - .lines() - .enumerate() - .map(|(i, line)| format!("{:>4} | {}\n", i + 1, line)) - .collect(); - token_count += estimate_tokens(encoding, &lines_with_numbers); - } else { - token_count += estimate_tokens(encoding, &content); - } + let mut buf: Vec = Vec::new(); + if crate::markdown::process_file( + base_path, + entry.path(), + &mut buf, + line_numbers, + encoding_strategy, + ts_config, + ) + .is_err() + { + // A file that fails to render contributes nothing to the document. + return 0; } - - token_count + estimate_tokens(encoding, &String::from_utf8_lossy(&buf)) } /// Counts the tokens that would be generated for the entire file tree section @@ -181,7 +169,14 @@ mod tests { .unwrap(); // Estimate tokens for the file - let estimated_tokens = count_file_tokens(dir.path(), &entry, false, Encoding::Cl100kBase); + let estimated_tokens = count_file_tokens( + dir.path(), + &entry, + false, + Encoding::Cl100kBase, + None, + &crate::markdown::TreeSitterConfig::default(), + ); // Generate actual markdown content let mut actual_content = Vec::new(); @@ -249,10 +244,22 @@ mod tests { .unwrap() .unwrap(); - let tokens_without_line_numbers = - count_file_tokens(dir.path(), &entry, false, Encoding::Cl100kBase); - let tokens_with_line_numbers = - count_file_tokens(dir.path(), &entry, true, Encoding::Cl100kBase); + let tokens_without_line_numbers = count_file_tokens( + dir.path(), + &entry, + false, + Encoding::Cl100kBase, + None, + &crate::markdown::TreeSitterConfig::default(), + ); + let tokens_with_line_numbers = count_file_tokens( + dir.path(), + &entry, + true, + Encoding::Cl100kBase, + None, + &crate::markdown::TreeSitterConfig::default(), + ); // With line numbers should have more tokens due to line number prefixes assert!(tokens_with_line_numbers > tokens_without_line_numbers); @@ -285,9 +292,17 @@ mod tests { std::fs::remove_file(&test_file).unwrap(); if let Some(entry) = found_entry { - let tokens = count_file_tokens(dir.path(), &entry, false, Encoding::Cl100kBase); - // Should still return some tokens for the file header even if content can't be read - assert!(tokens > 0); + let tokens = count_file_tokens( + dir.path(), + &entry, + false, + Encoding::Cl100kBase, + None, + &crate::markdown::TreeSitterConfig::default(), + ); + // A file that cannot be read renders to nothing, so it contributes 0 + // tokens — matching what the produced document would actually contain. + assert_eq!(tokens, 0); } } diff --git a/src/tree_sitter/languages/c.rs b/src/tree_sitter/languages/c.rs index efee2e2..4f322ac 100644 --- a/src/tree_sitter/languages/c.rs +++ b/src/tree_sitter/languages/c.rs @@ -270,7 +270,10 @@ impl CSupport { ) -> Option { let name = self.find_child_text(node, "type_identifier", source)?; - let full_sig = format!("typedef {}", name); + // Preserve the aliased type, e.g. `typedef unsigned int uint` instead of + // dropping it to a bare `typedef uint`. + let text = source[node.start_byte()..node.end_byte()].trim_end(); + let full_sig = text.trim_end_matches(';').trim_end().to_string(); Some(Signature { kind: SignatureKind::TypeAlias, diff --git a/src/tree_sitter/languages/cpp.rs b/src/tree_sitter/languages/cpp.rs index 0556a76..8aba782 100644 --- a/src/tree_sitter/languages/cpp.rs +++ b/src/tree_sitter/languages/cpp.rs @@ -320,7 +320,9 @@ impl CppSupport { ) -> Option { let name = self.find_child_text(node, "type_identifier", source)?; - let full_sig = format!("struct {}", name); + // Byte-slice to preserve inheritance/templates (e.g. `struct Foo : public Base`). + let full_sig = slice_signature_before_body(source, node, &["field_declaration_list"]) + .unwrap_or_else(|| format!("struct {}", name)); Some(Signature { kind: SignatureKind::Struct, @@ -336,7 +338,9 @@ impl CppSupport { fn extract_enum_signature(&self, source: &str, node: &tree_sitter::Node) -> Option { let name = self.find_child_text(node, "type_identifier", source)?; - let full_sig = format!("enum {}", name); + // Byte-slice to preserve `enum class` and the underlying type (e.g. `: int`). + let full_sig = slice_signature_before_body(source, node, &["enumerator_list"]) + .unwrap_or_else(|| format!("enum {}", name)); Some(Signature { kind: SignatureKind::Enum, @@ -352,7 +356,10 @@ impl CppSupport { fn extract_alias_signature(&self, source: &str, node: &tree_sitter::Node) -> Option { let name = self.find_child_text(node, "type_identifier", source)?; - let full_sig = format!("using/typedef {}", name); + // Preserve the full alias target, e.g. `using StringVec = std::vector` + // or `typedef unsigned int uint`, instead of a bare `using/typedef X`. + let text = source[node.start_byte()..node.end_byte()].trim_end(); + let full_sig = text.trim_end_matches(';').trim_end().to_string(); Some(Signature { kind: SignatureKind::TypeAlias, @@ -620,6 +627,53 @@ enum class Direction { assert_eq!(enums[0].name, "Direction"); } + #[test] + fn test_struct_enum_alias_preserve_details() { + // Regression (B17): struct inheritance, enum underlying type, and alias + // targets were dropped by `format!`-based signatures. + let source = r#" +struct Derived : public Base { + int x; +}; + +enum class Color : int { + Red, + Green +}; + +using StringVec = std::vector; +"#; + + let signatures = CppSupport.extract_signatures(source, Visibility::All); + let s = signatures + .iter() + .find(|s| s.kind == SignatureKind::Struct && s.name == "Derived") + .expect("struct extracted"); + assert!( + s.full_signature.contains(": public Base"), + "struct inheritance dropped: {}", + s.full_signature + ); + let e = signatures + .iter() + .find(|s| s.kind == SignatureKind::Enum) + .expect("enum extracted"); + assert!( + e.full_signature.contains(": int"), + "enum underlying type dropped: {}", + e.full_signature + ); + let a = signatures + .iter() + .find(|s| s.kind == SignatureKind::TypeAlias) + .expect("alias extracted"); + assert!( + a.full_signature.contains("std::vector"), + "alias target dropped: {}", + a.full_signature + ); + } + #[test] fn test_extract_header_prototype() { let source = r#" From b0229f5f3bec64f022c39a24c4b9914e24f095b0 Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Thu, 4 Jun 2026 18:53:26 -0300 Subject: [PATCH 08/10] fix: address PR #1 review feedback (pipe prompt, trait visibility, serial streaming) Three valid findings from the automated PR review (Copilot + Codex), each verified against the source before fixing: - Pipe mode (`-o -`) no longer corrupts the piped document: the >100-file confirmation prompt (`confirm_processing`, which `print!`s to stdout and blocks on stdin) is now skipped when output is stdout, alongside the existing chatter guards. Pipe mode is non-interactive, so it proceeds as `--yes` would. (Codex P2) - Public trait methods are no longer dropped under `--visibility public`: a `function_signature_item` carries no visibility modifier (Rust forbids one on trait items), so `extract_function_sig_item` now inherits the enclosing trait's visibility via a new `sig_item_visibility` helper instead of defaulting to private. A public trait's required methods are kept; a private trait's are still filtered out. (Codex P2) - The non-parallel build streams each file straight to output when no `--max-tokens` budget is set, instead of always rendering into a Vec first (buffering is only needed to tokenize the chunk for the budget). Keeps peak memory bounded on large files. Affects `--no-default-features` builds only (default is parallel). (Copilot) Adds regression tests: pipe_mode_skips_processing_confirmation (cli_integration) and test_public_trait_methods_survive_public_filter (rust.rs). CHANGELOG updated. Verified: fmt, clippy (default + all-features, -D warnings), and full tests pass in both parallel and serial builds. --- CHANGELOG.md | 4 +- src/lib.rs | 6 ++- src/markdown.rs | 83 ++++++++++++++++++------------- src/tree_sitter/languages/rust.rs | 62 ++++++++++++++++++++++- tests/cli_integration.rs | 49 ++++++++++++++++++ 5 files changed, 167 insertions(+), 37 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ab1823d..59294a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,12 +15,13 @@ All notable changes to this project will be documented in this file. - **Pipe-friendly output — stream to stdout (F2)** - `-o -` streams the generated document to **stdout** instead of a file, enabling `context-builder -f rs -o - | llm`. All progress/status chatter is suppressed (or already on stderr) in this mode so the pipe stays clean, and `-` is never folded into an output folder or timestamped name. Works for the auto-diff path too (the composed diff document is written to stdout) + - In pipe mode the large-project (>100 files) confirmation prompt is skipped as well — it `print!`s to stdout and would otherwise prepend prompt text to the piped document (and block on stdin); pipe mode is non-interactive, so it proceeds as if `--yes` were given - **Trustworthy token budget — `--max-tokens` uses the real tokenizer (F4, B1, B2)** - `--max-tokens` now counts the real tokenizer (per `--encoding`) on each file's rendered output, replacing the crude `buf.len()/4` (parallel) and `metadata().len()/4` (serial) byte heuristics. Both code paths now estimate identically, restoring byte-for-byte determinism between parallel and non-parallel builds - Fixed the budget bypass where the first file was always emitted in full regardless of `--max-tokens` (the `tokens_used > 0` guard). The budget now applies to every file, including the first (B1) - The document header + file tree are debited from the budget, so `--max-tokens` accounts for the whole document, not just file bodies - - Tokenization is skipped entirely when no budget is set — no overhead on normal runs + - Tokenization is skipped entirely when no budget is set — no overhead on normal runs. In that case the non-parallel build also streams each file straight to the output instead of rendering it into a buffer first (buffering exists only to tokenize the chunk for the budget), keeping peak memory bounded on large files - The deterministic content hash now folds in every output-affecting option (`line_numbers`, `max_tokens`, `encoding`, tree-sitter flags, `encoding_strategy`). Previously, toggling e.g. `--line-numbers` produced a different document under the **same** hash, silently breaking LLM prompt caching. The hash still fingerprints raw file content (not the rendered output, which embeds volatile mtimes), so it stays stable across checkouts (B2) - **Robustness & accuracy — low-severity mop-up (B5, B9, B17, B19, B20)** @@ -42,6 +43,7 @@ All notable changes to this project will be documented in this file. - Fixed Rust bodiless trait methods (`function_signature_item`, e.g. `fn draw(&self);`) being dropped from both signature extraction and structure counts (B15) - Fixed Python class base lists being double-parenthesized (`class User((Base))`) — `argument_list` already includes the surrounding parentheses (B16) - Fixed Rust restricted visibility (`pub(crate)` / `pub(super)` / `pub(in ...)`) being reported as fully public; restricted forms are no longer matched by `--visibility public` (B18) + - Fixed a public trait's required methods being dropped under `--visibility public` — a `function_signature_item` carries no visibility modifier (Rust forbids one on trait items), so it now inherits the enclosing trait's visibility instead of defaulting to private; a public trait's required methods are part of its public API and are kept, while a private trait's are still filtered out - Added regression tests covering each of the above - **Maintenance** diff --git a/src/lib.rs b/src/lib.rs index f766327..1021053 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -358,7 +358,11 @@ pub fn run_with_args(args: Args, config: Config, prompter: &impl Prompter) -> io return Ok(()); } - if !final_args.yes && !prompter.confirm_processing(files.len())? { + // In pipe mode (`-o -`) there is no interactive terminal to answer a prompt, + // and `confirm_processing` would `print!` to stdout — corrupting the piped + // document and blocking on stdin. Skip the >100-file confirmation and proceed, + // exactly as `--yes` would. + if !final_args.yes && !to_stdout && !prompter.confirm_processing(files.len())? { if !silent { println!("Operation cancelled."); } diff --git a/src/markdown.rs b/src/markdown.rs index 10856d2..6891a3c 100644 --- a/src/markdown.rs +++ b/src/markdown.rs @@ -293,43 +293,58 @@ pub fn generate_markdown( #[cfg(not(feature = "parallel"))] { - let mut tokens_used: usize = header_tokens; + match max_tokens { + // No budget: stream each file straight to the output (as the serial + // path did before v0.9.0). Buffering would only be needed to count + // tokens, so without a budget it just inflates peak memory by the + // size of the largest file for no benefit. + None => { + for entry in files.iter() { + process_file( + base_path, + entry.path(), + &mut output, + line_numbers, + encoding_strategy, + ts_config, + )?; + } + } + // Budget set: render each file into a buffer first so the rendered + // chunk can be tokenized before we decide to emit it — identical to + // the parallel path, so both builds truncate at the same boundary. + Some(budget) => { + let mut tokens_used: usize = header_tokens; - for (idx, entry) in files.iter().enumerate() { - // Render the file into a buffer first, then count the rendered chunk - // with the real tokenizer — identical to the parallel path. - let mut buf: Vec = Vec::new(); - process_file( - base_path, - entry.path(), - &mut buf, - line_numbers, - encoding_strategy, - ts_config, - )?; - // Only pay for tokenization when a budget is set. - let chunk_tokens = if max_tokens.is_some() { - estimate_tokens(token_encoding, &String::from_utf8_lossy(&buf)) - } else { - 0 - }; + for (idx, entry) in files.iter().enumerate() { + let mut buf: Vec = Vec::new(); + process_file( + base_path, + entry.path(), + &mut buf, + line_numbers, + encoding_strategy, + ts_config, + )?; + let chunk_tokens = + estimate_tokens(token_encoding, &String::from_utf8_lossy(&buf)); + + // Budget applies to every file, including the first (no bypass). + if tokens_used + chunk_tokens > budget { + let remaining = files.len() - idx; + writeln!(output, "---\n")?; + writeln!( + output, + "_⚠️ Token budget ({}) reached. {} remaining files omitted._\n", + budget, remaining + )?; + break; + } - // Budget applies to every file, including the first (no bypass). - if let Some(budget) = max_tokens - && tokens_used + chunk_tokens > budget - { - let remaining = files.len() - idx; - writeln!(output, "---\n")?; - writeln!( - output, - "_⚠️ Token budget ({}) reached. {} remaining files omitted._\n", - budget, remaining - )?; - break; + tokens_used += chunk_tokens; + output.write_all(&buf)?; + } } - - tokens_used += chunk_tokens; - output.write_all(&buf)?; } } diff --git a/src/tree_sitter/languages/rust.rs b/src/tree_sitter/languages/rust.rs index 744d756..3dae496 100644 --- a/src/tree_sitter/languages/rust.rs +++ b/src/tree_sitter/languages/rust.rs @@ -244,6 +244,34 @@ impl RustSupport { Visibility::Private } + /// Effective visibility of a bodiless `fn` declaration (`function_signature_item`). + /// Rust forbids a `visibility_modifier` on a required trait method, so such a + /// method's real visibility is the enclosing trait's — without this, a public + /// trait's required methods would be classified `Private` and dropped under + /// `--visibility public`. A declaration that carries its own modifier (e.g. + /// `extern "C" { pub fn .. }`) keeps it; `extern` declarations with no modifier + /// stay module-private, as they should. + fn sig_item_visibility(&self, node: &tree_sitter::Node, source: &str) -> Visibility { + // An explicit `pub` on the declaration itself always wins. + let own = self.get_visibility(node, source); + if own == Visibility::Public { + return own; + } + // Otherwise inherit from the nearest enclosing trait, if any. + let mut current = node.parent(); + while let Some(n) = current { + match n.kind() { + "trait_item" => return self.get_visibility(&n, source), + // Stop once we leave the trait body without finding a trait + // (e.g. an `extern` block, an impl, or the file root). + "impl_item" | "function_item" | "mod_item" | "source_file" => break, + _ => {} + } + current = n.parent(); + } + own + } + fn node_text<'a>(&self, source: &'a str, node: &tree_sitter::Node) -> &'a str { &source[node.start_byte()..node.end_byte()] } @@ -304,7 +332,7 @@ impl RustSupport { node: &tree_sitter::Node, visibility_filter: Visibility, ) -> Option { - let vis = self.get_visibility(node, source); + let vis = self.sig_item_visibility(node, source); if !vis.matches_filter(visibility_filter) { return None; } @@ -677,6 +705,38 @@ pub trait Drawable { assert!(!area.full_signature.ends_with(';')); } + #[test] + fn test_public_trait_methods_survive_public_filter() { + // Regression: required trait methods carry no `visibility_modifier` + // (Rust forbids one), so they were classified Private and dropped under + // `--visibility public` — hiding a public trait's API. They must inherit + // the trait's visibility instead. + let source = r#" +pub trait Drawable { + fn draw(&self); + fn area(&self) -> f64; +} + +trait Hidden { + fn secret(&self); +} +"#; + + let public_only = RustSupport.extract_signatures(source, Visibility::Public); + assert!( + public_only.iter().any(|s| s.name == "draw"), + "a public trait's required methods must pass the `public` filter" + ); + assert!( + public_only.iter().any(|s| s.name == "area"), + "a public trait's required methods must pass the `public` filter" + ); + assert!( + !public_only.iter().any(|s| s.name == "secret"), + "a private trait's methods must NOT pass the `public` filter" + ); + } + #[test] fn test_pub_crate_is_not_public() { // Regression: pub(crate)/pub(super) were reported as fully public. diff --git a/tests/cli_integration.rs b/tests/cli_integration.rs index afcbf30..66174db 100644 --- a/tests/cli_integration.rs +++ b/tests/cli_integration.rs @@ -431,6 +431,55 @@ fn confirm_processing_receives_large_count() { ); } +#[test] +fn pipe_mode_skips_processing_confirmation() { + // In `-o -` (stdout pipe) mode the confirmation prompt would `print!` to + // stdout and corrupt the piped document — so it must be skipped entirely, + // even with >100 files and without `--yes`. A prompter that would CANCEL + // proves the prompt is never consulted: the run still succeeds. + let dir = tempdir().unwrap(); + let root = dir.path(); + + fs::create_dir_all(root.join("data")).unwrap(); + for i in 0..150 { + write_file(&root.join("data").join(format!("f{}.txt", i)), "x"); + } + + let args = Args { + input: root.to_string_lossy().into_owned(), + output: "-".to_string(), + filter: vec!["txt".into()], + ignore: vec![], + preview: false, + token_count: false, + line_numbers: false, + yes: false, + diff_only: false, + clear_cache: false, + encoding: "o200k_base".to_string(), + init: false, + max_tokens: None, + signatures: false, + structure: false, + truncate: "smart".to_string(), + visibility: "all".to_string(), + }; + + // processing_response = false → would cancel if the prompt were consulted. + let prompter = TestPrompter::new(true, false); + + let res = run_with_args(args, Config::default(), &prompter); + assert!( + res.is_ok(), + "pipe mode must proceed without consulting the confirmation prompt" + ); + assert_eq!( + prompter.last_count(), + 0, + "confirm_processing must NOT be called in pipe mode" + ); +} + #[test] fn token_count_mode_does_not_create_output_file() { let dir = tempdir().unwrap(); From 7285e2047fff9ddc74365f93048113c98bb83fea Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Thu, 4 Jun 2026 19:08:06 -0300 Subject: [PATCH 09/10] fix: keep implicit-public Java interface methods under --visibility public MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to the new Codex PR #1 review comment — the Java analog of the Rust trait-method fix. A Java interface method (`public interface Foo { void bar(); }`) is implicitly public but carries no `modifiers` node, so the B12 fallback classified it package-private and `extract_method_signature` dropped it under `--visibility public`, hiding the interface's API. Java's `get_visibility` now resolves a modifier-less declaration by context: implicitly public as an interface/annotation member, package-private in a class/enum/record. An explicit `public` still wins, and an explicit `private`/`protected` (including Java 9+ private interface methods) is still respected. Added `is_interface_member` (nearest-enclosing-type walk) and a `modifiers_text` helper. Verified the bug is unique to Java + Rust among supported languages: TypeScript extracts interfaces as one signature with no per-member filter; C++ `get_visibility` is dead code and never filters (signatures are stamped with the filter value, never dropped); C/Go/Python/JS have no per-member visibility filtering. Adds regression test test_interface_methods_survive_public_filter. Verified: fmt, clippy --all-features -D warnings, all-features lib tests (329 passed). --- CHANGELOG.md | 1 + src/tree_sitter/languages/java.rs | 88 +++++++++++++++++++++++++++---- 2 files changed, 78 insertions(+), 11 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 59294a8..d37e3a5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -44,6 +44,7 @@ All notable changes to this project will be documented in this file. - Fixed Python class base lists being double-parenthesized (`class User((Base))`) — `argument_list` already includes the surrounding parentheses (B16) - Fixed Rust restricted visibility (`pub(crate)` / `pub(super)` / `pub(in ...)`) being reported as fully public; restricted forms are no longer matched by `--visibility public` (B18) - Fixed a public trait's required methods being dropped under `--visibility public` — a `function_signature_item` carries no visibility modifier (Rust forbids one on trait items), so it now inherits the enclosing trait's visibility instead of defaulting to private; a public trait's required methods are part of its public API and are kept, while a private trait's are still filtered out + - Fixed the Java analog: a `public interface`'s methods are implicitly public but carry no `modifiers` node, so they were classified package-private and dropped under `--visibility public`. Java members with no access modifier are now resolved by context — implicitly public as interface/annotation members, package-private in a class/enum — while an explicit `private`/`protected` (incl. Java 9+ private interface methods) is still respected - Added regression tests covering each of the above - **Maintenance** diff --git a/src/tree_sitter/languages/java.rs b/src/tree_sitter/languages/java.rs index 07b4112..243b0df 100644 --- a/src/tree_sitter/languages/java.rs +++ b/src/tree_sitter/languages/java.rs @@ -144,23 +144,60 @@ impl JavaSupport { } } - /// Determine a Java declaration's visibility from its `modifiers` child. - /// `public` → Public; `private`/`protected` and package-private (no modifier) - /// → Private. This keeps `--visibility public` scoped to the true public API - /// surface and `--visibility private` to everything else. - fn get_visibility(&self, node: &tree_sitter::Node, source: &str) -> Visibility { + /// Text of the declaration's `modifiers` child, if present. + fn modifiers_text<'a>(&self, node: &tree_sitter::Node, source: &'a str) -> Option<&'a str> { let mut cursor = node.walk(); for child in node.children(&mut cursor) { if child.kind() == "modifiers" { - let text = &source[child.start_byte()..child.end_byte()]; - if text.split_whitespace().any(|t| t == "public") { - return Visibility::Public; - } + return Some(&source[child.start_byte()..child.end_byte()]); + } + } + None + } + + /// True when the declaration's nearest enclosing type is an interface or + /// annotation type — whose members are implicitly `public`. + fn is_interface_member(&self, node: &tree_sitter::Node) -> bool { + let mut current = node.parent(); + while let Some(n) = current { + match n.kind() { + "interface_declaration" | "annotation_type_declaration" => return true, + // The nearest enclosing type is a class/enum/record, whose + // members are package-private by default — stop here. + "class_declaration" | "enum_declaration" | "record_declaration" => return false, + _ => {} + } + current = n.parent(); + } + false + } + + /// Determine a Java declaration's *effective* visibility, honoring Java's + /// implicit rules. An explicit `public` always wins, and an explicit + /// `private`/`protected` is respected even inside an interface. Otherwise a + /// declaration with no access modifier is **package-private** in a class/enum + /// but **implicitly public** as an interface/annotation member — without that + /// distinction, a public interface's methods (which carry no `modifiers` + /// node) would be classified non-public and dropped under `--visibility public`. + fn get_visibility(&self, node: &tree_sitter::Node, source: &str) -> Visibility { + if let Some(text) = self.modifiers_text(node, source) { + if text.split_whitespace().any(|t| t == "public") { + return Visibility::Public; + } + if text + .split_whitespace() + .any(|t| t == "private" || t == "protected") + { return Visibility::Private; } + // Only non-access modifiers (e.g. `default`, `static`, `final`, + // `abstract`) — fall through to the implicit rule below. + } + if self.is_interface_member(node) { + Visibility::Public + } else { + Visibility::Private } - // No modifiers node → package-private, treated as non-public. - Visibility::Private } fn extract_method_signature( @@ -554,6 +591,35 @@ public class Api { ); } + #[test] + fn test_interface_methods_survive_public_filter() { + // Regression: interface methods are implicitly public but carry no + // `modifiers` node, so they were classified package-private and dropped + // under `--visibility public`, hiding a public interface's API. An + // explicitly `private` interface method (Java 9+) must still be filtered. + let source = r#" +public interface Service { + void start(); + String name(); + private void internalHelper() {} +} +"#; + + let public_only = JavaSupport.extract_signatures(source, Visibility::Public); + assert!( + public_only.iter().any(|s| s.name == "start"), + "implicitly-public interface method must pass the `public` filter" + ); + assert!( + public_only.iter().any(|s| s.name == "name"), + "implicitly-public interface method must pass the `public` filter" + ); + assert!( + !public_only.iter().any(|s| s.name == "internalHelper"), + "explicitly-private interface method must NOT pass the `public` filter" + ); + } + #[test] fn test_extract_structure() { let source = r#" From 29e37d2519cee380257411cd48a322c74a5cc2aa Mon Sep 17 00:00:00 2001 From: Igor Lins e Silva <4753812+igorls@users.noreply.github.com> Date: Thu, 4 Jun 2026 19:36:01 -0300 Subject: [PATCH 10/10] fix: address PR #1 Codex review (cache fingerprint scope, encoding override) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two more valid Codex P2 findings, both verified against the source by tracing the auto-diff capture/compare flow and CLI resolution: - config_fingerprint no longer includes rendering options. The auto-diff baseline is each selected file's RAW captured content (ProjectState stores bytes via read_to_string; the diff compares those), which no rendering option changes. Keying the cache on line_numbers/signatures/structure/ truncate/visibility/max_tokens/encoding_strategy therefore caused spurious baseline resets: toggling e.g. --max-tokens or --signatures between runs discarded the diff and hid real content changes. The fingerprint now keys on filter+ignore only (the inputs that decide which files form the comparable baseline); the project path is already keyed separately in cache.rs. This supersedes the earlier B7 approach and fixes the same-class bug Codex flagged for max_tokens, for all rendering flags. (Codex P2) - Explicit CLI flags override config even at their default value. --encoding, --truncate, and --visibility carry a default value, so "value == default" could not be distinguished from "flag omitted" — an explicit `--encoding o200k_base` was ignored when the config set cl100k_base. run() now reads clap's ValueSource to detect explicitly-passed flags and threads that into the resolver (new ExplicitCli), so CLI-explicit > config > default holds even when the explicit value equals the default. (Codex P2) Regression tests: config_fingerprint_sensitivity rewritten to assert only filter/ignore change the key while every rendering option leaves it stable; explicit_cli_default_value_overrides_config covers the precedence fix. Verified end-to-end via the binary that `--encoding o200k_base` overrides a cl100k_base config. fmt, clippy (default + all-features, -D warnings), and full tests pass in both parallel (330 lib) and serial (212 lib) builds. --- CHANGELOG.md | 7 ++- src/config.rs | 99 ++++++++++++++++--------------- src/config_resolver.rs | 102 ++++++++++++++++++++++++++++---- src/lib.rs | 43 +++++++------- tests/test_config_resolution.rs | 6 +- 5 files changed, 173 insertions(+), 84 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d37e3a5..1a3047c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,12 +7,15 @@ All notable changes to this project will be documented in this file. > Planned per `docs/research/next-release-roadmap.md`. This section tracks work as it lands. - **Accurate token counting — selectable tokenizer (F1)** - - New `--encoding {o200k_base|cl100k_base}` flag (and `encoding` config option), defaulting to **`o200k_base`** (GPT-4o / o-series). The previous hardcoded `cl100k_base` under-counts every modern OpenAI model. The selected encoding flows through `--token-count` (and, once F4 lands, `--max-tokens`) + - New `--encoding {o200k_base|cl100k_base}` flag (and `encoding` config option), defaulting to **`o200k_base`** (GPT-4o / o-series). The previous hardcoded `cl100k_base` under-counts every modern OpenAI model. The selected encoding flows through both `--token-count` and `--max-tokens` budgeting - `--token-count` now reports counts using the chosen encoding - **CLI validation — invalid values are now rejected (F7)** - `--truncate`, `--visibility`, and `--encoding` are constrained to their allowed sets via clap `value_parser`, so invalid values produce a clear `error: ... [possible values: ...]` instead of being silently coerced (fixes B3 `--visibility` and B4 `--truncate` silent-acceptance) and `--help` now lists the valid values +- **CLI precedence — explicit flags override config even at their default value** + - `--encoding`, `--truncate`, and `--visibility` are now honored when passed explicitly even if the value equals the clap default — e.g. `--encoding o200k_base` overriding a config that sets `cl100k_base`. Resolution uses clap's `ValueSource` to tell an explicitly-passed flag from an omitted one, instead of treating "value == default" as "not provided", which previously left the default value un-selectable from the CLI whenever the config set a non-default + - **Pipe-friendly output — stream to stdout (F2)** - `-o -` streams the generated document to **stdout** instead of a file, enabling `context-builder -f rs -o - | llm`. All progress/status chatter is suppressed (or already on stderr) in this mode so the pipe stays clean, and `-` is never folded into an output folder or timestamped name. Works for the auto-diff path too (the composed diff document is written to stdout) - In pipe mode the large-project (>100 files) confirmation prompt is skipped as well — it `print!`s to stdout and would otherwise prepend prompt text to the piped document (and block on stdin); pipe mode is non-interactive, so it proceeds as if `--yes` were given @@ -33,7 +36,7 @@ All notable changes to this project will be documented in this file. - **Reliable auto-diff cache invalidation (B6, B7, B8)** - Unified the two duplicated config-hash functions (`cache.rs::hash_config` and `state.rs::compute_config_hash`) into one shared `config::config_fingerprint`, removing the "must stay in sync" drift hazard (B6) - - The auto-diff cache hash now reflects the **resolved CLI values** for `--signatures` / `--structure` / `--truncate` / `--visibility` — previously these arrived from CLI args while the hash only saw the (often empty) config-file values, so toggling them reused a stale diff baseline. `encoding_strategy` is now included too (it transcodes non-UTF-8 content). Pure output-rendering options (`--diff-only`, `--encoding` tokenizer) are deliberately **excluded** so toggling them against an existing baseline doesn't discard it (B7) + - The auto-diff cache fingerprint now keys on **only the file-selection inputs** (`filter` / `ignore`) — the options that decide which files form the comparable baseline. The diff compares each file's **raw captured content** (`ProjectState` stores the bytes), which no rendering option changes, so `--signatures` / `--structure` / `--truncate` / `--visibility` / `--max-tokens` / `--line-numbers` / `--encoding` / `encoding_strategy` are all **excluded**: toggling one against an existing baseline no longer silently discards the diff (which would hide real content changes on that run). The resolved CLI `filter`/`ignore` are folded in so the key reflects real behavior even when they come from flags rather than the config file (B7). _(This supersedes an earlier B7 approach that included the rendering options; per PR review, those caused exactly the spurious baseline resets described above.)_ - `--diff-only` now warns when used without `auto_diff` instead of silently emitting full file contents (B8) - **Tree-Sitter correctness — honest `--signatures` / `--visibility`** diff --git a/src/config.rs b/src/config.rs index c6cc089..8dd9d8b 100644 --- a/src/config.rs +++ b/src/config.rs @@ -91,11 +91,22 @@ pub struct Config { pub encoding: Option, } -/// Stable fingerprint of all configuration that affects auto-diff output and the -/// cache baseline. Shared by the cache key (`cache.rs`) and the project-state -/// config hash (`state.rs`) so the two can never drift out of sync. Every -/// output-affecting option must be included here, otherwise toggling it reuses a -/// stale cached diff baseline. +/// Stable fingerprint of the configuration that determines the auto-diff +/// *baseline* — i.e. which files are captured and compared between runs. Shared +/// by the cache key (`cache.rs`) and the project-state config hash (`state.rs`) +/// so the two can never drift out of sync. +/// +/// The baseline is the **raw content** of the selected files (`ProjectState` +/// stores each file's bytes via `read_to_string`; the diff compares those). The +/// only inputs that change that baseline are the file-selection options: +/// `filter` and `ignore`. Everything else is pure *rendering* — `line_numbers`, +/// `signatures`, `structure`, `truncate`, `visibility`, `max_tokens`, +/// `encoding`/`encoding_strategy`, `diff_context_lines`, `diff_only`, +/// `timestamped_output`, `output_folder` — and does **not** affect the captured +/// content. Such options are deliberately EXCLUDED: including them would reset +/// the diff baseline whenever a user toggles one (e.g. adding `--signatures`), +/// silently hiding real content changes on that run. (The project *path* is +/// keyed separately in `cache.rs`, so it isn't part of this fingerprint.) pub(crate) fn config_fingerprint(config: &Config) -> String { let mut s = String::new(); if let Some(ref filters) = config.filter { @@ -105,26 +116,6 @@ pub(crate) fn config_fingerprint(config: &Config) -> String { if let Some(ref ignores) = config.ignore { s.push_str(&ignores.join(",")); } - s.push('|'); - // Only include options that change WHICH files are captured or HOW their - // content is read into the diff baseline. `encoding_strategy` is new here - // (it transcodes non-UTF-8 content, so it genuinely affects the captured - // state). Pure output-rendering options (diff_only, encoding/tokenizer, - // timestamped_output, output_folder) are deliberately EXCLUDED: users toggle - // them — especially `--diff-only` — against an existing baseline, and they do - // not change the diffed content, so they must not invalidate the cache. - s.push_str(&format!( - "{:?}|{:?}|{:?}|{:?}|{:?}|{:?}|{:?}|{:?}|{:?}", - config.line_numbers, - config.auto_diff, - config.diff_context_lines, - config.signatures, - config.structure, - config.truncate, - config.visibility, - config.max_tokens, - config.encoding_strategy, - )); let hash = xxhash_rust::xxh3::xxh3_64(s.as_bytes()); format!("{:x}", hash) } @@ -317,44 +308,56 @@ invalid_toml [ #[test] fn config_fingerprint_sensitivity() { - // The cache/diff fingerprint must change for options that affect which - // files are captured or how their content is read, but must stay stable - // for pure output-rendering options that users toggle against an existing - // baseline (notably --diff-only). + // The cache/diff fingerprint must change ONLY for the file-selection + // options (filter, ignore) that determine which files form the comparable + // baseline. Every pure output-rendering option must leave it untouched, so + // toggling one against an existing baseline never discards the diff. let base = Config::default(); let base_h = config_fingerprint(&base); + // --- File selection: MUST change the fingerprint --- let mut c = base.clone(); - c.signatures = Some(true); + c.filter = Some(vec!["rs".to_string()]); assert_ne!( config_fingerprint(&c), base_h, - "signatures should affect the fingerprint" + "filter changes which files are captured, so it must change the fingerprint" ); let mut c = base.clone(); - c.encoding_strategy = Some("strict".to_string()); + c.ignore = Some(vec!["target".to_string()]); assert_ne!( config_fingerprint(&c), base_h, - "encoding_strategy should affect the fingerprint" - ); - - let mut c = base.clone(); - c.diff_only = Some(true); - assert_eq!( - config_fingerprint(&c), - base_h, - "diff_only must NOT invalidate the diff baseline" + "ignore changes which files are captured, so it must change the fingerprint" ); - let mut c = base.clone(); - c.encoding = Some("cl100k_base".to_string()); - assert_eq!( - config_fingerprint(&c), - base_h, - "tokenizer encoding must NOT invalidate the diff baseline" - ); + // --- Rendering options: MUST NOT change the fingerprint --- + // (none of these affect the raw content captured into the diff baseline) + type Mutate = fn(&mut Config); + let render_only: Vec<(&str, Mutate)> = vec![ + ("line_numbers", |c| c.line_numbers = Some(true)), + ("signatures", |c| c.signatures = Some(true)), + ("structure", |c| c.structure = Some(true)), + ("truncate", |c| c.truncate = Some("byte".to_string())), + ("visibility", |c| c.visibility = Some("public".to_string())), + ("max_tokens", |c| c.max_tokens = Some(1000)), + ("encoding", |c| c.encoding = Some("cl100k_base".to_string())), + ("encoding_strategy", |c| { + c.encoding_strategy = Some("strict".to_string()) + }), + ("diff_only", |c| c.diff_only = Some(true)), + ("diff_context_lines", |c| c.diff_context_lines = Some(9)), + ]; + for (name, mutate) in render_only { + let mut c = base.clone(); + mutate(&mut c); + assert_eq!( + config_fingerprint(&c), + base_h, + "{name} is a pure rendering option and must NOT invalidate the diff baseline" + ); + } } #[test] diff --git a/src/config_resolver.rs b/src/config_resolver.rs index aa6b508..267699f 100644 --- a/src/config_resolver.rs +++ b/src/config_resolver.rs @@ -41,10 +41,25 @@ pub struct ConfigResolution { pub warnings: Vec, } +/// Which value-bearing CLI flags the user explicitly passed (as opposed to +/// leaving at their clap default). These flags carry a default *value*, so the +/// value alone can't tell us whether the user typed e.g. `--encoding o200k_base` +/// to override a non-default config or simply omitted the flag. `run()` fills +/// this from clap's `ValueSource`; `Default` (all `false`) means "treat the +/// value as a default", which preserves the value-based precedence for callers +/// (e.g. tests) that build `Args` directly. +#[derive(Debug, Clone, Copy, Default)] +pub struct ExplicitCli { + pub truncate: bool, + pub visibility: bool, + pub encoding: bool, +} + /// Resolves final configuration by merging CLI arguments with config file values. /// /// Precedence rules (highest to lowest): -/// 1. Explicit CLI arguments (non-default values) +/// 1. Explicit CLI arguments — a non-default value, or (for the value-bearing +/// flags in `ExplicitCli`) a flag the user passed even at its default value /// 2. Configuration file values /// 3. CLI default values /// @@ -52,7 +67,11 @@ pub struct ConfigResolution { /// - `output` field supports timestamping and output folder resolution /// - Boolean flags respect explicit CLI usage vs defaults /// - Arrays (filter, ignore) use CLI if non-empty, otherwise config file -pub fn resolve_final_config(mut args: Args, config: Option) -> ConfigResolution { +pub fn resolve_final_config( + mut args: Args, + config: Option, + explicit: ExplicitCli, +) -> ConfigResolution { let mut warnings = Vec::new(); // Start with CLI defaults, then apply config file, then explicit CLI overrides @@ -81,7 +100,10 @@ pub fn resolve_final_config(mut args: Args, config: Option) -> ConfigRes init: args.init, signatures: args.signatures || final_config.signatures.unwrap_or(false), structure: args.structure || final_config.structure.unwrap_or(false), - truncate: if args.truncate != "smart" { + // CLI explicit (even when the value equals the default) > config > default. + // The `|| value != default` keeps value-based precedence for callers that + // build Args directly without an explicitness signal (e.g. tests). + truncate: if explicit.truncate || args.truncate != "smart" { args.truncate.clone() } else { final_config @@ -89,7 +111,7 @@ pub fn resolve_final_config(mut args: Args, config: Option) -> ConfigRes .clone() .unwrap_or_else(|| args.truncate.clone()) }, - visibility: if args.visibility != "all" { + visibility: if explicit.visibility || args.visibility != "all" { args.visibility.clone() } else { final_config @@ -97,7 +119,7 @@ pub fn resolve_final_config(mut args: Args, config: Option) -> ConfigRes .clone() .unwrap_or_else(|| args.visibility.clone()) }, - encoding: if args.encoding != "o200k_base" { + encoding: if explicit.encoding || args.encoding != "o200k_base" { args.encoding.clone() } else { final_config @@ -270,7 +292,7 @@ mod tests { ..Default::default() }; - let resolution = resolve_final_config(args.clone(), Some(config)); + let resolution = resolve_final_config(args.clone(), Some(config), ExplicitCli::default()); assert_eq!(resolution.config.output, "custom.md"); // CLI wins assert_eq!(resolution.config.filter, vec!["rs"]); // CLI wins @@ -312,7 +334,7 @@ mod tests { ..Default::default() }; - let resolution = resolve_final_config(args, Some(config)); + let resolution = resolve_final_config(args, Some(config), ExplicitCli::default()); assert_eq!(resolution.config.output, "from_config.md"); assert_eq!( @@ -354,7 +376,7 @@ mod tests { ..Default::default() }; - let resolution = resolve_final_config(args, Some(config)); + let resolution = resolve_final_config(args, Some(config), ExplicitCli::default()); // Output should have timestamp format: test_YYYYMMDDHHMMSS.md assert!(resolution.config.output.starts_with("test_")); @@ -389,7 +411,7 @@ mod tests { ..Default::default() }; - let resolution = resolve_final_config(args, Some(config)); + let resolution = resolve_final_config(args, Some(config), ExplicitCli::default()); assert!(resolution.config.output.contains("docs")); assert!(resolution.config.output.ends_with("test.md")); @@ -423,7 +445,7 @@ mod tests { ..Default::default() }; - let resolution = resolve_final_config(args, Some(config)); + let resolution = resolve_final_config(args, Some(config), ExplicitCli::default()); assert!(resolution.config.output.contains("docs")); assert!(resolution.config.output.contains("test_")); @@ -458,7 +480,7 @@ mod tests { ..Default::default() }; - let resolution = resolve_final_config(args, Some(config)); + let resolution = resolve_final_config(args, Some(config), ExplicitCli::default()); // `-` must survive untouched — not folded into docs/ nor timestamped. assert_eq!(resolution.config.output, "-"); } @@ -491,7 +513,7 @@ mod tests { ..Default::default() }; - let resolution = resolve_final_config(args, Some(config)); + let resolution = resolve_final_config(args, Some(config), ExplicitCli::default()); assert!(!resolution.warnings.is_empty()); assert!(resolution.warnings[0].contains("auto_diff")); @@ -520,7 +542,7 @@ mod tests { visibility: "all".to_string(), }; - let resolution = resolve_final_config(args.clone(), None); + let resolution = resolve_final_config(args.clone(), None, ExplicitCli::default()); assert_eq!(resolution.config.input, args.input); assert_eq!(resolution.config.output, args.output); @@ -535,4 +557,58 @@ mod tests { assert_eq!(resolution.config.diff_context_lines, 3); assert!(resolution.warnings.is_empty()); } + + #[test] + fn explicit_cli_default_value_overrides_config() { + // Regression: `--encoding o200k_base` (the clap default value) must still + // override a non-default config encoding. The value looks like the default, + // so resolution must rely on the explicit-flag signal, not the value. + // Same for `--truncate smart` and `--visibility all`. + let make_args = || Args { + input: ".".to_string(), + output: "output.md".to_string(), + filter: vec![], + ignore: vec![], + line_numbers: false, + preview: false, + token_count: false, + yes: false, + diff_only: false, + clear_cache: false, + encoding: "o200k_base".to_string(), + init: false, + max_tokens: None, + signatures: false, + structure: false, + truncate: "smart".to_string(), + visibility: "all".to_string(), + }; + let config = Config { + encoding: Some("cl100k_base".to_string()), + truncate: Some("byte".to_string()), + visibility: Some("public".to_string()), + ..Default::default() + }; + + // Flags NOT explicitly passed → config wins (the value equals the default). + let implicit = + resolve_final_config(make_args(), Some(config.clone()), ExplicitCli::default()); + assert_eq!(implicit.config.encoding, "cl100k_base"); + assert_eq!(implicit.config.truncate, "byte"); + assert_eq!(implicit.config.visibility, "public"); + + // Flags explicitly passed at their default value → CLI wins over config. + let explicit = resolve_final_config( + make_args(), + Some(config), + ExplicitCli { + truncate: true, + visibility: true, + encoding: true, + }, + ); + assert_eq!(explicit.config.encoding, "o200k_base"); + assert_eq!(explicit.config.truncate, "smart"); + assert_eq!(explicit.config.visibility, "all"); + } } diff --git a/src/lib.rs b/src/lib.rs index 1021053..dc3d397 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,4 +1,4 @@ -use clap::{CommandFactory, Parser}; +use clap::{CommandFactory, FromArgMatches}; use std::fs; use std::io::{self, Write}; @@ -386,30 +386,21 @@ pub fn run_with_args(args: Args, config: Config, prompter: &impl Prompter) -> io } if config.auto_diff.unwrap_or(false) { - // Build an effective config that mirrors the *actual* operational settings coming - // from resolved CLI args (filters/ignores/line_numbers). This ensures the - // configuration hash used for cache invalidation reflects real behavior and - // stays consistent across runs even when values originate from CLI not file. + // Build an effective config that mirrors the *actual* file selection coming + // from resolved CLI args, so the cache/diff fingerprint reflects real + // behavior even when filter/ignore originate from the CLI, not the config + // file. Only `filter`/`ignore` matter: they decide which files form the + // diff baseline. Rendering options (signatures/structure/truncate/ + // visibility/max_tokens/line_numbers/encoding) deliberately do NOT feed the + // fingerprint — they don't change the captured raw content — so propagating + // them here would only risk spurious baseline resets (see `config_fingerprint`). let mut effective_config = config.clone(); - // Normalize filter/ignore/line_numbers into config so hashing sees them if !final_args.filter.is_empty() { effective_config.filter = Some(final_args.filter.clone()); } if !final_args.ignore.is_empty() { effective_config.ignore = Some(final_args.ignore.clone()); } - effective_config.line_numbers = Some(final_args.line_numbers); - // Propagate the *resolved* CLI values so the cache/diff config hash sees - // them too (B7) — these arrive from CLI args, not the config file, so - // without this, toggling e.g. --signatures/--visibility would reuse a - // stale diff baseline. Only fields that are part of the fingerprint are - // set here; pure output-rendering flags (diff_only, encoding) are - // intentionally NOT propagated so they don't invalidate the baseline. - effective_config.signatures = Some(final_args.signatures); - effective_config.structure = Some(final_args.structure); - effective_config.truncate = Some(final_args.truncate.clone()); - effective_config.visibility = Some(final_args.visibility.clone()); - effective_config.max_tokens = final_args.max_tokens; // 1. Create current project state let current_state = ProjectState::from_files( @@ -865,7 +856,19 @@ fn generate_markdown_with_diff( pub fn run() -> io::Result<()> { env_logger::init(); - let args = Args::parse(); + // Parse via `ArgMatches` (not `Args::parse`) so we can tell whether the + // value-bearing flags were *explicitly* passed or left at their clap default. + // `--encoding o200k_base` carries the same value as the default, so the value + // alone can't reveal an intent to override a non-default config (see resolver). + let matches = Args::command().get_matches(); + let explicit = crate::config_resolver::ExplicitCli { + truncate: matches.value_source("truncate") == Some(clap::parser::ValueSource::CommandLine), + visibility: matches.value_source("visibility") + == Some(clap::parser::ValueSource::CommandLine), + encoding: matches.value_source("encoding") == Some(clap::parser::ValueSource::CommandLine), + }; + let args = Args::from_arg_matches(&matches) + .expect("arguments were already validated by get_matches()"); // Handle init command first if args.init { @@ -896,7 +899,7 @@ pub fn run() -> io::Result<()> { } // Resolve final configuration using the new config resolver - let resolution = crate::config_resolver::resolve_final_config(args, config.clone()); + let resolution = crate::config_resolver::resolve_final_config(args, config.clone(), explicit); // Print warnings if any let silent = std::env::var("CB_SILENT") diff --git a/tests/test_config_resolution.rs b/tests/test_config_resolution.rs index 8cdad2f..25ee2ac 100644 --- a/tests/test_config_resolution.rs +++ b/tests/test_config_resolution.rs @@ -48,7 +48,11 @@ fn run_with_resolved_config( prompter: &impl Prompter, ) -> std::io::Result<()> { // Resolve final configuration using the new config resolver - let resolution = resolve_final_config(args, config.clone()); + let resolution = resolve_final_config( + args, + config.clone(), + context_builder::config_resolver::ExplicitCli::default(), + ); // Convert resolved config back to Args for run_with_args let final_args = Args {