From a39647064f96b2380f95f00580a3c3b3612afad3 Mon Sep 17 00:00:00 2001 From: realtonyyoung <6655045+realtonyyoung@users.noreply.github.com> Date: Fri, 3 Jul 2026 12:51:28 -0400 Subject: [PATCH 1/3] perf(import): skip live PR/MR provider detection during bulk import MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to #229. ImportCommand's bulk loop calls DetectRepositoryAsync per session/cwd, which runs the live `gh pr view` / `glab api` provider round-trip (up to ~2s best-effort each). But every import path discards the PR fields: the scope-picker resolution and "Scanning repositories" step use only Owner/RepoName, the per-session ingest builds its `repository` node without any pr_* fields, and classification/import-source detection only need owner/repo. So that per-cwd round-trip is pure wasted latency — worst on GitLab, whose `glab api` is a network call rather than gh's local resolve. - DetectRepositoryAsync gains `detectPullRequest = true` (default preserves every live/hook/enrich caller) and an injectable `CommandRunner? run = null` (defaults to the real spawner) so the git and provider spawns are unit-testable. - All import-path callers pass `detectPullRequest: false`: ImportCommand (scope resolution, repo scan, per-session ingest), TranscriptFileClassification (exclusion key), and the Copilot/Pi/Kiro/ Cursor import-source default detectors. Base repo info (owner/repo/user/branch/host) still resolves from git alone. This is a behaviour-preserving latency win — imported sessions never carried PR info, so nothing in the imported payload changes; only the discarded network calls are removed. Adds ImportProviderDetectionTests (import path spawns no gh/glab; live path still detects the PR). Closes #232 Linear: AI-1122 Co-Authored-By: Claude Opus 4.8 --- .../Commands/CopilotImportSource.cs | 2 +- .../Commands/CursorImportSource.cs | 2 +- src/Capacitor.Cli/Commands/ImportCommand.cs | 10 ++-- .../Commands/KiroImportSource.cs | 2 +- src/Capacitor.Cli/Commands/PiImportSource.cs | 2 +- .../Commands/TranscriptFileClassification.cs | 3 +- src/Capacitor.Cli/RepositoryDetection.cs | 25 +++++---- .../ImportProviderDetectionTests.cs | 51 +++++++++++++++++++ 8 files changed, 80 insertions(+), 17 deletions(-) create mode 100644 test/Capacitor.Cli.Tests.Unit/ImportProviderDetectionTests.cs diff --git a/src/Capacitor.Cli/Commands/CopilotImportSource.cs b/src/Capacitor.Cli/Commands/CopilotImportSource.cs index e99fe69b..91d481d5 100644 --- a/src/Capacitor.Cli/Commands/CopilotImportSource.cs +++ b/src/Capacitor.Cli/Commands/CopilotImportSource.cs @@ -36,7 +36,7 @@ public CopilotImportSource( ) { _sessionStateDir = sessionStateDirOverride ?? CopilotPaths.SessionStateDir(); _legacySessionStateDir = legacyDirOverride ?? CopilotPaths.LegacySessionStateDir(); - _repoDetector = repoDetector ?? (cwd => RepositoryDetection.DetectRepositoryAsync(cwd)); + _repoDetector = repoDetector ?? (cwd => RepositoryDetection.DetectRepositoryAsync(cwd, detectPullRequest: false)); } static StringComparison PathComparison => diff --git a/src/Capacitor.Cli/Commands/CursorImportSource.cs b/src/Capacitor.Cli/Commands/CursorImportSource.cs index dedbe8db..3ea57a5e 100644 --- a/src/Capacitor.Cli/Commands/CursorImportSource.cs +++ b/src/Capacitor.Cli/Commands/CursorImportSource.cs @@ -42,7 +42,7 @@ public CursorImportSource( _projectsDir = projectsDirOverride ?? CursorPaths.ProjectsDir(); _workspaceStorageDir = workspaceStorageDirOverride ?? CursorPaths.Resolve().WorkspaceStorageDir; _sanitizedToFolder = new Lazy>(BuildSanitizedToFolderMap); - _repoDetector = repoDetector ?? (cwd => RepositoryDetection.DetectRepositoryAsync(cwd)); + _repoDetector = repoDetector ?? (cwd => RepositoryDetection.DetectRepositoryAsync(cwd, detectPullRequest: false)); } /// diff --git a/src/Capacitor.Cli/Commands/ImportCommand.cs b/src/Capacitor.Cli/Commands/ImportCommand.cs index 837aa6b8..cdb66f23 100644 --- a/src/Capacitor.Cli/Commands/ImportCommand.cs +++ b/src/Capacitor.Cli/Commands/ImportCommand.cs @@ -589,7 +589,8 @@ await Parallel.ForEachAsync( opts, async (cwd, _) => { try { - var repo = await RepositoryDetection.DetectRepositoryAsync(cwd); + // Import only needs owner/repo here — skip the PR/MR provider round-trip. + var repo = await RepositoryDetection.DetectRepositoryAsync(cwd, detectPullRequest: false); repoByCwd[cwd] = repo is { Owner: { } o, RepoName: { } n } ? (o, n) : null; } catch { repoByCwd[cwd] = null; @@ -2003,7 +2004,8 @@ string sessionId var total = uniqueCwds.Count; async ValueTask DetectOne(string cwd) { - var repo = await RepositoryDetection.DetectRepositoryAsync(cwd); + // Import only needs owner/repo here — skip the PR/MR provider round-trip. + var repo = await RepositoryDetection.DetectRepositoryAsync(cwd, detectPullRequest: false); repoByCwd[cwd] = repo is { Owner: { } o, RepoName: { } n } ? (o, n) : null; } @@ -2358,7 +2360,9 @@ CancellationToken ct var codexRepo = session.Vendor == "codex" ? ExtractCodexGitInfo(session.FilePath) : null; if (cwd is not null) { - var repo = await RepositoryDetection.DetectRepositoryAsync(cwd); + // The imported session-start payload carries no PR fields (only owner/repo/branch/user), + // so skip the PR/MR provider round-trip. + var repo = await RepositoryDetection.DetectRepositoryAsync(cwd, detectPullRequest: false); if (repo is not null || codexRepo is not null) { var repoNode = new JsonObject(); diff --git a/src/Capacitor.Cli/Commands/KiroImportSource.cs b/src/Capacitor.Cli/Commands/KiroImportSource.cs index 48e5fe2d..d8f5e843 100644 --- a/src/Capacitor.Cli/Commands/KiroImportSource.cs +++ b/src/Capacitor.Cli/Commands/KiroImportSource.cs @@ -26,7 +26,7 @@ public KiroImportSource( Func>? repoDetector = null ) { _sessionsDir = sessionsDirOverride ?? KiroPaths.SessionsDir(); - _repoDetector = repoDetector ?? (cwd => RepositoryDetection.DetectRepositoryAsync(cwd)); + _repoDetector = repoDetector ?? (cwd => RepositoryDetection.DetectRepositoryAsync(cwd, detectPullRequest: false)); } static StringComparison PathComparison => diff --git a/src/Capacitor.Cli/Commands/PiImportSource.cs b/src/Capacitor.Cli/Commands/PiImportSource.cs index 7daced2e..69b1a82e 100644 --- a/src/Capacitor.Cli/Commands/PiImportSource.cs +++ b/src/Capacitor.Cli/Commands/PiImportSource.cs @@ -31,7 +31,7 @@ public PiImportSource( Func>? repoDetector = null ) { _sessionsDir = sessionsDirOverride ?? PiPaths.SessionsDir(); - _repoDetector = repoDetector ?? (cwd => RepositoryDetection.DetectRepositoryAsync(cwd)); + _repoDetector = repoDetector ?? (cwd => RepositoryDetection.DetectRepositoryAsync(cwd, detectPullRequest: false)); } static StringComparison PathComparison => diff --git a/src/Capacitor.Cli/Commands/TranscriptFileClassification.cs b/src/Capacitor.Cli/Commands/TranscriptFileClassification.cs index c0155577..7dbb5f7a 100644 --- a/src/Capacitor.Cli/Commands/TranscriptFileClassification.cs +++ b/src/Capacitor.Cli/Commands/TranscriptFileClassification.cs @@ -193,7 +193,8 @@ CancellationToken ct if (cwd is not null) { if (excludedRepos is { Length: > 0 }) { - var repo = await RepositoryDetection.DetectRepositoryAsync(cwd); + // Classification only needs owner/repo for the exclusion key — skip PR detection. + var repo = await RepositoryDetection.DetectRepositoryAsync(cwd, detectPullRequest: false); if (repo?.Owner is not null && repo.RepoName is not null) { var key = $"{repo.Owner}/{repo.RepoName}"; diff --git a/src/Capacitor.Cli/RepositoryDetection.cs b/src/Capacitor.Cli/RepositoryDetection.cs index d9fd5794..4911c9dd 100644 --- a/src/Capacitor.Cli/RepositoryDetection.cs +++ b/src/Capacitor.Cli/RepositoryDetection.cs @@ -113,8 +113,15 @@ static bool RepoPayloadEquals(RepositoryPayload a, RepositoryPayload b) => && a.UserName == b.UserName && a.UserEmail == b.UserEmail; - public static async Task DetectRepositoryAsync(string cwd, TimeSpan? budget = null) { + // detectPullRequest=false skips the live PR/MR provider detection (the `gh pr view` / `glab api` + // round-trip) while still resolving base repo info (owner/repo/user/branch/host). Bulk import + // passes false: it never emits PR fields, so that per-cwd round-trip is pure wasted latency + // (AI-1122). `run` is an injectable command runner (defaults to the real process spawner) so the + // git/provider spawns are unit-testable. + public static async Task DetectRepositoryAsync( + string cwd, TimeSpan? budget = null, bool detectPullRequest = true, CommandRunner? run = null) { if (budget is { } b0 && b0 <= TimeSpan.Zero) return null; + run ??= DefaultRunner; try { // Bound the git/gh probes by the remaining hook budget so a slow repo never // overruns the deadline. When no budget is set, keep the historical 5s/2s caps. @@ -129,7 +136,7 @@ static bool RepoPayloadEquals(RepositoryPayload a, RepositoryPayload b) => string? userName, userEmail, remoteUrl, owner, repoName, branch, host; // Always detect branch fresh — it changes frequently during a session - var branchTask = RunCommandAsync("git", "branch --show-current", cwd, gitCap); + var branchTask = run("git", "branch --show-current", cwd, gitCap); if (cache is not null) { userName = cache.UserName; @@ -141,9 +148,9 @@ static bool RepoPayloadEquals(RepositoryPayload a, RepositoryPayload b) => branch = await branchTask; } else { // Run git commands in parallel - var userNameTask = RunCommandAsync("git", "config user.name", cwd, gitCap); - var userEmailTask = RunCommandAsync("git", "config user.email", cwd, gitCap); - var remoteUrlTask = RunCommandAsync("git", "remote get-url origin", cwd, gitCap); + var userNameTask = run("git", "config user.name", cwd, gitCap); + var userEmailTask = run("git", "config user.email", cwd, gitCap); + var remoteUrlTask = run("git", "remote get-url origin", cwd, gitCap); await Task.WhenAll(userNameTask, userEmailTask, remoteUrlTask, branchTask); @@ -191,16 +198,16 @@ static bool RepoPayloadEquals(RepositoryPayload a, RepositoryPayload b) => // (only ClaudeHookCommand does), else the historical 2s cap. Covers probe + detector. var providerCap = ghCap; - if (providerCap > TimeSpan.Zero && host is not null) { + if (detectPullRequest && providerCap > TimeSpan.Zero && host is not null) { var provSw = Stopwatch.StartNew(); - var kind = await GitProviderRouter.ResolveAsync(host, cwd, providerCap, DefaultRunner); + var kind = await GitProviderRouter.ResolveAsync(host, cwd, providerCap, run); var detectCap = providerCap - provSw.Elapsed; // combined ceiling: probe + detector if (detectCap > TimeSpan.Zero) { var pr = kind switch { - GitProviderKind.GitHub => await GitHubPrDetector.DetectAsync(cwd, detectCap, DefaultRunner), + GitProviderKind.GitHub => await GitHubPrDetector.DetectAsync(cwd, detectCap, run), GitProviderKind.GitLab when owner is not null && repoName is not null - => await GitLabPrDetector.DetectAsync(host, owner, repoName, branch, cwd, detectCap, DefaultRunner), + => await GitLabPrDetector.DetectAsync(host, owner, repoName, branch, cwd, detectCap, run), _ => null }; diff --git a/test/Capacitor.Cli.Tests.Unit/ImportProviderDetectionTests.cs b/test/Capacitor.Cli.Tests.Unit/ImportProviderDetectionTests.cs new file mode 100644 index 00000000..31c521e1 --- /dev/null +++ b/test/Capacitor.Cli.Tests.Unit/ImportProviderDetectionTests.cs @@ -0,0 +1,51 @@ +using Capacitor.Cli; +using Capacitor.Cli.PrDetection; + +namespace Capacitor.Cli.Tests.Unit; + +/// +/// Import discards PR/MR fields on every path, so running the live `gh pr view` / `glab api` +/// provider round-trip per cwd during a bulk import is wasted latency (AI-1122). These tests pin +/// the `detectPullRequest` switch on RepositoryDetection.DetectRepositoryAsync via an injected +/// command runner: import (false) resolves owner/repo with git only and never spawns a provider +/// CLI, while the default (live) path still does. +/// +public class ImportProviderDetectionTests { + // Records every command the detector spawns; answers git probes with a GitHub remote so + // owner/repo/host resolve, and answers `gh pr view` with a PR so the live path can populate it. + static CommandRunner Recording(List log) => (cmd, args, _, _) => { + log.Add($"{cmd} {args}"); + if (cmd == "git" && args.StartsWith("remote get-url")) return Task.FromResult("https://github.com/foo/bar"); + if (cmd == "git" && args.StartsWith("branch")) return Task.FromResult("main"); + if (cmd == "git") return Task.FromResult(""); + if (cmd == "gh") return Task.FromResult("""{"number":7,"title":"t","url":"https://github.com/foo/bar/pull/7","headRefName":"main"}"""); + return Task.FromResult(null); + }; + + [Test] + public async Task Import_detection_resolves_repo_without_spawning_a_provider_cli() { + var log = new List(); + + var repo = await RepositoryDetection.DetectRepositoryAsync( + "/fake/import/skip-pr", budget: null, detectPullRequest: false, run: Recording(log)); + + // Base repo info still resolves from git alone… + await Assert.That(repo!.Owner).IsEqualTo("foo"); + await Assert.That(repo.RepoName).IsEqualTo("bar"); + await Assert.That(repo.PrNumber).IsNull(); + // …but no `gh` / `glab` round-trip was made. + await Assert.That(log.Any(c => c.StartsWith("gh") || c.StartsWith("glab"))).IsFalse(); + await Assert.That(log.Any(c => c.StartsWith("git"))).IsTrue(); + } + + [Test] + public async Task Live_detection_still_runs_provider_detection() { + var log = new List(); + + var repo = await RepositoryDetection.DetectRepositoryAsync( + "/fake/live/do-pr", budget: null, detectPullRequest: true, run: Recording(log)); + + await Assert.That(log.Any(c => c.StartsWith("gh pr view"))).IsTrue(); // provider detection ran + await Assert.That(repo!.PrNumber).IsEqualTo(7); + } +} From 042c8b275e17f3c2ede0d59d37931ad287de342c Mon Sep 17 00:00:00 2001 From: realtonyyoung <6655045+realtonyyoung@users.noreply.github.com> Date: Fri, 3 Jul 2026 13:57:25 -0400 Subject: [PATCH 2/3] fix(import): keep PR detection for Cursor import (it emits pr_* in its session-start) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review follow-up: CursorImportSource is the one import source whose synthetic session-start payload carries a `repository` node (AI-1152) built via RepositoryDetection.BuildRepositoryNode, which emits pr_number/pr_title/pr_url/pr_head_ref when a PR is detected. The bulk-import latency change flipped its default repo detector to detectPullRequest:false, which silently stripped those PR fields from imported Cursor sessions — a payload-shape regression. Revert only Cursor's default detector to PR-detecting. The other flipped paths (ImportCommand x3, TranscriptFileClassification, and the Pi/Kiro/Copilot sources) use their detector result only for the owner/repo exclusion key and never via BuildRepositoryNode, so they keep the latency win. Adds a test asserting the Cursor session-start payload carries pr_* when the repo has a PR. Co-Authored-By: Claude Opus 4.8 --- .../Commands/CursorImportSource.cs | 6 +++++- .../Import/CursorImportSourceTests.cs | 21 +++++++++++++++++++ 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/Capacitor.Cli/Commands/CursorImportSource.cs b/src/Capacitor.Cli/Commands/CursorImportSource.cs index 3ea57a5e..c958819b 100644 --- a/src/Capacitor.Cli/Commands/CursorImportSource.cs +++ b/src/Capacitor.Cli/Commands/CursorImportSource.cs @@ -42,7 +42,11 @@ public CursorImportSource( _projectsDir = projectsDirOverride ?? CursorPaths.ProjectsDir(); _workspaceStorageDir = workspaceStorageDirOverride ?? CursorPaths.Resolve().WorkspaceStorageDir; _sanitizedToFolder = new Lazy>(BuildSanitizedToFolderMap); - _repoDetector = repoDetector ?? (cwd => RepositoryDetection.DetectRepositoryAsync(cwd, detectPullRequest: false)); + // Cursor is the one import source that emits a `repository` node in its synthetic + // session-start (AI-1152) via BuildRepositoryNode, which includes pr_* when populated — + // so it keeps live PR detection (unlike the owner/repo-only detectors in the other + // sources). Dropping it here would silently strip PR tagging from imported Cursor sessions. + _repoDetector = repoDetector ?? (cwd => RepositoryDetection.DetectRepositoryAsync(cwd)); } /// diff --git a/test/Capacitor.Cli.Tests.Unit/Import/CursorImportSourceTests.cs b/test/Capacitor.Cli.Tests.Unit/Import/CursorImportSourceTests.cs index 0b30ea1c..c2959f8d 100644 --- a/test/Capacitor.Cli.Tests.Unit/Import/CursorImportSourceTests.cs +++ b/test/Capacitor.Cli.Tests.Unit/Import/CursorImportSourceTests.cs @@ -20,6 +20,27 @@ public async Task does_not_support_title_generation() { await Assert.That(src.SupportsTitleGeneration).IsFalse(); } + [Test] + public async Task session_start_payload_carries_pr_fields_when_repo_has_a_pr() { + // Cursor is the one import source whose synthetic session-start carries a `repository` + // node (AI-1152) via BuildRepositoryNode — including pr_* when a PR is detected. Guard + // that the payload propagates those fields (a regression dropped them when Cursor's repo + // detector was switched to skip PR detection during the import-latency work — AI-1122). + var repo = new RepositoryPayload { + Owner = "o", RepoName = "r", Host = "github.com", Branch = "main", + PrNumber = 7, PrTitle = "t", PrUrl = "https://github.com/o/r/pull/7", PrHeadRef = "main" + }; + var repoNode = RepositoryDetection.BuildRepositoryNode(repo); + + var payload = CursorImportSource.BuildSessionStartPayload( + "sid", "/ws", "/t.jsonl", DateTimeOffset.UtcNow, repoNode); + + var repository = payload["repository"] as JsonObject; + await Assert.That(repository).IsNotNull(); + await Assert.That(repository!["pr_number"]!.GetValue()).IsEqualTo(7); + await Assert.That(repository["pr_url"]!.GetValue()).IsEqualTo("https://github.com/o/r/pull/7"); + } + [Test] public async Task is_available_when_projects_dir_exists() { using var fx = new ProjectsDirFixture(); From d2263933de498484f1779ba653efe8e0f4f4c768 Mon Sep 17 00:00:00 2001 From: realtonyyoung <6655045+realtonyyoung@users.noreply.github.com> Date: Sun, 5 Jul 2026 07:51:40 -0400 Subject: [PATCH 3/3] docs(import): document the provider-detection skip rationale (#232) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Records the analysis behind skipping live PR/MR detection during bulk import (results are discarded on those paths — dead work, not a speculative latency optimization), why no live large-import benchmark was run, and the Cursor exception that keeps PR detection. Co-Authored-By: Claude Opus 4.8 --- docs/import-provider-detection-latency.md | 50 +++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 docs/import-provider-detection-latency.md diff --git a/docs/import-provider-detection-latency.md b/docs/import-provider-detection-latency.md new file mode 100644 index 00000000..53e5bed3 --- /dev/null +++ b/docs/import-provider-detection-latency.md @@ -0,0 +1,50 @@ +# Import provider (PR/MR) detection latency + +Tracking: [#232](https://github.com/kurrent-io/kcap-cli/issues/232) (AI-1122). + +## Question + +`ImportCommand`'s bulk loop calls `RepositoryDetection.DetectRepositoryAsync` per session/cwd, +which can spawn a live `gh pr view` / `glab api` per unique repo (bounded by a ~2 s best-effort +cap each). The per-host memo in `GitProviderRouter` stops the `gh auth status` **probe** from +multiplying, but the **detection call** still runs per cwd — and GitLab's `glab api` is a network +round-trip rather than gh's local resolution. Does this hurt large multi-repo imports, and if so, +should we cache per `(repo, branch)` or skip detection during import? + +## Finding: the PR/MR detection is *dead work* on the import paths + +The decisive observation is not a latency number — it's that **every bulk-import path discards the +PR fields**, so the `gh`/`glab` round-trip produces a result that is never used: + +- **Scope-picker resolution** and the **"Scanning repositories"** step use only `Owner`/`RepoName`. +- **Per-session ingest** (`ImportCommand`) builds its `repository` node manually with + `user_name`/`user_email`/`remote_url`/`owner`/`repo_name`/`branch` — **no `pr_*` fields**. +- **Classification** (`TranscriptFileClassification`) uses the detected repo only for the + owner/repo exclusion key. +- The **Copilot / Pi / Kiro** import sources use their repo detector only for the same exclusion + key. + +The **live/hook path** (`EnrichWithRepositoryInfo` → `BuildRepositoryNode`) *does* emit +`pr_number`/`pr_title`/`pr_url`/`pr_head_ref`, so PR detection genuinely matters there — just not +during import. + +## Decision + +Add a `detectPullRequest` flag (default `true`, preserving every live/hook caller) to +`DetectRepositoryAsync` and pass `detectPullRequest: false` on the bulk-import paths. This removes +the provably-unused `gh pr view` / `glab api` round-trip per cwd while still resolving base repo +info from local git. It is **behaviour-preserving**: imported payloads never carried PR fields on +those paths, so nothing observable changes — only the discarded network calls are dropped. + +Because the result is discarded, this is a stronger justification than a wall-clock measurement: +the work is unnecessary regardless of its per-call cost. A representative "large multi-repo import" +benchmark was **not** run — it needs live `gh`/`glab` authentication, many real repositories with +open PRs/MRs, and network access that isn't reproducible in the dev/CI sandbox — and would not +change the conclusion. + +### Exception: Cursor import keeps PR detection + +`CursorImportSource` is the one import source whose synthetic `session-start/cursor` payload carries +a `repository` node built via `RepositoryDetection.BuildRepositoryNode`, which **does** emit `pr_*` +when populated (AI-1152). It therefore keeps live PR detection (its default detector is *not* +flipped) so imported Cursor sessions retain their PR tagging.