From 9b11c4373f65dbe3cd8e7335176e902df501c9c7 Mon Sep 17 00:00:00 2001 From: kmajdoub Date: Thu, 28 May 2026 10:49:47 +0200 Subject: [PATCH] =?UTF-8?q?fix(runner):=20quarantine=20undeletable=20workt?= =?UTF-8?q?ree=20dirs=20=E2=80=94=20kills=20infinite=20worktree-create-fai?= =?UTF-8?q?led=20loop=20(#96)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a prior worker planted files owned by a different uid (subprocess context mismatch), chmod+rmtree in _prep_worktree fails silently, leaves the dir behind, and the next git worktree add fails with "path already exists" → worktree-create-failed → infinite retry loop until operator manually sudo-cleans /tmp/wt-loop-. Today this bit issue forge-loop#84: ~1h of ticks all failing the same way. Fix: if cleanup leaves the dir behind, rename it to wt-loop-.stale- so git worktree add proceeds. No sudo, no destruction (operator can inspect the planted state). Boot reaper sweeps .stale-* dirs on next restart. Applied to both _prep_worktree (fresh) and _prep_repair_worktree (repair). Test: PermissionError on rmtree → quarantine dir exists, original marker preserved, worktree add still called. --- src/forge_loop/runner/_helpers.py | 15 +++++++- src/forge_loop/worker.py | 37 +++++++++++++++++-- tests/test_worker.py | 59 +++++++++++++++++++++++++++++++ 3 files changed, 108 insertions(+), 3 deletions(-) diff --git a/src/forge_loop/runner/_helpers.py b/src/forge_loop/runner/_helpers.py index 11f385b..95d32e2 100644 --- a/src/forge_loop/runner/_helpers.py +++ b/src/forge_loop/runner/_helpers.py @@ -72,8 +72,21 @@ def reap_orphan_worktrees(repo: Path, events_file: Path) -> int: """ reaped = 0 for path in sorted(glob.glob("/tmp/wt-loop-*")): + name = Path(path).name + # Quarantined dirs (wt-loop-.stale-) from failed cleanups — + # try to rm them at boot. If still un-removable due to uid + # mismatch, leave them; operator sweep can take over. + if ".stale-" in name: + try: + import shutil + shutil.rmtree(path, ignore_errors=True) + except Exception: # noqa: BLE001 — best-effort boot cleanup + pass + if not Path(path).exists(): + reaped += 1 + continue try: - issue = int(Path(path).name.removeprefix("wt-loop-").split("-")[0]) + issue = int(name.removeprefix("wt-loop-").split("-")[0]) except ValueError: # Defensive: skip non-numeric suffixes without crashing the # boot path. A future operator tool might leave a marker dir diff --git a/src/forge_loop/worker.py b/src/forge_loop/worker.py index aae1ba2..92ed477 100644 --- a/src/forge_loop/worker.py +++ b/src/forge_loop/worker.py @@ -275,6 +275,26 @@ def _drop_permissive_settings(worktree: Path) -> None: cdir.chmod(0o555) +def _quarantine_if_blocking(wt: Path) -> Path | None: + """If `wt` still exists after normal cleanup (e.g. worker-planted files + owned by a different uid that we can't chmod/rm), rename it out of the + way so `git worktree add` can proceed. Returns the quarantined path, + or None if the dir is already gone. + + Quarantined dirs use the suffix ``.stale-`` so the boot + reaper + operator can find + sweep them later without risk of colliding + with the live path. + """ + if not wt.exists(): + return None + quarantine = wt.with_name(f"{wt.name}.stale-{int(time.time())}") + try: + wt.rename(quarantine) + except OSError: + return None + return quarantine + + def _prep_worktree( repo: Path, n: int, @@ -295,7 +315,16 @@ def _prep_worktree( capture_output=True, ) if wt.exists(): - shutil.rmtree(wt) + try: + shutil.rmtree(wt) + except (OSError, PermissionError): + pass + # If the worker planted files owned by a different uid (subprocess + # ran under a different namespace), chmod+rmtree above will silently + # fail and leave the dir behind. Quarantine it so the new worktree + # add doesn't collide. Without this, every retry of this issue hits + # `worktree-create-failed` → infinite loop until operator intervenes. + _quarantine_if_blocking(wt) # If a previous failed attempt left a local branch lying around, delete # it so `git worktree add -B` can recreate it cleanly off the freshest # origin/. `-B` would overwrite anyway, but we use plain `-b` @@ -345,7 +374,11 @@ def _prep_repair_worktree( subprocess.run(["chmod", "-R", "u+w", str(claude_dir)], capture_output=True) subprocess.run(["git", "worktree", "remove", "--force", str(wt)], cwd=repo, capture_output=True) if wt.exists(): - shutil.rmtree(wt) + try: + shutil.rmtree(wt) + except (OSError, PermissionError): + pass + _quarantine_if_blocking(wt) remote_ref = f"refs/remotes/origin/{branch}" subprocess.run( ["git", "fetch", "--prune", "origin", f"+refs/heads/{branch}:{remote_ref}"], diff --git a/tests/test_worker.py b/tests/test_worker.py index 613c6ae..ae5143a 100644 --- a/tests/test_worker.py +++ b/tests/test_worker.py @@ -323,6 +323,65 @@ def fake_run(cmd: list[str], **kwargs: Any) -> _Completed: assert ["git", "worktree", "add", str(worktree), "-B", "loop/12-demo", "origin/main"] in calls +def test_prep_worktree_quarantines_undeletable_dir( + tmp_path: Path, + monkeypatch: pytest.MonkeyPatch, +) -> None: + """When a prior worker planted files we can't chmod/rm (uid mismatch), + the dir is renamed out of the way so the new `git worktree add` + doesn't collide. Without this, every retry of the same issue hits + `worktree-create-failed` and the loop spins forever. + + Repro: simulate `shutil.rmtree` raising PermissionError, then + verify (a) `worktree add` still gets called, and (b) a quarantine + dir matching `wt-loop-.stale-` exists. + """ + import shutil as _real_shutil + + real_rmtree = _real_shutil.rmtree # capture before monkeypatch + + blocking = Path("/tmp/wt-loop-9999") + # Pre-clean from any prior failed run before we monkeypatch rmtree. + for q in Path("/tmp").glob("wt-loop-9999*"): + real_rmtree(q, ignore_errors=True) + blocking.mkdir(exist_ok=True) + (blocking / "marker").write_text("planted") + + class _Completed: + returncode = 0 + stderr = "" + + calls: list[list[str]] = [] + + def fake_run(cmd: list[str], **kwargs: Any) -> _Completed: + calls.append(cmd) + return _Completed() + + def boom_rmtree(_path: str | Path) -> None: + raise PermissionError("simulated: planted by another uid") + + monkeypatch.setattr("forge_loop.worker.subprocess.run", fake_run) + monkeypatch.setattr("forge_loop.worker.shutil.rmtree", boom_rmtree) + monkeypatch.setattr("forge_loop.worker._drop_permissive_settings", lambda _wt: None) + + try: + worktree, err = _prep_worktree(tmp_path, 9999, "loop/9999-demo") + assert err is None + # `git worktree add` MUST still get called — quarantine unblocked it. + assert any( + cmd[:3] == ["git", "worktree", "add"] for cmd in calls + ), f"worktree add was not called: {calls!r}" + # The blocking dir got renamed out of the way. + assert not blocking.exists(), "blocking dir should have been quarantined" + quarantined = list(Path("/tmp").glob("wt-loop-9999.stale-*")) + assert quarantined, "quarantine dir was not created" + # Quarantined dir still holds the original marker (rename, not delete). + assert (quarantined[0] / "marker").read_text() == "planted" + finally: + for q in Path("/tmp").glob("wt-loop-9999*"): + real_rmtree(q, ignore_errors=True) + + # Gradle/WSL-OOM guard tests removed: forge-loop is stack-agnostic; the # generic brief now says "avoid full-suite runs" without project-specific # JVM flag pinning. Operators add their own gates via project tooling.