From 5c781df6ee6716a1b11fbf674eefa0347e09bab8 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 18 Apr 2026 18:01:59 -0400
Subject: [PATCH 1/3] Guard TROP bootstrap loops against silent
 high-failure-rate runs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace the hard-coded "< 10 successes" warning threshold in the four
TROP bootstrap sites (local unit-resample, local Rao-Wu, global
unit-resample, global Rao-Wu) plus the Rust global happy path with a
proportional 5% failure-rate guard, matching the existing SyntheticDiD
bootstrap and placebo convention. A shared helper
`bootstrap_utils.warn_bootstrap_failure_rate` centralizes the check so
future axis-D work (e.g. PowerAnalysis simulation counter) can reuse it.

Before this change, a run with `n_bootstrap=200` and 11 successes
(94.5% failure rate) passed silently because 11 >= 10. Now any run
with failure rate > 5% emits a `UserWarning` surfacing the success
count, total attempts, and failure rate.

SDID bootstrap paths (`synthetic_did.py:1036-1070` and `:1229-1251`)
were verified during this work to already have the same 5% proportional
guard — D-2/D-3 in the audit are marked resolved rather than bundled.

Covered by audit axis D (degenerate-replicate handling). Findings
#13-#16 from `docs/audits/silent-failures-findings.md`.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 diff_diff/bootstrap_utils.py  |  42 +++++++++
 diff_diff/trop_global.py      |  60 ++++++++-----
 diff_diff/trop_local.py       |  44 ++++++----
 docs/methodology/REGISTRY.md  |   1 +
 tests/test_bootstrap_utils.py |  77 +++++++++++++---
 tests/test_trop.py            | 160 +++++++++++++++++++++++++++++++---
 6 files changed, 316 insertions(+), 68 deletions(-)

diff --git a/diff_diff/bootstrap_utils.py b/diff_diff/bootstrap_utils.py
index 4d80f4a6..4a0e3bdf 100644
--- a/diff_diff/bootstrap_utils.py
+++ b/diff_diff/bootstrap_utils.py
@@ -23,9 +23,51 @@
     "compute_bootstrap_pvalue",
     "compute_effect_bootstrap_stats",
     "compute_effect_bootstrap_stats_batch",
+    "warn_bootstrap_failure_rate",
 ]
 
 
+def warn_bootstrap_failure_rate(
+    n_success: int,
+    n_attempted: int,
+    context: str,
+    threshold: float = 0.05,
+    stacklevel: int = 3,
+) -> None:
+    """Emit one proportional failure-rate warning after a replicate loop.
+
+    Replaces the hard-coded ``< N successes`` pattern that lets high-failure
+    runs (e.g. 11 of 200) pass silently. Does not emit when the replicate
+    count is zero (callers handle the NaN-return path explicitly).
+
+    Parameters
+    ----------
+    n_success : int
+        Number of replicates that produced a finite estimate.
+    n_attempted : int
+        Total replicates attempted (``self.n_bootstrap``).
+    context : str
+        Short label for the caller (e.g. ``"TROP global bootstrap"``).
+    threshold : float, default=0.05
+        Failure-rate threshold above which a warning is emitted. ``0.05``
+        matches the existing SyntheticDiD bootstrap and placebo guards.
+    stacklevel : int, default=3
+        Passed to :func:`warnings.warn`.
+    """
+    if n_attempted <= 0 or n_success >= n_attempted:
+        return
+    failure_rate = 1.0 - (n_success / n_attempted)
+    if failure_rate <= threshold:
+        return
+    warnings.warn(
+        f"Only {n_success}/{n_attempted} bootstrap iterations succeeded in "
+        f"{context} ({failure_rate:.1%} failure rate). "
+        "Standard errors may be unreliable.",
+        UserWarning,
+        stacklevel=stacklevel,
+    )
+
+
 def generate_bootstrap_weights(
     n_units: int,
     weight_type: str,
diff --git a/diff_diff/trop_global.py b/diff_diff/trop_global.py
index 2008eca4..3d651fd9 100644
--- a/diff_diff/trop_global.py
+++ b/diff_diff/trop_global.py
@@ -24,6 +24,7 @@
     _rust_bootstrap_trop_variance_global,
     _rust_loocv_grid_search_global,
 )
+from diff_diff.bootstrap_utils import warn_bootstrap_failure_rate
 from diff_diff.trop_local import _soft_threshold_svd, _validate_and_pivot_treatment
 from diff_diff.trop_results import TROPResults
 from diff_diff.utils import safe_inference, warn_if_not_converged
@@ -169,7 +170,11 @@ def _solve_global_model(
             L = np.zeros((n_periods, n_units))
         else:
             mu, alpha, beta, L = self._solve_global_with_lowrank(
-                Y, delta, lambda_nn, self.max_iter, self.tol,
+                Y,
+                delta,
+                lambda_nn,
+                self.max_iter,
+                self.tol,
                 _nonconvergence_tracker=_nonconvergence_tracker,
             )
         return mu, alpha, beta, L
@@ -284,7 +289,9 @@ def _loocv_score_global(
 
             try:
                 mu, alpha, beta, L = self._solve_global_model(
-                    Y, delta_ex, lambda_nn,
+                    Y,
+                    delta_ex,
+                    lambda_nn,
                     _nonconvergence_tracker=nonconverg_tracker,
                 )
 
@@ -988,13 +995,13 @@ def _bootstrap_variance_global(
                     unit_weight_arr,
                 )
 
-                if len(bootstrap_estimates) < 10:
-                    warnings.warn(
-                        f"Only {len(bootstrap_estimates)} bootstrap iterations succeeded.",
-                        UserWarning,
-                    )
-                    if len(bootstrap_estimates) == 0:
-                        return np.nan, np.array([])
+                warn_bootstrap_failure_rate(
+                    n_success=len(bootstrap_estimates),
+                    n_attempted=self.n_bootstrap,
+                    context="TROP global bootstrap (Rust)",
+                )
+                if len(bootstrap_estimates) == 0:
+                    return np.nan, np.array([])
 
                 return float(se), np.array(bootstrap_estimates)
 
@@ -1073,12 +1080,13 @@ def _bootstrap_variance_global(
                 self.tol,
             )
 
-        if len(bootstrap_estimates) < 10:
-            warnings.warn(
-                f"Only {len(bootstrap_estimates)} bootstrap iterations succeeded.", UserWarning
-            )
-            if len(bootstrap_estimates) == 0:
-                return np.nan, np.array([])
+        warn_bootstrap_failure_rate(
+            n_success=len(bootstrap_estimates),
+            n_attempted=self.n_bootstrap,
+            context="TROP global bootstrap",
+        )
+        if len(bootstrap_estimates) == 0:
+            return np.nan, np.array([])
 
         se = np.std(bootstrap_estimates, ddof=1)
         return float(se), bootstrap_estimates
@@ -1236,7 +1244,9 @@ def _bootstrap_rao_wu_global(
                     Y, D, lambda_time, lambda_unit, treated_periods, n_units, n_periods
                 )
                 mu, alpha, beta, L = self._solve_global_model(
-                    Y, delta, lambda_nn,
+                    Y,
+                    delta,
+                    lambda_nn,
                     _nonconvergence_tracker=nonconverg_tracker,
                 )
 
@@ -1261,13 +1271,13 @@ def _bootstrap_rao_wu_global(
                 self.tol,
             )
 
-        if len(bootstrap_estimates) < 10:
-            warnings.warn(
-                f"Only {len(bootstrap_estimates)} bootstrap iterations succeeded.",
-                UserWarning,
-            )
-            if len(bootstrap_estimates) == 0:
-                return np.nan, np.array([])
+        warn_bootstrap_failure_rate(
+            n_success=len(bootstrap_estimates),
+            n_attempted=self.n_bootstrap,
+            context="TROP global Rao-Wu bootstrap",
+        )
+        if len(bootstrap_estimates) == 0:
+            return np.nan, np.array([])
 
         se = np.std(bootstrap_estimates, ddof=1)
         return float(se), bootstrap_estimates
@@ -1325,7 +1335,9 @@ def _fit_global_with_fixed_lambda(
 
         # Fit model on control data and extract post-hoc tau
         mu, alpha, beta, L = self._solve_global_model(
-            Y, delta, lambda_nn,
+            Y,
+            delta,
+            lambda_nn,
             _nonconvergence_tracker=_nonconvergence_tracker,
         )
         att, _, _ = self._extract_posthoc_tau(
diff --git a/diff_diff/trop_local.py b/diff_diff/trop_local.py
index 0ac731fd..49b7635d 100644
--- a/diff_diff/trop_local.py
+++ b/diff_diff/trop_local.py
@@ -24,6 +24,7 @@
     _rust_bootstrap_trop_variance,
     _rust_unit_distance_matrix,
 )
+from diff_diff.bootstrap_utils import warn_bootstrap_failure_rate
 from diff_diff.trop_results import _PrecomputedStructures
 from diff_diff.utils import warn_if_not_converged
 
@@ -727,8 +728,10 @@ def _estimate_model(
                 _nonconvergence_tracker.append(1)
             else:
                 warn_if_not_converged(
-                    converged, "TROP local alternating minimization",
-                    self.max_iter, self.tol,
+                    converged,
+                    "TROP local alternating minimization",
+                    self.max_iter,
+                    self.tol,
                 )
 
         return alpha, beta, L
@@ -1068,14 +1071,13 @@ def _bootstrap_variance(
                 self.tol,
             )
 
-        if len(bootstrap_estimates) < 10:
-            warnings.warn(
-                f"Only {len(bootstrap_estimates)} bootstrap iterations succeeded. "
-                "Standard errors may be unreliable.",
-                UserWarning,
-            )
-            if len(bootstrap_estimates) == 0:
-                return np.nan, np.array([])
+        warn_bootstrap_failure_rate(
+            n_success=len(bootstrap_estimates),
+            n_attempted=self.n_bootstrap,
+            context="TROP local bootstrap",
+        )
+        if len(bootstrap_estimates) == 0:
+            return np.nan, np.array([])
 
         se = np.std(bootstrap_estimates, ddof=1)
         return float(se), bootstrap_estimates
@@ -1236,14 +1238,13 @@ def _bootstrap_rao_wu_local(
                 self.tol,
             )
 
-        if len(bootstrap_estimates) < 10:
-            warnings.warn(
-                f"Only {len(bootstrap_estimates)} bootstrap iterations succeeded. "
-                "Standard errors may be unreliable.",
-                UserWarning,
-            )
-            if len(bootstrap_estimates) == 0:
-                return np.nan, np.array([])
+        warn_bootstrap_failure_rate(
+            n_success=len(bootstrap_estimates),
+            n_attempted=self.n_bootstrap,
+            context="TROP local Rao-Wu bootstrap",
+        )
+        if len(bootstrap_estimates) == 0:
+            return np.nan, np.array([])
 
         se = np.std(bootstrap_estimates, ddof=1)
         return float(se), bootstrap_estimates
@@ -1338,7 +1339,12 @@ def _fit_with_fixed_lambda(
 
             # Fit model with these weights
             alpha, beta, L = self._estimate_model(
-                Y, control_mask, weight_matrix, lambda_nn, n_units, n_periods,
+                Y,
+                control_mask,
+                weight_matrix,
+                lambda_nn,
+                n_units,
+                n_periods,
                 _nonconvergence_tracker=_nonconvergence_tracker,
             )
 
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
index 64d2e63a..6213f215 100644
--- a/docs/methodology/REGISTRY.md
+++ b/docs/methodology/REGISTRY.md
@@ -1971,6 +1971,7 @@ Q(λ) = Σ_{j,s: D_js=0} [τ̂_js^loocv(λ)]²
 - Wrong D specification: if user provides event-style D (only first treatment period),
   the absorbing-state validation will raise ValueError with helpful guidance
 - **Bootstrap minimum**: `n_bootstrap` must be >= 2 (enforced via `ValueError`). TROP uses bootstrap for all variance estimation — there is no analytical SE formula.
+- **Note:** TROP bootstrap loops (`_bootstrap_variance`, `_bootstrap_rao_wu`, and their global counterparts, including the Rust happy path) emit a proportional `UserWarning` via `diff_diff.bootstrap_utils.warn_bootstrap_failure_rate` when the replicate failure rate exceeds 5%. The previous hard-coded `< 10 successes` threshold let high-failure runs (e.g. 11 of 200) pass silently; this was classified as a silent failure under the Phase 2 audit (axis D — degenerate-replicate handling). The 5% threshold matches the existing SyntheticDiD bootstrap and placebo guards. When zero replicates succeed, SE is set to `NaN` (unchanged).
 - **LOOCV failure metadata**: When LOOCV fits fail in the Rust backend, the first failed observation coordinates (t, i) are returned to Python for informative warning messages
 - **Inference CI distribution**: After `safe_inference()` migration, CI uses t-distribution (df = max(1, n_treated_obs - 1)), consistent with p_value. Previously CI used normal-distribution while p_value used t-distribution (inconsistent). This is a minor behavioral change; CIs may be slightly wider for small n_treated_obs.
 - **Note:** Both the `local` alternating-minimization solver (`_estimate_model`) and the `global` alternating-minimization solver (`_solve_global_with_lowrank`, including its hard-coded inner FISTA loop of 20 iterations) emit `UserWarning` via `diff_diff.utils.warn_if_not_converged` when the outer loop exhausts `max_iter` without reaching `tol`. The global-method warning surfaces the inner-FISTA non-convergence count as diagnostic context. Silent return of the current iterate was classified as a silent failure under the Phase 2 audit and replaced with an explicit signal to match the convention used across other iterative solvers in the library.
diff --git a/tests/test_bootstrap_utils.py b/tests/test_bootstrap_utils.py
index 097aeb11..9746bc3e 100644
--- a/tests/test_bootstrap_utils.py
+++ b/tests/test_bootstrap_utils.py
@@ -8,6 +8,7 @@
 from diff_diff.bootstrap_utils import (
     compute_effect_bootstrap_stats,
     compute_effect_bootstrap_stats_batch,
+    warn_bootstrap_failure_rate,
 )
 
 
@@ -76,9 +77,7 @@ def test_nonfinite_original_effect_with_finite_boot_dist(self, bad_value):
     def test_bootstrap_stats_normal_case(self):
         """Normal case with varied values: all fields finite."""
         boot_dist = np.arange(100.0)
-        se, ci, p_value = compute_effect_bootstrap_stats(
-            original_effect=50.0, boot_dist=boot_dist
-        )
+        se, ci, p_value = compute_effect_bootstrap_stats(original_effect=50.0, boot_dist=boot_dist)
         assert np.isfinite(se)
         assert se > 0
         assert np.isfinite(ci[0])
@@ -102,9 +101,7 @@ def test_batch_warns_insufficient_valid_samples(self):
 
         effects = np.array([1.0, 2.0, 3.0])
         with pytest.warns(RuntimeWarning, match="too few valid"):
-            ses, ci_lo, ci_hi, pvals = compute_effect_bootstrap_stats_batch(
-                effects, matrix
-            )
+            ses, ci_lo, ci_hi, pvals = compute_effect_bootstrap_stats_batch(effects, matrix)
         # Effect 1 (index 1) should be NaN
         assert np.isnan(ses[1])
         # Other effects should be finite
@@ -119,9 +116,7 @@ def test_batch_warns_zero_se(self):
 
         effects = np.array([5.0, 5.0])
         with pytest.warns(RuntimeWarning, match="non-finite or zero"):
-            ses, ci_lo, ci_hi, pvals = compute_effect_bootstrap_stats_batch(
-                effects, matrix
-            )
+            ses, ci_lo, ci_hi, pvals = compute_effect_bootstrap_stats_batch(effects, matrix)
         assert np.isnan(ses[0])
         assert np.isnan(ses[1])
 
@@ -135,6 +130,66 @@ def test_batch_no_warning_for_normal_case(self):
 
         with warnings.catch_warnings():
             warnings.simplefilter("error", RuntimeWarning)
-            ses, ci_lo, ci_hi, pvals = compute_effect_bootstrap_stats_batch(
-                effects, matrix
+            ses, ci_lo, ci_hi, pvals = compute_effect_bootstrap_stats_batch(effects, matrix)
+
+
+class TestWarnBootstrapFailureRate:
+    """Proportional failure-rate guard for replicate loops (axis-D)."""
+
+    def test_warns_above_threshold(self):
+        """11/200 successes = 94.5% failure rate — must warn."""
+        with pytest.warns(UserWarning, match=r"11/200 bootstrap iterations"):
+            warn_bootstrap_failure_rate(n_success=11, n_attempted=200, context="test case")
+
+    def test_warning_message_includes_context(self):
+        """Context label must appear verbatim in the warning."""
+        with pytest.warns(UserWarning, match="TROP global bootstrap") as rec:
+            warn_bootstrap_failure_rate(
+                n_success=50,
+                n_attempted=200,
+                context="TROP global bootstrap",
+            )
+        assert len(rec) == 1
+        assert "75.0% failure rate" in str(rec[0].message)
+
+    def test_silent_below_threshold(self):
+        """Default threshold=0.05 — 4% failure is below and must not warn."""
+        with warnings.catch_warnings():
+            warnings.simplefilter("error", UserWarning)
+            warn_bootstrap_failure_rate(n_success=960, n_attempted=1000, context="test case")
+
+    def test_silent_on_full_success(self):
+        """No warning when every replicate succeeded."""
+        with warnings.catch_warnings():
+            warnings.simplefilter("error", UserWarning)
+            warn_bootstrap_failure_rate(n_success=200, n_attempted=200, context="test case")
+
+    def test_silent_when_n_attempted_zero(self):
+        """Degenerate empty call must not divide by zero."""
+        with warnings.catch_warnings():
+            warnings.simplefilter("error", UserWarning)
+            warn_bootstrap_failure_rate(n_success=0, n_attempted=0, context="test case")
+
+    def test_custom_threshold(self):
+        """Higher threshold suppresses the 50% case."""
+        with warnings.catch_warnings():
+            warnings.simplefilter("error", UserWarning)
+            warn_bootstrap_failure_rate(
+                n_success=100,
+                n_attempted=200,
+                context="test case",
+                threshold=0.75,
             )
+
+        with pytest.warns(UserWarning, match="50.0% failure rate"):
+            warn_bootstrap_failure_rate(
+                n_success=100,
+                n_attempted=200,
+                context="test case",
+                threshold=0.25,
+            )
+
+    def test_all_failed_warns(self):
+        """0/N replicates succeeded — caller handles NaN return, but the warning fires."""
+        with pytest.warns(UserWarning, match=r"0/50 bootstrap iterations"):
+            warn_bootstrap_failure_rate(n_success=0, n_attempted=50, context="test case")
diff --git a/tests/test_trop.py b/tests/test_trop.py
index 5ea51db6..ef0a42ea 100644
--- a/tests/test_trop.py
+++ b/tests/test_trop.py
@@ -3703,6 +3703,132 @@ def test_local_bootstrap_zero_draws_returns_nan_se(self):
         assert len(dist) == 0
 
 
+class TestTROPBootstrapFailureRateGuard:
+    """Proportional failure-rate guard for TROP bootstrap replicate loops.
+
+    Before PR #5, all four TROP bootstrap sites warned only when
+    ``len(bootstrap_estimates) < 10``. A run with n_bootstrap=200 and 11
+    successes (94.5% failure rate) passed silently. After PR #5, any
+    run with failure rate > 5% warns via
+    ``bootstrap_utils.warn_bootstrap_failure_rate``.
+    """
+
+    @staticmethod
+    def _make_failing_fit(n_total, n_success, success_value=0.1):
+        """Return a side_effect callable that succeeds exactly ``n_success``
+        times out of ``n_total`` calls, raising ValueError otherwise."""
+        state = {"calls": 0}
+
+        def _fit(*args, **kwargs):
+            state["calls"] += 1
+            if state["calls"] <= n_success:
+                return success_value
+            raise ValueError("forced bootstrap failure")
+
+        return _fit
+
+    def test_local_bootstrap_warns_above_5pct_failure(self):
+        """Local unit-resample bootstrap: 4/20 successes (80% fail) → warn."""
+        from unittest.mock import patch
+
+        df = TestTROPNValidTreated._make_panel()
+
+        trop_est = TROP(
+            method="local",
+            lambda_time_grid=[1.0],
+            lambda_unit_grid=[1.0],
+            lambda_nn_grid=[np.inf],
+            n_bootstrap=20,
+            seed=42,
+        )
+
+        with patch.object(
+            TROP,
+            "_fit_with_fixed_lambda",
+            side_effect=self._make_failing_fit(20, 4),
+        ):
+            with pytest.warns(
+                UserWarning,
+                match=r"4/20 bootstrap iterations succeeded in TROP local bootstrap",
+            ):
+                se, dist = trop_est._bootstrap_variance(
+                    df, "outcome", "treated", "unit", "time", (1.0, 1.0, 1e10)
+                )
+
+        assert np.isfinite(se)
+        assert len(dist) == 4
+
+    def test_global_bootstrap_warns_above_5pct_failure(self):
+        """Global unit-resample bootstrap (Python path): high failure rate → warn."""
+        import sys
+        from unittest.mock import patch
+
+        df = TestTROPNValidTreated._make_panel()
+
+        trop_est = TROP(
+            method="global",
+            lambda_time_grid=[1.0],
+            lambda_unit_grid=[1.0],
+            lambda_nn_grid=[np.inf],
+            n_bootstrap=20,
+            seed=42,
+        )
+
+        trop_global_module = sys.modules["diff_diff.trop_global"]
+        with (
+            patch.object(trop_global_module, "HAS_RUST_BACKEND", False),
+            patch.object(trop_global_module, "_rust_bootstrap_trop_variance_global", None),
+            patch.object(
+                TROP,
+                "_fit_global_with_fixed_lambda",
+                side_effect=self._make_failing_fit(20, 3),
+            ),
+        ):
+            with pytest.warns(
+                UserWarning,
+                match=r"3/20 bootstrap iterations succeeded in TROP global bootstrap",
+            ):
+                se, dist = trop_est._bootstrap_variance_global(
+                    df, "outcome", "treated", "unit", "time", (1.0, 1.0, 1e10), 3
+                )
+
+        assert np.isfinite(se)
+        assert len(dist) == 3
+
+    def test_local_bootstrap_silent_on_full_success(self):
+        """No proportional warning when every replicate succeeds."""
+        from unittest.mock import patch
+
+        df = TestTROPNValidTreated._make_panel()
+
+        trop_est = TROP(
+            method="local",
+            lambda_time_grid=[1.0],
+            lambda_unit_grid=[1.0],
+            lambda_nn_grid=[np.inf],
+            n_bootstrap=20,
+            seed=42,
+        )
+
+        with patch.object(
+            TROP,
+            "_fit_with_fixed_lambda",
+            side_effect=self._make_failing_fit(20, 20),
+        ):
+            with warnings.catch_warnings(record=True) as w:
+                warnings.simplefilter("always")
+                se, dist = trop_est._bootstrap_variance(
+                    df, "outcome", "treated", "unit", "time", (1.0, 1.0, 1e10)
+                )
+
+        failure_warnings = [x for x in w if "bootstrap iterations succeeded" in str(x.message)]
+        assert (
+            failure_warnings == []
+        ), f"No failure-rate warning expected on full success, got {failure_warnings}"
+        assert np.isfinite(se)
+        assert len(dist) == 20
+
+
 class TestTROPModuleSplit:
     """Regression tests for the trop.py -> trop_global.py / trop_local.py split."""
 
@@ -4023,8 +4149,9 @@ def test_local_alternating_min_warns_on_nonconvergence(self, simple_panel_data):
         W = np.where(D == 0, 1.0, 0.0)
 
         with pytest.warns(UserWarning, match="did not converge"):
-            trop_est._estimate_model(Y, control_mask, W, lambda_nn=0.1,
-                                     n_units=n_units, n_periods=n_periods)
+            trop_est._estimate_model(
+                Y, control_mask, W, lambda_nn=0.1, n_units=n_units, n_periods=n_periods
+            )
 
     def test_local_alternating_min_no_warning_on_convergence(self, simple_panel_data):
         """TROP local _estimate_model must not warn on a well-behaved fit."""
@@ -4044,8 +4171,9 @@ def test_local_alternating_min_no_warning_on_convergence(self, simple_panel_data
 
         with warnings.catch_warnings(record=True) as w:
             warnings.simplefilter("always")
-            trop_est._estimate_model(Y, control_mask, W, lambda_nn=0.1,
-                                     n_units=n_units, n_periods=n_periods)
+            trop_est._estimate_model(
+                Y, control_mask, W, lambda_nn=0.1, n_units=n_units, n_periods=n_periods
+            )
         assert not any("did not converge" in str(x.message) for x in w)
 
     def test_local_fit_emits_single_aggregate_warning(self, simple_panel_data):
@@ -4075,8 +4203,10 @@ def test_local_fit_emits_single_aggregate_warning(self, simple_panel_data):
 
         trop_mod = sys.modules["diff_diff.trop"]
         trop_local_mod = sys.modules["diff_diff.trop_local"]
-        with patch.object(trop_mod, "HAS_RUST_BACKEND", False), \
-             patch.object(trop_local_mod, "HAS_RUST_BACKEND", False):
+        with (
+            patch.object(trop_mod, "HAS_RUST_BACKEND", False),
+            patch.object(trop_local_mod, "HAS_RUST_BACKEND", False),
+        ):
             with warnings.catch_warnings(record=True) as w:
                 warnings.simplefilter("always")
                 trop_est.fit(
@@ -4104,9 +4234,9 @@ def matching(needle: str):
         loocv = matching("local LOOCV")
         assert len(loocv) >= 1, "expected at least one LOOCV aggregate warning"
         for msg in loocv:
-            assert "of" in msg and "per-observation fits" in msg, (
-                f"LOOCV warning is not in aggregate format (fan-out not reduced): {msg}"
-            )
+            assert (
+                "of" in msg and "per-observation fits" in msg
+            ), f"LOOCV warning is not in aggregate format (fan-out not reduced): {msg}"
 
     def test_global_fit_emits_single_aggregate_warning(self, simple_panel_data):
         """Global-method fit-level warning aggregation: mirrors the local test.
@@ -4127,8 +4257,10 @@ def test_global_fit_emits_single_aggregate_warning(self, simple_panel_data):
 
         trop_mod = sys.modules["diff_diff.trop"]
         trop_global_mod = sys.modules["diff_diff.trop_global"]
-        with patch.object(trop_mod, "HAS_RUST_BACKEND", False), \
-             patch.object(trop_global_mod, "HAS_RUST_BACKEND", False):
+        with (
+            patch.object(trop_mod, "HAS_RUST_BACKEND", False),
+            patch.object(trop_global_mod, "HAS_RUST_BACKEND", False),
+        ):
             with warnings.catch_warnings(record=True) as w:
                 warnings.simplefilter("always")
                 trop_est.fit(
@@ -4148,6 +4280,6 @@ def matching(needle: str):
         loocv = matching("global LOOCV")
         assert len(loocv) >= 1, "expected at least one LOOCV aggregate warning"
         for msg in loocv:
-            assert "of" in msg and "per-observation fits" in msg, (
-                f"LOOCV warning is not in aggregate format (fan-out not reduced): {msg}"
-            )
+            assert (
+                "of" in msg and "per-observation fits" in msg
+            ), f"LOOCV warning is not in aggregate format (fan-out not reduced): {msg}"

From 5755873dc03ad825c293c044bb27a74edc6a098e Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 18 Apr 2026 18:05:51 -0400
Subject: [PATCH 2/3] Address AI review: extend guard to Rust local path,
 clarify docstring
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two P2 items from local AI review:

1. The local Rust bootstrap happy path (`trop_local.py:988`) previously
   returned silently whenever it received `>= 10` samples and otherwise
   fell back to Python. That left the same silent-failure window the
   main PR was closing — Rust returning 11 of 200 samples (94.5%
   failure rate) would return its SE with no warning. Replaced the
   `len >= 10` path-switch with the proportional
   `warn_bootstrap_failure_rate` guard; only a zero-success Rust result
   now triggers the Python fallback.

2. The helper docstring was ambiguous about which zero case was silent.
   Revised to distinguish `n_attempted == 0` (silent — caller handles)
   from `n_success == 0` with `n_attempted > 0` (warning fires).

Added a regression test covering the Rust local path via a mocked
`_rust_bootstrap_trop_variance` returning 11/200 samples.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 diff_diff/bootstrap_utils.py |  6 ++--
 diff_diff/trop_local.py      | 13 ++++----
 docs/methodology/REGISTRY.md |  2 +-
 tests/test_trop.py           | 64 ++++++++++++++++++++++++++++++++++++
 4 files changed, 76 insertions(+), 9 deletions(-)

diff --git a/diff_diff/bootstrap_utils.py b/diff_diff/bootstrap_utils.py
index 4a0e3bdf..d492b8a0 100644
--- a/diff_diff/bootstrap_utils.py
+++ b/diff_diff/bootstrap_utils.py
@@ -37,8 +37,10 @@ def warn_bootstrap_failure_rate(
     """Emit one proportional failure-rate warning after a replicate loop.
 
     Replaces the hard-coded ``< N successes`` pattern that lets high-failure
-    runs (e.g. 11 of 200) pass silently. Does not emit when the replicate
-    count is zero (callers handle the NaN-return path explicitly).
+    runs (e.g. 11 of 200) pass silently. Does not emit when
+    ``n_attempted == 0`` (callers handle that degenerate path explicitly);
+    when ``n_success == 0`` and ``n_attempted > 0``, the warning fires
+    describing the all-failed run.
 
     Parameters
     ----------
diff --git a/diff_diff/trop_local.py b/diff_diff/trop_local.py
index 49b7635d..1b6bcccf 100644
--- a/diff_diff/trop_local.py
+++ b/diff_diff/trop_local.py
@@ -985,13 +985,14 @@ def _bootstrap_variance(
                     unit_weight_arr,
                 )
 
-                if len(bootstrap_estimates) >= 10:
+                if len(bootstrap_estimates) > 0:
+                    warn_bootstrap_failure_rate(
+                        n_success=len(bootstrap_estimates),
+                        n_attempted=self.n_bootstrap,
+                        context="TROP local bootstrap (Rust)",
+                    )
                     return float(se), bootstrap_estimates
-                # Fall through to Python if too few bootstrap samples
-                logger.debug(
-                    "Rust bootstrap returned only %d samples, falling back to Python",
-                    len(bootstrap_estimates),
-                )
+                logger.debug("Rust bootstrap returned 0 samples, falling back to Python")
             except Exception as e:
                 logger.debug("Rust bootstrap variance failed, falling back to Python: %s", e)
                 warnings.warn(
diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md
index 6213f215..a7f11455 100644
--- a/docs/methodology/REGISTRY.md
+++ b/docs/methodology/REGISTRY.md
@@ -1971,7 +1971,7 @@ Q(λ) = Σ_{j,s: D_js=0} [τ̂_js^loocv(λ)]²
 - Wrong D specification: if user provides event-style D (only first treatment period),
   the absorbing-state validation will raise ValueError with helpful guidance
 - **Bootstrap minimum**: `n_bootstrap` must be >= 2 (enforced via `ValueError`). TROP uses bootstrap for all variance estimation — there is no analytical SE formula.
-- **Note:** TROP bootstrap loops (`_bootstrap_variance`, `_bootstrap_rao_wu`, and their global counterparts, including the Rust happy path) emit a proportional `UserWarning` via `diff_diff.bootstrap_utils.warn_bootstrap_failure_rate` when the replicate failure rate exceeds 5%. The previous hard-coded `< 10 successes` threshold let high-failure runs (e.g. 11 of 200) pass silently; this was classified as a silent failure under the Phase 2 audit (axis D — degenerate-replicate handling). The 5% threshold matches the existing SyntheticDiD bootstrap and placebo guards. When zero replicates succeed, SE is set to `NaN` (unchanged).
+- **Note:** TROP bootstrap loops (`_bootstrap_variance`, `_bootstrap_rao_wu`, and their global counterparts, including both Rust happy paths — local and global) emit a proportional `UserWarning` via `diff_diff.bootstrap_utils.warn_bootstrap_failure_rate` when the replicate failure rate exceeds 5%. The previous hard-coded `< 10 successes` threshold let high-failure runs (e.g. 11 of 200) pass silently; this was classified as a silent failure under the Phase 2 audit (axis D — degenerate-replicate handling). The 5% threshold matches the existing SyntheticDiD bootstrap and placebo guards. When zero replicates succeed, SE is set to `NaN` (unchanged). The local Rust path previously also used `len >= 10` as a Python-fallback trigger; it now accepts any non-zero Rust result and emits the proportional warning instead of path-switching silently.
 - **LOOCV failure metadata**: When LOOCV fits fail in the Rust backend, the first failed observation coordinates (t, i) are returned to Python for informative warning messages
 - **Inference CI distribution**: After `safe_inference()` migration, CI uses t-distribution (df = max(1, n_treated_obs - 1)), consistent with p_value. Previously CI used normal-distribution while p_value used t-distribution (inconsistent). This is a minor behavioral change; CIs may be slightly wider for small n_treated_obs.
 - **Note:** Both the `local` alternating-minimization solver (`_estimate_model`) and the `global` alternating-minimization solver (`_solve_global_with_lowrank`, including its hard-coded inner FISTA loop of 20 iterations) emit `UserWarning` via `diff_diff.utils.warn_if_not_converged` when the outer loop exhausts `max_iter` without reaching `tol`. The global-method warning surfaces the inner-FISTA non-convergence count as diagnostic context. Silent return of the current iterate was classified as a silent failure under the Phase 2 audit and replaced with an explicit signal to match the convention used across other iterative solvers in the library.
diff --git a/tests/test_trop.py b/tests/test_trop.py
index ef0a42ea..2f111497 100644
--- a/tests/test_trop.py
+++ b/tests/test_trop.py
@@ -3828,6 +3828,70 @@ def test_local_bootstrap_silent_on_full_success(self):
         assert np.isfinite(se)
         assert len(dist) == 20
 
+    def test_local_rust_bootstrap_warns_above_5pct_failure(self):
+        """Rust-local path previously returned silently whenever `len >= 10`.
+
+        Now the same proportional guard fires: Rust returning 11 successful
+        draws out of n_bootstrap=200 (94.5% failure rate) must warn.
+        """
+        import sys
+        from unittest.mock import patch
+
+        df = TestTROPNValidTreated._make_panel()
+
+        trop_est = TROP(
+            method="local",
+            lambda_time_grid=[1.0],
+            lambda_unit_grid=[1.0],
+            lambda_nn_grid=[np.inf],
+            n_bootstrap=200,
+            seed=42,
+        )
+
+        n_units = df["unit"].nunique()
+        n_periods = df["time"].nunique()
+        Y = np.zeros((n_periods, n_units), dtype=np.float64)
+        D = np.zeros((n_periods, n_units), dtype=np.float64)
+        trop_est._precomputed = {
+            "control_mask": np.ones((n_periods, n_units), dtype=bool),
+            "time_dist_matrix": np.abs(
+                np.arange(n_periods)[:, None] - np.arange(n_periods)[None, :]
+            ).astype(np.int64),
+        }
+
+        trop_local_module = sys.modules["diff_diff.trop_local"]
+        rng = np.random.default_rng(0)
+        fake_boot = rng.normal(size=11)
+
+        def _fake_rust_boot(*args, **kwargs):
+            return fake_boot, float(np.std(fake_boot, ddof=1))
+
+        with (
+            patch.object(trop_local_module, "HAS_RUST_BACKEND", True),
+            patch.object(
+                trop_local_module,
+                "_rust_bootstrap_trop_variance",
+                side_effect=_fake_rust_boot,
+            ),
+        ):
+            with pytest.warns(
+                UserWarning,
+                match=r"11/200 bootstrap iterations succeeded in TROP local bootstrap \(Rust\)",
+            ):
+                se, dist = trop_est._bootstrap_variance(
+                    df,
+                    "outcome",
+                    "treated",
+                    "unit",
+                    "time",
+                    (1.0, 1.0, 1e10),
+                    Y=Y,
+                    D=D,
+                )
+
+        assert np.isfinite(se)
+        assert len(dist) == 11
+
 
 class TestTROPModuleSplit:
     """Regression tests for the trop.py -> trop_global.py / trop_local.py split."""

From 356e3ecf5e9d87daf280cf2f9782b88f8c007934 Mon Sep 17 00:00:00 2001
From: igerber <isaac.gerber@gmail.com>
Date: Sat, 18 Apr 2026 18:24:48 -0400
Subject: [PATCH 3/3] Close Rao-Wu and global-Rust warning-path coverage gap

CI AI review on PR #324 flagged (P3) that the delta changed the Rao-Wu
local, Rao-Wu global, and global-Rust warning branches without direct
assertions on those paths. Add three targeted tests mirroring the
pattern of the existing four (mocked inner-fit side effects, direct
method invocation, pytest.warns with the context string in the match
regex):

- `_bootstrap_variance_global` Rust happy path
- `_bootstrap_rao_wu_local` (survey design with per-unit PSU)
- `_bootstrap_rao_wu_global` (same survey setup)

All six changed warning sites now have direct regression coverage.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 tests/test_trop.py | 151 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 151 insertions(+)

diff --git a/tests/test_trop.py b/tests/test_trop.py
index 2f111497..aa5de093 100644
--- a/tests/test_trop.py
+++ b/tests/test_trop.py
@@ -3892,6 +3892,157 @@ def _fake_rust_boot(*args, **kwargs):
         assert np.isfinite(se)
         assert len(dist) == 11
 
+    def test_global_rust_bootstrap_warns_above_5pct_failure(self):
+        """Global Rust happy path: 3/20 Rust successes (85% fail) warns."""
+        import sys
+        from unittest.mock import patch
+
+        df = TestTROPNValidTreated._make_panel()
+
+        trop_est = TROP(
+            method="global",
+            lambda_time_grid=[1.0],
+            lambda_unit_grid=[1.0],
+            lambda_nn_grid=[np.inf],
+            n_bootstrap=20,
+            seed=42,
+        )
+
+        trop_global_module = sys.modules["diff_diff.trop_global"]
+        rng = np.random.default_rng(0)
+        fake_boot = rng.normal(size=3)
+
+        def _fake_rust_boot_global(*args, **kwargs):
+            return fake_boot, float(np.std(fake_boot, ddof=1))
+
+        with (
+            patch.object(trop_global_module, "HAS_RUST_BACKEND", True),
+            patch.object(
+                trop_global_module,
+                "_rust_bootstrap_trop_variance_global",
+                side_effect=_fake_rust_boot_global,
+            ),
+        ):
+            with pytest.warns(
+                UserWarning,
+                match=r"3/20 bootstrap iterations succeeded in TROP global bootstrap \(Rust\)",
+            ):
+                se, dist = trop_est._bootstrap_variance_global(
+                    df, "outcome", "treated", "unit", "time", (1.0, 1.0, 1e10), 3
+                )
+
+        assert np.isfinite(se)
+        assert len(dist) == 3
+
+    @staticmethod
+    def _make_survey_panel_and_design():
+        """Build a panel with per-unit PSU + weight columns and the matching
+        SurveyDesign/ResolvedSurveyDesign needed to reach the Rao-Wu path."""
+        from diff_diff import SurveyDesign
+        from diff_diff.survey import ResolvedSurveyDesign
+
+        df = TestTROPNValidTreated._make_panel().copy()
+        all_units = sorted(df["unit"].unique())
+        unit_to_psu = {u: i for i, u in enumerate(all_units)}
+        df["psu"] = df["unit"].map(unit_to_psu).astype(np.int64)
+        df["weight"] = 1.0
+        n_obs = len(df)
+
+        survey_design = SurveyDesign(weights="weight", psu="psu")
+        resolved_survey = ResolvedSurveyDesign(
+            weights=np.ones(n_obs, dtype=np.float64),
+            weight_type="pweight",
+            strata=None,
+            psu=df["psu"].values.astype(np.int64),
+            fpc=None,
+            n_strata=0,
+            n_psu=len(all_units),
+            lonely_psu="remove",
+        )
+        return df, survey_design, resolved_survey
+
+    def test_local_rao_wu_bootstrap_warns_above_5pct_failure(self):
+        """Local Rao-Wu survey bootstrap: forced failures → proportional warn."""
+        from unittest.mock import patch
+
+        df, survey_design, resolved_survey = self._make_survey_panel_and_design()
+
+        trop_est = TROP(
+            method="local",
+            lambda_time_grid=[1.0],
+            lambda_unit_grid=[1.0],
+            lambda_nn_grid=[np.inf],
+            n_bootstrap=20,
+            seed=42,
+        )
+
+        with patch.object(
+            TROP,
+            "_fit_with_fixed_lambda",
+            side_effect=self._make_failing_fit(20, 4),
+        ):
+            with pytest.warns(
+                UserWarning,
+                match=r"4/20 bootstrap iterations succeeded in TROP local Rao-Wu bootstrap",
+            ):
+                se, dist = trop_est._bootstrap_rao_wu_local(
+                    df,
+                    "outcome",
+                    "treated",
+                    "unit",
+                    "time",
+                    (1.0, 1.0, 1e10),
+                    resolved_survey,
+                    survey_design,
+                )
+
+        assert np.isfinite(se)
+        assert len(dist) == 4
+
+    def test_global_rao_wu_bootstrap_warns_above_5pct_failure(self):
+        """Global Rao-Wu survey bootstrap: forced failures → proportional warn."""
+        from unittest.mock import patch
+
+        df, survey_design, resolved_survey = self._make_survey_panel_and_design()
+
+        trop_est = TROP(
+            method="global",
+            lambda_time_grid=[1.0],
+            lambda_unit_grid=[1.0],
+            lambda_nn_grid=[np.inf],
+            n_bootstrap=20,
+            seed=42,
+        )
+
+        n_calls = {"count": 0}
+
+        def _flaky_solve(*args, **kwargs):
+            n_calls["count"] += 1
+            if n_calls["count"] <= 3:
+                n_periods, n_units = args[0].shape
+                return 0.0, np.zeros(n_units), np.zeros(n_periods), np.zeros((n_periods, n_units))
+            raise ValueError("forced Rao-Wu failure")
+
+        with patch.object(TROP, "_solve_global_model", side_effect=_flaky_solve):
+            with pytest.warns(
+                UserWarning,
+                match=r"3/20 bootstrap iterations succeeded in TROP global Rao-Wu bootstrap",
+            ):
+                se, dist = trop_est._bootstrap_rao_wu_global(
+                    df,
+                    "outcome",
+                    "treated",
+                    "unit",
+                    "time",
+                    (1.0, 1.0, 1e10),
+                    3,
+                    resolved_survey,
+                    survey_design,
+                )
+
+        assert np.isfinite(se) or np.isnan(se)
+        assert len(dist) == 3
+
 
 class TestTROPModuleSplit:
     """Regression tests for the trop.py -> trop_global.py / trop_local.py split."""