From 5c781df6ee6716a1b11fbf674eefa0347e09bab8 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 18:01:59 -0400 Subject: [PATCH 1/3] Guard TROP bootstrap loops against silent high-failure-rate runs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the hard-coded "< 10 successes" warning threshold in the four TROP bootstrap sites (local unit-resample, local Rao-Wu, global unit-resample, global Rao-Wu) plus the Rust global happy path with a proportional 5% failure-rate guard, matching the existing SyntheticDiD bootstrap and placebo convention. A shared helper `bootstrap_utils.warn_bootstrap_failure_rate` centralizes the check so future axis-D work (e.g. PowerAnalysis simulation counter) can reuse it. Before this change, a run with `n_bootstrap=200` and 11 successes (94.5% failure rate) passed silently because 11 >= 10. Now any run with failure rate > 5% emits a `UserWarning` surfacing the success count, total attempts, and failure rate. SDID bootstrap paths (`synthetic_did.py:1036-1070` and `:1229-1251`) were verified during this work to already have the same 5% proportional guard — D-2/D-3 in the audit are marked resolved rather than bundled. Covered by audit axis D (degenerate-replicate handling). Findings #13-#16 from `docs/audits/silent-failures-findings.md`. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/bootstrap_utils.py | 42 +++++++++ diff_diff/trop_global.py | 60 ++++++++----- diff_diff/trop_local.py | 44 ++++++---- docs/methodology/REGISTRY.md | 1 + tests/test_bootstrap_utils.py | 77 +++++++++++++--- tests/test_trop.py | 160 +++++++++++++++++++++++++++++++--- 6 files changed, 316 insertions(+), 68 deletions(-) diff --git a/diff_diff/bootstrap_utils.py b/diff_diff/bootstrap_utils.py index 4d80f4a6..4a0e3bdf 100644 --- a/diff_diff/bootstrap_utils.py +++ b/diff_diff/bootstrap_utils.py @@ -23,9 +23,51 @@ "compute_bootstrap_pvalue", "compute_effect_bootstrap_stats", "compute_effect_bootstrap_stats_batch", + "warn_bootstrap_failure_rate", ] +def warn_bootstrap_failure_rate( + n_success: int, + n_attempted: int, + context: str, + threshold: float = 0.05, + stacklevel: int = 3, +) -> None: + """Emit one proportional failure-rate warning after a replicate loop. + + Replaces the hard-coded ``< N successes`` pattern that lets high-failure + runs (e.g. 11 of 200) pass silently. Does not emit when the replicate + count is zero (callers handle the NaN-return path explicitly). + + Parameters + ---------- + n_success : int + Number of replicates that produced a finite estimate. + n_attempted : int + Total replicates attempted (``self.n_bootstrap``). + context : str + Short label for the caller (e.g. ``"TROP global bootstrap"``). + threshold : float, default=0.05 + Failure-rate threshold above which a warning is emitted. ``0.05`` + matches the existing SyntheticDiD bootstrap and placebo guards. + stacklevel : int, default=3 + Passed to :func:`warnings.warn`. + """ + if n_attempted <= 0 or n_success >= n_attempted: + return + failure_rate = 1.0 - (n_success / n_attempted) + if failure_rate <= threshold: + return + warnings.warn( + f"Only {n_success}/{n_attempted} bootstrap iterations succeeded in " + f"{context} ({failure_rate:.1%} failure rate). " + "Standard errors may be unreliable.", + UserWarning, + stacklevel=stacklevel, + ) + + def generate_bootstrap_weights( n_units: int, weight_type: str, diff --git a/diff_diff/trop_global.py b/diff_diff/trop_global.py index 2008eca4..3d651fd9 100644 --- a/diff_diff/trop_global.py +++ b/diff_diff/trop_global.py @@ -24,6 +24,7 @@ _rust_bootstrap_trop_variance_global, _rust_loocv_grid_search_global, ) +from diff_diff.bootstrap_utils import warn_bootstrap_failure_rate from diff_diff.trop_local import _soft_threshold_svd, _validate_and_pivot_treatment from diff_diff.trop_results import TROPResults from diff_diff.utils import safe_inference, warn_if_not_converged @@ -169,7 +170,11 @@ def _solve_global_model( L = np.zeros((n_periods, n_units)) else: mu, alpha, beta, L = self._solve_global_with_lowrank( - Y, delta, lambda_nn, self.max_iter, self.tol, + Y, + delta, + lambda_nn, + self.max_iter, + self.tol, _nonconvergence_tracker=_nonconvergence_tracker, ) return mu, alpha, beta, L @@ -284,7 +289,9 @@ def _loocv_score_global( try: mu, alpha, beta, L = self._solve_global_model( - Y, delta_ex, lambda_nn, + Y, + delta_ex, + lambda_nn, _nonconvergence_tracker=nonconverg_tracker, ) @@ -988,13 +995,13 @@ def _bootstrap_variance_global( unit_weight_arr, ) - if len(bootstrap_estimates) < 10: - warnings.warn( - f"Only {len(bootstrap_estimates)} bootstrap iterations succeeded.", - UserWarning, - ) - if len(bootstrap_estimates) == 0: - return np.nan, np.array([]) + warn_bootstrap_failure_rate( + n_success=len(bootstrap_estimates), + n_attempted=self.n_bootstrap, + context="TROP global bootstrap (Rust)", + ) + if len(bootstrap_estimates) == 0: + return np.nan, np.array([]) return float(se), np.array(bootstrap_estimates) @@ -1073,12 +1080,13 @@ def _bootstrap_variance_global( self.tol, ) - if len(bootstrap_estimates) < 10: - warnings.warn( - f"Only {len(bootstrap_estimates)} bootstrap iterations succeeded.", UserWarning - ) - if len(bootstrap_estimates) == 0: - return np.nan, np.array([]) + warn_bootstrap_failure_rate( + n_success=len(bootstrap_estimates), + n_attempted=self.n_bootstrap, + context="TROP global bootstrap", + ) + if len(bootstrap_estimates) == 0: + return np.nan, np.array([]) se = np.std(bootstrap_estimates, ddof=1) return float(se), bootstrap_estimates @@ -1236,7 +1244,9 @@ def _bootstrap_rao_wu_global( Y, D, lambda_time, lambda_unit, treated_periods, n_units, n_periods ) mu, alpha, beta, L = self._solve_global_model( - Y, delta, lambda_nn, + Y, + delta, + lambda_nn, _nonconvergence_tracker=nonconverg_tracker, ) @@ -1261,13 +1271,13 @@ def _bootstrap_rao_wu_global( self.tol, ) - if len(bootstrap_estimates) < 10: - warnings.warn( - f"Only {len(bootstrap_estimates)} bootstrap iterations succeeded.", - UserWarning, - ) - if len(bootstrap_estimates) == 0: - return np.nan, np.array([]) + warn_bootstrap_failure_rate( + n_success=len(bootstrap_estimates), + n_attempted=self.n_bootstrap, + context="TROP global Rao-Wu bootstrap", + ) + if len(bootstrap_estimates) == 0: + return np.nan, np.array([]) se = np.std(bootstrap_estimates, ddof=1) return float(se), bootstrap_estimates @@ -1325,7 +1335,9 @@ def _fit_global_with_fixed_lambda( # Fit model on control data and extract post-hoc tau mu, alpha, beta, L = self._solve_global_model( - Y, delta, lambda_nn, + Y, + delta, + lambda_nn, _nonconvergence_tracker=_nonconvergence_tracker, ) att, _, _ = self._extract_posthoc_tau( diff --git a/diff_diff/trop_local.py b/diff_diff/trop_local.py index 0ac731fd..49b7635d 100644 --- a/diff_diff/trop_local.py +++ b/diff_diff/trop_local.py @@ -24,6 +24,7 @@ _rust_bootstrap_trop_variance, _rust_unit_distance_matrix, ) +from diff_diff.bootstrap_utils import warn_bootstrap_failure_rate from diff_diff.trop_results import _PrecomputedStructures from diff_diff.utils import warn_if_not_converged @@ -727,8 +728,10 @@ def _estimate_model( _nonconvergence_tracker.append(1) else: warn_if_not_converged( - converged, "TROP local alternating minimization", - self.max_iter, self.tol, + converged, + "TROP local alternating minimization", + self.max_iter, + self.tol, ) return alpha, beta, L @@ -1068,14 +1071,13 @@ def _bootstrap_variance( self.tol, ) - if len(bootstrap_estimates) < 10: - warnings.warn( - f"Only {len(bootstrap_estimates)} bootstrap iterations succeeded. " - "Standard errors may be unreliable.", - UserWarning, - ) - if len(bootstrap_estimates) == 0: - return np.nan, np.array([]) + warn_bootstrap_failure_rate( + n_success=len(bootstrap_estimates), + n_attempted=self.n_bootstrap, + context="TROP local bootstrap", + ) + if len(bootstrap_estimates) == 0: + return np.nan, np.array([]) se = np.std(bootstrap_estimates, ddof=1) return float(se), bootstrap_estimates @@ -1236,14 +1238,13 @@ def _bootstrap_rao_wu_local( self.tol, ) - if len(bootstrap_estimates) < 10: - warnings.warn( - f"Only {len(bootstrap_estimates)} bootstrap iterations succeeded. " - "Standard errors may be unreliable.", - UserWarning, - ) - if len(bootstrap_estimates) == 0: - return np.nan, np.array([]) + warn_bootstrap_failure_rate( + n_success=len(bootstrap_estimates), + n_attempted=self.n_bootstrap, + context="TROP local Rao-Wu bootstrap", + ) + if len(bootstrap_estimates) == 0: + return np.nan, np.array([]) se = np.std(bootstrap_estimates, ddof=1) return float(se), bootstrap_estimates @@ -1338,7 +1339,12 @@ def _fit_with_fixed_lambda( # Fit model with these weights alpha, beta, L = self._estimate_model( - Y, control_mask, weight_matrix, lambda_nn, n_units, n_periods, + Y, + control_mask, + weight_matrix, + lambda_nn, + n_units, + n_periods, _nonconvergence_tracker=_nonconvergence_tracker, ) diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 64d2e63a..6213f215 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -1971,6 +1971,7 @@ Q(λ) = Σ_{j,s: D_js=0} [τ̂_js^loocv(λ)]² - Wrong D specification: if user provides event-style D (only first treatment period), the absorbing-state validation will raise ValueError with helpful guidance - **Bootstrap minimum**: `n_bootstrap` must be >= 2 (enforced via `ValueError`). TROP uses bootstrap for all variance estimation — there is no analytical SE formula. +- **Note:** TROP bootstrap loops (`_bootstrap_variance`, `_bootstrap_rao_wu`, and their global counterparts, including the Rust happy path) emit a proportional `UserWarning` via `diff_diff.bootstrap_utils.warn_bootstrap_failure_rate` when the replicate failure rate exceeds 5%. The previous hard-coded `< 10 successes` threshold let high-failure runs (e.g. 11 of 200) pass silently; this was classified as a silent failure under the Phase 2 audit (axis D — degenerate-replicate handling). The 5% threshold matches the existing SyntheticDiD bootstrap and placebo guards. When zero replicates succeed, SE is set to `NaN` (unchanged). - **LOOCV failure metadata**: When LOOCV fits fail in the Rust backend, the first failed observation coordinates (t, i) are returned to Python for informative warning messages - **Inference CI distribution**: After `safe_inference()` migration, CI uses t-distribution (df = max(1, n_treated_obs - 1)), consistent with p_value. Previously CI used normal-distribution while p_value used t-distribution (inconsistent). This is a minor behavioral change; CIs may be slightly wider for small n_treated_obs. - **Note:** Both the `local` alternating-minimization solver (`_estimate_model`) and the `global` alternating-minimization solver (`_solve_global_with_lowrank`, including its hard-coded inner FISTA loop of 20 iterations) emit `UserWarning` via `diff_diff.utils.warn_if_not_converged` when the outer loop exhausts `max_iter` without reaching `tol`. The global-method warning surfaces the inner-FISTA non-convergence count as diagnostic context. Silent return of the current iterate was classified as a silent failure under the Phase 2 audit and replaced with an explicit signal to match the convention used across other iterative solvers in the library. diff --git a/tests/test_bootstrap_utils.py b/tests/test_bootstrap_utils.py index 097aeb11..9746bc3e 100644 --- a/tests/test_bootstrap_utils.py +++ b/tests/test_bootstrap_utils.py @@ -8,6 +8,7 @@ from diff_diff.bootstrap_utils import ( compute_effect_bootstrap_stats, compute_effect_bootstrap_stats_batch, + warn_bootstrap_failure_rate, ) @@ -76,9 +77,7 @@ def test_nonfinite_original_effect_with_finite_boot_dist(self, bad_value): def test_bootstrap_stats_normal_case(self): """Normal case with varied values: all fields finite.""" boot_dist = np.arange(100.0) - se, ci, p_value = compute_effect_bootstrap_stats( - original_effect=50.0, boot_dist=boot_dist - ) + se, ci, p_value = compute_effect_bootstrap_stats(original_effect=50.0, boot_dist=boot_dist) assert np.isfinite(se) assert se > 0 assert np.isfinite(ci[0]) @@ -102,9 +101,7 @@ def test_batch_warns_insufficient_valid_samples(self): effects = np.array([1.0, 2.0, 3.0]) with pytest.warns(RuntimeWarning, match="too few valid"): - ses, ci_lo, ci_hi, pvals = compute_effect_bootstrap_stats_batch( - effects, matrix - ) + ses, ci_lo, ci_hi, pvals = compute_effect_bootstrap_stats_batch(effects, matrix) # Effect 1 (index 1) should be NaN assert np.isnan(ses[1]) # Other effects should be finite @@ -119,9 +116,7 @@ def test_batch_warns_zero_se(self): effects = np.array([5.0, 5.0]) with pytest.warns(RuntimeWarning, match="non-finite or zero"): - ses, ci_lo, ci_hi, pvals = compute_effect_bootstrap_stats_batch( - effects, matrix - ) + ses, ci_lo, ci_hi, pvals = compute_effect_bootstrap_stats_batch(effects, matrix) assert np.isnan(ses[0]) assert np.isnan(ses[1]) @@ -135,6 +130,66 @@ def test_batch_no_warning_for_normal_case(self): with warnings.catch_warnings(): warnings.simplefilter("error", RuntimeWarning) - ses, ci_lo, ci_hi, pvals = compute_effect_bootstrap_stats_batch( - effects, matrix + ses, ci_lo, ci_hi, pvals = compute_effect_bootstrap_stats_batch(effects, matrix) + + +class TestWarnBootstrapFailureRate: + """Proportional failure-rate guard for replicate loops (axis-D).""" + + def test_warns_above_threshold(self): + """11/200 successes = 94.5% failure rate — must warn.""" + with pytest.warns(UserWarning, match=r"11/200 bootstrap iterations"): + warn_bootstrap_failure_rate(n_success=11, n_attempted=200, context="test case") + + def test_warning_message_includes_context(self): + """Context label must appear verbatim in the warning.""" + with pytest.warns(UserWarning, match="TROP global bootstrap") as rec: + warn_bootstrap_failure_rate( + n_success=50, + n_attempted=200, + context="TROP global bootstrap", + ) + assert len(rec) == 1 + assert "75.0% failure rate" in str(rec[0].message) + + def test_silent_below_threshold(self): + """Default threshold=0.05 — 4% failure is below and must not warn.""" + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + warn_bootstrap_failure_rate(n_success=960, n_attempted=1000, context="test case") + + def test_silent_on_full_success(self): + """No warning when every replicate succeeded.""" + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + warn_bootstrap_failure_rate(n_success=200, n_attempted=200, context="test case") + + def test_silent_when_n_attempted_zero(self): + """Degenerate empty call must not divide by zero.""" + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + warn_bootstrap_failure_rate(n_success=0, n_attempted=0, context="test case") + + def test_custom_threshold(self): + """Higher threshold suppresses the 50% case.""" + with warnings.catch_warnings(): + warnings.simplefilter("error", UserWarning) + warn_bootstrap_failure_rate( + n_success=100, + n_attempted=200, + context="test case", + threshold=0.75, ) + + with pytest.warns(UserWarning, match="50.0% failure rate"): + warn_bootstrap_failure_rate( + n_success=100, + n_attempted=200, + context="test case", + threshold=0.25, + ) + + def test_all_failed_warns(self): + """0/N replicates succeeded — caller handles NaN return, but the warning fires.""" + with pytest.warns(UserWarning, match=r"0/50 bootstrap iterations"): + warn_bootstrap_failure_rate(n_success=0, n_attempted=50, context="test case") diff --git a/tests/test_trop.py b/tests/test_trop.py index 5ea51db6..ef0a42ea 100644 --- a/tests/test_trop.py +++ b/tests/test_trop.py @@ -3703,6 +3703,132 @@ def test_local_bootstrap_zero_draws_returns_nan_se(self): assert len(dist) == 0 +class TestTROPBootstrapFailureRateGuard: + """Proportional failure-rate guard for TROP bootstrap replicate loops. + + Before PR #5, all four TROP bootstrap sites warned only when + ``len(bootstrap_estimates) < 10``. A run with n_bootstrap=200 and 11 + successes (94.5% failure rate) passed silently. After PR #5, any + run with failure rate > 5% warns via + ``bootstrap_utils.warn_bootstrap_failure_rate``. + """ + + @staticmethod + def _make_failing_fit(n_total, n_success, success_value=0.1): + """Return a side_effect callable that succeeds exactly ``n_success`` + times out of ``n_total`` calls, raising ValueError otherwise.""" + state = {"calls": 0} + + def _fit(*args, **kwargs): + state["calls"] += 1 + if state["calls"] <= n_success: + return success_value + raise ValueError("forced bootstrap failure") + + return _fit + + def test_local_bootstrap_warns_above_5pct_failure(self): + """Local unit-resample bootstrap: 4/20 successes (80% fail) → warn.""" + from unittest.mock import patch + + df = TestTROPNValidTreated._make_panel() + + trop_est = TROP( + method="local", + lambda_time_grid=[1.0], + lambda_unit_grid=[1.0], + lambda_nn_grid=[np.inf], + n_bootstrap=20, + seed=42, + ) + + with patch.object( + TROP, + "_fit_with_fixed_lambda", + side_effect=self._make_failing_fit(20, 4), + ): + with pytest.warns( + UserWarning, + match=r"4/20 bootstrap iterations succeeded in TROP local bootstrap", + ): + se, dist = trop_est._bootstrap_variance( + df, "outcome", "treated", "unit", "time", (1.0, 1.0, 1e10) + ) + + assert np.isfinite(se) + assert len(dist) == 4 + + def test_global_bootstrap_warns_above_5pct_failure(self): + """Global unit-resample bootstrap (Python path): high failure rate → warn.""" + import sys + from unittest.mock import patch + + df = TestTROPNValidTreated._make_panel() + + trop_est = TROP( + method="global", + lambda_time_grid=[1.0], + lambda_unit_grid=[1.0], + lambda_nn_grid=[np.inf], + n_bootstrap=20, + seed=42, + ) + + trop_global_module = sys.modules["diff_diff.trop_global"] + with ( + patch.object(trop_global_module, "HAS_RUST_BACKEND", False), + patch.object(trop_global_module, "_rust_bootstrap_trop_variance_global", None), + patch.object( + TROP, + "_fit_global_with_fixed_lambda", + side_effect=self._make_failing_fit(20, 3), + ), + ): + with pytest.warns( + UserWarning, + match=r"3/20 bootstrap iterations succeeded in TROP global bootstrap", + ): + se, dist = trop_est._bootstrap_variance_global( + df, "outcome", "treated", "unit", "time", (1.0, 1.0, 1e10), 3 + ) + + assert np.isfinite(se) + assert len(dist) == 3 + + def test_local_bootstrap_silent_on_full_success(self): + """No proportional warning when every replicate succeeds.""" + from unittest.mock import patch + + df = TestTROPNValidTreated._make_panel() + + trop_est = TROP( + method="local", + lambda_time_grid=[1.0], + lambda_unit_grid=[1.0], + lambda_nn_grid=[np.inf], + n_bootstrap=20, + seed=42, + ) + + with patch.object( + TROP, + "_fit_with_fixed_lambda", + side_effect=self._make_failing_fit(20, 20), + ): + with warnings.catch_warnings(record=True) as w: + warnings.simplefilter("always") + se, dist = trop_est._bootstrap_variance( + df, "outcome", "treated", "unit", "time", (1.0, 1.0, 1e10) + ) + + failure_warnings = [x for x in w if "bootstrap iterations succeeded" in str(x.message)] + assert ( + failure_warnings == [] + ), f"No failure-rate warning expected on full success, got {failure_warnings}" + assert np.isfinite(se) + assert len(dist) == 20 + + class TestTROPModuleSplit: """Regression tests for the trop.py -> trop_global.py / trop_local.py split.""" @@ -4023,8 +4149,9 @@ def test_local_alternating_min_warns_on_nonconvergence(self, simple_panel_data): W = np.where(D == 0, 1.0, 0.0) with pytest.warns(UserWarning, match="did not converge"): - trop_est._estimate_model(Y, control_mask, W, lambda_nn=0.1, - n_units=n_units, n_periods=n_periods) + trop_est._estimate_model( + Y, control_mask, W, lambda_nn=0.1, n_units=n_units, n_periods=n_periods + ) def test_local_alternating_min_no_warning_on_convergence(self, simple_panel_data): """TROP local _estimate_model must not warn on a well-behaved fit.""" @@ -4044,8 +4171,9 @@ def test_local_alternating_min_no_warning_on_convergence(self, simple_panel_data with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") - trop_est._estimate_model(Y, control_mask, W, lambda_nn=0.1, - n_units=n_units, n_periods=n_periods) + trop_est._estimate_model( + Y, control_mask, W, lambda_nn=0.1, n_units=n_units, n_periods=n_periods + ) assert not any("did not converge" in str(x.message) for x in w) def test_local_fit_emits_single_aggregate_warning(self, simple_panel_data): @@ -4075,8 +4203,10 @@ def test_local_fit_emits_single_aggregate_warning(self, simple_panel_data): trop_mod = sys.modules["diff_diff.trop"] trop_local_mod = sys.modules["diff_diff.trop_local"] - with patch.object(trop_mod, "HAS_RUST_BACKEND", False), \ - patch.object(trop_local_mod, "HAS_RUST_BACKEND", False): + with ( + patch.object(trop_mod, "HAS_RUST_BACKEND", False), + patch.object(trop_local_mod, "HAS_RUST_BACKEND", False), + ): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") trop_est.fit( @@ -4104,9 +4234,9 @@ def matching(needle: str): loocv = matching("local LOOCV") assert len(loocv) >= 1, "expected at least one LOOCV aggregate warning" for msg in loocv: - assert "of" in msg and "per-observation fits" in msg, ( - f"LOOCV warning is not in aggregate format (fan-out not reduced): {msg}" - ) + assert ( + "of" in msg and "per-observation fits" in msg + ), f"LOOCV warning is not in aggregate format (fan-out not reduced): {msg}" def test_global_fit_emits_single_aggregate_warning(self, simple_panel_data): """Global-method fit-level warning aggregation: mirrors the local test. @@ -4127,8 +4257,10 @@ def test_global_fit_emits_single_aggregate_warning(self, simple_panel_data): trop_mod = sys.modules["diff_diff.trop"] trop_global_mod = sys.modules["diff_diff.trop_global"] - with patch.object(trop_mod, "HAS_RUST_BACKEND", False), \ - patch.object(trop_global_mod, "HAS_RUST_BACKEND", False): + with ( + patch.object(trop_mod, "HAS_RUST_BACKEND", False), + patch.object(trop_global_mod, "HAS_RUST_BACKEND", False), + ): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") trop_est.fit( @@ -4148,6 +4280,6 @@ def matching(needle: str): loocv = matching("global LOOCV") assert len(loocv) >= 1, "expected at least one LOOCV aggregate warning" for msg in loocv: - assert "of" in msg and "per-observation fits" in msg, ( - f"LOOCV warning is not in aggregate format (fan-out not reduced): {msg}" - ) + assert ( + "of" in msg and "per-observation fits" in msg + ), f"LOOCV warning is not in aggregate format (fan-out not reduced): {msg}" From 5755873dc03ad825c293c044bb27a74edc6a098e Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 18:05:51 -0400 Subject: [PATCH 2/3] Address AI review: extend guard to Rust local path, clarify docstring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two P2 items from local AI review: 1. The local Rust bootstrap happy path (`trop_local.py:988`) previously returned silently whenever it received `>= 10` samples and otherwise fell back to Python. That left the same silent-failure window the main PR was closing — Rust returning 11 of 200 samples (94.5% failure rate) would return its SE with no warning. Replaced the `len >= 10` path-switch with the proportional `warn_bootstrap_failure_rate` guard; only a zero-success Rust result now triggers the Python fallback. 2. The helper docstring was ambiguous about which zero case was silent. Revised to distinguish `n_attempted == 0` (silent — caller handles) from `n_success == 0` with `n_attempted > 0` (warning fires). Added a regression test covering the Rust local path via a mocked `_rust_bootstrap_trop_variance` returning 11/200 samples. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/bootstrap_utils.py | 6 ++-- diff_diff/trop_local.py | 13 ++++---- docs/methodology/REGISTRY.md | 2 +- tests/test_trop.py | 64 ++++++++++++++++++++++++++++++++++++ 4 files changed, 76 insertions(+), 9 deletions(-) diff --git a/diff_diff/bootstrap_utils.py b/diff_diff/bootstrap_utils.py index 4a0e3bdf..d492b8a0 100644 --- a/diff_diff/bootstrap_utils.py +++ b/diff_diff/bootstrap_utils.py @@ -37,8 +37,10 @@ def warn_bootstrap_failure_rate( """Emit one proportional failure-rate warning after a replicate loop. Replaces the hard-coded ``< N successes`` pattern that lets high-failure - runs (e.g. 11 of 200) pass silently. Does not emit when the replicate - count is zero (callers handle the NaN-return path explicitly). + runs (e.g. 11 of 200) pass silently. Does not emit when + ``n_attempted == 0`` (callers handle that degenerate path explicitly); + when ``n_success == 0`` and ``n_attempted > 0``, the warning fires + describing the all-failed run. Parameters ---------- diff --git a/diff_diff/trop_local.py b/diff_diff/trop_local.py index 49b7635d..1b6bcccf 100644 --- a/diff_diff/trop_local.py +++ b/diff_diff/trop_local.py @@ -985,13 +985,14 @@ def _bootstrap_variance( unit_weight_arr, ) - if len(bootstrap_estimates) >= 10: + if len(bootstrap_estimates) > 0: + warn_bootstrap_failure_rate( + n_success=len(bootstrap_estimates), + n_attempted=self.n_bootstrap, + context="TROP local bootstrap (Rust)", + ) return float(se), bootstrap_estimates - # Fall through to Python if too few bootstrap samples - logger.debug( - "Rust bootstrap returned only %d samples, falling back to Python", - len(bootstrap_estimates), - ) + logger.debug("Rust bootstrap returned 0 samples, falling back to Python") except Exception as e: logger.debug("Rust bootstrap variance failed, falling back to Python: %s", e) warnings.warn( diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 6213f215..a7f11455 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -1971,7 +1971,7 @@ Q(λ) = Σ_{j,s: D_js=0} [τ̂_js^loocv(λ)]² - Wrong D specification: if user provides event-style D (only first treatment period), the absorbing-state validation will raise ValueError with helpful guidance - **Bootstrap minimum**: `n_bootstrap` must be >= 2 (enforced via `ValueError`). TROP uses bootstrap for all variance estimation — there is no analytical SE formula. -- **Note:** TROP bootstrap loops (`_bootstrap_variance`, `_bootstrap_rao_wu`, and their global counterparts, including the Rust happy path) emit a proportional `UserWarning` via `diff_diff.bootstrap_utils.warn_bootstrap_failure_rate` when the replicate failure rate exceeds 5%. The previous hard-coded `< 10 successes` threshold let high-failure runs (e.g. 11 of 200) pass silently; this was classified as a silent failure under the Phase 2 audit (axis D — degenerate-replicate handling). The 5% threshold matches the existing SyntheticDiD bootstrap and placebo guards. When zero replicates succeed, SE is set to `NaN` (unchanged). +- **Note:** TROP bootstrap loops (`_bootstrap_variance`, `_bootstrap_rao_wu`, and their global counterparts, including both Rust happy paths — local and global) emit a proportional `UserWarning` via `diff_diff.bootstrap_utils.warn_bootstrap_failure_rate` when the replicate failure rate exceeds 5%. The previous hard-coded `< 10 successes` threshold let high-failure runs (e.g. 11 of 200) pass silently; this was classified as a silent failure under the Phase 2 audit (axis D — degenerate-replicate handling). The 5% threshold matches the existing SyntheticDiD bootstrap and placebo guards. When zero replicates succeed, SE is set to `NaN` (unchanged). The local Rust path previously also used `len >= 10` as a Python-fallback trigger; it now accepts any non-zero Rust result and emits the proportional warning instead of path-switching silently. - **LOOCV failure metadata**: When LOOCV fits fail in the Rust backend, the first failed observation coordinates (t, i) are returned to Python for informative warning messages - **Inference CI distribution**: After `safe_inference()` migration, CI uses t-distribution (df = max(1, n_treated_obs - 1)), consistent with p_value. Previously CI used normal-distribution while p_value used t-distribution (inconsistent). This is a minor behavioral change; CIs may be slightly wider for small n_treated_obs. - **Note:** Both the `local` alternating-minimization solver (`_estimate_model`) and the `global` alternating-minimization solver (`_solve_global_with_lowrank`, including its hard-coded inner FISTA loop of 20 iterations) emit `UserWarning` via `diff_diff.utils.warn_if_not_converged` when the outer loop exhausts `max_iter` without reaching `tol`. The global-method warning surfaces the inner-FISTA non-convergence count as diagnostic context. Silent return of the current iterate was classified as a silent failure under the Phase 2 audit and replaced with an explicit signal to match the convention used across other iterative solvers in the library. diff --git a/tests/test_trop.py b/tests/test_trop.py index ef0a42ea..2f111497 100644 --- a/tests/test_trop.py +++ b/tests/test_trop.py @@ -3828,6 +3828,70 @@ def test_local_bootstrap_silent_on_full_success(self): assert np.isfinite(se) assert len(dist) == 20 + def test_local_rust_bootstrap_warns_above_5pct_failure(self): + """Rust-local path previously returned silently whenever `len >= 10`. + + Now the same proportional guard fires: Rust returning 11 successful + draws out of n_bootstrap=200 (94.5% failure rate) must warn. + """ + import sys + from unittest.mock import patch + + df = TestTROPNValidTreated._make_panel() + + trop_est = TROP( + method="local", + lambda_time_grid=[1.0], + lambda_unit_grid=[1.0], + lambda_nn_grid=[np.inf], + n_bootstrap=200, + seed=42, + ) + + n_units = df["unit"].nunique() + n_periods = df["time"].nunique() + Y = np.zeros((n_periods, n_units), dtype=np.float64) + D = np.zeros((n_periods, n_units), dtype=np.float64) + trop_est._precomputed = { + "control_mask": np.ones((n_periods, n_units), dtype=bool), + "time_dist_matrix": np.abs( + np.arange(n_periods)[:, None] - np.arange(n_periods)[None, :] + ).astype(np.int64), + } + + trop_local_module = sys.modules["diff_diff.trop_local"] + rng = np.random.default_rng(0) + fake_boot = rng.normal(size=11) + + def _fake_rust_boot(*args, **kwargs): + return fake_boot, float(np.std(fake_boot, ddof=1)) + + with ( + patch.object(trop_local_module, "HAS_RUST_BACKEND", True), + patch.object( + trop_local_module, + "_rust_bootstrap_trop_variance", + side_effect=_fake_rust_boot, + ), + ): + with pytest.warns( + UserWarning, + match=r"11/200 bootstrap iterations succeeded in TROP local bootstrap \(Rust\)", + ): + se, dist = trop_est._bootstrap_variance( + df, + "outcome", + "treated", + "unit", + "time", + (1.0, 1.0, 1e10), + Y=Y, + D=D, + ) + + assert np.isfinite(se) + assert len(dist) == 11 + class TestTROPModuleSplit: """Regression tests for the trop.py -> trop_global.py / trop_local.py split.""" From 356e3ecf5e9d87daf280cf2f9782b88f8c007934 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 18:24:48 -0400 Subject: [PATCH 3/3] Close Rao-Wu and global-Rust warning-path coverage gap CI AI review on PR #324 flagged (P3) that the delta changed the Rao-Wu local, Rao-Wu global, and global-Rust warning branches without direct assertions on those paths. Add three targeted tests mirroring the pattern of the existing four (mocked inner-fit side effects, direct method invocation, pytest.warns with the context string in the match regex): - `_bootstrap_variance_global` Rust happy path - `_bootstrap_rao_wu_local` (survey design with per-unit PSU) - `_bootstrap_rao_wu_global` (same survey setup) All six changed warning sites now have direct regression coverage. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_trop.py | 151 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) diff --git a/tests/test_trop.py b/tests/test_trop.py index 2f111497..aa5de093 100644 --- a/tests/test_trop.py +++ b/tests/test_trop.py @@ -3892,6 +3892,157 @@ def _fake_rust_boot(*args, **kwargs): assert np.isfinite(se) assert len(dist) == 11 + def test_global_rust_bootstrap_warns_above_5pct_failure(self): + """Global Rust happy path: 3/20 Rust successes (85% fail) warns.""" + import sys + from unittest.mock import patch + + df = TestTROPNValidTreated._make_panel() + + trop_est = TROP( + method="global", + lambda_time_grid=[1.0], + lambda_unit_grid=[1.0], + lambda_nn_grid=[np.inf], + n_bootstrap=20, + seed=42, + ) + + trop_global_module = sys.modules["diff_diff.trop_global"] + rng = np.random.default_rng(0) + fake_boot = rng.normal(size=3) + + def _fake_rust_boot_global(*args, **kwargs): + return fake_boot, float(np.std(fake_boot, ddof=1)) + + with ( + patch.object(trop_global_module, "HAS_RUST_BACKEND", True), + patch.object( + trop_global_module, + "_rust_bootstrap_trop_variance_global", + side_effect=_fake_rust_boot_global, + ), + ): + with pytest.warns( + UserWarning, + match=r"3/20 bootstrap iterations succeeded in TROP global bootstrap \(Rust\)", + ): + se, dist = trop_est._bootstrap_variance_global( + df, "outcome", "treated", "unit", "time", (1.0, 1.0, 1e10), 3 + ) + + assert np.isfinite(se) + assert len(dist) == 3 + + @staticmethod + def _make_survey_panel_and_design(): + """Build a panel with per-unit PSU + weight columns and the matching + SurveyDesign/ResolvedSurveyDesign needed to reach the Rao-Wu path.""" + from diff_diff import SurveyDesign + from diff_diff.survey import ResolvedSurveyDesign + + df = TestTROPNValidTreated._make_panel().copy() + all_units = sorted(df["unit"].unique()) + unit_to_psu = {u: i for i, u in enumerate(all_units)} + df["psu"] = df["unit"].map(unit_to_psu).astype(np.int64) + df["weight"] = 1.0 + n_obs = len(df) + + survey_design = SurveyDesign(weights="weight", psu="psu") + resolved_survey = ResolvedSurveyDesign( + weights=np.ones(n_obs, dtype=np.float64), + weight_type="pweight", + strata=None, + psu=df["psu"].values.astype(np.int64), + fpc=None, + n_strata=0, + n_psu=len(all_units), + lonely_psu="remove", + ) + return df, survey_design, resolved_survey + + def test_local_rao_wu_bootstrap_warns_above_5pct_failure(self): + """Local Rao-Wu survey bootstrap: forced failures → proportional warn.""" + from unittest.mock import patch + + df, survey_design, resolved_survey = self._make_survey_panel_and_design() + + trop_est = TROP( + method="local", + lambda_time_grid=[1.0], + lambda_unit_grid=[1.0], + lambda_nn_grid=[np.inf], + n_bootstrap=20, + seed=42, + ) + + with patch.object( + TROP, + "_fit_with_fixed_lambda", + side_effect=self._make_failing_fit(20, 4), + ): + with pytest.warns( + UserWarning, + match=r"4/20 bootstrap iterations succeeded in TROP local Rao-Wu bootstrap", + ): + se, dist = trop_est._bootstrap_rao_wu_local( + df, + "outcome", + "treated", + "unit", + "time", + (1.0, 1.0, 1e10), + resolved_survey, + survey_design, + ) + + assert np.isfinite(se) + assert len(dist) == 4 + + def test_global_rao_wu_bootstrap_warns_above_5pct_failure(self): + """Global Rao-Wu survey bootstrap: forced failures → proportional warn.""" + from unittest.mock import patch + + df, survey_design, resolved_survey = self._make_survey_panel_and_design() + + trop_est = TROP( + method="global", + lambda_time_grid=[1.0], + lambda_unit_grid=[1.0], + lambda_nn_grid=[np.inf], + n_bootstrap=20, + seed=42, + ) + + n_calls = {"count": 0} + + def _flaky_solve(*args, **kwargs): + n_calls["count"] += 1 + if n_calls["count"] <= 3: + n_periods, n_units = args[0].shape + return 0.0, np.zeros(n_units), np.zeros(n_periods), np.zeros((n_periods, n_units)) + raise ValueError("forced Rao-Wu failure") + + with patch.object(TROP, "_solve_global_model", side_effect=_flaky_solve): + with pytest.warns( + UserWarning, + match=r"3/20 bootstrap iterations succeeded in TROP global Rao-Wu bootstrap", + ): + se, dist = trop_est._bootstrap_rao_wu_global( + df, + "outcome", + "treated", + "unit", + "time", + (1.0, 1.0, 1e10), + 3, + resolved_survey, + survey_design, + ) + + assert np.isfinite(se) or np.isnan(se) + assert len(dist) == 3 + class TestTROPModuleSplit: """Regression tests for the trop.py -> trop_global.py / trop_local.py split."""