From 573de52a76e2aeaa9db53e884609e4c9c2f43cc8 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 11:31:07 -0400 Subject: [PATCH 1/3] Surface silent np.linalg.solve fallbacks across axis-A minor solver paths Addresses findings #17, #18, #19 from the Phase 2 silent-failures audit (axis A, all Minor). Each site previously ran np.linalg.solve against a matrix that could be rank-deficient or near-singular with no user-facing signal. - StaggeredTripleDifference: `_compute_did_panel` now appends a condition-number sample to an instance tracker on LinAlgError; `fit()` emits ONE aggregate UserWarning listing affected (g, g_c, t) cells and the max condition number instead of silently falling back to np.linalg.lstsq per pair. Tracker resets on repeat fit. - EfficientDiD covariate sieve (estimate_propensity_ratio_sieve, estimate_inverse_propensity_sieve): precondition-check the normal-equations matrix via np.linalg.cond before solve and reject K values above 1/sqrt(eps); partial-K skips now surface via UserWarning listing the skipped K values, instead of being swallowed by `continue`. - compute_survey_vcov: check cond(X'WX) before the sandwich solve; emit UserWarning above the 1/sqrt(eps) threshold so ill-conditioned bread matrices don't silently produce unstable variance estimates. Sibling sites picked up via repo-wide lstsq-fallback pattern grep (per the pattern-check feedback memory): - two_stage.py:1768 (TSL variance bread) - two_stage_bootstrap.py:197 (multiplier bootstrap bread) Both now warn before the silent lstsq fallback. Adds 8 targeted tests across test_staggered_triple_diff.py, test_efficient_did.py, and test_survey.py, covering collinear/ill-conditioned triggers and happy-path negatives. REGISTRY.md notes added for each affected estimator section. No behavioral change on well-conditioned inputs. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/efficient_did_covariates.py | 47 +++++++++++++ diff_diff/staggered_triple_diff.py | 33 +++++++++ diff_diff/survey.py | 19 ++++++ diff_diff/two_stage.py | 11 +++ diff_diff/two_stage_bootstrap.py | 11 +++ docs/methodology/REGISTRY.md | 9 +++ tests/test_efficient_did.py | 82 ++++++++++++++++++++++ tests/test_staggered_triple_diff.py | 98 +++++++++++++++++++++++++++ tests/test_survey.py | 74 ++++++++++++++++++++ 9 files changed, 384 insertions(+) diff --git a/diff_diff/efficient_did_covariates.py b/diff_diff/efficient_did_covariates.py index bb19ddf5..fa4c9140 100644 --- a/diff_diff/efficient_did_covariates.py +++ b/diff_diff/efficient_did_covariates.py @@ -227,6 +227,10 @@ def estimate_propensity_ratio_sieve( best_ic = np.inf best_ratio = np.ones(n_units) # fallback: constant ratio 1 + singular_K: List[int] = [] # K values skipped due to rank deficiency (#18) + # Near-singular matrices solve without raising LinAlgError but return + # numerically meaningless beta. Rule-of-thumb threshold: 1/sqrt(eps). + cond_threshold = 1.0 / np.sqrt(np.finfo(float).eps) for K in range(1, k_max + 1): n_basis = comb(K + d, d) @@ -249,13 +253,23 @@ def estimate_propensity_ratio_sieve( A = Psi_gp.T @ Psi_gp b = Psi_g.sum(axis=0) + # Precondition check (#18, axis A): reject near-singular A explicitly + # so np.linalg.solve can't silently return garbage coefficients. + with np.errstate(invalid="ignore", over="ignore"): + A_cond = float(np.linalg.cond(A)) + if not np.isfinite(A_cond) or A_cond > cond_threshold: + singular_K.append(K) + continue + try: beta = np.linalg.solve(A, b) except np.linalg.LinAlgError: + singular_K.append(K) continue # singular — try next K # Check for NaN/Inf in solution if not np.all(np.isfinite(beta)): + singular_K.append(K) continue # Predicted ratio for all units @@ -282,6 +296,18 @@ def estimate_propensity_ratio_sieve( UserWarning, stacklevel=2, ) + elif singular_K: + # Finding #18 (axis A): partial K-failure was previously silent. + # Surface it so users see that the selected basis order was + # forced by rank deficiency at higher K rather than by the IC. + warnings.warn( + f"Propensity ratio sieve: skipped K={singular_K} due to " + f"rank-deficient or non-finite normal equations. " + f"Selected basis used the remaining K values; " + f"this may indicate limited variation in the covariates.", + UserWarning, + stacklevel=2, + ) # Overlap diagnostics: warn if ratios require significant clipping n_extreme = int(np.sum((best_ratio < 1.0 / ratio_clip) | (best_ratio > ratio_clip))) @@ -377,6 +403,8 @@ def estimate_inverse_propensity_sieve( best_ic = np.inf best_s = np.full(n_units, fallback_ratio) # fallback: unconditional + singular_K: List[int] = [] # K values skipped due to rank deficiency (#18) + cond_threshold = 1.0 / np.sqrt(np.finfo(float).eps) for K in range(1, k_max + 1): n_basis = comb(K + d, d) @@ -397,11 +425,20 @@ def estimate_inverse_propensity_sieve( # RHS: sum of basis over ALL units (not just one group) b = basis_all.sum(axis=0) + # Precondition check (#18, axis A): see ratio-sieve comment above. + with np.errstate(invalid="ignore", over="ignore"): + A_cond = float(np.linalg.cond(A)) + if not np.isfinite(A_cond) or A_cond > cond_threshold: + singular_K.append(K) + continue + try: beta = np.linalg.solve(A, b) except np.linalg.LinAlgError: + singular_K.append(K) continue if not np.all(np.isfinite(beta)): + singular_K.append(K) continue s_hat = basis_all @ beta @@ -423,6 +460,16 @@ def estimate_inverse_propensity_sieve( UserWarning, stacklevel=2, ) + elif singular_K: + # Finding #18 (axis A): partial K-failure was previously silent. + warnings.warn( + f"Inverse propensity sieve: skipped K={singular_K} due to " + f"rank-deficient or non-finite normal equations. " + f"Selected basis used the remaining K values; " + f"this may indicate limited variation in the covariates.", + UserWarning, + stacklevel=2, + ) # Overlap diagnostics: warn if s_hat values require clipping n_clipped = int(np.sum((best_s < 1.0) | (best_s > float(n_units)))) diff --git a/diff_diff/staggered_triple_diff.py b/diff_diff/staggered_triple_diff.py index 85086aa3..86835a85 100644 --- a/diff_diff/staggered_triple_diff.py +++ b/diff_diff/staggered_triple_diff.py @@ -348,6 +348,11 @@ def fit( {} if (covariates and self.estimation_method in ("ipw", "dr")) else None ) + # Tracker for rank-deficient OR-IF solves across all (g, g_c, t) cells. + # _compute_did_panel appends one condition-number sample per LinAlgError + # so we emit ONE aggregate warning below rather than fanning out. + self._lstsq_fallback_tracker: List[float] = [] + for g in treatment_groups: # In universal mode, skip the reference period (t == g-1-anticipation) # so it's omitted from GT estimation. The event-study mixin injects @@ -507,6 +512,26 @@ def fit( comparison_group_counts[(g, t)] = len(gc_labels) gmm_weights_store[(g, t)] = dict(zip(gc_labels, gmm_w.tolist())) + # Consolidated OR influence-function rank-deficiency warning. + # Finding #17 in the Phase 2 silent-failures audit: the per-pair OR + # solve at _compute_did_panel() previously fell back to lstsq with no + # signal, so near/fully singular X'WX in the covariate expansion went + # to the user as a normal result. + if self._lstsq_fallback_tracker: + n_cells = len(self._lstsq_fallback_tracker) + finite_conds = [c for c in self._lstsq_fallback_tracker if np.isfinite(c)] + max_cond = max(finite_conds) if finite_conds else float("inf") + warnings.warn( + f"Rank-deficient X'WX detected in the outcome-regression " + f"influence-function step for {n_cells} (g, g_c, t) pair(s); " + f"fell back to np.linalg.lstsq. " + f"Max condition number of affected X'WX: {max_cond:.2e}. " + f"Consider dropping collinear covariates or using " + f"estimation_method='ipw' to avoid the OR projection.", + UserWarning, + stacklevel=2, + ) + # Consolidated EPV summary warning if epv_diagnostics: low_epv = {k: v for k, v in epv_diagnostics.items() if v.get("is_low")} @@ -1330,6 +1355,14 @@ def _compute_did_panel( try: asy_linear_or = (np.linalg.solve(XpX, or_ex.T)).T except np.linalg.LinAlgError: + # Rank-deficient X'WX in the OR influence-function step. Record + # a condition-number sample so fit() can emit ONE aggregate + # warning across all (g, g_c, t) cells rather than fanning out. + tracker = getattr(self, "_lstsq_fallback_tracker", None) + if tracker is not None: + with np.errstate(invalid="ignore", over="ignore"): + cond = float(np.linalg.cond(XpX)) + tracker.append(cond) asy_linear_or = (np.linalg.lstsq(XpX, or_ex.T, rcond=None)[0]).T inf_treat_or = -(asy_linear_or @ M1) diff --git a/diff_diff/survey.py b/diff_diff/survey.py index 14ae44d0..2ee8334e 100644 --- a/diff_diff/survey.py +++ b/diff_diff/survey.py @@ -1445,6 +1445,25 @@ def compute_survey_vcov( return np.zeros((k, k)) return np.full((k, k), np.nan) + # Precondition check: near-singular X'WX lets np.linalg.solve return + # unstable values without raising (finding #19, axis A). Threshold of + # 1/sqrt(eps) ≈ 6.7e7 is the standard rule of thumb — above it, the + # sandwich bread becomes numerically unreliable and the caller should + # be told so. + with np.errstate(invalid="ignore", over="ignore"): + XtWX_cond = float(np.linalg.cond(XtWX)) + cond_threshold = 1.0 / np.sqrt(np.finfo(float).eps) + if np.isfinite(XtWX_cond) and XtWX_cond > cond_threshold: + warnings.warn( + f"X'WX is ill-conditioned (cond={XtWX_cond:.2e}) in the " + f"survey sandwich variance; variance estimates may be " + f"numerically unstable. This typically indicates near " + f"multicollinearity or zero-weight strata dominating the " + f"bread matrix.", + UserWarning, + stacklevel=2, + ) + # Sandwich: (X'WX)^{-1} meat (X'WX)^{-1} try: temp = np.linalg.solve(XtWX, meat) diff --git a/diff_diff/two_stage.py b/diff_diff/two_stage.py index 6a385bd9..7c2d5015 100644 --- a/diff_diff/two_stage.py +++ b/diff_diff/two_stage.py @@ -1768,6 +1768,17 @@ def _compute_gmm_variance( try: bread = np.linalg.solve(XtWX_2, np.eye(k)) except np.linalg.LinAlgError: + # Sibling of finding #17 (axis A) — silent lstsq fallback in the + # TSL-variance bread was previously silent. Surface it so a + # rank-deficient second-stage design doesn't quietly degrade SEs. + warnings.warn( + "Rank-deficient second-stage X'WX in TwoStageDiD TSL variance; " + "falling back to np.linalg.lstsq for the bread matrix. " + "Analytical SEs may be numerically unstable; consider dropping " + "collinear covariates.", + UserWarning, + stacklevel=2, + ) bread = np.linalg.lstsq(XtWX_2, np.eye(k), rcond=None)[0] # 7. V = bread @ meat @ bread diff --git a/diff_diff/two_stage_bootstrap.py b/diff_diff/two_stage_bootstrap.py index ea23ef40..d356f441 100644 --- a/diff_diff/two_stage_bootstrap.py +++ b/diff_diff/two_stage_bootstrap.py @@ -197,6 +197,17 @@ def _compute_cluster_S_scores( try: bread = np.linalg.solve(XtX_2, np.eye(k)) except np.linalg.LinAlgError: + # Sibling of finding #17 (axis A) — silent lstsq fallback in the + # TwoStage bootstrap bread matrix. Called once per (static / event- + # study / group) aggregation, so warning fan-out is bounded. + warnings.warn( + "Rank-deficient second-stage X'WX in TwoStageDiD multiplier " + "bootstrap bread; falling back to np.linalg.lstsq. Bootstrap " + "SEs may be numerically unstable; consider dropping collinear " + "covariates.", + UserWarning, + stacklevel=2, + ) bread = np.linalg.lstsq(XtX_2, np.eye(k), rcond=None)[0] return S, bread, unique_clusters diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 091b3847..3de1bd7b 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -753,6 +753,7 @@ See `docs/methodology/continuous-did.md` Section 4 for full details. - **Balanced panel**: Short balanced panel required ("large-n, fixed-T" regime). Does not handle unbalanced panels or repeated cross-sections - Warn if treatment varies within units (non-absorbing treatment) - Warn if propensity score estimates are near boundary values +- **Note:** Polynomial-sieve propensity fits now reject any K whose normal-equations matrix has condition number above `1/sqrt(eps)` (≈ 6.7e7) — previously a near-singular `np.linalg.solve` could return numerically meaningless coefficients without raising. If at least one K succeeds but others were skipped via this precondition, a `UserWarning` lists the skipped K values. If every K is skipped, the existing "estimation failed for all K values" fallback warning still fires. Axis-A finding #18 in the Phase 2 silent-failures audit. *Estimator equation -- single treatment date (Equations 3.2, 3.5):* @@ -1175,6 +1176,7 @@ Our implementation uses multiplier bootstrap on the GMM influence function: clus - **Zero-observation cohorts in group effects:** If all treated observations for a cohort have NaN `y_tilde` (excluded from estimation), that cohort's group effect is NaN with n_obs=0. - **Note:** Survey weights in TwoStageDiD GMM sandwich via weighted cross-products: bread uses (X'_2 W X_2)^{-1}, gamma_hat uses (X'_{10} W X_{10})^{-1}(X'_1 W X_2), per-cluster scores multiply by survey weights. PSU clustering, stratification, and FPC are fully supported in the meat matrix via `_compute_stratified_meat_from_psu_scores()`. When strata or FPC are present, the meat computation replaces `S' S` with the stratified formula `sum_h (1 - f_h) * (n_h/(n_h-1)) * centered_h' centered_h`. Strata also enters survey df (n_PSU - n_strata) for t-distribution inference. Bootstrap + survey supported (Phase 6) via PSU-level multiplier weights. - **Note:** Both the iterative FE solver (`_iterative_fe`, Stage 1) and the iterative alternating-projection demeaning helper (`_iterative_demean`, used in covariate residualization) emit `UserWarning` when `max_iter` exhausts without reaching `tol`, via `diff_diff.utils.warn_if_not_converged`. Silent return of the current iterate was classified as a silent failure under the Phase 2 audit and replaced with an explicit signal to match the logistic/Poisson IRLS pattern in `linalg.py`. +- **Note:** When the Stage-2 bread `X'_2 W X_2` is singular, both the analytical TSL variance (`two_stage.py`) and the multiplier-bootstrap bread (`two_stage_bootstrap.py`) now emit a `UserWarning` before falling back to `np.linalg.lstsq`. Previously this fallback was silent. Sibling of axis-A finding #17 in the Phase 2 silent-failures audit; surfaced by the repo-wide lstsq-fallback pattern grep that accompanied the StaggeredTripleDifference fix. - **Note:** The GMM sandwich and bootstrap paths both use `scipy.sparse.linalg.factorized` for the Stage 1 normal-equations solve `(X'_{10} W X_{10}) gamma = X'_1 W X_2` and fall back to dense `lstsq` when the sparse factorization raises `RuntimeError` on a near-singular matrix. Both fallback sites emit a `UserWarning` (silent-failure audit axis C) so callers know SE estimates came from the degraded path rather than the fast sparse path. **Reference implementation(s):** @@ -1695,6 +1697,7 @@ has no additional effect. - **Note:** `pscore_fallback` default changed from unconditional to error. Set `pscore_fallback="unconditional"` for legacy behavior. - Warns on singular GMM covariance matrix (falls back to pseudoinverse) +- **Note:** Rank-deficient X'WX in the per-pair outcome-regression influence-function step now emits ONE aggregate `UserWarning` at `fit()` time (counting affected (g, g_c, t) cells and reporting the max condition number), instead of silently falling back to `np.linalg.lstsq`. Axis-A finding #17 in the Phase 2 silent-failures audit. *Data structure:* @@ -2719,6 +2722,12 @@ unequal selection probabilities). per-observation PSUs for the TSL meat computation, consistent with the stratified-no-PSU path. The adjustment factor is `n/(n-1)` (not HC1's `n/(n-k)`). +- **Note:** TSL now precondition-checks `X'WX` via `np.linalg.cond` before + solving the sandwich. If the condition number exceeds `1/sqrt(eps)` (≈ + 6.7e7) a `UserWarning` fires stating that the bread is ill-conditioned + and variance estimates may be numerically unstable. Previously a near- + singular `X'WX` could silently produce unstable SEs. Axis-A finding #19 + in the Phase 2 silent-failures audit. ### Weight Type Effects on Inference diff --git a/tests/test_efficient_did.py b/tests/test_efficient_did.py index 951c46f5..ddce55a6 100644 --- a/tests/test_efficient_did.py +++ b/tests/test_efficient_did.py @@ -2052,3 +2052,85 @@ def test_inverse_propensity_sieve_fallback_warns(self): assert np.all(np.isfinite(s_hat)) # Should fall back to unconditional n/n_group = 100/2 = 50 assert np.allclose(s_hat, 50.0) + + +# --------------------------------------------------------------------------- +# Silent-failure audit PR #9: finding #18 — estimate_*_sieve silently +# `continue`'d past rank-deficient K values. Now we track skipped K and +# warn when we ship a result that wasn't the IC-winner across all K. +# --------------------------------------------------------------------------- + + +class TestSievePartialKSkipWarning: + """Finding #18 (axis A): partial K-failure no longer silent.""" + + def test_ratio_sieve_partial_skip_warns(self): + """If some K's are rank-deficient but at least one succeeds, + the function warns about the partial skip instead of swallowing it.""" + from diff_diff.efficient_did_covariates import estimate_propensity_ratio_sieve + + rng = np.random.default_rng(7) + n = 200 + # 1D covariate with discrete support {0, 1}. At K=1 the basis is + # [1, x]; at K>=2 the basis reaches size >= n_gp for most groups + # before hitting singularity, but with this discrete support the + # polynomial powers x^2, x^3, ... equal x, yielding rank-deficient + # normal equations deterministically. + X = rng.integers(0, 2, size=(n, 1)).astype(float) + mask_g = np.zeros(n, dtype=bool) + mask_g[:100] = True + mask_gp = np.zeros(n, dtype=bool) + mask_gp[100:] = True + with pytest.warns(UserWarning) as caught: + ratio = estimate_propensity_ratio_sieve(X, mask_g, mask_gp, k_max=3) + assert np.all(np.isfinite(ratio)) + partial_skip_msgs = [ + str(w.message) for w in caught if "skipped K=" in str(w.message) + ] + assert partial_skip_msgs, ( + "Expected a partial-K-skip warning when some K's are rank deficient " + "but at least one succeeds; got none." + ) + # Message should name the specific K values that were skipped. + assert any("K=" in m for m in partial_skip_msgs) + + def test_inverse_propensity_sieve_partial_skip_warns(self): + """Same contract for the inverse propensity sieve.""" + from diff_diff.efficient_did_covariates import estimate_inverse_propensity_sieve + + rng = np.random.default_rng(7) + n = 200 + X = rng.integers(0, 2, size=(n, 1)).astype(float) + mask = np.zeros(n, dtype=bool) + mask[:100] = True + with pytest.warns(UserWarning) as caught: + s_hat = estimate_inverse_propensity_sieve(X, mask, k_max=3) + assert np.all(np.isfinite(s_hat)) + partial_skip_msgs = [ + str(w.message) for w in caught if "skipped K=" in str(w.message) + ] + assert partial_skip_msgs + + def test_ratio_sieve_no_warning_when_no_skips(self): + """Clean, well-conditioned covariates → no partial-skip warning.""" + from diff_diff.efficient_did_covariates import estimate_propensity_ratio_sieve + + rng = np.random.default_rng(101) + n = 300 + X = rng.normal(0, 1, (n, 2)) + mask_g = np.zeros(n, dtype=bool) + mask_g[:150] = True + mask_gp = np.zeros(n, dtype=bool) + mask_gp[150:] = True + import warnings as _w + + with _w.catch_warnings(record=True) as caught: + _w.simplefilter("always") + ratio = estimate_propensity_ratio_sieve(X, mask_g, mask_gp, k_max=3) + assert np.all(np.isfinite(ratio)) + partial_skip_msgs = [ + str(w.message) for w in caught if "skipped K=" in str(w.message) + ] + assert partial_skip_msgs == [], ( + f"Unexpected partial-skip warning on clean data: {partial_skip_msgs}" + ) diff --git a/tests/test_staggered_triple_diff.py b/tests/test_staggered_triple_diff.py index a47d1758..23b929c3 100644 --- a/tests/test_staggered_triple_diff.py +++ b/tests/test_staggered_triple_diff.py @@ -534,3 +534,101 @@ def test_collinear_covariates_cached_ps_finite(self): for (g, t), eff in res.group_time_effects.items(): assert np.isfinite(eff["effect"]), f"Non-finite ATT at (g={g},t={t})" assert np.isfinite(eff["se"]), f"Non-finite SE at (g={g},t={t})" + + +# --------------------------------------------------------------------------- +# Silent-failure audit PR #9: finding #17 — OR influence-function solve +# fell back to np.linalg.lstsq silently when X'WX was rank deficient. Now +# tracked at _compute_did_panel() and aggregated into one fit-level warning. +# --------------------------------------------------------------------------- + + +class TestStaggeredTripleDiffORSolveFallback: + def test_collinear_covariates_emit_lstsq_fallback_warning(self): + """Perfectly collinear covariates should trigger the aggregate OR + lstsq-fallback warning. Previously this path was silent.""" + data = generate_staggered_ddd_data( + n_units=200, + treatment_effect=3.0, + add_covariates=True, + seed=55, + ) + # x3 ≡ 2·x1 makes covX = [intercept, x1, x2, x3] rank-deficient + # per pair, so XpX in _compute_did_panel() hits LinAlgError. + data["x3"] = 2.0 * data["x1"] + est = StaggeredTripleDifference( + estimation_method="dr", + rank_deficient_action="silent", # silence upstream rank-def noise + ) + import warnings as _w + + with _w.catch_warnings(record=True) as caught: + _w.simplefilter("always") + res = est.fit( + data, + "outcome", + "unit", + "period", + "first_treat", + "eligibility", + covariates=["x1", "x2", "x3"], + ) + lstsq_warnings = [ + w for w in caught if "Rank-deficient X'WX" in str(w.message) + ] + assert len(lstsq_warnings) == 1, ( + f"Expected exactly one aggregate OR lstsq-fallback warning, " + f"got {len(lstsq_warnings)}." + ) + msg = str(lstsq_warnings[0].message) + assert "(g, g_c, t) pair(s)" in msg + assert "np.linalg.lstsq" in msg + # Point estimates should still be finite (lstsq fallback succeeded). + assert np.isfinite(res.overall_att) + + def test_well_conditioned_covariates_emit_no_lstsq_warning(self): + """Clean, well-conditioned covariates should NOT trigger the + aggregate warning — regression-safety for the happy path.""" + data = generate_staggered_ddd_data( + n_units=300, + treatment_effect=3.0, + add_covariates=True, + seed=42, + ) + est = StaggeredTripleDifference(estimation_method="dr") + import warnings as _w + + with _w.catch_warnings(record=True) as caught: + _w.simplefilter("always") + est.fit( + data, + "outcome", + "unit", + "period", + "first_treat", + "eligibility", + covariates=["x1", "x2"], + ) + lstsq_warnings = [ + w for w in caught if "Rank-deficient X'WX" in str(w.message) + ] + assert lstsq_warnings == [], ( + f"Unexpected OR lstsq-fallback warning on clean covariates: " + f"{[str(w.message) for w in lstsq_warnings]}" + ) + + def test_no_covariates_no_warning(self, simple_data): + """Without covariates the OR path is skipped entirely, so no + aggregate warning should be emitted.""" + est = StaggeredTripleDifference(estimation_method="reg") + import warnings as _w + + with _w.catch_warnings(record=True) as caught: + _w.simplefilter("always") + est.fit( + simple_data, "outcome", "unit", "period", "first_treat", "eligibility" + ) + lstsq_warnings = [ + w for w in caught if "Rank-deficient X'WX" in str(w.message) + ] + assert lstsq_warnings == [] diff --git a/tests/test_survey.py b/tests/test_survey.py index f8a51fa0..56736057 100644 --- a/tests/test_survey.py +++ b/tests/test_survey.py @@ -3336,3 +3336,77 @@ def test_item7_aweight_normalization_warning(self): sd = SurveyDesign(weights="w", weight_type="aweight") with pytest.warns(UserWarning, match="aweight weights normalized"): sd.resolve(df) + + +# --------------------------------------------------------------------------- +# Silent-failure audit PR #9: finding #19 — compute_survey_vcov called +# np.linalg.solve(XtWX, ...) with no precondition check. A near-singular +# X'WX (zero-weight strata dominating, near-collinear X) solves without +# raising but returns numerically unstable variance. Now warns when +# cond(X'WX) exceeds 1/sqrt(eps). +# --------------------------------------------------------------------------- + + +class TestSurveyVcovIllConditionedWarning: + def test_ill_conditioned_X_warns(self): + """Near-collinear design matrix triggers the cond-number warning.""" + n = 60 + rng = np.random.default_rng(11) + x1 = rng.normal(0, 1, n) + # x2 = x1 + tiny noise → near-collinear after weighting + x2 = x1 + rng.normal(0, 1e-9, n) + X = np.column_stack([np.ones(n), x1, x2]) + y = 1.0 + 0.5 * x1 + rng.normal(0, 0.3, n) + weights = np.ones(n) + + # Use WLS solve to get residuals — but we need residuals that give + # a non-zero meat so the function reaches the vcov solve. + coef = np.linalg.lstsq(X, y, rcond=None)[0] + resid = y - X @ coef + + resolved = ResolvedSurveyDesign( + weights=weights, + weight_type="pweight", + strata=None, + psu=np.arange(n), + fpc=None, + n_strata=0, + n_psu=n, + lonely_psu="remove", + ) + with pytest.warns(UserWarning, match="X'WX is ill-conditioned"): + vcov = compute_survey_vcov(X, resid, resolved) + # Warning does not suppress output — the caller still gets a matrix. + assert vcov.shape == (3, 3) + + def test_well_conditioned_X_no_warning(self): + """Clean design matrix should NOT trigger the warning — happy path.""" + n = 80 + rng = np.random.default_rng(22) + X = np.column_stack([np.ones(n), rng.normal(0, 1, n), rng.normal(0, 1, n)]) + y = 1.0 + 0.5 * X[:, 1] - 0.3 * X[:, 2] + rng.normal(0, 0.3, n) + weights = np.ones(n) + + coef, resid, _ = solve_ols(X, y, weights=weights, weight_type="pweight") + resolved = ResolvedSurveyDesign( + weights=weights, + weight_type="pweight", + strata=None, + psu=np.arange(n), + fpc=None, + n_strata=0, + n_psu=n, + lonely_psu="remove", + ) + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + vcov = compute_survey_vcov(X, resid, resolved) + ill_cond_warnings = [ + w for w in caught if "X'WX is ill-conditioned" in str(w.message) + ] + assert ill_cond_warnings == [], ( + f"Unexpected cond-number warning on clean data: " + f"{[str(w.message) for w in ill_cond_warnings]}" + ) + assert vcov.shape == (3, 3) + assert np.all(np.isfinite(vcov)) From 90a38316506e2604030627ffba091edb8f441eae Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 11:57:50 -0400 Subject: [PATCH 2/3] Address CI AI review: instrument CS _safe_inv + sieve docstrings + TwoStage tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI review on PR #334 flagged one P1 (incomplete audit of silent solve fallbacks) and two P3s. P1 — Callaway-Sant'Anna `_safe_inv()` sibling site: The shared `_safe_inv(A)` helper in staggered.py was still silently falling back from np.linalg.solve to np.linalg.lstsq. It feeds ~13 analytical SE paths (propensity-score Hessian, OR bread, event-study bread, etc.), so a rank-deficient design could silently ship degraded analytical SEs. Extended `_safe_inv(A, tracker: Optional[list] = None)` to append a condition-number sample on LinAlgError when a tracker is passed. Initialize `self._safe_inv_tracker: List[float] = []` at the top of `CallawaySantAnna.fit()`, thread `tracker=self._safe_inv_tracker` through all 13 callsites, and emit ONE aggregate UserWarning at the end of fit() listing the number of fallbacks and max condition number. Matches the tracker pattern established in STD finding #17. Added TestCallawaySantAnnaSafeInvFallback with two tests: collinear covariates trigger the aggregate warning; well-conditioned data emits no warning (happy-path regression guard). REGISTRY.md §CallawaySantAnna notes the new warning contract. P3 — Sieve docstrings lag behavior: Updated estimate_propensity_ratio_sieve and estimate_inverse_propensity_sieve docstrings to describe the new cond(A) > 1/sqrt(eps) precondition check, the partial-K skip warning, and the all-K fallback semantics. P3 — No TwoStage regression coverage: Added TestTwoStageStage2BreadWarning with two tests covering both the analytical TSL and bootstrap bread paths (contract: if the lstsq fallback triggers, it must warn). TODO.md: logged honest_did.py:1907 basis-enumeration skip as an intentional algorithm behavior (not a silent failure per the Phase 2 audit definition) but notable for a future diagnostic enhancement. No behavioral change on well-conditioned inputs. Co-Authored-By: Claude Opus 4.7 (1M context) --- TODO.md | 1 + diff_diff/efficient_did_covariates.py | 18 +++++- diff_diff/staggered.py | 92 ++++++++++++++++++++++----- docs/methodology/REGISTRY.md | 1 + tests/test_staggered.py | 74 +++++++++++++++++++++ tests/test_two_stage.py | 82 ++++++++++++++++++++++++ 6 files changed, 252 insertions(+), 16 deletions(-) diff --git a/TODO.md b/TODO.md index 71a1ebfd..95ac15c2 100644 --- a/TODO.md +++ b/TODO.md @@ -82,6 +82,7 @@ Deferred items from PR reviews that were not addressed before merge. | HC2 / HC2 + Bell-McCaffrey on absorbed-FE fits currently raises `NotImplementedError` in three places: `TwoWayFixedEffects` unconditionally; `DifferenceInDifferences(absorb=..., vcov_type in {"hc2","hc2_bm"})`; `MultiPeriodDiD(absorb=..., vcov_type in {"hc2","hc2_bm"})`. Within-transformation preserves coefficients and residuals under FWL but not the hat matrix, so the reduced-design `h_ii` is not the diagonal of the full FE projection and CR2's block adjustment `A_g = (I - H_gg)^{-1/2}` is likewise wrong on absorbed cluster blocks. Lifting the guard needs HC2/CR2-BM computed from the full absorbed projection (unit/time FE dummies reconstructed internally, or a FE-aware hat-matrix formulation) and a parity harness against a full-dummy OLS run or R `fixest`/`clubSandwich`. HC1/CR1 are unaffected by this because they have no leverage term. | `twfe.py::fit`, `estimators.py::DifferenceInDifferences.fit`, `estimators.py::MultiPeriodDiD.fit` | Phase 1a | Medium | | Weighted CR2 Bell-McCaffrey cluster-robust (`vcov_type="hc2_bm"` + `cluster_ids` + `weights`) currently raises `NotImplementedError`. Weighted hat matrix and residual rebalancing need threading per clubSandwich WLS handling. | `linalg.py::_compute_cr2_bm` | Phase 1a | Medium | | Regenerate `benchmarks/data/clubsandwich_cr2_golden.json` from R (`Rscript benchmarks/R/generate_clubsandwich_golden.R`). Current JSON has `source: python_self_reference` as a stability anchor until an authoritative R run. | `benchmarks/R/generate_clubsandwich_golden.R` | Phase 1a | Medium | +| `honest_did.py:1907` `np.linalg.solve(A_sys, b_sys) / except LinAlgError: continue` is a silent basis-rejection in the vertex-enumeration loop that is algorithmically intentional (try the next basis). Consider surfacing a count of rejected bases as a diagnostic when ARP enumeration exhausts, so users see when the vertex search was heavily constrained. Not a silent failure in the sense of the Phase 2 audit (the algorithm is supposed to skip), but the diagnostic would help debug borderline cases. | `honest_did.py` | #334 | Low | #### Performance diff --git a/diff_diff/efficient_did_covariates.py b/diff_diff/efficient_did_covariates.py index fa4c9140..86052aee 100644 --- a/diff_diff/efficient_did_covariates.py +++ b/diff_diff/efficient_did_covariates.py @@ -172,7 +172,15 @@ def estimate_propensity_ratio_sieve( Selects K via AIC/BIC: ``IC(K) = 2*loss(K) + C_n*K/n``. - On singular basis: tries lower K. Short-circuits r_{g,g}(X) = 1. + Precondition check per K: if ``cond(Psi_{g'}' W Psi_{g'}) > 1/sqrt(eps)`` + (≈ 6.7e7), that K is skipped. LinAlgError on the `np.linalg.solve` call + or a non-finite beta skips as well. If at least one K succeeds but + others were skipped, emits a ``UserWarning`` listing the skipped K + values (silent-failure audit PR, axis-A finding #18). If every K is + skipped, the caller falls back to a constant ratio of 1 with a + separate "estimation failed for all K values" warning. + + Short-circuits ``r_{g,g}(X) = 1`` for same-cohort comparisons (PT-All). Parameters ---------- @@ -355,6 +363,14 @@ def estimate_inverse_propensity_sieve( units on the RHS (not just group g), following the paper's algorithm step 4. + Precondition check per K: if ``cond(Psi_{g'}' W Psi_{g'}) > 1/sqrt(eps)`` + (≈ 6.7e7), that K is skipped. LinAlgError on the `np.linalg.solve` call + or a non-finite beta skips as well. If at least one K succeeds but + others were skipped, emits a ``UserWarning`` listing the skipped K + values (silent-failure audit PR, axis-A finding #18). If every K is + skipped, the caller falls back to unconditional ``n/n_group`` scaling + with a separate "estimation failed for all K values" warning. + Parameters ---------- covariate_matrix : ndarray, shape (n_units, n_covariates) diff --git a/diff_diff/staggered.py b/diff_diff/staggered.py index 2004f2e7..d26233fe 100644 --- a/diff_diff/staggered.py +++ b/diff_diff/staggered.py @@ -92,11 +92,29 @@ def _linear_regression( return beta, residuals -def _safe_inv(A: np.ndarray) -> np.ndarray: - """Invert a square matrix with lstsq fallback for near-singular cases.""" +def _safe_inv( + A: np.ndarray, + tracker: Optional[list] = None, +) -> np.ndarray: + """Invert a square matrix with lstsq fallback for near-singular cases. + + Parameters + ---------- + A : np.ndarray + Square matrix to invert. + tracker : list, optional + When provided, one condition-number sample of ``A`` is appended on + every LinAlgError fallback. ``CallawaySantAnna.fit()`` initializes + a list and emits a single aggregate `UserWarning` after the fit + finishes, rather than surfacing a separate warning per fallback. + Sibling of finding #17 in the Phase 2 silent-failures audit. + """ try: return np.linalg.solve(A, np.eye(A.shape[0])) except np.linalg.LinAlgError: + if tracker is not None: + with np.errstate(invalid="ignore", over="ignore"): + tracker.append(float(np.linalg.cond(A))) return np.linalg.lstsq(A, np.eye(A.shape[0]), rcond=None)[0] @@ -1436,6 +1454,12 @@ def fit( # Reset stale state from prior fit (prevents leaking event-study VCV) self._event_study_vcov = None + # Tracker for _safe_inv lstsq fallbacks across all analytical SE + # paths (PS Hessian, OR bread, event-study bread, etc.). Emit ONE + # aggregate warning at the end of fit rather than fanning out per + # cell. Sibling of PR #9 finding #17. + self._safe_inv_tracker: List[float] = [] + if not self.panel: warnings.warn( "panel=False uses repeated cross-section DRDID estimators " @@ -1976,6 +2000,26 @@ def fit( eff_data["effect"] + cband_crit_value * se_val, ) + # Consolidated _safe_inv lstsq-fallback warning (sibling of PR #9 + # finding #17). Rank-deficient PS Hessian / OR bread matrices in the + # analytical SE paths previously fell back to np.linalg.lstsq + # silently per cell. Now aggregated here into ONE UserWarning so + # a bad design surface doesn't quietly degrade analytical SEs. + if self._safe_inv_tracker: + n_fallbacks = len(self._safe_inv_tracker) + finite_conds = [c for c in self._safe_inv_tracker if np.isfinite(c)] + max_cond = max(finite_conds) if finite_conds else float("inf") + warnings.warn( + f"Rank-deficient matrix encountered {n_fallbacks} time(s) " + f"in analytical SE paths (propensity-score Hessian or " + f"outcome-regression bread); fell back to np.linalg.lstsq. " + f"Max condition number of affected matrix: {max_cond:.2e}. " + f"Analytical SEs may be numerically unstable; consider " + f"dropping collinear covariates or using n_bootstrap > 0.", + UserWarning, + stacklevel=2, + ) + # Store results # Retrieve event-study VCV from aggregation mixin (Phase 7d). # Clear it when bootstrap overwrites event-study SEs to prevent @@ -2276,7 +2320,7 @@ def _ipw_estimation( W_ps = W_ps * sw_all # R: Hessian.ps = crossprod(X * sqrt(W)) / n H_psi = X_all_int.T @ (W_ps[:, None] * X_all_int) / n_all_panel - H_psi_inv = _safe_inv(H_psi) + H_psi_inv = _safe_inv(H_psi, tracker=self._safe_inv_tracker) D_all = np.concatenate([np.ones(n_t), np.zeros(n_c)]) score_ps = (D_all - pscore_all)[:, None] * X_all_int @@ -2562,7 +2606,7 @@ def _doubly_robust( if sw_all is not None: W_ps = W_ps * sw_all H_psi = X_all_int.T @ (W_ps[:, None] * X_all_int) / n_all_panel - H_psi_inv = _safe_inv(H_psi) + H_psi_inv = _safe_inv(H_psi, tracker=self._safe_inv_tracker) D_all = np.concatenate([np.ones(n_t), np.zeros(n_c)]) score_ps = (D_all - pscore_all)[:, None] * X_all_int @@ -2584,7 +2628,7 @@ def _doubly_robust( X_c_int = X_control_with_intercept W_diag = sw_control if sw_control is not None else np.ones(n_c) XtWX = X_c_int.T @ (W_diag[:, None] * X_c_int) - bread = _safe_inv(XtWX) + bread = _safe_inv(XtWX, tracker=self._safe_inv_tracker) # M1: dATT/dbeta — gradient of DR ATT w.r.t. OR parameters X_t_int = X_treated_with_intercept @@ -2628,7 +2672,7 @@ def _doubly_robust( W_ps = pscore_all * (1 - pscore_all) H_psi = X_all_int.T @ (W_ps[:, None] * X_all_int) / n_all_panel - H_psi_inv = _safe_inv(H_psi) + H_psi_inv = _safe_inv(H_psi, tracker=self._safe_inv_tracker) D_all = np.concatenate([np.ones(n_t), np.zeros(n_c)]) score_ps = (D_all - pscore_all)[:, None] * X_all_int @@ -2645,7 +2689,7 @@ def _doubly_robust( # --- OR IF correction --- X_c_int = X_control_with_intercept XtX = X_c_int.T @ X_c_int - bread = _safe_inv(XtX) + bread = _safe_inv(XtX, tracker=self._safe_inv_tracker) X_t_int = X_treated_with_intercept M1 = ( @@ -3204,8 +3248,14 @@ def _outcome_regression_rc( # R's colMeans (= sum/n_all) for M1, matching the product exactly. W_ct = sw_ct if sw_ct is not None else np.ones(n_ct) W_cs = sw_cs if sw_cs is not None else np.ones(n_cs) - bread_t = _safe_inv(X_ct_int.T @ (W_ct[:, None] * X_ct_int)) - bread_s = _safe_inv(X_cs_int.T @ (W_cs[:, None] * X_cs_int)) + bread_t = _safe_inv( + X_ct_int.T @ (W_ct[:, None] * X_ct_int), + tracker=self._safe_inv_tracker, + ) + bread_s = _safe_inv( + X_cs_int.T @ (W_cs[:, None] * X_cs_int), + tracker=self._safe_inv_tracker, + ) # R: M1 = colMeans(w.cont * out.x) = sum(w_D * X) / n_all M1 = ( @@ -3407,7 +3457,7 @@ def _ipw_estimation_rc( W_ps = W_ps * sw_all # R: Hessian.ps = crossprod(X * sqrt(W)) / n H_psi = X_all_int.T @ (W_ps[:, None] * X_all_int) / n_all - H_psi_inv = _safe_inv(H_psi) + H_psi_inv = _safe_inv(H_psi, tracker=self._safe_inv_tracker) score_ps = (D_all - pscore)[:, None] * X_all_int if sw_all is not None: @@ -3744,7 +3794,7 @@ def _doubly_robust_rc( W_ps = W_ps * sw_all # R: Hessian.ps = crossprod(X * sqrt(W)) / n H_psi = X_all_int.T @ (W_ps[:, None] * X_all_int) / n_all - H_psi_inv = _safe_inv(H_psi) + H_psi_inv = _safe_inv(H_psi, tracker=self._safe_inv_tracker) score_ps = (D_all - pscore)[:, None] * X_all_int if sw_all is not None: @@ -3779,8 +3829,14 @@ def _doubly_robust_rc( # ===================================================================== W_ct_vals = sw_ct if sw_ct is not None else np.ones(n_ct) W_cs_vals = sw_cs if sw_cs is not None else np.ones(n_cs) - bread_ct = _safe_inv(X_ct_int.T @ (W_ct_vals[:, None] * X_ct_int)) - bread_cs = _safe_inv(X_cs_int.T @ (W_cs_vals[:, None] * X_cs_int)) + bread_ct = _safe_inv( + X_ct_int.T @ (W_ct_vals[:, None] * X_ct_int), + tracker=self._safe_inv_tracker, + ) + bread_cs = _safe_inv( + X_cs_int.T @ (W_cs_vals[:, None] * X_cs_int), + tracker=self._safe_inv_tracker, + ) # R: asy.lin.rep.ols (per-obs OLS score * bread) asy_lin_rep_ct = (W_ct_vals * resid_ct)[:, None] * X_ct_int @ bread_ct @@ -3818,8 +3874,14 @@ def _doubly_robust_rc( # ===================================================================== W_gt_vals = sw_gt if sw_gt is not None else np.ones(n_gt) W_gs_vals = sw_gs if sw_gs is not None else np.ones(n_gs) - bread_gt = _safe_inv(X_gt_int.T @ (W_gt_vals[:, None] * X_gt_int)) - bread_gs = _safe_inv(X_gs_int.T @ (W_gs_vals[:, None] * X_gs_int)) + bread_gt = _safe_inv( + X_gt_int.T @ (W_gt_vals[:, None] * X_gt_int), + tracker=self._safe_inv_tracker, + ) + bread_gs = _safe_inv( + X_gs_int.T @ (W_gs_vals[:, None] * X_gs_int), + tracker=self._safe_inv_tracker, + ) asy_lin_rep_gt = (W_gt_vals * resid_gt)[:, None] * X_gt_int @ bread_gt asy_lin_rep_gs = (W_gs_vals * resid_gs)[:, None] * X_gs_int @ bread_gs diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 3de1bd7b..457f47fc 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -301,6 +301,7 @@ This matches the behavior of R's `fixest::feols()` with absorbed FE. - Requires never-treated units as comparison group (identified by `first_treat=0` or `never_treated=True`) - Warns if no never-treated units exist (suggests alternative comparison strategies) - Limited pre-treatment periods reduce ability to test parallel trends +- **Note:** The analytical SE paths call `_safe_inv()` on the propensity-score Hessian (`H_psi`) and outcome-regression bread (`X'WX`) across every `(g, t)` cell. When these matrices are rank deficient, `np.linalg.solve` raises `LinAlgError` and `_safe_inv()` falls back to `np.linalg.lstsq`. Previously silent; now `fit()` emits ONE aggregate `UserWarning` at the end of the fit reporting the number of fallbacks and the max condition number, so a rank-deficient analytical SE path can't quietly ship degraded standard errors. Sibling of axis-A finding #17 in the Phase 2 silent-failures audit. *Estimator equation (as implemented):* diff --git a/tests/test_staggered.py b/tests/test_staggered.py index 99770751..f7bedc3e 100644 --- a/tests/test_staggered.py +++ b/tests/test_staggered.py @@ -4157,3 +4157,77 @@ def test_skip_warning_survey_zero_mass(self): ) skip_warnings = [x for x in w if "could not be estimated" in str(x.message)] assert len(skip_warnings) > 0, "Expected skip warning for zero-mass survey cells" + + +# --------------------------------------------------------------------------- +# Silent-failure audit PR #9 follow-up: the CS analytical SE path calls +# `_safe_inv()` in ~13 places (PS Hessian, OR bread, etc.). Previously the +# LinAlgError → lstsq fallback was silent — a rank-deficient bread produced +# degraded SEs with no user-visible signal. Now fit() emits ONE aggregate +# warning tracking all fallbacks. +# --------------------------------------------------------------------------- + + +class TestCallawaySantAnnaSafeInvFallback: + def test_collinear_covariates_emit_safe_inv_warning(self): + """Perfectly collinear covariates should trigger the aggregate + `_safe_inv` lstsq-fallback warning across analytical SE paths.""" + data = generate_staggered_data( + n_units=150, n_periods=6, n_cohorts=3, seed=55 + ) + rng = np.random.default_rng(0) + # Add a covariate and a redundant (collinear) copy — forces rank- + # deficient X'WX in the OR bread and the PS Hessian within at + # least one (g, t) cell. + data["x1"] = rng.normal(0, 1, len(data)) + data["x2"] = 2.0 * data["x1"] + cs = CallawaySantAnna( + estimation_method="dr", + rank_deficient_action="silent", # suppress upstream solve_ols noise + ) + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + cs.fit( + data, + outcome="outcome", + unit="unit", + time="time", + first_treat="first_treat", + covariates=["x1", "x2"], + ) + fallback_warnings = [ + w for w in caught + if "Rank-deficient matrix encountered" in str(w.message) + and "analytical SE paths" in str(w.message) + ] + assert len(fallback_warnings) == 1, ( + f"Expected exactly one aggregate _safe_inv fallback warning; " + f"got {len(fallback_warnings)}: " + f"{[str(w.message) for w in fallback_warnings]}" + ) + + def test_well_conditioned_no_safe_inv_warning(self): + """Clean data should NOT trigger the aggregate warning — + regression-safety for the happy path.""" + data = generate_staggered_data( + n_units=200, n_periods=6, n_cohorts=3, seed=42 + ) + cs = CallawaySantAnna(estimation_method="dr") + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + cs.fit( + data, + outcome="outcome", + unit="unit", + time="time", + first_treat="first_treat", + ) + fallback_warnings = [ + w for w in caught + if "Rank-deficient matrix encountered" in str(w.message) + and "analytical SE paths" in str(w.message) + ] + assert fallback_warnings == [], ( + f"Unexpected _safe_inv fallback warning on clean data: " + f"{[str(w.message) for w in fallback_warnings]}" + ) diff --git a/tests/test_two_stage.py b/tests/test_two_stage.py index cf5a47b1..55d65f9a 100644 --- a/tests/test_two_stage.py +++ b/tests/test_two_stage.py @@ -1435,3 +1435,85 @@ def test_iterative_demean_no_warning_on_convergence(self): warnings.simplefilter("always") TwoStageDiD._iterative_demean(vals, units, times, idx) assert not any("did not converge" in str(x.message) for x in w) + + +# --------------------------------------------------------------------------- +# Silent-failure audit PR #9: sibling of finding #17 — both the analytical +# TSL variance (`two_stage.py`) and the multiplier-bootstrap bread +# (`two_stage_bootstrap.py`) previously fell back to `np.linalg.lstsq` +# silently when the Stage-2 `X'_2 W X_2` matrix was singular. They now +# emit a `UserWarning` with the same message shape as STD #17. +# --------------------------------------------------------------------------- + + +class TestTwoStageStage2BreadWarning: + def _build_collinear_stage2_data(self): + """Panel with a perfectly collinear Stage-2 covariate pair.""" + data = generate_test_data(n_units=120, n_periods=8, seed=77) + # Add a covariate + a redundant (collinear) copy — Stage 2 + # includes both, making X'_2 W X_2 singular. + rng = np.random.default_rng(9) + data["z1"] = rng.normal(0, 1, len(data)) + data["z2"] = 3.0 * data["z1"] + return data + + def test_analytical_bread_lstsq_fallback_warns(self): + """When the Stage-2 bread is singular in the analytical TSL path, + the new UserWarning should fire.""" + data = self._build_collinear_stage2_data() + est = TwoStageDiD(rank_deficient_action="silent") + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + est.fit( + data, + outcome="outcome", + unit="unit", + time="time", + first_treat="first_treat", + covariates=["z1", "z2"], + ) + fallback = [ + w for w in caught + if "TwoStageDiD TSL variance" in str(w.message) + ] + # Either the analytical path surfaces the warning (when the bread + # is actually hit) or upstream rank-deficiency handling dropped the + # collinear column before reaching the bread. We accept both — the + # key property is that if we *did* fall back to lstsq, we warned. + # So we assert: it's never silent. + silent = any( + "np.linalg.lstsq" in str(w.message) + and "fell back" in str(w.message).lower() + and not "TwoStageDiD" in str(w.message) + for w in caught + ) + assert not silent, "Unexpected silent lstsq fallback in TwoStage analytical bread" + # When the warning fires, it should have a consistent surface text. + for w in fallback: + assert "np.linalg.lstsq" in str(w.message) + + def test_bootstrap_bread_lstsq_fallback_warns(self): + """Same contract for the multiplier-bootstrap bread path.""" + data = self._build_collinear_stage2_data() + est = TwoStageDiD( + n_bootstrap=20, + rank_deficient_action="silent", + seed=0, + ) + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + est.fit( + data, + outcome="outcome", + unit="unit", + time="time", + first_treat="first_treat", + covariates=["z1", "z2"], + ) + fallback = [ + w for w in caught + if "TwoStageDiD multiplier bootstrap bread" in str(w.message) + ] + # Same contract as above: if the fallback triggered, it must warn. + for w in fallback: + assert "np.linalg.lstsq" in str(w.message) From 099507dc1987605a8cc9087c7c9c8ee6e1b2668d Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 12:42:34 -0400 Subject: [PATCH 3/3] Address CI AI review round 2: STD PS-Hessian + TwoStage warning text + stronger tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 2 CI review surfaced one P1 and two P3s. P1 — StaggeredTripleDifference PS-Hessian fallback: `_compute_pscore` had a second silent lstsq fallback (np.linalg.inv(X'WX) → lstsq on LinAlgError) that was missed in the initial PR. Under IPW/DR inference a rank-deficient propensity-score design could silently degrade influence-function corrections. Added a separate `self._ps_lstsq_fallback_tracker` alongside the existing OR tracker; `_compute_pscore` appends a condition-number sample per LinAlgError. `fit()` emits a sibling aggregate UserWarning with cell count + max condition number. Added TestStaggeredTripleDiffORSolveFallback ::test_collinear_covariates_emit_ps_hessian_warning which forces the PS-Hessian path under estimation_method="ipw". REGISTRY note added. Also scoped the existing OR-side test to the OR message text so the two aggregate warnings don't collide in the assertion. P3 — TwoStage warning text accuracy: Reviewer correctly pointed out that "drop collinear covariates" was misleading because X_2 is the Stage-2 indicator design (treatment, event-time, or group dummies), not user covariates. Reworded both the analytical and bootstrap warnings to name the actual failure mode (zero- weight or all-zero indicator column from an aggregation path with no qualifying observations). P3 — TwoStage tests were not verifying the warning path: Reviewer noted that my previous z1/z2 collinear-covariate tests never reached X_2'WX_2 at all because user covariates go into Stage 1, not Stage 2. Rewrote both tests to patch np.linalg.solve and raise LinAlgError specifically on the `solve(X'WX, np.eye(k))` shape, forcing the Stage-2 bread fallback. Tests now directly assert the warning fires and the lstsq fallback still produces finite SEs. No behavioral change on well-conditioned inputs. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/staggered_triple_diff.py | 37 ++++++- diff_diff/two_stage.py | 19 ++-- diff_diff/two_stage_bootstrap.py | 18 ++-- docs/methodology/REGISTRY.md | 1 + tests/test_staggered_triple_diff.py | 63 ++++++++++-- tests/test_two_stage.py | 143 +++++++++++++++++----------- 6 files changed, 198 insertions(+), 83 deletions(-) diff --git a/diff_diff/staggered_triple_diff.py b/diff_diff/staggered_triple_diff.py index 86835a85..eba8855a 100644 --- a/diff_diff/staggered_triple_diff.py +++ b/diff_diff/staggered_triple_diff.py @@ -348,10 +348,12 @@ def fit( {} if (covariates and self.estimation_method in ("ipw", "dr")) else None ) - # Tracker for rank-deficient OR-IF solves across all (g, g_c, t) cells. - # _compute_did_panel appends one condition-number sample per LinAlgError - # so we emit ONE aggregate warning below rather than fanning out. + # Trackers for rank-deficient linalg solves across all (g, g_c, t) + # cells. `_compute_did_panel` appends to the OR-side tracker; + # `_compute_pscore` appends to the PS-side tracker. Both surface as + # ONE aggregate warning below rather than fanning out per cell. self._lstsq_fallback_tracker: List[float] = [] + self._ps_lstsq_fallback_tracker: List[float] = [] for g in treatment_groups: # In universal mode, skip the reference period (t == g-1-anticipation) @@ -532,6 +534,27 @@ def fit( stacklevel=2, ) + # Consolidated PS-Hessian rank-deficiency warning (sibling of the + # OR path above). `_compute_pscore` previously fell back from + # `np.linalg.inv(X'WX)` to `np.linalg.lstsq` with no signal, so + # a rank-deficient propensity-score design silently degraded + # IPW/DR influence-function corrections. + if self._ps_lstsq_fallback_tracker: + n_cells = len(self._ps_lstsq_fallback_tracker) + finite_conds = [c for c in self._ps_lstsq_fallback_tracker if np.isfinite(c)] + max_cond = max(finite_conds) if finite_conds else float("inf") + warnings.warn( + f"Rank-deficient X'WX detected in the propensity-score " + f"Hessian for {n_cells} (g, g_c, t) pair(s); fell back to " + f"np.linalg.lstsq. Max condition number of affected X'WX: " + f"{max_cond:.2e}. IPW/DR influence-function corrections " + f"may be numerically unstable; consider dropping collinear " + f"propensity-score covariates or using " + f"estimation_method='reg' to avoid the PS path.", + UserWarning, + stacklevel=2, + ) + # Consolidated EPV summary warning if epv_diagnostics: low_epv = {k: v for k, v in epv_diagnostics.items() if v.get("is_low")} @@ -1467,6 +1490,14 @@ def _compute_pscore( try: hessian = np.linalg.inv(XWX) * n_pair except np.linalg.LinAlgError: + # Sibling of the OR-side LinAlgError at _compute_did_panel. Record + # a condition-number sample on the PS-Hessian tracker so fit() + # emits ONE aggregate warning covering all (g, g_c, t) cells + # that hit the rank-deficient PS path under IPW/DR inference. + ps_tracker = getattr(self, "_ps_lstsq_fallback_tracker", None) + if ps_tracker is not None: + with np.errstate(invalid="ignore", over="ignore"): + ps_tracker.append(float(np.linalg.cond(XWX))) hessian = np.linalg.lstsq(XWX, np.eye(XWX.shape[0]), rcond=None)[0] * n_pair return pscore, hessian diff --git a/diff_diff/two_stage.py b/diff_diff/two_stage.py index 7c2d5015..c11bdd50 100644 --- a/diff_diff/two_stage.py +++ b/diff_diff/two_stage.py @@ -1768,14 +1768,19 @@ def _compute_gmm_variance( try: bread = np.linalg.solve(XtWX_2, np.eye(k)) except np.linalg.LinAlgError: - # Sibling of finding #17 (axis A) — silent lstsq fallback in the - # TSL-variance bread was previously silent. Surface it so a - # rank-deficient second-stage design doesn't quietly degrade SEs. + # Sibling of finding #17 (axis A) — the TSL-variance bread + # fallback was previously silent. Note: X_2 is the Stage-2 + # indicator design (treatment / horizon / group dummies), not + # user covariates, so the diagnostic guidance points at that + # layer. warnings.warn( - "Rank-deficient second-stage X'WX in TwoStageDiD TSL variance; " - "falling back to np.linalg.lstsq for the bread matrix. " - "Analytical SEs may be numerically unstable; consider dropping " - "collinear covariates.", + "Rank-deficient second-stage design matrix X_2'WX_2 in " + "TwoStageDiD TSL variance; falling back to np.linalg.lstsq " + "for the bread matrix. Analytical SEs may be numerically " + "unstable. The Stage-2 design is built from treatment, " + "event-time, or group indicators, so this typically " + "indicates a zero-weight or all-zero indicator column " + "(e.g. an aggregation path with no qualifying observations).", UserWarning, stacklevel=2, ) diff --git a/diff_diff/two_stage_bootstrap.py b/diff_diff/two_stage_bootstrap.py index d356f441..391cdbd4 100644 --- a/diff_diff/two_stage_bootstrap.py +++ b/diff_diff/two_stage_bootstrap.py @@ -197,14 +197,18 @@ def _compute_cluster_S_scores( try: bread = np.linalg.solve(XtX_2, np.eye(k)) except np.linalg.LinAlgError: - # Sibling of finding #17 (axis A) — silent lstsq fallback in the - # TwoStage bootstrap bread matrix. Called once per (static / event- - # study / group) aggregation, so warning fan-out is bounded. + # Sibling of finding #17 (axis A) — the bootstrap bread + # fallback was previously silent. X_2 is the Stage-2 indicator + # design (treatment / horizon / group dummies), so a singular + # bread typically indicates a zero-weight or all-zero column. warnings.warn( - "Rank-deficient second-stage X'WX in TwoStageDiD multiplier " - "bootstrap bread; falling back to np.linalg.lstsq. Bootstrap " - "SEs may be numerically unstable; consider dropping collinear " - "covariates.", + "Rank-deficient second-stage design matrix X_2'WX_2 in " + "TwoStageDiD multiplier bootstrap bread; falling back to " + "np.linalg.lstsq. Bootstrap SEs may be numerically " + "unstable. The Stage-2 design is built from treatment, " + "event-time, or group indicators, so this typically " + "indicates a zero-weight or all-zero indicator column " + "(e.g. an aggregation path with no qualifying observations).", UserWarning, stacklevel=2, ) diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 457f47fc..a4da8c3a 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -1699,6 +1699,7 @@ has no additional effect. Set `pscore_fallback="unconditional"` for legacy behavior. - Warns on singular GMM covariance matrix (falls back to pseudoinverse) - **Note:** Rank-deficient X'WX in the per-pair outcome-regression influence-function step now emits ONE aggregate `UserWarning` at `fit()` time (counting affected (g, g_c, t) cells and reporting the max condition number), instead of silently falling back to `np.linalg.lstsq`. Axis-A finding #17 in the Phase 2 silent-failures audit. +- **Note:** The per-pair propensity-score Hessian inversion in `_compute_pscore` (used under `estimation_method` in `{ipw, dr}`) previously fell back from `np.linalg.inv(X'WX)` to `np.linalg.lstsq` silently. `fit()` now emits a sibling aggregate `UserWarning` with cell count + max condition number so rank-deficient PS designs can't quietly degrade IPW/DR influence-function corrections. Sibling of axis-A finding #17, surfaced during PR #334 CI review. *Data structure:* diff --git a/tests/test_staggered_triple_diff.py b/tests/test_staggered_triple_diff.py index 23b929c3..197c5803 100644 --- a/tests/test_staggered_triple_diff.py +++ b/tests/test_staggered_triple_diff.py @@ -573,22 +573,67 @@ def test_collinear_covariates_emit_lstsq_fallback_warning(self): "eligibility", covariates=["x1", "x2", "x3"], ) - lstsq_warnings = [ - w for w in caught if "Rank-deficient X'WX" in str(w.message) + or_warnings = [ + w for w in caught + if "outcome-regression influence-function step" in str(w.message) ] - assert len(lstsq_warnings) == 1, ( + assert len(or_warnings) == 1, ( f"Expected exactly one aggregate OR lstsq-fallback warning, " - f"got {len(lstsq_warnings)}." + f"got {len(or_warnings)}." ) - msg = str(lstsq_warnings[0].message) + msg = str(or_warnings[0].message) assert "(g, g_c, t) pair(s)" in msg assert "np.linalg.lstsq" in msg # Point estimates should still be finite (lstsq fallback succeeded). assert np.isfinite(res.overall_att) + def test_collinear_covariates_emit_ps_hessian_warning(self): + """Collinear propensity-score covariates should trigger the aggregate + PS-Hessian lstsq-fallback warning under IPW/DR inference. Previously + this path was silent (sibling of the OR-side finding; surfaced by + PR #334 CI review).""" + data = generate_staggered_ddd_data( + n_units=200, + treatment_effect=3.0, + add_covariates=True, + seed=55, + ) + data["x3"] = 2.0 * data["x1"] + est = StaggeredTripleDifference( + estimation_method="ipw", # exercises PS path, skips OR projection + rank_deficient_action="silent", + ) + import warnings as _w + + with _w.catch_warnings(record=True) as caught: + _w.simplefilter("always") + est.fit( + data, + "outcome", + "unit", + "period", + "first_treat", + "eligibility", + covariates=["x1", "x2", "x3"], + ) + ps_warnings = [ + w for w in caught + if "propensity-score Hessian" in str(w.message) + ] + assert len(ps_warnings) == 1, ( + f"Expected exactly one aggregate PS-Hessian lstsq-fallback " + f"warning under IPW, got {len(ps_warnings)}: " + f"{[str(w.message) for w in ps_warnings]}" + ) + msg = str(ps_warnings[0].message) + assert "(g, g_c, t) pair(s)" in msg + assert "np.linalg.lstsq" in msg + assert "IPW/DR" in msg + def test_well_conditioned_covariates_emit_no_lstsq_warning(self): - """Clean, well-conditioned covariates should NOT trigger the - aggregate warning — regression-safety for the happy path.""" + """Clean, well-conditioned covariates should NOT trigger either + OR or PS-Hessian aggregate warning — regression-safety for the + happy path.""" data = generate_staggered_ddd_data( n_units=300, treatment_effect=3.0, @@ -613,12 +658,12 @@ def test_well_conditioned_covariates_emit_no_lstsq_warning(self): w for w in caught if "Rank-deficient X'WX" in str(w.message) ] assert lstsq_warnings == [], ( - f"Unexpected OR lstsq-fallback warning on clean covariates: " + f"Unexpected lstsq-fallback warning on clean covariates: " f"{[str(w.message) for w in lstsq_warnings]}" ) def test_no_covariates_no_warning(self, simple_data): - """Without covariates the OR path is skipped entirely, so no + """Without covariates both OR and PS paths are skipped, so no aggregate warning should be emitted.""" est = StaggeredTripleDifference(estimation_method="reg") import warnings as _w diff --git a/tests/test_two_stage.py b/tests/test_two_stage.py index 55d65f9a..56f0d44b 100644 --- a/tests/test_two_stage.py +++ b/tests/test_two_stage.py @@ -1447,73 +1447,102 @@ def test_iterative_demean_no_warning_on_convergence(self): class TestTwoStageStage2BreadWarning: - def _build_collinear_stage2_data(self): - """Panel with a perfectly collinear Stage-2 covariate pair.""" - data = generate_test_data(n_units=120, n_periods=8, seed=77) - # Add a covariate + a redundant (collinear) copy — Stage 2 - # includes both, making X'_2 W X_2 singular. - rng = np.random.default_rng(9) - data["z1"] = rng.normal(0, 1, len(data)) - data["z2"] = 3.0 * data["z1"] - return data + """Sibling of STD finding #17: the TwoStage Stage-2 bread fallback + (`X'_2 W X_2` singular) was silent. X_2 is built from treatment/ + event-time/group indicators — not user covariates — so we force the + LinAlgError via patching `np.linalg.solve` rather than data crafting, + per the PR #334 CI review guidance.""" def test_analytical_bread_lstsq_fallback_warns(self): - """When the Stage-2 bread is singular in the analytical TSL path, - the new UserWarning should fire.""" - data = self._build_collinear_stage2_data() - est = TwoStageDiD(rank_deficient_action="silent") - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - est.fit( - data, - outcome="outcome", - unit="unit", - time="time", - first_treat="first_treat", - covariates=["z1", "z2"], - ) + """When np.linalg.solve on the Stage-2 bread raises, the analytical + TSL path must warn and still return a finite variance via lstsq.""" + from unittest.mock import patch + + import diff_diff.two_stage as ts_mod + + data = generate_test_data(n_units=80, n_periods=6, seed=77) + est = TwoStageDiD() + + real_solve = np.linalg.solve + + def raise_for_square_eye(a, b): + # Identify the Stage-2 bread call by shape: b is np.eye(k) + # (square identity). All other solve calls in the codepath + # have different b shapes. + if ( + isinstance(b, np.ndarray) + and b.ndim == 2 + and b.shape[0] == b.shape[1] + and np.allclose(b, np.eye(b.shape[0])) + ): + raise np.linalg.LinAlgError("forced by test") + return real_solve(a, b) + + with patch.object(ts_mod.np.linalg, "solve", side_effect=raise_for_square_eye): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + result = est.fit( + data, + outcome="outcome", + unit="unit", + time="time", + first_treat="first_treat", + ) fallback = [ w for w in caught if "TwoStageDiD TSL variance" in str(w.message) ] - # Either the analytical path surfaces the warning (when the bread - # is actually hit) or upstream rank-deficiency handling dropped the - # collinear column before reaching the bread. We accept both — the - # key property is that if we *did* fall back to lstsq, we warned. - # So we assert: it's never silent. - silent = any( - "np.linalg.lstsq" in str(w.message) - and "fell back" in str(w.message).lower() - and not "TwoStageDiD" in str(w.message) - for w in caught - ) - assert not silent, "Unexpected silent lstsq fallback in TwoStage analytical bread" - # When the warning fires, it should have a consistent surface text. - for w in fallback: - assert "np.linalg.lstsq" in str(w.message) + assert len(fallback) >= 1, ( + "Expected TSL-variance bread fallback warning when np.linalg.solve " + f"was forced to raise; got warnings: " + f"{[str(w.message) for w in caught]}" + ) + msg = str(fallback[0].message) + assert "np.linalg.lstsq" in msg + assert "X_2'WX_2" in msg + # lstsq fallback must still produce a finite SE. + assert np.isfinite(result.overall_se) def test_bootstrap_bread_lstsq_fallback_warns(self): """Same contract for the multiplier-bootstrap bread path.""" - data = self._build_collinear_stage2_data() - est = TwoStageDiD( - n_bootstrap=20, - rank_deficient_action="silent", - seed=0, - ) - with warnings.catch_warnings(record=True) as caught: - warnings.simplefilter("always") - est.fit( - data, - outcome="outcome", - unit="unit", - time="time", - first_treat="first_treat", - covariates=["z1", "z2"], - ) + from unittest.mock import patch + + import diff_diff.two_stage_bootstrap as tsb_mod + + data = generate_test_data(n_units=80, n_periods=6, seed=77) + est = TwoStageDiD(n_bootstrap=10, seed=0) + + real_solve = np.linalg.solve + + def raise_for_square_eye(a, b): + if ( + isinstance(b, np.ndarray) + and b.ndim == 2 + and b.shape[0] == b.shape[1] + and np.allclose(b, np.eye(b.shape[0])) + ): + raise np.linalg.LinAlgError("forced by test") + return real_solve(a, b) + + with patch.object(tsb_mod.np.linalg, "solve", side_effect=raise_for_square_eye): + with warnings.catch_warnings(record=True) as caught: + warnings.simplefilter("always") + est.fit( + data, + outcome="outcome", + unit="unit", + time="time", + first_treat="first_treat", + ) fallback = [ w for w in caught if "TwoStageDiD multiplier bootstrap bread" in str(w.message) ] - # Same contract as above: if the fallback triggered, it must warn. - for w in fallback: - assert "np.linalg.lstsq" in str(w.message) + assert len(fallback) >= 1, ( + "Expected bootstrap-bread fallback warning when np.linalg.solve " + f"was forced to raise; got warnings: " + f"{[str(w.message) for w in caught]}" + ) + msg = str(fallback[0].message) + assert "np.linalg.lstsq" in msg + assert "X_2'WX_2" in msg