diff --git a/TODO.md b/TODO.md index 95ac15c2..8cc3475c 100644 --- a/TODO.md +++ b/TODO.md @@ -83,6 +83,9 @@ Deferred items from PR reviews that were not addressed before merge. | Weighted CR2 Bell-McCaffrey cluster-robust (`vcov_type="hc2_bm"` + `cluster_ids` + `weights`) currently raises `NotImplementedError`. Weighted hat matrix and residual rebalancing need threading per clubSandwich WLS handling. | `linalg.py::_compute_cr2_bm` | Phase 1a | Medium | | Regenerate `benchmarks/data/clubsandwich_cr2_golden.json` from R (`Rscript benchmarks/R/generate_clubsandwich_golden.R`). Current JSON has `source: python_self_reference` as a stability anchor until an authoritative R run. | `benchmarks/R/generate_clubsandwich_golden.R` | Phase 1a | Medium | | `honest_did.py:1907` `np.linalg.solve(A_sys, b_sys) / except LinAlgError: continue` is a silent basis-rejection in the vertex-enumeration loop that is algorithmically intentional (try the next basis). Consider surfacing a count of rejected bases as a diagnostic when ARP enumeration exhausts, so users see when the vertex search was heavily constrained. Not a silent failure in the sense of the Phase 2 audit (the algorithm is supposed to skip), but the diagnostic would help debug borderline cases. | `honest_did.py` | #334 | Low | +| `compute_synthetic_weights` backend algorithm mismatch: Rust path uses Frank-Wolfe (`_rust_synthetic_weights` in `utils.py:1184`); Python fallback uses projected gradient descent (`_compute_synthetic_weights_numpy` in `utils.py:1228`). Both solve the same constrained QP but converge to different simplex vertices on near-degenerate / extreme-scale inputs (e.g. `Y~1e9`, or near-singular `Y'Y`). Unified backend (one algorithm) would close the parity gap surfaced by audit finding #22. Two `@pytest.mark.xfail(strict=True)` tests in `tests/test_rust_backend.py::TestSyntheticWeightsBackendParity` baseline the divergence so we notice when/if the algorithms align. | `utils.py`, `rust/` | follow-up | Medium | +| TROP Rust vs Python grid-search divergence on rank-deficient Y: on two near-parallel control units, LOOCV grid-search ATT diverges ~6% between Rust (`trop_global.py:688`) and Python fallback (`trop_global.py:753`). Either grid-winner ties are broken differently or the per-λ solver reaches different stationary points under rank deficiency. Audit finding #23 flagged this surface. `@pytest.mark.xfail(strict=True)` in `tests/test_rust_backend.py::TestTROPRustEdgeCaseParity::test_grid_search_rank_deficient_Y` baselines the gap. | `trop_global.py`, `rust/` | follow-up | Medium | +| TROP Rust vs Python bootstrap SE divergence under fixed seed: `seed=42` on a tiny panel produces ~28% bootstrap-SE gap. Root cause: Rust bootstrap uses its own RNG (`rand` crate) while Python uses `numpy.random.default_rng`; same seed value maps to different bytestreams across backends. Audit axis-H (RNG/seed) adjacent. `@pytest.mark.xfail(strict=True)` in `tests/test_rust_backend.py::TestTROPRustEdgeCaseParity::test_bootstrap_seed_reproducibility` baselines the gap. Unifying RNG (threading a numpy-generated seed-sequence into Rust, or porting Python to ChaCha) would close it. | `trop_global.py`, `rust/` | follow-up | Medium | #### Performance diff --git a/tests/test_rust_backend.py b/tests/test_rust_backend.py index 4c316f7e..25ae10ef 100644 --- a/tests/test_rust_backend.py +++ b/tests/test_rust_backend.py @@ -2103,6 +2103,357 @@ def test_full_sdid_rust_vs_python(self): ) +# --------------------------------------------------------------------------- +# Silent-failure audit axis-G coverage (findings #21, #22, #23). +# The motivating incident (CS covariate scaling, early 2026) was a silent +# Rust-vs-Python divergence on rank-deficient / mixed-scale designs that +# sailed through the existing happy-path parity tests. These three classes +# extend backend-parity coverage to the edge cases the Phase-2 audit flagged. +# --------------------------------------------------------------------------- + + +@pytest.mark.skipif(not HAS_RUST_BACKEND, reason="Rust backend not available") +class TestSolveOLSSkipRankCheckParity: + """Finding #21: `solve_ols(..., skip_rank_check=True)` parity. + + Both backends skip the pivoted-QR rank check and trust the caller's + full-rank assertion. Rust uses SVD; Python uses scipy.linalg.lstsq + with ``cond=1e-7``. Parity is expected in well-conditioned cases, but + near-singular and mixed-scale inputs could divergence on singular- + value truncation ordering between LAPACK backends. + + Rust dispatch is gated by ``vcov_type='hc1'`` and ``weights is None`` + (linalg.py:621-634), so tests scope to that intersection — the only + path where Rust runs under ``skip_rank_check=True``. + + Assertions target fitted values (X @ beta), not beta directly: for + rank-deficient designs, kept-column beta values depend on pivot order + while fitted values are backend-invariant. + """ + + def _run_both_backends(self, X, y): + """Call solve_ols with skip_rank_check=True under Rust and under + forced-Python, return (coef_rust, coef_py).""" + import sys + from unittest.mock import patch + + from diff_diff.linalg import solve_ols + + coef_rust, resid_rust, _ = solve_ols( + X, y, skip_rank_check=True, vcov_type="hc1", return_vcov=False + ) + + linalg_module = sys.modules["diff_diff.linalg"] + with patch.object(linalg_module, "HAS_RUST_BACKEND", False), \ + patch.object(linalg_module, "_rust_solve_ols", None): + coef_py, resid_py, _ = solve_ols( + X, y, skip_rank_check=True, vcov_type="hc1", return_vcov=False + ) + return coef_rust, coef_py + + def test_mixed_scale_full_rank(self): + """Full-rank X with column-norm ratio > 1e6. Both SVD backends + should truncate the same singular values.""" + rng = np.random.default_rng(11) + n, k = 80, 3 + X = np.column_stack([ + rng.normal(0, 1, n), # unit scale + rng.normal(0, 1e6, n), # 1e6 scale + rng.normal(0, 1, n), # unit scale + ]) + y = 1.0 + 0.5 * X[:, 0] + 1e-7 * X[:, 1] + 0.3 * X[:, 2] + rng.normal(0, 0.1, n) + + coef_rust, coef_py = self._run_both_backends(X, y) + + fitted_rust = X @ coef_rust + fitted_py = X @ coef_py + np.testing.assert_allclose( + fitted_rust, fitted_py, rtol=1e-6, atol=1e-8, + err_msg="Rust vs Python fitted-value divergence on mixed-scale X", + ) + + def test_near_singular_full_rank(self): + """Near-collinear full-rank X (cond(X'X) > 1e10). Backends use + the same SVD threshold (cond=1e-7), so should agree on truncation.""" + rng = np.random.default_rng(22) + n = 80 + x1 = rng.normal(0, 1, n) + x2 = x1 + rng.normal(0, 1e-6, n) # nearly parallel to x1 + x3 = rng.normal(0, 1, n) + X = np.column_stack([x1, x2, x3]) + y = 1.0 + 0.5 * x1 - 0.3 * x3 + rng.normal(0, 0.1, n) + + coef_rust, coef_py = self._run_both_backends(X, y) + + fitted_rust = X @ coef_rust + fitted_py = X @ coef_py + np.testing.assert_allclose( + fitted_rust, fitted_py, rtol=1e-6, atol=1e-8, + err_msg="Rust vs Python fitted-value divergence on near-singular X", + ) + + def test_rank_deficient_collinear(self): + """Perfectly collinear columns. ``skip_rank_check=True`` bypasses + the QR detector; both backends must still produce matching fitted + values via minimum-norm SVD solve, even if individual coefficients + differ on dropped columns.""" + rng = np.random.default_rng(33) + n = 80 + x1 = rng.normal(0, 1, n) + X = np.column_stack([x1, 2.0 * x1, rng.normal(0, 1, n)]) # x2 ≡ 2*x1 + y = 1.0 + 0.5 * x1 + 0.3 * X[:, 2] + rng.normal(0, 0.1, n) + + coef_rust, coef_py = self._run_both_backends(X, y) + + fitted_rust = X @ coef_rust + fitted_py = X @ coef_py + # Fitted values are the backend-invariant object under rank deficiency. + np.testing.assert_allclose( + fitted_rust, fitted_py, rtol=1e-6, atol=1e-8, + err_msg="Rust vs Python fitted-value divergence on rank-deficient X", + ) + + +@pytest.mark.skipif(not HAS_RUST_BACKEND, reason="Rust backend not available") +class TestSyntheticWeightsBackendParity: + """Finding #22: `compute_synthetic_weights` Rust vs Python parity. + + Both backends solve the same constrained QP (FW + simplex projection) + with ``_OPTIMIZATION_TOL=1e-8``. PR #312 normalized Y inside the SDID + fit path; the shared utility at ``utils.py:1134-1199`` is unchanged + and is called from ``prep.py:990`` (DGP) only. + + Tolerance: ``atol=1e-7, rtol=1e-6`` on weights — loose enough to + absorb the FW convergence tolerance (1e-8) plus BLAS platform drift + (~1e-12 to 1e-14 on float64), tight enough to catch real algorithmic + divergence. + """ + + def _run_both_backends(self, Y_control, Y_treated, lambda_reg=0.0): + import sys + from unittest.mock import patch + + from diff_diff.utils import compute_synthetic_weights + + w_rust = compute_synthetic_weights(Y_control, Y_treated, lambda_reg=lambda_reg) + + utils_module = sys.modules["diff_diff.utils"] + with patch.object(utils_module, "HAS_RUST_BACKEND", False): + w_py = compute_synthetic_weights(Y_control, Y_treated, lambda_reg=lambda_reg) + return w_rust, w_py + + def test_near_singular_Y_prime_Y(self): + """Highly collinear control units make Y'Y near-singular. Both + backends should converge to the same (near-uniform or zeroed-out) + weight vector.""" + rng = np.random.default_rng(7) + n_pre, n_control = 5, 4 + base = rng.normal(0, 1, n_pre) + # Three of four control units are nearly parallel to `base` + Y_control = np.column_stack([ + base, + base + 1e-10 * rng.normal(0, 1, n_pre), + base + 1e-10 * rng.normal(0, 1, n_pre), + rng.normal(0, 1, n_pre), + ]) + Y_treated = base + 1e-8 * rng.normal(0, 1, n_pre) + + w_rust, w_py = self._run_both_backends(Y_control, Y_treated) + + np.testing.assert_allclose( + w_rust, w_py, atol=1e-7, rtol=1e-6, + err_msg="Near-singular Y'Y: weight divergence between Rust and Python", + ) + + @pytest.mark.xfail( + strict=True, + reason="Rust path is Frank-Wolfe, Python fallback at utils.py:1228 is " + "projected gradient descent. PGD and FW can converge to different " + "vertices under near-degenerate / extreme-scale inputs even though " + "both solve the same constrained QP. Algorithmic unification is a " + "larger fix (P1 follow-up in TODO.md); this xfail baselines the gap " + "so we notice if/when the algorithms align.", + ) + def test_extreme_Y_scale(self): + """Y at 1e9 scale, well-conditioned panel. Known Rust-vs-Python + divergence: Rust/FW concentrates on one unit, Python/PGD returns + near-uniform. See xfail reason.""" + rng = np.random.default_rng(44) + n_pre, n_control = 8, 5 + Y_control = rng.normal(0, 1, (n_pre, n_control)) + 1e9 + Y_treated = rng.normal(0, 1, n_pre) + 1e9 + + w_rust, w_py = self._run_both_backends(Y_control, Y_treated) + + np.testing.assert_allclose( + w_rust, w_py, atol=1e-7, rtol=1e-6, + err_msg="Extreme Y scale: weight divergence " + "(known FW-vs-PGD algorithmic mismatch; see xfail reason)", + ) + + @pytest.mark.xfail( + strict=True, + reason="Rust path is Frank-Wolfe, Python fallback at utils.py:1228 is " + "projected gradient descent. Same algorithmic mismatch as " + "test_extreme_Y_scale — backends reach different simplex vertices " + "on moderate-scale random data. P1 follow-up in TODO.md.", + ) + def test_regularization_lambda_variations(self): + """Parity across lambda_reg=0 (unregularized) and lambda_reg>0 + (shrink toward uniform). Known divergence; see xfail reason.""" + rng = np.random.default_rng(55) + n_pre, n_control = 8, 5 + Y_control = rng.normal(0, 1, (n_pre, n_control)) + Y_treated = rng.normal(0, 1, n_pre) + + for lr in (0.0, 0.01, 1.0): + w_rust, w_py = self._run_both_backends(Y_control, Y_treated, lambda_reg=lr) + np.testing.assert_allclose( + w_rust, w_py, atol=1e-7, rtol=1e-6, + err_msg=f"lambda_reg={lr}: weight divergence between Rust and Python", + ) + + +@pytest.mark.slow +@pytest.mark.skipif(not HAS_RUST_BACKEND, reason="Rust backend not available") +class TestTROPRustEdgeCaseParity: + """Finding #23: TROP Rust grid-search + bootstrap parity on edge cases. + + Existing parity tests (this file, ~line 1687 / 1757) compare ATT and + effect dictionaries on well-conditioned random data at ``atol=1e-6``. + Gap: rank-deficient Y on the grid-search path, seed reproducibility + on the bootstrap path. + + Sizing kept minimal (n_units=6, n_periods=5–6) per the + `feedback_trop_heavy_tests` memory. + """ + + @staticmethod + def _make_correlated_panel(n_units=6, n_periods=5, n_treated=2): + """Panel with two control units nearly parallel to each other, + making the pre-period Y matrix near rank-deficient.""" + import pandas as pd + rng = np.random.default_rng(13) + data = [] + shared_path = rng.normal(0, 1, n_periods) + for i in range(n_units): + is_treated = i < n_treated + if i in (n_treated, n_treated + 1): + # Two control units share a near-identical trajectory + base = shared_path + 1e-10 * rng.normal(0, 1, n_periods) + else: + base = rng.normal(0, 1, n_periods) + for t in range(n_periods): + y = 5.0 + i * 0.3 + base[t] + treated = 1 if (is_treated and t >= n_periods - 2) else 0 + if treated: + y += 1.5 + data.append({"unit": i, "time": t, "outcome": y, "treated": treated}) + return pd.DataFrame(data) + + @pytest.mark.xfail( + strict=True, + reason="TROP Rust grid-search at trop_global.py:688 and Python fallback " + "at trop_global.py:753 diverge on rank-deficient Y: empirical ATT " + "gap ~6% on two near-parallel control units. Either (a) grid-winner " + "ties break differently between backends, or (b) the per-λ solver " + "itself reaches different stationary points under rank deficiency. " + "Finding #23 flagged this exact surface as the Phase-2 gap. Rust " + "vs Python unification is a P1 follow-up (TODO.md). This xfail " + "baselines the divergence so we notice when/if the backends align.", + ) + def test_grid_search_rank_deficient_Y(self): + """Grid-search ATT parity on rank-deficient Y. + + Known to fail: two near-parallel control units produce ~6% ATT + divergence between Rust and Python LOOCV grid-search. See xfail + reason for follow-up plan. + """ + import sys + from unittest.mock import patch + + from diff_diff import TROP + + df = self._make_correlated_panel() + # n_bootstrap>=2 is required by TROP.__init__; we set the minimum. + # Bootstrap SE is NOT asserted here (see separate test below + + # xfail baseline for the RNG-algorithm mismatch between backends). + trop_params = dict( + method="global", + lambda_time_grid=[0.1, 1.0, 10.0], + lambda_unit_grid=[0.1, 1.0, 10.0], + lambda_nn_grid=[np.inf], + n_bootstrap=2, + seed=42, + ) + + trop_rust = TROP(**trop_params) + res_rust = trop_rust.fit(df.copy(), "outcome", "treated", "unit", "time") + + trop_global_module = sys.modules["diff_diff.trop_global"] + with patch.object(trop_global_module, "HAS_RUST_BACKEND", False), \ + patch.object(trop_global_module, "_rust_loocv_grid_search_global", None), \ + patch.object(trop_global_module, "_rust_bootstrap_trop_variance_global", None): + trop_py = TROP(**trop_params) + res_py = trop_py.fit(df.copy(), "outcome", "treated", "unit", "time") + + # Primary assertion: the ATT point estimate at the chosen λ matches. + # This catches both (a) same λ chosen and (b) tied λ producing same fit. + np.testing.assert_allclose( + res_rust.att, res_py.att, atol=1e-6, + err_msg="Grid-search ATT divergence on rank-deficient Y: " + f"Rust={res_rust.att:.8f}, Python={res_py.att:.8f}", + ) + + @pytest.mark.xfail( + strict=True, + reason="Rust bootstrap uses its own RNG (rand crate) while Python uses " + "numpy's default_rng; same `seed=42` produces different bootstrap " + "replicate indices. Empirical SE divergence ~28%. Unifying the RNG " + "across backends is a P1 follow-up (TODO.md). This xfail baselines " + "the gap so we notice if/when the backends share an RNG seed-to-" + "bytestream mapping. Grid-search ATT parity is validated in the " + "previous test (rank-deficient Y path).", + ) + def test_bootstrap_seed_reproducibility(self): + """Bootstrap SE parity under a fixed seed. + + Known to fail: Rust and Python use different RNG backends, so + identical ``seed=42`` produces different bootstrap replicates. + See xfail reason for follow-up plan. + """ + import sys + from unittest.mock import patch + + from diff_diff import TROP + + df = self._make_correlated_panel(n_units=6, n_periods=6, n_treated=2) + trop_params = dict( + method="global", + lambda_time_grid=[1.0], + lambda_unit_grid=[1.0], + lambda_nn_grid=[np.inf], + n_bootstrap=10, + seed=42, + ) + + trop_rust = TROP(**trop_params) + res_rust = trop_rust.fit(df.copy(), "outcome", "treated", "unit", "time") + + trop_global_module = sys.modules["diff_diff.trop_global"] + with patch.object(trop_global_module, "HAS_RUST_BACKEND", False), \ + patch.object(trop_global_module, "_rust_loocv_grid_search_global", None), \ + patch.object(trop_global_module, "_rust_bootstrap_trop_variance_global", None): + trop_py = TROP(**trop_params) + res_py = trop_py.fit(df.copy(), "outcome", "treated", "unit", "time") + + np.testing.assert_allclose( + res_rust.se, res_py.se, atol=1e-8, + err_msg=f"Bootstrap SE divergence under seed=42: " + f"Rust={res_rust.se:.10f}, Python={res_py.se:.10f}", + ) + + class TestFallbackWhenNoRust: """Test that pure Python fallback works when Rust is unavailable."""