Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ Deferred items from PR reviews that were not addressed before merge.
| Weighted CR2 Bell-McCaffrey cluster-robust (`vcov_type="hc2_bm"` + `cluster_ids` + `weights`) currently raises `NotImplementedError`. Weighted hat matrix and residual rebalancing need threading per clubSandwich WLS handling. | `linalg.py::_compute_cr2_bm` | Phase 1a | Medium |
| Regenerate `benchmarks/data/clubsandwich_cr2_golden.json` from R (`Rscript benchmarks/R/generate_clubsandwich_golden.R`). Current JSON has `source: python_self_reference` as a stability anchor until an authoritative R run. | `benchmarks/R/generate_clubsandwich_golden.R` | Phase 1a | Medium |
| `honest_did.py:1907` `np.linalg.solve(A_sys, b_sys) / except LinAlgError: continue` is a silent basis-rejection in the vertex-enumeration loop that is algorithmically intentional (try the next basis). Consider surfacing a count of rejected bases as a diagnostic when ARP enumeration exhausts, so users see when the vertex search was heavily constrained. Not a silent failure in the sense of the Phase 2 audit (the algorithm is supposed to skip), but the diagnostic would help debug borderline cases. | `honest_did.py` | #334 | Low |
| `compute_synthetic_weights` backend algorithm mismatch: Rust path uses Frank-Wolfe (`_rust_synthetic_weights` in `utils.py:1184`); Python fallback uses projected gradient descent (`_compute_synthetic_weights_numpy` in `utils.py:1228`). Both solve the same constrained QP but converge to different simplex vertices on near-degenerate / extreme-scale inputs (e.g. `Y~1e9`, or near-singular `Y'Y`). Unified backend (one algorithm) would close the parity gap surfaced by audit finding #22. Two `@pytest.mark.xfail(strict=True)` tests in `tests/test_rust_backend.py::TestSyntheticWeightsBackendParity` baseline the divergence so we notice when/if the algorithms align. | `utils.py`, `rust/` | follow-up | Medium |
| TROP Rust vs Python grid-search divergence on rank-deficient Y: on two near-parallel control units, LOOCV grid-search ATT diverges ~6% between Rust (`trop_global.py:688`) and Python fallback (`trop_global.py:753`). Either grid-winner ties are broken differently or the per-λ solver reaches different stationary points under rank deficiency. Audit finding #23 flagged this surface. `@pytest.mark.xfail(strict=True)` in `tests/test_rust_backend.py::TestTROPRustEdgeCaseParity::test_grid_search_rank_deficient_Y` baselines the gap. | `trop_global.py`, `rust/` | follow-up | Medium |
| TROP Rust vs Python bootstrap SE divergence under fixed seed: `seed=42` on a tiny panel produces ~28% bootstrap-SE gap. Root cause: Rust bootstrap uses its own RNG (`rand` crate) while Python uses `numpy.random.default_rng`; same seed value maps to different bytestreams across backends. Audit axis-H (RNG/seed) adjacent. `@pytest.mark.xfail(strict=True)` in `tests/test_rust_backend.py::TestTROPRustEdgeCaseParity::test_bootstrap_seed_reproducibility` baselines the gap. Unifying RNG (threading a numpy-generated seed-sequence into Rust, or porting Python to ChaCha) would close it. | `trop_global.py`, `rust/` | follow-up | Medium |

#### Performance

Expand Down
351 changes: 351 additions & 0 deletions tests/test_rust_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -2103,6 +2103,357 @@ def test_full_sdid_rust_vs_python(self):
)


# ---------------------------------------------------------------------------
# Silent-failure audit axis-G coverage (findings #21, #22, #23).
# The motivating incident (CS covariate scaling, early 2026) was a silent
# Rust-vs-Python divergence on rank-deficient / mixed-scale designs that
# sailed through the existing happy-path parity tests. These three classes
# extend backend-parity coverage to the edge cases the Phase-2 audit flagged.
# ---------------------------------------------------------------------------


@pytest.mark.skipif(not HAS_RUST_BACKEND, reason="Rust backend not available")
class TestSolveOLSSkipRankCheckParity:
"""Finding #21: `solve_ols(..., skip_rank_check=True)` parity.

Both backends skip the pivoted-QR rank check and trust the caller's
full-rank assertion. Rust uses SVD; Python uses scipy.linalg.lstsq
with ``cond=1e-7``. Parity is expected in well-conditioned cases, but
near-singular and mixed-scale inputs could divergence on singular-
value truncation ordering between LAPACK backends.

Rust dispatch is gated by ``vcov_type='hc1'`` and ``weights is None``
(linalg.py:621-634), so tests scope to that intersection — the only
path where Rust runs under ``skip_rank_check=True``.

Assertions target fitted values (X @ beta), not beta directly: for
rank-deficient designs, kept-column beta values depend on pivot order
while fitted values are backend-invariant.
"""

def _run_both_backends(self, X, y):
"""Call solve_ols with skip_rank_check=True under Rust and under
forced-Python, return (coef_rust, coef_py)."""
import sys
from unittest.mock import patch

from diff_diff.linalg import solve_ols

coef_rust, resid_rust, _ = solve_ols(
X, y, skip_rank_check=True, vcov_type="hc1", return_vcov=False
)

linalg_module = sys.modules["diff_diff.linalg"]
with patch.object(linalg_module, "HAS_RUST_BACKEND", False), \
patch.object(linalg_module, "_rust_solve_ols", None):
coef_py, resid_py, _ = solve_ols(
X, y, skip_rank_check=True, vcov_type="hc1", return_vcov=False
)
return coef_rust, coef_py

def test_mixed_scale_full_rank(self):
"""Full-rank X with column-norm ratio > 1e6. Both SVD backends
should truncate the same singular values."""
rng = np.random.default_rng(11)
n, k = 80, 3
X = np.column_stack([
rng.normal(0, 1, n), # unit scale
rng.normal(0, 1e6, n), # 1e6 scale
rng.normal(0, 1, n), # unit scale
])
y = 1.0 + 0.5 * X[:, 0] + 1e-7 * X[:, 1] + 0.3 * X[:, 2] + rng.normal(0, 0.1, n)

coef_rust, coef_py = self._run_both_backends(X, y)

fitted_rust = X @ coef_rust
fitted_py = X @ coef_py
np.testing.assert_allclose(
fitted_rust, fitted_py, rtol=1e-6, atol=1e-8,
err_msg="Rust vs Python fitted-value divergence on mixed-scale X",
)

def test_near_singular_full_rank(self):
"""Near-collinear full-rank X (cond(X'X) > 1e10). Backends use
the same SVD threshold (cond=1e-7), so should agree on truncation."""
rng = np.random.default_rng(22)
n = 80
x1 = rng.normal(0, 1, n)
x2 = x1 + rng.normal(0, 1e-6, n) # nearly parallel to x1
x3 = rng.normal(0, 1, n)
X = np.column_stack([x1, x2, x3])
y = 1.0 + 0.5 * x1 - 0.3 * x3 + rng.normal(0, 0.1, n)

coef_rust, coef_py = self._run_both_backends(X, y)

fitted_rust = X @ coef_rust
fitted_py = X @ coef_py
np.testing.assert_allclose(
fitted_rust, fitted_py, rtol=1e-6, atol=1e-8,
err_msg="Rust vs Python fitted-value divergence on near-singular X",
)

def test_rank_deficient_collinear(self):
"""Perfectly collinear columns. ``skip_rank_check=True`` bypasses
the QR detector; both backends must still produce matching fitted
values via minimum-norm SVD solve, even if individual coefficients
differ on dropped columns."""
rng = np.random.default_rng(33)
n = 80
x1 = rng.normal(0, 1, n)
X = np.column_stack([x1, 2.0 * x1, rng.normal(0, 1, n)]) # x2 ≡ 2*x1
y = 1.0 + 0.5 * x1 + 0.3 * X[:, 2] + rng.normal(0, 0.1, n)

coef_rust, coef_py = self._run_both_backends(X, y)

fitted_rust = X @ coef_rust
fitted_py = X @ coef_py
# Fitted values are the backend-invariant object under rank deficiency.
np.testing.assert_allclose(
fitted_rust, fitted_py, rtol=1e-6, atol=1e-8,
err_msg="Rust vs Python fitted-value divergence on rank-deficient X",
)


@pytest.mark.skipif(not HAS_RUST_BACKEND, reason="Rust backend not available")
class TestSyntheticWeightsBackendParity:
"""Finding #22: `compute_synthetic_weights` Rust vs Python parity.

Both backends solve the same constrained QP (FW + simplex projection)
with ``_OPTIMIZATION_TOL=1e-8``. PR #312 normalized Y inside the SDID
fit path; the shared utility at ``utils.py:1134-1199`` is unchanged
and is called from ``prep.py:990`` (DGP) only.

Tolerance: ``atol=1e-7, rtol=1e-6`` on weights — loose enough to
absorb the FW convergence tolerance (1e-8) plus BLAS platform drift
(~1e-12 to 1e-14 on float64), tight enough to catch real algorithmic
divergence.
"""

def _run_both_backends(self, Y_control, Y_treated, lambda_reg=0.0):
import sys
from unittest.mock import patch

from diff_diff.utils import compute_synthetic_weights

w_rust = compute_synthetic_weights(Y_control, Y_treated, lambda_reg=lambda_reg)

utils_module = sys.modules["diff_diff.utils"]
with patch.object(utils_module, "HAS_RUST_BACKEND", False):
w_py = compute_synthetic_weights(Y_control, Y_treated, lambda_reg=lambda_reg)
return w_rust, w_py

def test_near_singular_Y_prime_Y(self):
"""Highly collinear control units make Y'Y near-singular. Both
backends should converge to the same (near-uniform or zeroed-out)
weight vector."""
rng = np.random.default_rng(7)
n_pre, n_control = 5, 4
base = rng.normal(0, 1, n_pre)
# Three of four control units are nearly parallel to `base`
Y_control = np.column_stack([
base,
base + 1e-10 * rng.normal(0, 1, n_pre),
base + 1e-10 * rng.normal(0, 1, n_pre),
rng.normal(0, 1, n_pre),
])
Y_treated = base + 1e-8 * rng.normal(0, 1, n_pre)

w_rust, w_py = self._run_both_backends(Y_control, Y_treated)

np.testing.assert_allclose(
w_rust, w_py, atol=1e-7, rtol=1e-6,
err_msg="Near-singular Y'Y: weight divergence between Rust and Python",
)

@pytest.mark.xfail(
strict=True,
reason="Rust path is Frank-Wolfe, Python fallback at utils.py:1228 is "
"projected gradient descent. PGD and FW can converge to different "
"vertices under near-degenerate / extreme-scale inputs even though "
"both solve the same constrained QP. Algorithmic unification is a "
"larger fix (P1 follow-up in TODO.md); this xfail baselines the gap "
"so we notice if/when the algorithms align.",
)
def test_extreme_Y_scale(self):
"""Y at 1e9 scale, well-conditioned panel. Known Rust-vs-Python
divergence: Rust/FW concentrates on one unit, Python/PGD returns
near-uniform. See xfail reason."""
rng = np.random.default_rng(44)
n_pre, n_control = 8, 5
Y_control = rng.normal(0, 1, (n_pre, n_control)) + 1e9
Y_treated = rng.normal(0, 1, n_pre) + 1e9

w_rust, w_py = self._run_both_backends(Y_control, Y_treated)

np.testing.assert_allclose(
w_rust, w_py, atol=1e-7, rtol=1e-6,
err_msg="Extreme Y scale: weight divergence "
"(known FW-vs-PGD algorithmic mismatch; see xfail reason)",
)

@pytest.mark.xfail(
strict=True,
reason="Rust path is Frank-Wolfe, Python fallback at utils.py:1228 is "
"projected gradient descent. Same algorithmic mismatch as "
"test_extreme_Y_scale — backends reach different simplex vertices "
"on moderate-scale random data. P1 follow-up in TODO.md.",
)
def test_regularization_lambda_variations(self):
"""Parity across lambda_reg=0 (unregularized) and lambda_reg>0
(shrink toward uniform). Known divergence; see xfail reason."""
rng = np.random.default_rng(55)
n_pre, n_control = 8, 5
Y_control = rng.normal(0, 1, (n_pre, n_control))
Y_treated = rng.normal(0, 1, n_pre)

for lr in (0.0, 0.01, 1.0):
w_rust, w_py = self._run_both_backends(Y_control, Y_treated, lambda_reg=lr)
np.testing.assert_allclose(
w_rust, w_py, atol=1e-7, rtol=1e-6,
err_msg=f"lambda_reg={lr}: weight divergence between Rust and Python",
)


@pytest.mark.slow
@pytest.mark.skipif(not HAS_RUST_BACKEND, reason="Rust backend not available")
class TestTROPRustEdgeCaseParity:
"""Finding #23: TROP Rust grid-search + bootstrap parity on edge cases.

Existing parity tests (this file, ~line 1687 / 1757) compare ATT and
effect dictionaries on well-conditioned random data at ``atol=1e-6``.
Gap: rank-deficient Y on the grid-search path, seed reproducibility
on the bootstrap path.

Sizing kept minimal (n_units=6, n_periods=5–6) per the
`feedback_trop_heavy_tests` memory.
"""

@staticmethod
def _make_correlated_panel(n_units=6, n_periods=5, n_treated=2):
"""Panel with two control units nearly parallel to each other,
making the pre-period Y matrix near rank-deficient."""
import pandas as pd
rng = np.random.default_rng(13)
data = []
shared_path = rng.normal(0, 1, n_periods)
for i in range(n_units):
is_treated = i < n_treated
if i in (n_treated, n_treated + 1):
# Two control units share a near-identical trajectory
base = shared_path + 1e-10 * rng.normal(0, 1, n_periods)
else:
base = rng.normal(0, 1, n_periods)
for t in range(n_periods):
y = 5.0 + i * 0.3 + base[t]
treated = 1 if (is_treated and t >= n_periods - 2) else 0
if treated:
y += 1.5
data.append({"unit": i, "time": t, "outcome": y, "treated": treated})
return pd.DataFrame(data)

@pytest.mark.xfail(
strict=True,
reason="TROP Rust grid-search at trop_global.py:688 and Python fallback "
"at trop_global.py:753 diverge on rank-deficient Y: empirical ATT "
"gap ~6% on two near-parallel control units. Either (a) grid-winner "
"ties break differently between backends, or (b) the per-λ solver "
"itself reaches different stationary points under rank deficiency. "
"Finding #23 flagged this exact surface as the Phase-2 gap. Rust "
"vs Python unification is a P1 follow-up (TODO.md). This xfail "
"baselines the divergence so we notice when/if the backends align.",
)
def test_grid_search_rank_deficient_Y(self):
"""Grid-search ATT parity on rank-deficient Y.

Known to fail: two near-parallel control units produce ~6% ATT
divergence between Rust and Python LOOCV grid-search. See xfail
reason for follow-up plan.
"""
import sys
from unittest.mock import patch

from diff_diff import TROP

df = self._make_correlated_panel()
# n_bootstrap>=2 is required by TROP.__init__; we set the minimum.
# Bootstrap SE is NOT asserted here (see separate test below +
# xfail baseline for the RNG-algorithm mismatch between backends).
trop_params = dict(
method="global",
lambda_time_grid=[0.1, 1.0, 10.0],
lambda_unit_grid=[0.1, 1.0, 10.0],
lambda_nn_grid=[np.inf],
n_bootstrap=2,
seed=42,
)

trop_rust = TROP(**trop_params)
res_rust = trop_rust.fit(df.copy(), "outcome", "treated", "unit", "time")

trop_global_module = sys.modules["diff_diff.trop_global"]
with patch.object(trop_global_module, "HAS_RUST_BACKEND", False), \
patch.object(trop_global_module, "_rust_loocv_grid_search_global", None), \
patch.object(trop_global_module, "_rust_bootstrap_trop_variance_global", None):
trop_py = TROP(**trop_params)
res_py = trop_py.fit(df.copy(), "outcome", "treated", "unit", "time")

# Primary assertion: the ATT point estimate at the chosen λ matches.
# This catches both (a) same λ chosen and (b) tied λ producing same fit.
np.testing.assert_allclose(
res_rust.att, res_py.att, atol=1e-6,
err_msg="Grid-search ATT divergence on rank-deficient Y: "
f"Rust={res_rust.att:.8f}, Python={res_py.att:.8f}",
)

@pytest.mark.xfail(
strict=True,
reason="Rust bootstrap uses its own RNG (rand crate) while Python uses "
"numpy's default_rng; same `seed=42` produces different bootstrap "
"replicate indices. Empirical SE divergence ~28%. Unifying the RNG "
"across backends is a P1 follow-up (TODO.md). This xfail baselines "
"the gap so we notice if/when the backends share an RNG seed-to-"
"bytestream mapping. Grid-search ATT parity is validated in the "
"previous test (rank-deficient Y path).",
)
def test_bootstrap_seed_reproducibility(self):
"""Bootstrap SE parity under a fixed seed.

Known to fail: Rust and Python use different RNG backends, so
identical ``seed=42`` produces different bootstrap replicates.
See xfail reason for follow-up plan.
"""
import sys
from unittest.mock import patch

from diff_diff import TROP

df = self._make_correlated_panel(n_units=6, n_periods=6, n_treated=2)
trop_params = dict(
method="global",
lambda_time_grid=[1.0],
lambda_unit_grid=[1.0],
lambda_nn_grid=[np.inf],
n_bootstrap=10,
seed=42,
)

trop_rust = TROP(**trop_params)
res_rust = trop_rust.fit(df.copy(), "outcome", "treated", "unit", "time")

trop_global_module = sys.modules["diff_diff.trop_global"]
with patch.object(trop_global_module, "HAS_RUST_BACKEND", False), \
patch.object(trop_global_module, "_rust_loocv_grid_search_global", None), \
patch.object(trop_global_module, "_rust_bootstrap_trop_variance_global", None):
trop_py = TROP(**trop_params)
res_py = trop_py.fit(df.copy(), "outcome", "treated", "unit", "time")

np.testing.assert_allclose(
res_rust.se, res_py.se, atol=1e-8,
err_msg=f"Bootstrap SE divergence under seed=42: "
f"Rust={res_rust.se:.10f}, Python={res_py.se:.10f}",
)


class TestFallbackWhenNoRust:
"""Test that pure Python fallback works when Rust is unavailable."""

Expand Down
Loading