From 2ccedb9923f15bd79db3555d2907507e90e46456 Mon Sep 17 00:00:00 2001 From: igerber Date: Mon, 1 Jun 2026 09:11:04 -0400 Subject: [PATCH 1/4] Reject covariate names that collide with reserved structural terms (DiD family) DifferenceInDifferences, MultiPeriodDiD, and TwoWayFixedEffects build their coefficients dict by zipping a var_names list (structural term names + user covariate column names appended verbatim) with the fitted coefficient vector. A covariate whose name equaled a reserved structural name (const; the treatment/time column names; the {treatment}:{time} interaction; MPD period_{p} dummies and {treatment}:period_{p} interactions; TWFE ATT; fixed-effect / unit / time dummy names; or an internal _-prefixed working column such as _treat_time / _did_treatment / _treatment_post) silently overwrote that structural coefficient via dict last-write-wins -- e.g. a covariate named const dropped the intercept -- with no error. Add a shared validate_covariate_names helper (utils.py) called in each fit() before the design matrix is built: it raises ValueError on a collision (case-sensitive, so Const is allowed) and on duplicate covariate names. Fixed-effect/unit/time dummy reserved names are taken from the same pd.get_dummies(..., drop_first=True) call used to build them, so they match exactly (including for Categorical columns with a non-default category order). The TWFE guard fires on both variance paths: the within-transform path returns only {"ATT": att}, but a covariate named _treatment_post would still clobber the internal interaction column. Potentially breaking: a fit that previously succeeded with a colliding (or duplicated) covariate name -- silently returning a corrupted coefficient dict -- now raises. The staggered / influence-function estimators key results by (g, t) tuples / relative-time indices and are unaffected; SyntheticDiD overrides fit() and never reaches the base-class guard. Tests: tests/test_utils.py::TestValidateCovariateNames, tests/test_estimators.py::TestCovariateNameCollision, tests/test_estimators_vcov_type.py::TestTWFECovariateNameCollision. Co-Authored-By: Claude Opus 4.8 (1M context) --- CHANGELOG.md | 3 + diff_diff/estimators.py | 57 ++++++++++- diff_diff/twfe.py | 30 +++++- diff_diff/utils.py | 69 ++++++++++++- docs/methodology/REGISTRY.md | 3 + tests/test_estimators.py | 153 +++++++++++++++++++++++++++++ tests/test_estimators_vcov_type.py | 54 ++++++++++ tests/test_utils.py | 40 ++++++++ 8 files changed, 405 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 93406cbe..f5ca3c8e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Fixed +- **Covariate names that collide with reserved structural terms now raise `ValueError` instead of silently corrupting the coefficient dict (`DifferenceInDifferences`, `MultiPeriodDiD`, `TwoWayFixedEffects`).** These estimators build their `coefficients` dict by zipping a variable-name list -- structural term names PLUS the user covariate column names appended verbatim -- with the fitted coefficient vector. A covariate whose name equaled a reserved structural name (`const`; the treatment/time column names; the `{treatment}:{time}` interaction; MultiPeriodDiD `period_{p}` dummies and `{treatment}:period_{p}` interactions; `TwoWayFixedEffects` `ATT`; fixed-effect / unit / time dummy names; or an internal `_`-prefixed working column such as `_treat_time` / `_did_treatment` / `_treatment_post`) silently **overwrote** that structural coefficient via Python dict last-write-wins -- e.g. a covariate named `const` dropped the intercept -- with no error or warning. A new shared `validate_covariate_names` helper (`diff_diff/utils.py`) is now called in each of the three `fit()` methods before the design matrix is built; it raises `ValueError` on a collision (the comparison is case-sensitive, so e.g. `Const` is still allowed) **and** on duplicate names within `covariates` (which collapse to a single dict entry the same way). Fixed-effect/unit/time dummy reserved names are taken from the same `pd.get_dummies(..., drop_first=True)` call used to build them, so they match exactly (including for pandas `Categorical` columns with a non-default category order). For `TwoWayFixedEffects` the guard fires on **all** variance paths: the default within-transform path returns only `{"ATT": att}` (no covariate is a dict key there), but a covariate named `_treatment_post` would still clobber the internal interaction column, so guarding both paths is uniform and forward-compatible. **Potentially breaking:** a fit that previously *succeeded* with a colliding (or duplicated) covariate name -- silently returning a corrupted coefficient dict -- now raises; rename the covariate column(s). The staggered / influence-function estimators (CallawaySantAnna, SunAbraham, StaggeredTripleDifference, EfficientDiD, TwoStageDiD, ImputationDiD, WooldridgeDiD, dCDH, StackedDiD) key results by `(g, t)` tuples / relative-time indices, never covariate names, and `TripleDifference` / `SyntheticControl` / `SyntheticDiD` do not expose covariates by name, so none are affected. New tests in `tests/test_utils.py`, `tests/test_estimators.py`, and `tests/test_estimators_vcov_type.py`. + ## [3.5.0] - 2026-06-01 ### Added diff --git a/diff_diff/estimators.py b/diff_diff/estimators.py index 8c71dd76..bb9aa180 100644 --- a/diff_diff/estimators.py +++ b/diff_diff/estimators.py @@ -32,6 +32,7 @@ demean_by_group, safe_inference, validate_binary, + validate_covariate_names, wild_bootstrap_se, ) @@ -255,6 +256,11 @@ def fit( If provided, overrides outcome, treatment, and time parameters. covariates : list, optional List of covariate column names to include as linear controls. + Names must not collide with reserved structural terms (``const``, + the treatment/time column names, the ``{treatment}:{time}`` + interaction, fixed-effect dummy names, or internal working columns) + and must be unique; a collision or duplicate raises ``ValueError`` + (it would otherwise silently overwrite a structural coefficient). fixed_effects : list, optional List of categorical column names to include as fixed effects. Creates dummy variables for each category (drops first level). @@ -286,7 +292,9 @@ def fit( Raises ------ ValueError - If required parameters are missing or data validation fails. + If required parameters are missing or data validation fails, or if + a covariate name collides with a reserved structural term name or + duplicates another covariate. Examples -------- @@ -465,6 +473,22 @@ def fit( else: dt = d * t + # Reject covariate names that collide with reserved structural terms. + # Covariate names are appended verbatim to var_names below and zipped + # into coef_dict, so a covariate named like a structural term would + # silently overwrite that coefficient (dict last-write-wins). The + # reserved set covers the intercept, treatment/time indicators, the + # interaction, the internal _treat_time working column, and any + # fixed-effect dummy names (derived from the SAME get_dummies call used + # to build them below, so the names match exactly). + _reserved = {"const", treatment, time, f"{treatment}:{time}", "_treat_time"} + if fixed_effects: + for fe in fixed_effects: + _reserved.update( + pd.get_dummies(working_data[fe], prefix=fe, drop_first=True).columns + ) + validate_covariate_names(covariates, _reserved, estimator="DifferenceInDifferences") + # Build design matrix X = np.column_stack([np.ones(len(y)), d, t, dt]) var_names = ["const", treatment, time, f"{treatment}:{time}"] @@ -1244,6 +1268,12 @@ def fit( # type: ignore[override] All other periods are treated as pre-treatment. covariates : list, optional List of covariate column names to include as linear controls. + Names must not collide with reserved structural terms (``const``, + the treatment column name, ``period_{p}`` dummies, the + ``{treatment}:period_{p}`` interactions, fixed-effect dummy names, or + internal working columns) and must be unique; a collision or + duplicate raises ``ValueError`` (it would otherwise silently + overwrite a structural coefficient). fixed_effects : list, optional List of categorical column names to include as fixed effects. absorb : list, optional @@ -1271,7 +1301,9 @@ def fit( # type: ignore[override] Raises ------ ValueError - If required parameters are missing or data validation fails. + If required parameters are missing or data validation fails, or if + a covariate name collides with a reserved structural term name or + duplicates another covariate. """ # Fall back to analytical inference if wild bootstrap requested # (must happen before _resolve_survey_for_fit which rejects bootstrap+survey). @@ -1567,6 +1599,27 @@ def fit( # type: ignore[override] d = working_data[treatment].values.astype(float) t = working_data[time].values + # Reject covariate names that collide with reserved structural terms. + # Covariates are appended verbatim to var_names below and zipped into + # coef_dict, so a covariate named like a structural term (intercept, + # treatment, a period dummy, a treatment-period interaction, an internal + # _did_* working column, or a fixed-effect dummy) would silently + # overwrite that coefficient (dict last-write-wins). FE dummy names use + # the SAME get_dummies call (and fe==time skip) as the construction below. + _reserved = {"const", treatment, "_did_treatment"} + _reserved.update(f"period_{p}" for p in non_ref_periods) + _reserved.update(f"{treatment}:period_{p}" for p in non_ref_periods) + _reserved.update(f"_did_period_{p}" for p in non_ref_periods) + _reserved.update(f"_did_interact_{p}" for p in non_ref_periods) + if fixed_effects: + for fe in fixed_effects: + if fe == time: + continue + _reserved.update( + pd.get_dummies(working_data[fe], prefix=fe, drop_first=True).columns + ) + validate_covariate_names(covariates, _reserved, estimator="MultiPeriodDiD") + # Build design matrix # Start with intercept and treatment main effect X = np.column_stack([np.ones(len(y)), d]) diff --git a/diff_diff/twfe.py b/diff_diff/twfe.py index 72948cb9..e090ce85 100644 --- a/diff_diff/twfe.py +++ b/diff_diff/twfe.py @@ -14,6 +14,9 @@ from diff_diff.estimators import DifferenceInDifferences from diff_diff.linalg import LinearRegression from diff_diff.results import DiDResults +from diff_diff.utils import ( + validate_covariate_names, +) from diff_diff.utils import ( within_transform as _within_transform_util, ) @@ -135,7 +138,12 @@ def fit( # type: ignore[override] unit : str Name of unit identifier column. covariates : list, optional - List of covariate column names. + List of covariate column names. Names must not collide with reserved + structural terms (``const``, ``ATT``, unit/time fixed-effect dummy + names, or the internal ``_treatment_post`` column) and must be + unique; a collision or duplicate raises ``ValueError`` (it would + otherwise silently overwrite a structural coefficient on the + full-dummy HC2/HC2-BM path). survey_design : SurveyDesign, optional Survey design specification for design-based inference. When provided, uses Taylor Series Linearization for variance estimation and @@ -145,6 +153,12 @@ def fit( # type: ignore[override] ------- DiDResults Estimation results. + + Raises + ------ + ValueError + If a covariate name collides with a reserved structural term name + or duplicates another covariate. """ # Validate unit column exists if unit not in data.columns: @@ -282,6 +296,20 @@ def fit( # type: ignore[override] n_units = data[unit].nunique() n_times = data[time].nunique() + # Reject covariate names that collide with reserved structural terms. + # On the full-dummy (HC2/HC2-BM) path covariates are zipped into the + # coefficient dict alongside "const"/"ATT" and the unit/time dummies, so + # a colliding covariate would silently overwrite that coefficient (dict + # last-write-wins). The within-transform path does not expose covariates + # in the dict, but the covariate is still in X and a covariate named + # "_treatment_post" would clobber the internal interaction column; we + # therefore guard ALL paths. Unit/time dummy names use the SAME + # get_dummies call as the full-dummy construction below. + _reserved = {"const", "ATT", "_treatment_post"} + _reserved.update(pd.get_dummies(data[unit], prefix=f"_fe_{unit}", drop_first=True).columns) + _reserved.update(pd.get_dummies(data[time], prefix=f"_fe_{time}", drop_first=True).columns) + validate_covariate_names(covariates, _reserved, estimator="TwoWayFixedEffects") + if use_full_dummy: # HC2 / HC2-BM full-dummy build: bypass the within-transform # and stack [intercept, treated×post, covariates, unit_dummies, diff --git a/diff_diff/utils.py b/diff_diff/utils.py index 41f2f574..817c8e5d 100644 --- a/diff_diff/utils.py +++ b/diff_diff/utils.py @@ -4,7 +4,7 @@ import warnings from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, Iterable, List, Optional, Tuple import numpy as np import pandas as pd @@ -67,6 +67,73 @@ def validate_binary(arr: np.ndarray, name: str) -> None: raise ValueError(f"{name} must be binary (0 or 1). " f"Found values: {unique_values}") +def validate_covariate_names( + covariates: Optional[List[str]], + reserved_names: Iterable[str], + *, + estimator: str = "estimator", +) -> None: + """ + Validate that covariate column names do not collide with reserved + structural term names (and are not duplicated within ``covariates``). + + Fitted coefficients are stored in a ``name -> value`` dict built by zipping + a variable-name list -- structural term names PLUS the user covariate column + names appended verbatim -- with the coefficient vector. A covariate whose + name equals a reserved structural name (the intercept ``const``, the + treatment/time indicators, the interaction term, period dummies, + fixed-effect dummies, or an internal working column) would silently + overwrite the structural coefficient (Python dict last-write-wins), + corrupting the result with no error. Duplicate names within ``covariates`` + collapse to a single dict entry the same way. + + The comparison is case-sensitive: column names and dict keys are + case-sensitive, so e.g. ``Const`` does not actually collide with ``const`` + and is allowed. + + Parameters + ---------- + covariates : list of str or None + User-supplied covariate column names. ``None`` or empty is a no-op. + reserved_names : iterable of str + Reserved structural term names this estimator builds (estimator-specific). + estimator : str + Estimator name, used in the error message. + + Raises + ------ + ValueError + If a covariate name collides with a reserved structural name, or if + ``covariates`` contains duplicate names. + """ + if not covariates: + return + reserved = set(reserved_names) + collisions = sorted({c for c in covariates if c in reserved}) + if collisions: + raise ValueError( + f"{estimator}: covariate name(s) {collisions} collide with reserved " + f"structural term name(s). These names are used internally for the " + f"intercept, the treatment/time indicators, the interaction term, " + f"period dummies, fixed-effect dummies, or internal working columns, " + f"and a colliding covariate would silently overwrite the structural " + f"coefficient. Rename the covariate column(s). Reserved names for " + f"this fit: {sorted(reserved)}." + ) + seen: set = set() + duplicates = [] + for c in covariates: + if c in seen: + duplicates.append(c) + seen.add(c) + if duplicates: + raise ValueError( + f"{estimator}: duplicate covariate name(s) {sorted(set(duplicates))} " + f"in `covariates`. Each covariate maps to one coefficient; duplicates " + f"collapse to a single entry. Remove the duplicate(s)." + ) + + def warn_if_not_converged( converged: bool, method_name: str, diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index cb838ec6..5bf5cc1e 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -78,6 +78,7 @@ where τ is the ATT. - Tolerance: `1e-07` (matches R's `qr()` default), relative to largest diagonal element of R in QR decomposition - Controllable via `rank_deficient_action` parameter: "warn" (default), "error", or "silent" - **Note (scale invariance):** the shared `diff_diff/linalg.py` rank check re-detects on column-equilibrated (unit 2-norm) columns when the raw pass reports a deficiency (adopting the higher equilibrated rank and its scale-corrected pivot selection only when a large-scale column inflated the threshold; genuine collinearity keeps the raw selection), and the least-squares solve equilibrates columns then unscales the coefficients — so rank detection and the fit are invariant to per-column scaling; for a well-scaled collinear design the dropped column is unchanged, while a scale-induced under-count adopts the scale-corrected equilibrated selection (which may differ from the raw choice but is guaranteed to retain an identified subset). This covers every covariate fit routed through `solve_ols` — DiD, TwoWayFixedEffects, MultiPeriodDiD, ImputationDiD, TwoStageDiD, and TripleDifference. CallawaySantAnna and StaggeredTripleDifference fit their covariate outcome-regression nuisance via estimator-local `cho_solve`(X'X) / `lstsq` that are not yet equilibrated — see the CallawaySantAnna rank-deficiency Note's scope caveat. +- **Note (covariate-name collision guard):** `DifferenceInDifferences`, `MultiPeriodDiD`, and `TwoWayFixedEffects` build their `coefficients` dict by zipping a variable-name list (structural terms + user covariate column names, appended verbatim) with the coefficient vector. A covariate whose name equals a reserved structural name — `const`, the treatment/time column names, the `{treatment}:{time}` interaction (DiD), the `period_{p}` dummies / `{treatment}:period_{p}` interactions (MultiPeriodDiD), `ATT` (TwoWayFixedEffects), any fixed-effect / unit / time dummy name, or an internal `_`-prefixed working column (`_treat_time`, `_did_treatment`, `_treatment_post`) — would silently overwrite that structural coefficient (dict last-write-wins) with no error. `fit()` now calls `utils.validate_covariate_names()` before building the design, raising `ValueError` on such a collision (case-sensitive, so `Const` is allowed) and on duplicate covariate names. Reserved fixed-effect/unit/time dummy names are taken from the same `pd.get_dummies(..., drop_first=True)` call used to build them (exact match, including for `Categorical` columns). The TwoWayFixedEffects guard fires on both variance paths — the within-transform path exposes only `{"ATT": att}`, but a `_treatment_post`-named covariate would still clobber the internal interaction column. The influence-function estimators (CallawaySantAnna, etc.) key results by `(g, t)` and are unaffected. **Reference implementation(s):** - R: `fixest::feols()` with interaction term @@ -207,6 +208,7 @@ where V is the VCV sub-matrix for post-treatment δ_e coefficients. - Rank-deficient design matrix (collinearity): warns and sets NA for dropped coefficients (R-style, matches `lm()`) - **Note (scale invariance):** shared `diff_diff/linalg.py` behavior — rank detection re-checks on column-equilibrated columns and the solve equilibrates/unscales, so detection and fit are invariant to per-column scaling. For a well-scaled collinear design the dropped column is unchanged; a scale-induced under-count adopts the scale-corrected equilibrated selection (which may differ from the raw choice but retains an identified subset). See the CallawaySantAnna rank-deficiency Note. + - **Note (covariate-name collision guard):** a covariate named like a reserved structural term (`const`, the treatment column, a `period_{p}` dummy, a `{treatment}:period_{p}` interaction, a fixed-effect dummy, or an internal `_did_*` column) — or a duplicate covariate name — raises `ValueError` (would otherwise silently overwrite a structural coefficient in the coef dict). See the DifferenceInDifferences "covariate-name collision guard" Note. - Average ATT (`avg_att`) is NA if any post-period effect is unidentified (R-style NA propagation) - NaN inference for undefined statistics: @@ -279,6 +281,7 @@ This matches the behavior of R's `fixest::feols()` with absorbed FE. - Covariate collinearity emits warning but estimation continues (ATT still identified) - Rank-deficient design matrix: warns and sets NA for dropped coefficients (R-style, matches `lm()`) - **Note (scale invariance):** shared `diff_diff/linalg.py` behavior — rank detection re-checks on column-equilibrated columns and the solve equilibrates/unscales, so detection and fit are invariant to per-column scaling. For a well-scaled collinear design the dropped column is unchanged; a scale-induced under-count adopts the scale-corrected equilibrated selection (which may differ from the raw choice but retains an identified subset). See the CallawaySantAnna rank-deficiency Note. + - **Note (covariate-name collision guard):** a covariate named `const`, `ATT`, `_treatment_post`, or a unit/time fixed-effect dummy name — or a duplicate covariate name — raises `ValueError` on both variance paths (would otherwise silently overwrite a structural coefficient on the full-dummy HC2/HC2-BM path). See the DifferenceInDifferences "covariate-name collision guard" Note. - Unbalanced panels handled via proper demeaning - Multi-period `time` parameter: only binary (0/1) post indicator is recommended; multi-period values produce `treated × period_number` rather than `treated × post_indicator`. A `UserWarning` is diff --git a/tests/test_estimators.py b/tests/test_estimators.py index 77382274..e4880b25 100644 --- a/tests/test_estimators.py +++ b/tests/test_estimators.py @@ -3555,3 +3555,156 @@ def test_twfe_with_absorbed_covariate(self): assert np.isfinite(results.att) assert results.se > 0 + + +class TestCovariateNameCollision: + """PR3: a covariate whose name equals a reserved structural term silently + overwrote that coefficient in the zip()-built coef_dict (dict last-write-wins). + The DiD-family estimators now raise ValueError; non-colliding covariates leave + the structural coefficients intact. TWFE collision tests live in + tests/test_estimators_vcov_type.py (next to the full-dummy invariant).""" + + @staticmethod + def _did_data(): + rng = np.random.default_rng(7) + rows = [] + for unit in range(60): + treated = int(unit < 30) + for post in (0, 1): + y = 1.0 + 2.0 * treated * post + rng.normal() + rows.append({"unit": unit, "treated": treated, "post": post, "outcome": y}) + df = pd.DataFrame(rows) + df["x1"] = np.random.default_rng(11).normal(size=len(df)) + return df + + @staticmethod + def _mpd_data(): + rng = np.random.default_rng(7) + rows = [] + for unit in range(60): + treated = int(unit < 30) + ueff = rng.normal() + for period in range(6): + y = ( + 10.0 + ueff + 0.5 * period + + (3.0 if (treated and period >= 3) else 0.0) + + rng.normal(0, 0.5) + ) + rows.append( + {"unit": unit, "treated": treated, "time": period, "outcome": y} + ) + df = pd.DataFrame(rows) + df["x1"] = np.random.default_rng(11).normal(size=len(df)) + return df + + def test_did_collision_for_each_structural_name(self): + # Drive the reserved set from a clean fit's actual coefficient keys so + # the test cannot drift from the real var_names (const/treated/post/ + # treated:post), plus the internal _treat_time working column. + data = self._did_data() + clean = DifferenceInDifferences().fit( + data, "outcome", "treated", "post", covariates=["x1"] + ) + reserved = [k for k in clean.coefficients if k != "x1"] + ["_treat_time"] + assert "const" in reserved and "treated:post" in reserved # sanity + for name in reserved: + d2 = data.copy() + if name not in d2.columns: + d2[name] = np.random.default_rng(5).normal(size=len(d2)) + with pytest.raises(ValueError, match="collide"): + DifferenceInDifferences().fit( + d2, "outcome", "treated", "post", covariates=[name] + ) + + def test_did_fixed_effects_dummy_collision(self): + # get_dummies(region, prefix="region", drop_first=True) drops "region_A" + # and keeps "region_B" -> a covariate named "region_B" collides. + data = self._did_data() + data["region"] = np.where(data["unit"] % 2 == 0, "A", "B") + data["region_B"] = np.random.default_rng(2).normal(size=len(data)) + with pytest.raises(ValueError, match="collide"): + DifferenceInDifferences().fit( + data, "outcome", "treated", "post", + covariates=["region_B"], fixed_effects=["region"], + ) + + def test_did_noncolliding_preserves_structural_coefs(self): + data = self._did_data() + r = DifferenceInDifferences().fit( + data, "outcome", "treated", "post", covariates=["x1"] + ) + ck = r.coefficients + assert np.isfinite(ck["const"]) and np.isfinite(ck["treated:post"]) + assert "x1" in ck and ck["x1"] != ck["treated:post"] + # No key was overwritten: dict size matches the vcov rank. + assert len(ck) == r.vcov.shape[0] + + def test_did_formula_colliding_raises(self): + data = self._did_data() + data["const"] = np.random.default_rng(3).normal(size=len(data)) + with pytest.raises(ValueError, match="collide"): + DifferenceInDifferences().fit(data, formula="outcome ~ treated*post + const") + + def test_did_formula_noncolliding_works(self): + data = self._did_data() + r = DifferenceInDifferences().fit(data, formula="outcome ~ treated*post + x1") + assert "x1" in r.coefficients + + def test_did_duplicate_covariates_raise(self): + data = self._did_data() + with pytest.raises(ValueError, match="duplicate"): + DifferenceInDifferences().fit( + data, "outcome", "treated", "post", covariates=["x1", "x1"] + ) + + def test_mpd_collision_for_each_structural_name(self): + data = self._mpd_data() + clean = MultiPeriodDiD().fit( + data, "outcome", "treated", "time", covariates=["x1"] + ) + # const/treated/period_*/treated:period_* (actual keys) + internal column. + reserved = [k for k in clean.coefficients if k != "x1"] + ["_did_treatment"] + assert any(k.startswith("period_") for k in reserved) # sanity + for name in reserved: + d2 = data.copy() + if name not in d2.columns: + d2[name] = np.random.default_rng(5).normal(size=len(d2)) + with pytest.raises(ValueError, match="collide"): + MultiPeriodDiD().fit( + d2, "outcome", "treated", "time", covariates=[name] + ) + + def test_mpd_noncolliding_preserves_structural_coefs(self): + data = self._mpd_data() + r = MultiPeriodDiD().fit( + data, "outcome", "treated", "time", covariates=["x1"] + ) + ck = r.coefficients + assert "x1" in ck and np.isfinite(ck["const"]) + assert any(k.startswith("period_") for k in ck) + assert len(ck) == r.vcov.shape[0] + + def test_synthetic_did_unaffected_by_guard(self): + # SyntheticDiD overrides fit() and never reaches the base-class guard; + # a covariate must still fit (regression lock that the guard didn't leak). + rng = np.random.default_rng(42) + rows = [] + for unit in range(30): + treated = int(unit < 5) + ueff = rng.normal(0, 3) + for period in range(8): + y = ( + 10.0 + ueff + 0.5 * period + + (5.0 if (treated and period >= 4) else 0.0) + + rng.normal(0, 0.5) + ) + rows.append( + {"unit": unit, "period": period, "treated": treated, "outcome": y} + ) + d = pd.DataFrame(rows) + d["x1"] = rng.normal(size=len(d)) + r = SyntheticDiD().fit( + d, "outcome", "treated", unit="unit", time="period", + post_periods=[4, 5, 6, 7], covariates=["x1"], + ) + assert np.isfinite(r.att) diff --git a/tests/test_estimators_vcov_type.py b/tests/test_estimators_vcov_type.py index b25a577d..ea5926bf 100644 --- a/tests/test_estimators_vcov_type.py +++ b/tests/test_estimators_vcov_type.py @@ -2151,3 +2151,57 @@ def test_absorb_hc2_bm_replicate_weights_auto_routes(self): np.testing.assert_allclose(pe_a.se, pe_f.se, atol=1e-12) np.testing.assert_allclose(res_absorb.avg_att, res_fe.avg_att, atol=1e-12) np.testing.assert_allclose(res_absorb.avg_se, res_fe.avg_se, atol=1e-12) + + +class TestTWFECovariateNameCollision: + """PR3: TwoWayFixedEffects covariate-name collision guard. + + On the full-dummy HC2/HC2-BM path covariates are zipped into the coefficient + dict alongside "const"/"ATT" and the unit/time dummies, so a colliding + covariate would silently overwrite that coefficient. The within-transform + (default HC1) path exposes only {"ATT": att}, but the covariate is still in + X and a covariate named "_treatment_post" would clobber the internal + interaction column, so the guard fires on ALL paths. Lives here next to the + full-dummy ``len(coefficients) == vcov.shape[0]`` invariant. + """ + + @staticmethod + def _panel_with(covname: str) -> pd.DataFrame: + df = _make_did_panel(n_units=30, seed=20260420) + # unit 0..29 / time {0,1}: get_dummies(drop_first=True) keeps + # "_fe_unit_1".."_fe_unit_29" and "_fe_time_1". + df[covname] = np.random.default_rng(99).normal(size=len(df)) + return df + + @pytest.mark.parametrize("vcov_type", ["hc1", "hc2"]) + @pytest.mark.parametrize( + "name", ["const", "ATT", "_treatment_post", "_fe_unit_1", "_fe_time_1"] + ) + def test_collision_raises_on_all_paths(self, vcov_type, name): + df = self._panel_with(name) + with pytest.raises(ValueError, match="collide"): + TwoWayFixedEffects(vcov_type=vcov_type).fit( + df, outcome="y", treatment="treated", time="time", unit="unit", + covariates=[name], + ) + + def test_hc2_full_dummy_noncolliding_preserves_coefs(self): + df = self._panel_with("x1") + r = TwoWayFixedEffects(vcov_type="hc2").fit( + df, outcome="y", treatment="treated", time="time", unit="unit", + covariates=["x1"], + ) + ck = r.coefficients + assert "ATT" in ck and "x1" in ck + # No key overwritten: dict size matches the full-dummy vcov rank. + assert len(ck) == r.vcov.shape[0] + + def test_within_transform_noncolliding_returns_att_only(self): + df = self._panel_with("x1") + r = TwoWayFixedEffects().fit( # default hc1 -> within-transform + df, outcome="y", treatment="treated", time="time", unit="unit", + covariates=["x1"], + ) + # The within-transform path exposes only the ATT coefficient by design; + # the covariate is NOT a dict key there (so there is no overwrite surface). + assert set(r.coefficients.keys()) == {"ATT"} diff --git a/tests/test_utils.py b/tests/test_utils.py index 9d605955..c30fb3f0 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -29,6 +29,7 @@ equivalence_test_trends, safe_inference, validate_binary, + validate_covariate_names, ) # ============================================================================= @@ -1302,3 +1303,42 @@ def test_multiple_post_periods(self): # Control DiD: 14 - 10 = 4 # tau = 9 - 4 = 5 assert abs(tau - 5.0) < 1e-6 + + +class TestValidateCovariateNames: + """Tests for validate_covariate_names (covariate/reserved-term collision guard).""" + + def test_none_is_noop(self): + validate_covariate_names(None, {"const", "treated"}) + + def test_empty_is_noop(self): + validate_covariate_names([], {"const", "treated"}) + + def test_non_colliding_passes(self): + # Should not raise. + validate_covariate_names(["x1", "x2"], {"const", "treated", "treated:post"}) + + def test_collision_raises(self): + with pytest.raises(ValueError, match="collide"): + validate_covariate_names(["const"], {"const", "treated"}) + + def test_collision_lists_offender_and_estimator(self): + with pytest.raises(ValueError, match="MyEstimator") as exc: + validate_covariate_names( + ["age", "const"], {"const", "treated"}, estimator="MyEstimator" + ) + # The offending name is reported (not the innocent one as a collision). + assert "const" in str(exc.value) + + def test_case_sensitive_does_not_collide(self): + # Column names / dict keys are case-sensitive; "Const" != "const". + validate_covariate_names(["Const"], {"const", "treated"}) + + def test_duplicate_covariates_raise(self): + with pytest.raises(ValueError, match="duplicate"): + validate_covariate_names(["x1", "x1"], {"const"}) + + def test_duplicate_takes_precedence_only_when_no_collision(self): + # A name that both collides AND is duplicated reports the collision first. + with pytest.raises(ValueError, match="collide"): + validate_covariate_names(["const", "const"], {"const"}) From f6344d448722ed37ae34ec60ee3004eb4af8019f Mon Sep 17 00:00:00 2001 From: igerber Date: Mon, 1 Jun 2026 09:31:48 -0400 Subject: [PATCH 2/4] Add validate_design_term_names backstop for fixed-effect dummy collisions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The upfront validate_covariate_names guard only validates the user `covariates` list. Fixed-effect dummy names (`{fe}_{value}`) are appended to var_names AFTER that guard and were not themselves checked against structural names — so MultiPeriodDiD(fixed_effects=["period"]) (a non-time FE whose dummies are named period_1, period_2, ...) could still collide with the structural period_{p} event-study keys and silently overwrite them in the coefficients dict, the exact bug class this PR addresses (caught in local review). Add a shared validate_design_term_names(var_names) backstop in utils.py that enforces uniqueness on the FINAL assembled term list (structural + covariates + fixed-effect dummies) right before the coefficients dict is built, in all three estimators. This catches data-dependent collisions (FE-dummy vs structural, FE-dummy vs FE-dummy) that cannot be known up front. The upfront covariate guard is kept for its precise, fail-fast messaging on the common case. Tests: tests/test_utils.py::TestValidateDesignTermNames + an MPD fixed-effect- dummy-vs-period_{p} regression in TestCovariateNameCollision. Full estimator + TWFE-methodology suites pass with the backstop (no false positives on existing fixed-effect fits). Co-Authored-By: Claude Opus 4.8 (1M context) --- diff_diff/estimators.py | 6 +++++ diff_diff/twfe.py | 5 +++++ diff_diff/utils.py | 48 ++++++++++++++++++++++++++++++++++++++++ tests/test_estimators.py | 13 +++++++++++ tests/test_utils.py | 23 +++++++++++++++++++ 5 files changed, 95 insertions(+) diff --git a/diff_diff/estimators.py b/diff_diff/estimators.py index bb9aa180..3c0c35dd 100644 --- a/diff_diff/estimators.py +++ b/diff_diff/estimators.py @@ -33,6 +33,7 @@ safe_inference, validate_binary, validate_covariate_names, + validate_design_term_names, wild_bootstrap_se, ) @@ -685,6 +686,7 @@ def _refit_did_absorb(w_r): n_control = n_control_raw # Create coefficient dictionary + validate_design_term_names(var_names, estimator="DifferenceInDifferences") coef_dict = {name: coef for name, coef in zip(var_names, coefficients)} # Determine inference method and bootstrap info @@ -2036,6 +2038,10 @@ def _refit_mp_absorb(w_r): n_treated = n_treated_raw n_control = n_control_raw + # Backstop: reject any duplicate in the FINAL term list (e.g. a + # fixed-effect dummy colliding with a structural `period_{p}` key) + # before it silently overwrites a coefficient in the dict below. + validate_design_term_names(var_names, estimator="MultiPeriodDiD") # Create coefficient dictionary coef_dict = {name: coef for name, coef in zip(var_names, coefficients)} diff --git a/diff_diff/twfe.py b/diff_diff/twfe.py index e090ce85..9a6597a4 100644 --- a/diff_diff/twfe.py +++ b/diff_diff/twfe.py @@ -16,6 +16,7 @@ from diff_diff.results import DiDResults from diff_diff.utils import ( validate_covariate_names, + validate_design_term_names, ) from diff_diff.utils import ( within_transform as _within_transform_util, @@ -367,6 +368,10 @@ def fit( # type: ignore[override] + list(unit_dummies_df.columns) + list(time_dummies_df.columns) ) + # Backstop: reject any duplicate in the FINAL term list (e.g. a + # unit/time dummy colliding with a structural term or another dummy) + # before it silently overwrites a coefficient in the dict below. + validate_design_term_names(_twfe_var_names, estimator="TwoWayFixedEffects") else: # Default within-transform path (HC1 / classical / Conley): # demean outcome, covariates, AND interaction in a single pass diff --git a/diff_diff/utils.py b/diff_diff/utils.py index 817c8e5d..90e5785e 100644 --- a/diff_diff/utils.py +++ b/diff_diff/utils.py @@ -134,6 +134,54 @@ def validate_covariate_names( ) +def validate_design_term_names( + var_names: Iterable[str], + *, + estimator: str = "estimator", +) -> None: + """ + Raise if the assembled design term-name list contains duplicates. + + Backstop for :func:`validate_covariate_names`: even after the user + covariates are cleared, a fixed-effect dummy name (``{fe}_{value}``) can + still collide with a structural term — most notably a ``MultiPeriodDiD`` + ``period_{p}`` event-study key when a non-time fixed effect produces matching + dummy names — or with another dummy. Such a duplicate would silently + overwrite a coefficient when ``var_names`` is zipped into the result's + ``coefficients`` dict (Python dict last-write-wins). This checks the FINAL + name list (structural terms + covariates + fixed-effect dummies) right + before the dict is built, catching collisions that depend on the data and so + cannot be known up front. + + Parameters + ---------- + var_names : iterable of str + The fully assembled design-matrix column-name list. + estimator : str + Estimator name, used in the error message. + + Raises + ------ + ValueError + If any name appears more than once. + """ + seen: set = set() + duplicates = [] + for name in var_names: + if name in seen: + duplicates.append(name) + seen.add(name) + if duplicates: + raise ValueError( + f"{estimator}: the fitted design has duplicate term name(s) " + f"{sorted(set(duplicates))} — a covariate or fixed-effect dummy name " + f"collides with a structural term (intercept, treatment/time " + f"indicators, the interaction, or period dummies) or with another " + f"column. This would silently overwrite a coefficient in the result. " + f"Rename the offending fixed-effect category or covariate column." + ) + + def warn_if_not_converged( converged: bool, method_name: str, diff --git a/tests/test_estimators.py b/tests/test_estimators.py index e4880b25..7de586d4 100644 --- a/tests/test_estimators.py +++ b/tests/test_estimators.py @@ -3684,6 +3684,19 @@ def test_mpd_noncolliding_preserves_structural_coefs(self): assert any(k.startswith("period_") for k in ck) assert len(ck) == r.vcov.shape[0] + def test_mpd_fixed_effect_dummy_collides_with_period_keys(self): + # Backstop: a NON-time fixed effect whose get_dummies names match the + # structural period_{p} event-study keys must raise (would otherwise + # silently overwrite those coefficients) — the FE dummy is appended to + # var_names directly, so the upfront covariate guard cannot catch it. + data = self._mpd_data() + data["period"] = data["time"] # FE column 'period' -> 'period_1'... dummies + with pytest.raises(ValueError, match="collide"): + MultiPeriodDiD().fit( + data, "outcome", "treated", "time", + covariates=["x1"], fixed_effects=["period"], + ) + def test_synthetic_did_unaffected_by_guard(self): # SyntheticDiD overrides fit() and never reaches the base-class guard; # a covariate must still fit (regression lock that the guard didn't leak). diff --git a/tests/test_utils.py b/tests/test_utils.py index c30fb3f0..f1288674 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -30,6 +30,7 @@ safe_inference, validate_binary, validate_covariate_names, + validate_design_term_names, ) # ============================================================================= @@ -1342,3 +1343,25 @@ def test_duplicate_takes_precedence_only_when_no_collision(self): # A name that both collides AND is duplicated reports the collision first. with pytest.raises(ValueError, match="collide"): validate_covariate_names(["const", "const"], {"const"}) + + +class TestValidateDesignTermNames: + """Tests for validate_design_term_names (final var_names uniqueness backstop).""" + + def test_unique_passes(self): + validate_design_term_names(["const", "treated", "post", "treated:post", "x1"]) + + def test_duplicate_raises(self): + # e.g. a fixed-effect dummy "period_2" colliding with a structural key. + with pytest.raises(ValueError, match="collide"): + validate_design_term_names( + ["const", "treated", "period_2", "period_2"], estimator="MultiPeriodDiD" + ) + + def test_message_names_duplicate_and_estimator(self): + with pytest.raises(ValueError, match="MultiPeriodDiD") as exc: + validate_design_term_names(["a", "a"], estimator="MultiPeriodDiD") + assert "a" in str(exc.value) + + def test_empty_passes(self): + validate_design_term_names([]) From ef4ea276781a295277026b9c3af6c05499db35e7 Mon Sep 17 00:00:00 2001 From: igerber Date: Mon, 1 Jun 2026 09:44:52 -0400 Subject: [PATCH 3/4] Derive reserved FE-dummy names without materializing the dummy matrix (perf) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The collision guard reserved fixed-effect/unit/time dummy names via pd.get_dummies(col, ...).columns, which materializes the full (n x G) dummy matrix. In TwoWayFixedEffects this ran UNCONDITIONALLY — including the default within-transform (hc1/classical/conley) path whose entire purpose is to avoid expanding high-cardinality FE dummies — so a large-G panel paid an avoidable O(n x G) allocation (latency / potential OOM) before estimation (caught in local review). Add utils.fe_dummy_names(col, prefix): derives the same names get_dummies would produce, from the category levels only (O(G)), via pd.Categorical(col).categories (sorted unique for a plain column, declared order for a Categorical) + drop_first. Verified to match pd.get_dummies(drop_first=True).columns exactly across int / str / float / NaN / non-default-order Categorical dtypes. Wire it into the DiD, MPD, and TWFE reserved-name construction in place of get_dummies. The within- transform TWFE path now performs no FE-dummy materialization at all. Tests: tests/test_utils.py::TestFeDummyNames (equivalence to get_dummies, incl. Categorical order) + a TwoWayFixedEffects within-path regression that monkeypatches pd.get_dummies to raise and asserts the hc1 fit still succeeds. Co-Authored-By: Claude Opus 4.8 (1M context) --- diff_diff/estimators.py | 20 ++++++++--------- diff_diff/twfe.py | 13 +++++++---- diff_diff/utils.py | 35 ++++++++++++++++++++++++++++++ tests/test_estimators_vcov_type.py | 20 +++++++++++++++++ tests/test_utils.py | 21 ++++++++++++++++++ 5 files changed, 95 insertions(+), 14 deletions(-) diff --git a/diff_diff/estimators.py b/diff_diff/estimators.py index 3c0c35dd..9545f316 100644 --- a/diff_diff/estimators.py +++ b/diff_diff/estimators.py @@ -30,6 +30,7 @@ from diff_diff.utils import ( WildBootstrapResults, demean_by_group, + fe_dummy_names, safe_inference, validate_binary, validate_covariate_names, @@ -480,14 +481,13 @@ def fit( # silently overwrite that coefficient (dict last-write-wins). The # reserved set covers the intercept, treatment/time indicators, the # interaction, the internal _treat_time working column, and any - # fixed-effect dummy names (derived from the SAME get_dummies call used - # to build them below, so the names match exactly). + # fixed-effect dummy names (derived via fe_dummy_names WITHOUT + # materializing the dummy matrix; names match the get_dummies build + # below exactly). validate_design_term_names re-checks the FINAL list. _reserved = {"const", treatment, time, f"{treatment}:{time}", "_treat_time"} if fixed_effects: for fe in fixed_effects: - _reserved.update( - pd.get_dummies(working_data[fe], prefix=fe, drop_first=True).columns - ) + _reserved.update(fe_dummy_names(working_data[fe], fe)) validate_covariate_names(covariates, _reserved, estimator="DifferenceInDifferences") # Build design matrix @@ -1606,8 +1606,10 @@ def fit( # type: ignore[override] # coef_dict, so a covariate named like a structural term (intercept, # treatment, a period dummy, a treatment-period interaction, an internal # _did_* working column, or a fixed-effect dummy) would silently - # overwrite that coefficient (dict last-write-wins). FE dummy names use - # the SAME get_dummies call (and fe==time skip) as the construction below. + # overwrite that coefficient (dict last-write-wins). FE dummy names are + # derived via fe_dummy_names (no dummy-matrix materialization), matching + # the construction below (and applying the same fe==time skip). + # validate_design_term_names re-checks the FINAL list before coef_dict. _reserved = {"const", treatment, "_did_treatment"} _reserved.update(f"period_{p}" for p in non_ref_periods) _reserved.update(f"{treatment}:period_{p}" for p in non_ref_periods) @@ -1617,9 +1619,7 @@ def fit( # type: ignore[override] for fe in fixed_effects: if fe == time: continue - _reserved.update( - pd.get_dummies(working_data[fe], prefix=fe, drop_first=True).columns - ) + _reserved.update(fe_dummy_names(working_data[fe], fe)) validate_covariate_names(covariates, _reserved, estimator="MultiPeriodDiD") # Build design matrix diff --git a/diff_diff/twfe.py b/diff_diff/twfe.py index 9a6597a4..ce7ff445 100644 --- a/diff_diff/twfe.py +++ b/diff_diff/twfe.py @@ -15,6 +15,7 @@ from diff_diff.linalg import LinearRegression from diff_diff.results import DiDResults from diff_diff.utils import ( + fe_dummy_names, validate_covariate_names, validate_design_term_names, ) @@ -304,11 +305,15 @@ def fit( # type: ignore[override] # last-write-wins). The within-transform path does not expose covariates # in the dict, but the covariate is still in X and a covariate named # "_treatment_post" would clobber the internal interaction column; we - # therefore guard ALL paths. Unit/time dummy names use the SAME - # get_dummies call as the full-dummy construction below. + # therefore guard ALL paths. Unit/time dummy names are derived via + # fe_dummy_names WITHOUT materializing the dummy matrix — critical here + # because the within-transform path (the default hc1/classical/conley + # branch) deliberately never expands the full FE dummies (its scaling + # contract for high-cardinality panels); a get_dummies build would + # defeat that. validate_design_term_names re-checks the full-dummy list. _reserved = {"const", "ATT", "_treatment_post"} - _reserved.update(pd.get_dummies(data[unit], prefix=f"_fe_{unit}", drop_first=True).columns) - _reserved.update(pd.get_dummies(data[time], prefix=f"_fe_{time}", drop_first=True).columns) + _reserved.update(fe_dummy_names(data[unit], f"_fe_{unit}")) + _reserved.update(fe_dummy_names(data[time], f"_fe_{time}")) validate_covariate_names(covariates, _reserved, estimator="TwoWayFixedEffects") if use_full_dummy: diff --git a/diff_diff/utils.py b/diff_diff/utils.py index 90e5785e..7723dd79 100644 --- a/diff_diff/utils.py +++ b/diff_diff/utils.py @@ -182,6 +182,41 @@ def validate_design_term_names( ) +def fe_dummy_names(col: pd.Series, prefix: str) -> List[str]: + """ + Reserved fixed-effect dummy column names for the collision guard, matching + ``pd.get_dummies(col, prefix=prefix, drop_first=True).columns`` WITHOUT + materializing the dense ``(n x G)`` dummy matrix. + + The within-transform ``TwoWayFixedEffects`` path is specifically designed to + avoid expanding high-cardinality fixed-effect dummies (that is its scaling + contract), so the collision guard must reserve those names without building + the dummy block. ``pd.get_dummies`` orders categories via + ``pd.Categorical(col).categories`` — sorted unique values for a plain column, + the declared category order for a ``Categorical`` — then ``drop_first=True`` + drops the first. This derivation reproduces that exactly (including + ``Categorical`` columns with a non-default category order) at ``O(G)`` memory. + + Parameters + ---------- + col : pandas.Series + The fixed-effect / unit / time column. + prefix : str + Dummy-name prefix (the project uses ``fe`` for ``fixed_effects`` and + ``_fe_{unit}`` / ``_fe_{time}`` for TWFE unit/time dummies). + + Returns + ------- + list of str + The kept (post ``drop_first``) dummy column names. + """ + if isinstance(col.dtype, pd.CategoricalDtype): + cats = list(col.cat.categories) + else: + cats = list(pd.Categorical(col).categories) + return [f"{prefix}_{c}" for c in cats[1:]] + + def warn_if_not_converged( converged: bool, method_name: str, diff --git a/tests/test_estimators_vcov_type.py b/tests/test_estimators_vcov_type.py index ea5926bf..a41f6a23 100644 --- a/tests/test_estimators_vcov_type.py +++ b/tests/test_estimators_vcov_type.py @@ -2205,3 +2205,23 @@ def test_within_transform_noncolliding_returns_att_only(self): # The within-transform path exposes only the ATT coefficient by design; # the covariate is NOT a dict key there (so there is no overwrite surface). assert set(r.coefficients.keys()) == {"ATT"} + + def test_within_path_does_not_materialize_fe_dummies(self, monkeypatch): + # Regression: the within-transform (default hc1) path must NOT build full + # unit/time dummy matrices merely to reserve collision names — that would + # defeat its high-cardinality scaling contract. Reserved names come from + # fe_dummy_names (category levels only), so pd.get_dummies must never be + # called on this path. + df = self._panel_with("x1") + + def _boom(*args, **kwargs): + raise AssertionError( + "pd.get_dummies must not be called on the within-transform path" + ) + + monkeypatch.setattr(pd, "get_dummies", _boom) + r = TwoWayFixedEffects().fit( + df, outcome="y", treatment="treated", time="time", unit="unit", + covariates=["x1"], + ) + assert set(r.coefficients.keys()) == {"ATT"} diff --git a/tests/test_utils.py b/tests/test_utils.py index f1288674..97f36d32 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -27,6 +27,7 @@ compute_sdid_estimator, compute_time_weights, equivalence_test_trends, + fe_dummy_names, safe_inference, validate_binary, validate_covariate_names, @@ -1365,3 +1366,23 @@ def test_message_names_duplicate_and_estimator(self): def test_empty_passes(self): validate_design_term_names([]) + + +class TestFeDummyNames: + """fe_dummy_names must match pd.get_dummies(drop_first=True).columns exactly + (including Categorical non-default order) without materializing the matrix.""" + + @pytest.mark.parametrize( + "col", + [ + pd.Series([3, 1, 2, 1]), + pd.Series(["b", "a", "c", "a"]), + pd.Series(pd.Categorical(["m", "a", "z", "a"], categories=["m", "a", "z"])), + pd.Series([2.0, 1.0, 3.0]), + pd.Series(["b", "a", np.nan, "a"]), + ], + ) + def test_matches_get_dummies(self, col): + assert fe_dummy_names(col, "fe") == list( + pd.get_dummies(col, prefix="fe", drop_first=True).columns + ) From 7ecffd21460cd64040dfa6e3934ff1a5772c19c4 Mon Sep 17 00:00:00 2001 From: igerber Date: Mon, 1 Jun 2026 09:52:26 -0400 Subject: [PATCH 4/4] Move design-term uniqueness check before the regression (fail-fast) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit validate_design_term_names ran just before coef_dict construction — i.e. AFTER the OLS fit — in DifferenceInDifferences and MultiPeriodDiD, so a duplicate assembled term name (e.g. an MPD fixed-effect dummy colliding with a structural period_{p} key) drove a wasted rank-deficient fit and emitted a misleading multicollinearity warning before the intended ValueError (local review P3). Move the check to immediately after var_names is fully assembled and before the regression call in both estimators (TwoWayFixedEffects already checked pre-fit). var_names is not mutated between assembly and coef_dict, so this is behavior- preserving except that the ValueError now fires fast and warning-free. Co-Authored-By: Claude Opus 4.8 (1M context) --- diff_diff/estimators.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/diff_diff/estimators.py b/diff_diff/estimators.py index 9545f316..fcaa0520 100644 --- a/diff_diff/estimators.py +++ b/diff_diff/estimators.py @@ -510,6 +510,12 @@ def fit( X = np.column_stack([X, dummies[col].values.astype(float)]) var_names.append(col) + # Reject any duplicate in the FINAL term list (e.g. a fixed-effect dummy + # colliding with a structural term) BEFORE the regression — so the fit is + # not wasted and no misleading multicollinearity warning is emitted ahead + # of the intended ValueError. + validate_design_term_names(var_names, estimator="DifferenceInDifferences") + # Extract ATT index (coefficient on interaction term) att_idx = 3 # Index of interaction term att_var_name = f"{treatment}:{time}" @@ -686,7 +692,6 @@ def _refit_did_absorb(w_r): n_control = n_control_raw # Create coefficient dictionary - validate_design_term_names(var_names, estimator="DifferenceInDifferences") coef_dict = {name: coef for name, coef in zip(var_names, coefficients)} # Determine inference method and bootstrap info @@ -1680,6 +1685,12 @@ def fit( # type: ignore[override] X = np.column_stack([X, dummies[col].values.astype(float)]) var_names.append(col) + # Reject any duplicate in the FINAL term list (e.g. a fixed-effect dummy + # colliding with a structural period_{p} key) BEFORE the regression — so + # the fit is not wasted and no misleading multicollinearity warning is + # emitted ahead of the intended ValueError. + validate_design_term_names(var_names, estimator="MultiPeriodDiD") + # Fit OLS using unified backend # Pass cluster_ids to solve_ols for proper vcov computation # This handles rank-deficient matrices by returning NaN for dropped columns @@ -2038,11 +2049,8 @@ def _refit_mp_absorb(w_r): n_treated = n_treated_raw n_control = n_control_raw - # Backstop: reject any duplicate in the FINAL term list (e.g. a - # fixed-effect dummy colliding with a structural `period_{p}` key) - # before it silently overwrites a coefficient in the dict below. - validate_design_term_names(var_names, estimator="MultiPeriodDiD") - # Create coefficient dictionary + # Create coefficient dictionary (var_names uniqueness already enforced + # before the fit above). coef_dict = {name: coef for name, coef in zip(var_names, coefficients)} # Store results