Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 59 additions & 3 deletions diff_diff/continuous_did.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,50 @@ def fit(
f"Dose must be time-invariant. Units with varying dose: {bad_units[:5]}"
)

# Normalize first_treat: inf → 0
# Normalize first_treat: +inf → 0 (R-style never-treated encoding).
# Count rows recategorized so users can see how many units just
# crossed from "treated at some point" to "never treated" — silent
# recategorization here would shift the control composition (axis-E
# silent coercion). Only positive infinity is recoded (to match the
# existing `.replace([np.inf, float("inf")], 0)` semantics on the
# next line).
first_treat_vals = df[first_treat].values
# Reject NaN first_treat explicitly. NaN survives preprocessing but
# satisfies neither the treated (g > 0) nor never-treated (g == 0)
# mask, so affected units would be silently excluded from the
# estimator (same silent-failure shape as `first_treat < 0`).
nan_mask = pd.isna(df[first_treat])
n_nan_first_treat = int(nan_mask.sum())
if n_nan_first_treat > 0:
raise ValueError(
f"{n_nan_first_treat} row(s) have NaN '{first_treat}' "
f"values. Valid values are 0 (never-treated) or a positive "
f"treatment period; such units would otherwise be silently "
f"excluded from both treated and control pools."
)
inf_mask = np.isposinf(first_treat_vals)
n_inf_first_treat = int(inf_mask.sum())
if n_inf_first_treat > 0:
warnings.warn(
f"{n_inf_first_treat} row(s) have inf in '{first_treat}'; "
f"treating the corresponding units as never-treated. Pass an "
f"explicit never-treated marker (0) if this is not intended.",
UserWarning,
stacklevel=2,
)
# Reject negative first_treat values (including -inf) explicitly.
# Without this guard they would survive preprocessing but fall out of
# both the treated (g > 0) and never-treated (g == 0) masks, silently
# excluding the affected units.
negative_mask = first_treat_vals < 0
n_negative_first_treat = int(negative_mask.sum())
if n_negative_first_treat > 0:
raise ValueError(
f"{n_negative_first_treat} row(s) have negative '{first_treat}' "
f"values (including -inf). Valid values are 0 (never-treated) "
f"or a positive treatment period; such units would otherwise "
f"be silently excluded from both treated and control pools."
)
df[first_treat] = df[first_treat].replace([np.inf, float("inf")], 0)

# Drop units with positive first_treat but zero dose (R convention)
Expand Down Expand Up @@ -265,9 +308,22 @@ def fit(
stacklevel=2,
)

# Force dose=0 for never-treated units with nonzero dose
# Force dose=0 for never-treated units with nonzero dose. Report the
# affected row count via UserWarning so users can see whether their
# never-treated rows had unintended nonzero doses — silent zeroing
# here would quietly shift part of the control trajectory (axis-E
# silent coercion, paired with the `first_treat=inf -> 0` fix above).
never_treated_mask = df[first_treat] == 0
if (df.loc[never_treated_mask, dose] != 0).any():
nonzero_dose_rows = never_treated_mask & (df[dose] != 0)
n_nonzero_dose_never_treated = int(nonzero_dose_rows.sum())
if n_nonzero_dose_never_treated > 0:
warnings.warn(
f"{n_nonzero_dose_never_treated} row(s) have '{first_treat}'=0 "
f"(never-treated) but nonzero '{dose}'; zeroing the dose. Pass "
f"dose=0 for never-treated rows to avoid this coercion.",
UserWarning,
stacklevel=2,
)
df.loc[never_treated_mask, dose] = 0.0

# Verify balanced panel
Expand Down
13 changes: 13 additions & 0 deletions diff_diff/staggered_triple_diff.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,19 @@ def fit(

if first_treat != "first_treat":
df["first_treat"] = df[first_treat]
# Surface the inf → 0 recategorization the same way StaggeredDiD does
# (see `staggered.py:1508-1519`). Silently recoding inf would shift
# units between treated and never-treated pools with no signal
# (axis-E silent coercion under the Phase 2 audit).
_inf_mask = np.isposinf(df["first_treat"].values)
if _inf_mask.any():
n_inf_rows = int(_inf_mask.sum())
warnings.warn(
f"{n_inf_rows} row(s) have first_treat=inf; recoding to 0 "
f"(never-treated). Use first_treat=0 to suppress this warning.",
UserWarning,
stacklevel=2,
)
df["first_treat"] = df["first_treat"].replace([np.inf, float("inf")], 0)

precomputed = self._precompute_structures(
Expand Down
32 changes: 28 additions & 4 deletions diff_diff/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -821,7 +821,8 @@ def check_parallel_trends_robust(

# Compute outcome changes
treated_changes, control_changes = _compute_outcome_changes(
pre_data, outcome, time, treatment_group, unit
pre_data, outcome, time, treatment_group, unit,
caller_label="check_parallel_trends_robust",
)

if len(treated_changes) < 2 or len(control_changes) < 2:
Expand Down Expand Up @@ -897,7 +898,12 @@ def check_parallel_trends_robust(


def _compute_outcome_changes(
data: pd.DataFrame, outcome: str, time: str, treatment_group: str, unit: Optional[str] = None
data: pd.DataFrame,
outcome: str,
time: str,
treatment_group: str,
unit: Optional[str] = None,
caller_label: str = "parallel-trend diagnostic",
) -> Tuple[np.ndarray, np.ndarray]:
"""
Compute period-to-period outcome changes for treated and control groups.
Expand Down Expand Up @@ -925,7 +931,24 @@ def _compute_outcome_changes(
data_sorted = data.sort_values([unit, time])
data_sorted["_outcome_change"] = data_sorted.groupby(unit)[outcome].diff()

# Remove NaN from first period of each unit
# Remove NaN from first period of each unit. The first period per unit
# has no prior observation to diff against, so n_units drops are
# expected. Anything beyond that is a silent side-effect of gaps or
# NaN outcomes — surface the excess via warning (axis-E drop counter).
n_units_observed = int(data_sorted[unit].nunique())
n_dropped = int(data_sorted["_outcome_change"].isna().sum())
n_unexpected_drops = max(0, n_dropped - n_units_observed)
if n_unexpected_drops > 0:
warnings.warn(
f"{caller_label}: dropped {n_dropped} row(s) with NaN "
f"first-differences; {n_units_observed} are the expected "
f"first-period-per-unit drops, and {n_unexpected_drops} are "
f"additional NaN first-differences (e.g. NaN outcomes or "
f"unit-period gaps upstream). Parallel-trend statistics are "
f"computed on the remaining rows.",
UserWarning,
stacklevel=3,
)
changes_data = data_sorted.dropna(subset=["_outcome_change"])

treated_changes = changes_data[changes_data[treatment_group] == 1]["_outcome_change"].values
Expand Down Expand Up @@ -1001,7 +1024,8 @@ def equivalence_test_trends(

# Compute outcome changes
treated_changes, control_changes = _compute_outcome_changes(
pre_data, outcome, time, treatment_group, unit
pre_data, outcome, time, treatment_group, unit,
caller_label="equivalence_test_trends",
)

# Need at least 2 observations per group to compute variance
Expand Down
26 changes: 23 additions & 3 deletions diff_diff/wooldridge.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

from __future__ import annotations

import warnings
from typing import Any, Dict, List, Optional, Tuple

import numpy as np
Expand Down Expand Up @@ -112,6 +113,26 @@ def _resolve_survey_for_wooldridge(survey_design, sample, cluster_ids, cluster_n
return resolved, survey_weights, survey_weight_type, survey_metadata, df_inf


def _warn_and_fill_nan_cohort(df: pd.DataFrame, cohort: str, stacklevel: int) -> pd.DataFrame:
"""Fill NaN cohort with 0 (never-treated) and warn with the row count.

Used by both `_filter_sample` (pre-fit) and `WooldridgeDiD.fit()` so the
silent recategorization is surfaced on whichever entry path the caller
hits first. See REGISTRY.md §WooldridgeDiD (axis-E silent coercion).
"""
n_nan_cohort = int(df[cohort].isna().sum())
if n_nan_cohort > 0:
warnings.warn(
f"{n_nan_cohort} row(s) have NaN cohort values; filling with 0 "
f"and treating the corresponding units as never-treated. Pass "
f"an explicit never-treated marker (0) if this is not intended.",
UserWarning,
stacklevel=stacklevel,
)
df[cohort] = df[cohort].fillna(0)
return df


def _filter_sample(
data: pd.DataFrame,
unit: str,
Expand All @@ -128,8 +149,7 @@ def _filter_sample(
(see _build_interaction_matrix).
"""
df = data.copy()
# Normalise never-treated: fill NaN cohort with 0
df[cohort] = df[cohort].fillna(0)
df = _warn_and_fill_nan_cohort(df, cohort, stacklevel=3)

treated_mask = df[cohort] > 0

Expand Down Expand Up @@ -396,7 +416,7 @@ def fit(
``NotImplementedError``.
"""
df = data.copy()
df[cohort] = df[cohort].fillna(0)
df = _warn_and_fill_nan_cohort(df, cohort, stacklevel=2)

# 0a. Validate cohort is time-invariant within unit
cohort_per_unit = df.groupby(unit)[cohort].nunique()
Expand Down
4 changes: 4 additions & 0 deletions docs/methodology/REGISTRY.md
Original file line number Diff line number Diff line change
Expand Up @@ -720,6 +720,8 @@ See `docs/methodology/continuous-did.md` Section 4 for full details.
- [ ] Lowest-dose-as-control (Remark 3.1)
- [x] Survey design support (Phase 3): weighted B-spline OLS, TSL on influence functions; bootstrap+survey supported (Phase 6)
- **Note:** ContinuousDiD bootstrap with survey weights supported (Phase 6) via PSU-level multiplier weights
- **Note:** The R-style convention of coding never-treated units as `first_treat=inf` is still accepted and normalized to `first_treat=0` internally, but the estimator now emits a `UserWarning` reporting the row count so the silent recategorization is surfaced (axis-E silent coercion under the Phase 2 audit). Only `+inf` is recoded (matching the R convention). Any **negative** `first_treat` value (including `-inf`) raises `ValueError` with the row count, since such units would otherwise silently fall out of both the treated (`g > 0`) and never-treated (`g == 0`) masks. Pass `0` directly for never-treated units to avoid the warning.
- **Note:** Rows where `first_treat=0` (never-treated) carry a nonzero `dose` are silently zeroed for internal consistency (never-treated cells must have `D=0` in the dose response). The estimator now emits a `UserWarning` with the affected row count before the zeroing, so unintended nonzero doses on never-treated rows are no longer absorbed without a signal (axis-E silent coercion).

---

Expand Down Expand Up @@ -1303,6 +1305,7 @@ The saturated ETWFE regression includes:
The interaction coefficient `δ_{g,t}` identifies `ATT(g, t)` under parallel trends.
- **Note:** OLS path uses iterative alternating-projection within-transformation (uniform weights) for exact FE absorption on both balanced and unbalanced panels. One-pass demeaning (`y - ȳ_i - ȳ_t + ȳ`) is only exact for balanced panels.
- **Note:** The weighted within-transformation (`utils.within_transform` with `weights`) is invoked on every WooldridgeDiD fit (survey weights when provided, `np.ones` otherwise) and emits a `UserWarning` on non-convergence per the shared convention documented under *Absorbed Fixed Effects with Survey Weights*.
- **Note:** NaN values in the `cohort` column are filled with 0 (treated as never-treated), both in `_filter_sample` and in `fit()`. This recategorization now emits a `UserWarning` reporting the affected row count so it is no longer silent (axis-E silent coercion under the Phase 2 audit). Pass `0` directly for never-treated units to avoid the warning.

*Nonlinear extensions (Wooldridge 2023):*

Expand Down Expand Up @@ -1689,6 +1692,7 @@ Balanced panel. Key variables:
- `Q_i` (`eligibility`): binary, time-invariant eligibility indicator
- Treatment: `D_{i,t} = 1{t >= S_i AND Q_i = 1}` (absorbing)
- Covariates `X_i`: time-invariant (first observation per unit used)
- **Note:** `first_treat=inf` (R-style never-enabled marker) is accepted and normalized to `0` internally. The recoding now emits a `UserWarning` reporting the affected row count so the reclassification is not silent (axis-E silent coercion under the Phase 2 audit, mirroring the StaggeredDiD behavior). Pass `first_treat=0` directly to avoid the warning.

*Estimator equation (Equation 4.1 in paper, as implemented):*

Expand Down
Loading
Loading