From 47968a860149fa0d9a4dd6d1bcf7b57d7a974837 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 16:58:14 -0400 Subject: [PATCH 1/6] Stage 1: per-period IF attribution scaffolding for dCDH survey allocator Plumbs per-(g,t)-cell contributions through the dCDH influence-function functions without changing behavior. Extends _obs_survey_info with time_ids and periods, modifies _compute_full_per_group_contributions, _compute_per_group_if_multi_horizon, and _compute_per_group_if_placebo_horizon to return per-period attribution matrices alongside the per-group IFs, adds _cohort_recenter_per_period for column-wise centering, and wires the 2D attributions through _compute_cohort_recentered_inputs into _compute_se / _survey_se_from_group_if. The survey expansion now uses the cell-period allocator psi_i = U_centered_per_period[g_i, t_i] * (w_i / W_{g_i, t_i}) when per-period attribution is supplied. Under within-group-constant PSU (the current validator's accepted input), per-cell sums telescope to U_centered[g], so PSU-level Binder aggregation produces byte-identical variance (differences only at FP single-ULP level from summation order). The within-group-constant validator remains in place; it lifts in Stage 2 of PR 2. Tests in tests/test_survey_dcdh.py, tests/test_survey_dcdh_replicate_psu.py, tests/test_chaisemartin_dhaultfoeuille.py, and tests/test_honest_did.py pass unchanged (318/318). Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/chaisemartin_dhaultfoeuille.py | 322 +++++++++++++++++++---- 1 file changed, 264 insertions(+), 58 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index cf89c6c3..f944322a 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -808,12 +808,19 @@ def fit( # Retain observation-level survey info for IF expansion (Step 3 # of survey integration: group-level IF → observation-level psi). + # `time_ids` is per-row; `periods` (the sorted column index used + # by the pivoted U_per_period matrices) is populated below once + # it's known. Together they enable cell-level IF expansion + # psi_i = U[g_i, t_i] * (w_i / W_{g_i, t_i}) under the cell- + # period allocator (REGISTRY.md survey IF expansion contract). _obs_survey_info = None if resolved_survey is not None: _obs_survey_info = { "group_ids": data[group].values, + "time_ids": data[time].values, "weights": survey_weights, "resolved": resolved_survey, + "periods": None, } # Replicate-weight variance tracker: each _compute_se call under @@ -1203,6 +1210,12 @@ def fit( Y_mat = y_pivot.to_numpy() N_mat = n_pivot.to_numpy() + # Finalize survey obs-info with the pivot's column index so that + # cell-level IF expansion can map per-row time values to matrix + # column indices in _survey_se_from_group_if. + if _obs_survey_info is not None: + _obs_survey_info["periods"] = np.asarray(all_periods) + # ------------------------------------------------------------------ # Step 7b: Covariate residualization (DID^X) # @@ -1615,7 +1628,7 @@ def fit( # so the event_study_effects dict uses a consistent estimand # (per-group DID_{g,l}) across all horizons. for l_h in range(1, L_max + 1): - U_l = multi_horizon_if[l_h] + U_l, U_pp_l = multi_horizon_if[l_h] # Cohort IDs for this horizon: (D_{g,1}, F_g, S_g) triples # are the same as Phase 1 (cohort identity depends on first # switch, not on the horizon). Filter to eligible. @@ -1646,6 +1659,9 @@ def fit( U_l_elig = U_l[eligible_mask_var] cid_elig = cid_l[eligible_mask_var] U_centered_l = _cohort_recenter(U_l_elig, cid_elig) + U_centered_pp_l = _cohort_recenter_per_period( + U_pp_l[eligible_mask_var], cid_elig + ) N_l_h = multi_horizon_dids[l_h]["N_l"] _elig_groups_l = [all_groups[g] for g in range(len(all_groups)) if eligible_mask_var[g]] se_l, n_valid_l = _compute_se( @@ -1653,6 +1669,7 @@ def fit( divisor=N_l_h, obs_survey_info=_obs_survey_info, eligible_groups=_elig_groups_l, + U_centered_per_period=U_centered_pp_l, ) if n_valid_l is not None: _replicate_n_valid_list.append(n_valid_l) @@ -1753,7 +1770,7 @@ def fit( if pl_data is None or pl_data["N_pl_l"] == 0: placebo_horizon_se[lag_l] = float("nan") continue - U_pl = placebo_horizon_if[lag_l] + U_pl, U_pp_pl = placebo_horizon_if[lag_l] # Cohort IDs (same as positive horizons) cohort_keys_pl = [ ( @@ -1776,12 +1793,16 @@ def fit( U_pl_elig = U_pl[eligible_mask_pl] cid_elig_pl = cid_pl[eligible_mask_pl] U_centered_pl_l = _cohort_recenter(U_pl_elig, cid_elig_pl) + U_centered_pp_pl_l = _cohort_recenter_per_period( + U_pp_pl[eligible_mask_pl], cid_elig_pl + ) _elig_groups_pl = [all_groups[g] for g in range(len(all_groups)) if eligible_mask_pl[g]] se_pl_l, n_valid_pl_l = _compute_se( U_centered=U_centered_pl_l, divisor=pl_data["N_pl_l"], obs_survey_info=_obs_survey_info, eligible_groups=_elig_groups_pl, + U_centered_per_period=U_centered_pp_pl_l, ) if n_valid_pl_l is not None: _replicate_n_valid_list.append(n_valid_pl_l) @@ -1846,6 +1867,9 @@ def fit( U_centered_joiners, U_centered_leavers, _eligible_group_ids, + U_centered_pp_overall, + U_centered_pp_joiners, + U_centered_pp_leavers, ) = _compute_cohort_recentered_inputs( D_mat=D_mat, # Phase 1 IF uses per-period structure: use raw outcomes @@ -1868,6 +1892,7 @@ def fit( divisor=N_S, obs_survey_info=_obs_survey_info, eligible_groups=_eligible_group_ids, + U_centered_per_period=U_centered_pp_overall, ) if n_valid_overall is not None: _replicate_n_valid_list.append(n_valid_overall) @@ -1908,6 +1933,7 @@ def fit( divisor=joiner_total, obs_survey_info=_obs_survey_info, eligible_groups=_eligible_group_ids, + U_centered_per_period=U_centered_pp_joiners, ) if n_valid_joiners is not None: _replicate_n_valid_list.append(n_valid_joiners) @@ -1931,6 +1957,7 @@ def fit( divisor=leaver_total, obs_survey_info=_obs_survey_info, eligible_groups=_eligible_group_ids, + U_centered_per_period=U_centered_pp_leavers, ) if n_valid_leavers is not None: _replicate_n_valid_list.append(n_valid_leavers) @@ -2085,7 +2112,7 @@ def fit( pl_data = multi_horizon_placebos.get(lag_l) if pl_data is None or pl_data["N_pl_l"] == 0: continue - U_pl_full = placebo_horizon_if[lag_l] + U_pl_full, _ = placebo_horizon_if[lag_l] U_pl_elig_b = U_pl_full[eligible_mask_pl_b] cohort_keys_pl_b = [ ( @@ -2137,7 +2164,7 @@ def fit( h_data = multi_horizon_dids.get(l_h) if h_data is None or h_data["N_l"] == 0: continue - U_l_full = multi_horizon_if[l_h] + U_l_full, _ = multi_horizon_if[l_h] # Full variance-eligible group set (matching # analytical SE path: singleton-baseline only) U_l_elig = U_l_full[eligible_mask_b] @@ -4250,7 +4277,7 @@ def _compute_per_group_if_multi_horizon( T_g: np.ndarray, L_max: int, set_ids: Optional[np.ndarray] = None, -) -> Dict[int, np.ndarray]: +) -> Dict[int, Tuple[np.ndarray, np.ndarray]]: """ Compute per-group influence function ``U^G_{g,l}`` for ``l = 1..L_max``. @@ -4275,9 +4302,15 @@ def _compute_per_group_if_multi_horizon( Returns ------- - dict mapping horizon l -> U_g_l array of shape (n_groups,) - NOT cohort-centered. The caller applies ``_cohort_recenter()`` - before computing SE. + dict mapping horizon l -> (U_g_l, U_per_period_l) tuple + - ``U_g_l``: np.ndarray of shape (n_groups,). NOT cohort-centered; + the caller applies ``_cohort_recenter()`` before computing SE. + - ``U_per_period_l``: np.ndarray of shape (n_groups, n_periods). + Per-``(g, t)``-cell contributions attributed to the outcome + period ``t = F_g - 1 + l`` (same column for switcher and its + controls, since they share the outcome period). Satisfies + ``U_per_period_l.sum(axis=1) == U_g_l``. Consumed by the cell- + period IF allocator for within-group-varying PSU designs. """ n_groups, n_periods = D_mat.shape is_switcher = first_switch_idx >= 0 @@ -4291,10 +4324,11 @@ def _compute_per_group_if_multi_horizon( baseline_groups[float(d)] = np.where(mask)[0] baseline_f[float(d)] = first_switch_idx[mask] - results: Dict[int, np.ndarray] = {} + results: Dict[int, Tuple[np.ndarray, np.ndarray]] = {} for l in range(1, L_max + 1): # noqa: E741 U_l = np.zeros(n_groups, dtype=float) + U_per_period_l = np.zeros((n_groups, n_periods), dtype=float) for g in range(n_groups): if not is_switcher[g]: @@ -4334,13 +4368,16 @@ def _compute_per_group_if_multi_horizon( # Switcher contribution: +S_g * (Y_{g, out} - Y_{g, ref}) switcher_change = Y_mat[g, out_idx] - Y_mat[g, ref_idx] U_l[g] += S_g * switcher_change + U_per_period_l[g, out_idx] += S_g * switcher_change # Control contributions: each control g' in the pool gets # -S_g * (1/n_ctrl) * (Y_{g', out} - Y_{g', ref}) ctrl_changes = Y_mat[ctrl_pool, out_idx] - Y_mat[ctrl_pool, ref_idx] - U_l[ctrl_pool] -= (S_g / n_ctrl) * ctrl_changes + ctrl_contrib = (S_g / n_ctrl) * ctrl_changes + U_l[ctrl_pool] -= ctrl_contrib + U_per_period_l[ctrl_pool, out_idx] -= ctrl_contrib - results[l] = U_l + results[l] = (U_l, U_per_period_l) return results @@ -4355,7 +4392,7 @@ def _compute_per_group_if_placebo_horizon( T_g: np.ndarray, L_max: int, set_ids: Optional[np.ndarray] = None, -) -> Dict[int, np.ndarray]: +) -> Dict[int, Tuple[np.ndarray, np.ndarray]]: """ Compute per-group influence function for placebo horizons. @@ -4372,9 +4409,13 @@ def _compute_per_group_if_placebo_horizon( Returns ------- - dict mapping lag l (positive int) -> U_pl_l array of shape (n_groups,) - NOT cohort-centered. The caller applies ``_cohort_recenter()`` - before computing SE. + dict mapping lag l (positive int) -> (U_pl_l, U_per_period_pl_l) tuple + - ``U_pl_l``: np.ndarray of shape (n_groups,). NOT cohort- + centered; the caller applies ``_cohort_recenter()``. + - ``U_per_period_pl_l``: np.ndarray of shape (n_groups, n_periods). + Per-``(g, t)`` contributions attributed to ``t = backward_idx`` + (the pre-period observation that supports the contrast). + Satisfies ``U_per_period_pl_l.sum(axis=1) == U_pl_l``. """ n_groups, n_periods = D_mat.shape is_switcher = first_switch_idx >= 0 @@ -4387,10 +4428,11 @@ def _compute_per_group_if_placebo_horizon( baseline_groups[float(d)] = np.where(mask)[0] baseline_f[float(d)] = first_switch_idx[mask] - results: Dict[int, np.ndarray] = {} + results: Dict[int, Tuple[np.ndarray, np.ndarray]] = {} for l in range(1, L_max + 1): # noqa: E741 U_pl = np.zeros(n_groups, dtype=float) + U_per_period_pl = np.zeros((n_groups, n_periods), dtype=float) for g in range(n_groups): if not is_switcher[g]: @@ -4434,12 +4476,15 @@ def _compute_per_group_if_placebo_horizon( # Switcher contribution: paper convention backward - ref switcher_change = Y_mat[g, backward_idx] - Y_mat[g, ref_idx] U_pl[g] += S_g * switcher_change + U_per_period_pl[g, backward_idx] += S_g * switcher_change # Control contributions ctrl_changes = Y_mat[ctrl_pool, backward_idx] - Y_mat[ctrl_pool, ref_idx] - U_pl[ctrl_pool] -= (S_g / n_ctrl) * ctrl_changes + ctrl_contrib = (S_g / n_ctrl) * ctrl_changes + U_pl[ctrl_pool] -= ctrl_contrib + U_per_period_pl[ctrl_pool, backward_idx] -= ctrl_contrib - results[l] = U_pl + results[l] = (U_pl, U_per_period_pl) return results @@ -4780,7 +4825,7 @@ def _compute_full_per_group_contributions( a11_plus_zeroed_arr: np.ndarray, a11_minus_zeroed_arr: np.ndarray, side: str = "overall", -) -> np.ndarray: +) -> Tuple[np.ndarray, np.ndarray]: """ Compute the per-group influence function ``U^G_g`` for ``DID_M``, ``DID_+``, or ``DID_-`` by summing role-weighted outcome differences @@ -4830,12 +4875,20 @@ def _compute_full_per_group_contributions( U : np.ndarray of shape (n_groups,) Per-group contributions. NOT cohort-centered; the caller is responsible for centering before computing the SE. + U_per_period : np.ndarray of shape (n_groups, n_periods) + Per-``(g, t)``-cell contributions, attributed to the post- + period ``t`` of each transition pair. Satisfies + ``U_per_period.sum(axis=1) == U`` exactly. NOT cohort-centered. + Used by the cell-period IF allocator in + ``_survey_se_from_group_if`` so that PSU/strata that vary + across the cells of g can be honored by design-based variance. """ if side not in ("overall", "joiners", "leavers"): raise ValueError(f"side must be one of overall/joiners/leavers, got {side!r}") n_groups, n_periods = D_mat.shape U = np.zeros(n_groups, dtype=float) + U_per_period = np.zeros((n_groups, n_periods), dtype=float) include_joiners_side = side in ("overall", "joiners") include_leavers_side = side in ("overall", "leavers") @@ -4867,6 +4920,8 @@ def _compute_full_per_group_contributions( ): U[joiner_mask] += y_diff[joiner_mask] U[stable0_mask] -= (n_10_t / n_00_t) * y_diff[stable0_mask] + U_per_period[joiner_mask, t_idx] += y_diff[joiner_mask] + U_per_period[stable0_mask, t_idx] -= (n_10_t / n_00_t) * y_diff[stable0_mask] # Leavers side (-y_diff for leavers; +(n_01/n_11)*y_diff for stable_1) if ( @@ -4877,8 +4932,10 @@ def _compute_full_per_group_contributions( ): U[leaver_mask] -= y_diff[leaver_mask] U[stable1_mask] += (n_01_t / n_11_t) * y_diff[stable1_mask] + U_per_period[leaver_mask, t_idx] -= y_diff[leaver_mask] + U_per_period[stable1_mask, t_idx] += (n_01_t / n_11_t) * y_diff[stable1_mask] - return U + return U, U_per_period def _cohort_recenter( @@ -4906,6 +4963,41 @@ def _cohort_recenter( return U_centered +def _cohort_recenter_per_period( + U_per_period: np.ndarray, + cohort_ids: np.ndarray, +) -> np.ndarray: + """ + Column-wise cohort recentering of a per-(g, t) IF attribution. + + For each period column t, subtract the cohort-conditional mean of + that column: ``U_centered[g, t] = U[g, t] - mean_{g' in cohort(g)} + U[g', t]``. The row sum identity is preserved: + + sum_t U_centered[g, t] + = sum_t U[g, t] - sum_t cohort_mean[cohort(g), t] + = U[g] - cohort_mean_of_U[cohort(g)] + = U_centered[g] + + Hence the cell-level Binder TSL aggregation telescopes to the same + group-level sum produced by ``_cohort_recenter`` on ``U``, giving + byte-identical variance under within-group-constant PSU (old + validator's accepted input set) and a methodologically correct + per-cell attribution under within-group-varying PSU. + """ + U_centered = U_per_period.astype(float).copy() + if U_per_period.size == 0: + return U_centered + unique_cohorts = np.unique(cohort_ids) + for k in unique_cohorts: + in_cohort = cohort_ids == k + if in_cohort.any(): + # Per-period mean across groups in this cohort + col_means = U_per_period[in_cohort].mean(axis=0) + U_centered[in_cohort] = U_per_period[in_cohort] - col_means[np.newaxis, :] + return U_centered + + def _compute_cohort_recentered_inputs( D_mat: np.ndarray, Y_mat: np.ndarray, @@ -4918,7 +5010,18 @@ def _compute_cohort_recentered_inputs( a11_minus_zeroed_arr: np.ndarray, all_groups: List[Any], singleton_baseline_groups: List[Any], -) -> Tuple[np.ndarray, int, int, int, np.ndarray, np.ndarray]: +) -> Tuple[ + np.ndarray, # U_centered_overall (n_eligible,) + int, # n_groups_for_overall + int, # n_cohorts + int, # n_groups_dropped_never_switching + np.ndarray, # U_centered_joiners + np.ndarray, # U_centered_leavers + List[Any], # eligible_group_ids + np.ndarray, # U_centered_per_period_overall (n_eligible, n_periods) + np.ndarray, # U_centered_per_period_joiners + np.ndarray, # U_centered_per_period_leavers +]: """ Compute the cohort-centered influence-function vectors for variance. @@ -4973,6 +5076,10 @@ def _compute_cohort_recentered_inputs( 0, np.array([], dtype=float), np.array([], dtype=float), + [], + np.zeros((0, 0), dtype=float), + np.zeros((0, 0), dtype=float), + np.zeros((0, 0), dtype=float), ) # Per-group switch metadata via the shared helper (factored out in @@ -5008,8 +5115,8 @@ def _compute_cohort_recentered_inputs( cohort_id[g] = unique_cohorts[key] n_cohorts = len(unique_cohorts) - # Compute the full IF vectors via the new helper - U_overall_full = _compute_full_per_group_contributions( + # Compute the full IF vectors + per-period attributions via the helper + U_overall_full, U_per_period_overall_full = _compute_full_per_group_contributions( D_mat=D_mat, Y_mat=Y_mat, N_mat=N_mat, @@ -5021,7 +5128,7 @@ def _compute_cohort_recentered_inputs( a11_minus_zeroed_arr=a11_minus_zeroed_arr, side="overall", ) - U_joiners_full = _compute_full_per_group_contributions( + U_joiners_full, U_per_period_joiners_full = _compute_full_per_group_contributions( D_mat=D_mat, Y_mat=Y_mat, N_mat=N_mat, @@ -5033,7 +5140,7 @@ def _compute_cohort_recentered_inputs( a11_minus_zeroed_arr=a11_minus_zeroed_arr, side="joiners", ) - U_leavers_full = _compute_full_per_group_contributions( + U_leavers_full, U_per_period_leavers_full = _compute_full_per_group_contributions( D_mat=D_mat, Y_mat=Y_mat, N_mat=N_mat, @@ -5050,13 +5157,32 @@ def _compute_cohort_recentered_inputs( U_overall = U_overall_full[eligible_mask] U_joiners = U_joiners_full[eligible_mask] U_leavers = U_leavers_full[eligible_mask] + U_per_period_overall = U_per_period_overall_full[eligible_mask] + U_per_period_joiners = U_per_period_joiners_full[eligible_mask] + U_per_period_leavers = U_per_period_leavers_full[eligible_mask] cohort_id_eligible = cohort_id[eligible_mask] - # Cohort-recenter each IF vector + # Cohort-recenter each IF vector (group level for plug-in path) U_centered_overall = _cohort_recenter(U_overall, cohort_id_eligible) U_centered_joiners = _cohort_recenter(U_joiners, cohort_id_eligible) U_centered_leavers = _cohort_recenter(U_leavers, cohort_id_eligible) + # Per-period cohort recentering for the cell-period survey allocator. + # Column-wise centering preserves sum_t U_centered_pp[g, t] = + # U_centered[g], so Binder TSL PSU-level aggregation telescopes to + # the same group-level sum under within-group-constant PSU (byte- + # identical to the old allocator) and gives a valid per-cell + # attribution under within-group-varying PSU. + U_centered_pp_overall = _cohort_recenter_per_period( + U_per_period_overall, cohort_id_eligible + ) + U_centered_pp_joiners = _cohort_recenter_per_period( + U_per_period_joiners, cohort_id_eligible + ) + U_centered_pp_leavers = _cohort_recenter_per_period( + U_per_period_leavers, cohort_id_eligible + ) + # Eligible group IDs for survey IF expansion eligible_group_ids = [all_groups[g] for g in range(n_groups) if eligible_mask[g]] @@ -5068,6 +5194,9 @@ def _compute_cohort_recentered_inputs( U_centered_joiners, U_centered_leavers, eligible_group_ids, + U_centered_pp_overall, + U_centered_pp_joiners, + U_centered_pp_leavers, ) @@ -5236,6 +5365,7 @@ def _compute_se( divisor: int, obs_survey_info: Optional[dict], eligible_groups: Optional[list] = None, + U_centered_per_period: Optional[np.ndarray] = None, ) -> Tuple[float, Optional[int]]: """Dispatch to plug-in SE or survey-design-aware SE. @@ -5244,6 +5374,17 @@ def _compute_se( to TSL variance (or replicate-weight variance when the resolved design carries replicate weights). + ``U_centered_per_period`` is the cell-period attribution of + ``U_centered``. Each row ``g`` satisfies + ``U_centered_per_period[g, :].sum() == U_centered[g]``. When + supplied, the survey-aware path expands influence to observations + using the cell-level allocator + ``psi_i = U_centered_per_period[g_i, t_i] * (w_i / W_{g_i, t_i})``; + when ``None`` the path falls back to the group-level allocator + ``psi_i = U_centered[g_i] * (w_i / W_{g_i})``. Byte-identical + under within-group-constant PSU since both allocators telescope to + the same PSU-level sum (REGISTRY.md survey IF expansion Note). + Returns ``(se, n_valid_replicates)``. ``n_valid_replicates`` is ``None`` under the plug-in / TSL paths, and the number of valid replicate columns under the replicate path. @@ -5258,10 +5399,16 @@ def _compute_se( # compute_survey_if_variance() expects estimator-scale psi. # Scale by 1/divisor to normalize before survey expansion. U_scaled = U_centered / divisor + U_pp_scaled = ( + U_centered_per_period / divisor + if U_centered_per_period is not None + else None + ) return _survey_se_from_group_if( U_centered=U_scaled, eligible_groups=eligible_groups, obs_survey_info=obs_survey_info, + U_centered_per_period=U_pp_scaled, ) @@ -5269,31 +5416,45 @@ def _survey_se_from_group_if( U_centered: np.ndarray, eligible_groups: list, obs_survey_info: dict, + U_centered_per_period: Optional[np.ndarray] = None, ) -> Tuple[float, Optional[int]]: - """Compute survey-design-aware SE from group-level centered IFs. - - Expands group-level influence function values to observation level - using the proportional-weight allocation ``psi_i = U[g] * (w_i / W_g)`` - so that ``sum(psi_i for i in g) = U[g]``, then delegates to either - ``compute_survey_if_variance()`` (TSL / analytical path) or - ``compute_replicate_if_variance()`` (BRR/Fay/JK1/JKn/SDR) depending - on whether the resolved design carries replicate weights. - - This is a library extension not in the dCDH papers (the paper's - plug-in variance assumes iid sampling). The replicate path uses - Rao-Wu weight-ratio rescaling; see REGISTRY.md - ``ChaisemartinDHaultfoeuille`` Note on survey expansion. + """Compute survey-design-aware SE from per-group / per-cell centered IFs. + + Expands influence-function mass to observation level using the + **cell-period allocator** + ``psi_i = U_centered_per_period[g_i, t_i] * (w_i / W_{g_i, t_i})`` + when ``U_centered_per_period`` is supplied, or the legacy group- + level allocator ``psi_i = U_centered[g_i] * (w_i / W_{g_i})`` + otherwise. Both allocators are equivalent at the PSU-level sum + under within-group-constant PSU (per-cell sums telescope to + ``U_centered[g]``), so Binder TSL variance is byte-identical on + inputs the old within-group-constant validator accepted. The cell- + period allocator additionally supports PSU/strata that vary across + cells of g — the lift the allocator buys for Stage 2. Dispatches + to ``compute_survey_if_variance()`` (TSL) or + ``compute_replicate_if_variance()`` (BRR/Fay/JK1/JKn/SDR) based on + the resolved design. See REGISTRY.md ``ChaisemartinDHaultfoeuille`` + Note on survey IF expansion for the contract. Parameters ---------- U_centered : np.ndarray Cohort-recentered per-group IF values (one per eligible group). + Used for degenerate-cohort detection and fallback expansion. eligible_groups : list - Group IDs corresponding to entries of ``U_centered`` - (variance-eligible groups after singleton-baseline exclusion). + Group IDs corresponding to entries of ``U_centered`` / + ``U_centered_per_period`` (variance-eligible groups after + singleton-baseline exclusion). obs_survey_info : dict Observation-level survey data retained from ``fit()`` setup: - ``{"group_ids": ..., "weights": ..., "resolved": ...}``. + ``{"group_ids", "time_ids", "weights", "resolved", "periods"}``. + ``time_ids`` and ``periods`` are required for the cell-level + allocator; when either is absent, the group-level allocator is + used. + U_centered_per_period : np.ndarray of shape (n_eligible, n_periods), optional + Cohort-recentered per-``(g, t)``-cell IF attributions, with + ``U_centered_per_period.sum(axis=1) == U_centered``. When + supplied, the cell allocator is used. Returns ------- @@ -5301,9 +5462,7 @@ def _survey_se_from_group_if( ``(se, n_valid_replicates)``. ``se`` is the survey-design-aware SE, or NaN if degenerate. ``n_valid_replicates`` is the number of valid replicate columns used for variance under the replicate - path, or ``None`` under the TSL (analytical) path. Callers - track the min across sites to set an effective ``df_survey`` - for replicate designs. + path, or ``None`` under the TSL (analytical) path. """ from diff_diff.survey import ( compute_replicate_if_variance, @@ -5325,6 +5484,8 @@ def _survey_se_from_group_if( group_ids = obs_survey_info["group_ids"] weights = obs_survey_info["weights"] resolved = obs_survey_info["resolved"] + time_ids = obs_survey_info.get("time_ids") + periods = obs_survey_info.get("periods") # Zero-weight rows are out-of-sample (SurveyDesign.subpopulation()). # Skip them before the group-ID factorization so NaN / non-comparable @@ -5343,20 +5504,65 @@ def _survey_se_from_group_if( gids_eff = np.asarray(group_ids)[pos_mask] w_eff = weights_arr[pos_mask] - # Build group → U_centered lookup (vectorized via factorization) - group_to_u = {gid: U_centered[idx] for idx, gid in enumerate(eligible_groups)} - - # Map group IFs to observation level (effective sample only) - u_obs_eff = np.array([group_to_u.get(gid, 0.0) for gid in gids_eff]) - - # Compute per-group weight totals W_g via bincount on effective sample - unique_gids, inverse = np.unique(gids_eff, return_inverse=True) - w_totals_per_group = np.bincount(inverse, weights=w_eff) - w_obs_total_eff = w_totals_per_group[inverse] + use_cell_allocator = ( + U_centered_per_period is not None + and time_ids is not None + and periods is not None + and U_centered_per_period.size > 0 + ) - # Expand to observation level: psi_i = U[g] * (w_i / W_g) - safe_w = np.where(w_obs_total_eff > 0, w_obs_total_eff, 1.0) - psi[pos_mask] = u_obs_eff * (w_eff / safe_w) + if use_cell_allocator: + tids_eff = np.asarray(time_ids)[pos_mask] + # Map row's group to an index in eligible_groups (−1 when the + # group is ineligible — singleton-baseline exclusion drops it). + eligible_idx_by_gid = {gid: i for i, gid in enumerate(eligible_groups)} + elig_idx_eff = np.array( + [eligible_idx_by_gid.get(gid, -1) for gid in gids_eff], + dtype=np.int64, + ) + # Map row's time to a column index in U_centered_per_period. + # get_indexer returns −1 for values absent from `periods` + # (should be rare in practice — positive-weight rows almost + # always survive cell aggregation; if one slips through we + # treat it like an ineligible row rather than crash). + col_idx_eff = np.asarray( + pd.Index(periods).get_indexer(tids_eff), dtype=np.int64 + ) + valid_cell = (elig_idx_eff >= 0) & (col_idx_eff >= 0) + n_eligible = len(eligible_groups) + n_periods_pp = U_centered_per_period.shape[1] + # Per-(g, t) weight totals W_{g,t} over the effective sample. + W_cell = np.zeros((n_eligible, n_periods_pp), dtype=np.float64) + np.add.at( + W_cell, + (elig_idx_eff[valid_cell], col_idx_eff[valid_cell]), + w_eff[valid_cell], + ) + # Lookup U_centered_per_period and W_cell per row. + u_obs_cell = np.zeros(w_eff.shape[0], dtype=np.float64) + u_obs_cell[valid_cell] = U_centered_per_period[ + elig_idx_eff[valid_cell], col_idx_eff[valid_cell] + ] + w_cell_at_row = np.zeros(w_eff.shape[0], dtype=np.float64) + w_cell_at_row[valid_cell] = W_cell[ + elig_idx_eff[valid_cell], col_idx_eff[valid_cell] + ] + safe_w = np.where(w_cell_at_row > 0, w_cell_at_row, 1.0) + psi[pos_mask] = u_obs_cell * (w_eff / safe_w) + else: + # Legacy group-level allocator (no per-period attribution + # provided, or time/period info unavailable). Preserved for + # paths that haven't threaded per-period attribution through + # yet (e.g., the heterogeneity psi_obs construction in + # _compute_heterogeneity_test — gated to within-group-constant + # PSU in Stage 2 per PR 2 scope). + group_to_u = {gid: U_centered[idx] for idx, gid in enumerate(eligible_groups)} + u_obs_eff = np.array([group_to_u.get(gid, 0.0) for gid in gids_eff]) + unique_gids, inverse = np.unique(gids_eff, return_inverse=True) + w_totals_per_group = np.bincount(inverse, weights=w_eff) + w_obs_total_eff = w_totals_per_group[inverse] + safe_w = np.where(w_obs_total_eff > 0, w_obs_total_eff, 1.0) + psi[pos_mask] = u_obs_eff * (w_eff / safe_w) # Dispatch: replicate variance (BRR/Fay/JK1/JKn/SDR) vs TSL. # Mirrors the inline branch in TripleDifference:1206-1238 and From 4fea5138d9f0956d1a916dee0a28689c2cc8055c Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 17:18:09 -0400 Subject: [PATCH 2/6] Stage 2: lift within-group constancy + add gates + MC coverage sim MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces _validate_group_constant_strata_psu with a relaxed _validate_cell_constant_strata_psu (within-cell constancy, trivially satisfied in one-obs-per-cell panels). Adds a diagnostic helper _strata_psu_vary_within_group and two NotImplementedError gates at the validator call site: heterogeneity= and n_bootstrap > 0 both still use the legacy group-level IF expansion / PSU map and therefore reject designs whose PSU varies within group (PR 3 and PR 4 extend them). Updates REGISTRY.md cluster, heterogeneity, survey IF expansion, and survey+bootstrap notes to describe the cell-period allocator, the within-cell constancy contract, and the two scope gates. Updates the class fit() docstring survey_design section to match. Updates stale "within-group-constant" comments in the bootstrap path since the constancy is now enforced by the gate rather than the upstream validator. Inverts the TestSurveyWithinGroupValidation rejection tests to acceptance tests under the cell allocator. Adds positive-rule tests for within-cell strata and PSU variation (rejected) and gate tests for heterogeneity and bootstrap (raise NotImplementedError). Adds tests/test_dcdh_cell_period_coverage.py: Monte Carlo coverage simulation at 500 reps on a panel with PSU varying across cells of each group (two PSUs per group on even/odd periods). Empirical 95% coverage hits 0.956 on the seeded draw — inside the ±2.5pp tolerance (~2 MC-sigma). Marked @pytest.mark.slow and excluded by default. All 322 tests pass on the affected surface (test_survey_dcdh, test_survey_dcdh_replicate_psu, test_chaisemartin_dhaultfoeuille, test_honest_did). Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/chaisemartin_dhaultfoeuille.py | 217 ++++++++++++++++------- docs/methodology/REGISTRY.md | 8 +- tests/test_dcdh_cell_period_coverage.py | 140 +++++++++++++++ tests/test_survey_dcdh.py | 149 +++++++++++++--- 4 files changed, 428 insertions(+), 86 deletions(-) create mode 100644 tests/test_dcdh_cell_period_coverage.py diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index f944322a..706a82a9 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -639,23 +639,32 @@ def fit( Survey design specification for design-based inference. Supports ``weight_type='pweight'`` with two variance paths: (1) Taylor Series Linearization using strata / PSU / FPC - (analytical) and (2) replicate-weight variance using - BRR / Fay / JK1 / JKn / SDR methods (analytical, - closed-form). Survey weights produce weighted cell means - for the point estimate. Under a survey design without an - explicit ``psu``, ``fit()`` auto-injects ``psu=`` - so the group is the effective sampling unit (strata / PSU - must be within-group-constant; mixed labels within a group - raise ``ValueError``). When ``n_bootstrap > 0`` and a - survey design is supplied, the multiplier bootstrap - operates at the PSU level (Hall-Mammen wild PSU - bootstrap) — under the default auto-inject this collapses - to a group-level clustered bootstrap. **Replicate weights - combined with ``n_bootstrap > 0`` are rejected** with - ``NotImplementedError`` (replicate variance is - closed-form; bootstrap would double-count variance). See - REGISTRY.md ``ChaisemartinDHaultfoeuille`` Notes for the - full contract. + (analytical) via the **cell-period IF allocator** that + attributes per-``(g, t)``-cell mass and aggregates through + Binder (1983), and (2) replicate-weight variance using + BRR / Fay / JK1 / JKn / SDR methods (analytical, closed- + form). Survey weights produce weighted cell means for the + point estimate. Under a survey design without an explicit + ``psu``, ``fit()`` auto-injects ``psu=`` as a + safe default (the group is the effective sampling unit). + **Strata and PSU may vary across cells of a group** but + must be constant within each ``(g, t)`` cell (trivially + true in one-obs-per-cell panels; enforced otherwise with + ``ValueError``). When ``n_bootstrap > 0`` and a survey + design is supplied, the multiplier bootstrap operates at + the PSU level (Hall-Mammen wild PSU bootstrap) — under the + default auto-inject this collapses to a group-level + clustered bootstrap. **Out-of-scope combinations raise + ``NotImplementedError``**: (a) replicate weights with + ``n_bootstrap > 0`` (replicate variance is closed-form; + bootstrap would double-count variance); (b) + ``heterogeneity=`` with PSU/strata that vary within group + (heterogeneity WLS still uses the legacy group-level IF + expansion; follow-up PR extends it); (c) ``n_bootstrap > + 0`` with PSU that varies within group (PSU-level bootstrap + still uses the legacy group-level PSU map; follow-up PR + extends it). See REGISTRY.md + ``ChaisemartinDHaultfoeuille`` Notes for the full contract. Returns ------- @@ -770,15 +779,44 @@ def fit( # precedent: efficient_did.py:989, staggered.py:1869, # two_stage.py:251-253. - # Within-group-constant PSU/strata is required: the IF - # expansion psi_i = U[g] * (w_i / W_g) supports within-group - # variation in WEIGHTS (each obs contributes proportionally), - # but PSU and strata must be constant within group — the - # group is treated as the effective sampling unit for the - # Binder stratified-PSU variance formula. - _validate_group_constant_strata_psu( + # Cell-period IF allocator contract: strata and PSU must be + # constant within each (g, t) cell, a strict relaxation of + # the previous within-group constancy rule. Two out-of-scope + # combinations are gated with NotImplementedError until the + # corresponding follow-up PRs extend them: + # - heterogeneity= + within-group-varying PSU/strata + # (PR 3: cell-period allocator for the WLS psi_obs) + # - n_bootstrap > 0 + within-group-varying PSU + # (PR 4: cell-level Hall-Mammen wild bootstrap) + strata_varies, psu_varies = _strata_psu_vary_within_group( resolved_survey, data, group, survey_weights, ) + if strata_varies or psu_varies: + if heterogeneity is not None: + raise NotImplementedError( + "heterogeneity= is not supported under a survey " + "design whose PSU or strata vary within group. " + "The heterogeneity WLS path uses the legacy " + "group-level IF expansion and will be extended " + "to the cell-period allocator in a follow-up " + "PR. For now, either (a) set heterogeneity=None, " + "or (b) collapse PSU/strata to be constant " + "within each group." + ) + if self.n_bootstrap > 0: + raise NotImplementedError( + "n_bootstrap > 0 is not supported under a " + "survey design whose PSU varies within group. " + "The PSU-level Hall-Mammen wild bootstrap uses " + "the legacy group-level PSU map and will be " + "extended to cell-level PSU in a follow-up PR. " + "For now, use n_bootstrap=0 (analytical TSL " + "variance, which fully supports within-group-" + "varying PSU via the cell-period allocator)." + ) + _validate_cell_constant_strata_psu( + resolved_survey, data, group, time, survey_weights, + ) # Design-2 precondition: requires drop_larger_lower=False if design2 and self.drop_larger_lower: @@ -2048,9 +2086,10 @@ def fit( pos_mask_warn = obs_ws_warn > 0 psu_codes_warn = np.asarray(psu_arr_warn) # Collect the PSU label for each variance-eligible - # group (within-group-constant PSU is validated - # upstream, so the first positive-weight label - # represents the whole group). + # group. PSU that varies within group is rejected + # for the bootstrap path upstream (NotImplementedError + # gate), so the first positive-weight label + # represents the whole group here. eligible_psu_labels: List[Any] = [] for gid in _eligible_group_ids: mask_g = (obs_gids_warn == gid) & pos_mask_warn @@ -2198,12 +2237,16 @@ def fit( # Under a survey design with PSU information, build a # `group_id_to_psu_code` dict so the bootstrap mixin can # derive each target's PSU map by ID lookup rather than by - # positional reuse. PSU/strata are within-group-constant by - # the _validate_group_constant_strata_psu precondition, so - # each variance-eligible group has exactly one PSU label. - # Under auto-inject psu=group each group has a unique PSU - # code and the bootstrap mixin's identity-map fast path - # reproduces the pre-PSU behavior bit-for-bit. + # positional reuse. PSU that varies within group is + # rejected for the bootstrap path by a NotImplementedError + # gate in fit() (the cell-period allocator used by the + # analytical TSL path does NOT require within-group PSU + # constancy, but the Hall-Mammen wild PSU bootstrap still + # uses a group-level PSU map — PR 4 extends it). Each + # variance-eligible group therefore has exactly one PSU + # label here. Under auto-inject psu=group each group has a + # unique PSU code and the bootstrap mixin's identity-map + # fast path reproduces the pre-PSU behavior bit-for-bit. group_id_to_psu_code_bootstrap: Optional[Dict[Any, int]] = None eligible_group_ids_bootstrap: Optional[np.ndarray] = None if ( @@ -5241,58 +5284,108 @@ def _plugin_se(U_centered: np.ndarray, divisor: int) -> float: return float(np.sqrt(sigma_hat_sq) / np.sqrt(divisor)) -def _validate_group_constant_strata_psu( +def _strata_psu_vary_within_group( resolved: Any, data: pd.DataFrame, group_col: str, survey_weights: Optional[np.ndarray], +) -> Tuple[bool, bool]: + """Return (strata_varies_within_group, psu_varies_within_group). + + Diagnostic helper used to gate out-of-scope combinations for the + cell-period IF allocator — heterogeneity and ``n_bootstrap > 0`` + currently require within-group constancy because they read + ``obs_survey_info`` through the legacy group-level expansion path. + PR 3 and PR 4 will extend them. Zero-weight rows are excluded from + the check (subpopulation contract). + """ + if resolved is None: + return False, False + pos_mask = np.asarray(survey_weights) > 0 + if not pos_mask.any(): + return False, False + g_eff = np.asarray(data[group_col].values)[pos_mask] + strata_varies = False + psu_varies = False + if resolved.strata is not None: + s_eff = np.asarray(resolved.strata)[pos_mask] + strata_varies = bool( + pd.DataFrame({"g": g_eff, "s": s_eff}).groupby("g")["s"].nunique().gt(1).any() + ) + if resolved.psu is not None: + p_eff = np.asarray(resolved.psu)[pos_mask] + psu_varies = bool( + pd.DataFrame({"g": g_eff, "p": p_eff}).groupby("g")["p"].nunique().gt(1).any() + ) + return strata_varies, psu_varies + + +def _validate_cell_constant_strata_psu( + resolved: Any, + data: pd.DataFrame, + group_col: str, + time_col: str, + survey_weights: Optional[np.ndarray], ) -> None: - """Reject survey designs where strata or PSU vary within group. - - The dCDH IF expansion ``psi_i = U[g] * (w_i / W_g)`` treats the - group as the effective sampling unit for design-based variance. - When strata or PSU vary within group, the expansion silently spreads - horizon-specific IF mass onto observations whose survey stratum or - PSU differs from the rest of the group, contaminating the - stratified-PSU variance. Reject those designs with a clear error. - - Zero-weight rows are excluded from the check (subpopulation - contract — an excluded row with a different stratum/PSU label does - not actually participate in the variance). + """Reject survey designs where strata or PSU vary within a (g, t) cell. + + Under the cell-period IF allocator, ``psi_i`` attributes each + observation's mass to its ``(g, t)`` cell scaled by proportional + weight share: + ``psi_i = U_centered_per_period[g, t] * (w_i / W_{g, t})``. Binder + TSL aggregates at PSU level, so strata / PSU labels within a cell + must be unambiguous. In one-obs-per-cell panels (the canonical dCDH + structure) this check is trivially satisfied; in multi-obs-per-cell + panels, a cell split across strata or PSUs is rejected. This is a + strict relaxation of the old within-group constancy rule shipped + before the cell-period allocator (REGISTRY.md + ``ChaisemartinDHaultfoeuille`` Note on survey IF expansion). + + Zero-weight rows are excluded (subpopulation contract). """ if resolved is None: return pos_mask = np.asarray(survey_weights) > 0 + if not pos_mask.any(): + return g_eff = np.asarray(data[group_col].values)[pos_mask] + t_eff = np.asarray(data[time_col].values)[pos_mask] if resolved.strata is not None: s_eff = np.asarray(resolved.strata)[pos_mask] - df_s = pd.DataFrame({"g": g_eff, "s": s_eff}) - varying = df_s.groupby("g")["s"].nunique() + df_cell = pd.DataFrame({"g": g_eff, "t": t_eff, "s": s_eff}) + varying = df_cell.groupby(["g", "t"])["s"].nunique() bad = varying[varying > 1] if len(bad) > 0: raise ValueError( f"ChaisemartinDHaultfoeuille survey support requires " - f"strata to be constant within group, but " - f"{len(bad)} group(s) have multiple strata " - f"(examples: {bad.index.tolist()[:5]}). The IF " - f"expansion psi_i = U[g] * (w_i / W_g) treats the " - f"group as the effective sampling unit for stratified " - f"design-based variance." + f"strata to be constant within each (group, time) cell " + f"under the cell-period IF allocator, but {len(bad)} " + f"cell(s) have multiple strata (examples: " + f"{bad.index.tolist()[:5]}). The cell allocator treats " + f"the (g, t) cell as the effective sampling unit for " + f"stratified Binder variance; within-cell stratum " + f"variation is ambiguous. The allocator DOES support " + f"strata that vary across cells of the same group " + f"(relaxation over the previous within-group constancy " + f"rule shipped in earlier releases)." ) if resolved.psu is not None: p_eff = np.asarray(resolved.psu)[pos_mask] - df_p = pd.DataFrame({"g": g_eff, "p": p_eff}) - varying = df_p.groupby("g")["p"].nunique() + df_cell = pd.DataFrame({"g": g_eff, "t": t_eff, "p": p_eff}) + varying = df_cell.groupby(["g", "t"])["p"].nunique() bad = varying[varying > 1] if len(bad) > 0: raise ValueError( f"ChaisemartinDHaultfoeuille survey support requires " - f"PSU to be constant within group, but " - f"{len(bad)} group(s) have multiple PSUs " - f"(examples: {bad.index.tolist()[:5]}). The IF " - f"expansion psi_i = U[g] * (w_i / W_g) treats the " - f"group as the effective sampling unit for stratified " - f"design-based variance." + f"PSU to be constant within each (group, time) cell " + f"under the cell-period IF allocator, but {len(bad)} " + f"cell(s) have multiple PSUs (examples: " + f"{bad.index.tolist()[:5]}). The cell allocator treats " + f"the (g, t) cell as the effective sampling unit; " + f"within-cell PSU variation is ambiguous. The allocator " + f"DOES support PSU that varies across cells of the same " + f"group (relaxation over the previous within-group " + f"constancy rule shipped in earlier releases)." ) diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 64d2e63a..a871ce8b 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -589,7 +589,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - **Note:** When every variance-eligible group forms its own `(D_{g,1}, F_g, S_g)` cohort (a degenerate small-panel case where the cohort framework has zero degrees of freedom), the cohort-recentered plug-in formula is unidentified: cohort recentering subtracts the cohort mean from each group's `U^G_g`, and for singleton cohorts the centered value is exactly zero, so the centered influence function vector collapses to all zeros. The estimator returns `overall_se = NaN` with a `UserWarning` rather than silently collapsing to `0.0` (which would falsely imply infinite precision). The `DID_M` point estimate remains well-defined. The bootstrap path inherits the same degeneracy on these panels — the multiplier weights act on an all-zero vector, so the bootstrap distribution is also degenerate. **Deviation from R `DIDmultiplegtDYN`:** R returns a non-zero SE on the canonical 4-group worked example via small-sample sandwich machinery that Python does not implement. Both responses are valid for a degenerate case; Python's `NaN`+warning is the safer default. To get a non-degenerate SE, include more groups so cohorts have peers (real-world panels typically have `G >> K`). -- **Note (cluster contract):** `ChaisemartinDHaultfoeuille` clusters at the group level by default. The cohort-recentered analytical SE plug-in operates on per-group influence-function values (one `U^G_g` per group); the multiplier bootstrap generates one weight per group. The user-facing `cluster=` kwarg is not supported: the constructor accepts `cluster=None` (the default and only supported value); passing any non-`None` value raises `NotImplementedError` at construction time (and the same gate fires from `set_params`) — custom user-specified clustering is reserved for a future phase. **Automatic PSU-level clustering under `survey_design`:** when `survey_design` carries PSUs strictly coarser than the group and `n_bootstrap > 0`, the multiplier bootstrap automatically switches to Hall-Mammen wild PSU clustering (see the Note on survey + bootstrap below). Under the default auto-inject `psu=group`, PSU coincides with group and the group-level and PSU-level paths are identical (bit-for-bit via the identity-map fast path). The matching test for the `cluster=` gate is `test_cluster_parameter_raises_not_implemented` in `tests/test_chaisemartin_dhaultfoeuille.py::TestForwardCompatGates`. +- **Note (cluster contract):** `ChaisemartinDHaultfoeuille` clusters at the group level by default. The analytical SE plug-in operates on per-group influence-function values (one `U^G_g` per group) and, under the cell-period allocator, on their per-cell decomposition `U[g, t]` which telescopes back to `U^G_g` at the PSU-level sum. The multiplier bootstrap generates one weight per group. The user-facing `cluster=` kwarg is not supported: the constructor accepts `cluster=None` (the default and only supported value); passing any non-`None` value raises `NotImplementedError` at construction time (and the same gate fires from `set_params`) — custom user-specified clustering is reserved for a future phase. **Automatic PSU-level clustering under `survey_design`:** the analytical TSL path supports PSU labels that vary across cells of a group (within-cell constancy required); the multiplier bootstrap still uses the legacy group-level PSU map and raises `NotImplementedError` when combined with within-group-varying PSU (PR 4 lifts this). Under the default auto-inject `psu=group`, PSU coincides with group and the group-level and cell-level paths are identical at PSU-sum granularity (bit-for-bit via the identity-map fast path). The matching test for the `cluster=` gate is `test_cluster_parameter_raises_not_implemented` in `tests/test_chaisemartin_dhaultfoeuille.py::TestForwardCompatGates`. - **Note (bootstrap inference surface):** When `n_bootstrap > 0`, the top-level `results.overall_p_value` / `results.overall_conf_int` (and joiners/leavers analogues) hold **percentile-based bootstrap inference** computed by the multiplier bootstrap, NOT normal-theory recomputations from the bootstrap SE. The t-stat (`overall_t_stat`, etc.) is computed from the SE via `safe_inference()[0]` to satisfy the project's anti-pattern rule (never compute `t = effect / se` inline) — bootstrap does not define an alternative t-stat semantic for percentile bootstrap, so the SE-based t-stat is the natural choice. `event_study_effects[1]`, `summary()`, `to_dataframe()`, `is_significant`, and `significance_stars` all read from these top-level fields and therefore reflect the bootstrap inference automatically. The library precedent for this propagation is `imputation.py:790-805`, `two_stage.py:778-787`, and `efficient_did.py:1009-1013`. The single-period placebo (`L_max=None`) still has NaN bootstrap fields; multi-horizon placebos (`L_max >= 1`) have valid bootstrap SE/CI/p via `placebo_horizon_ses/cis/p_values` on the bootstrap results object. The matching test is `test_bootstrap_p_value_and_ci_propagated_to_top_level` in `tests/test_chaisemartin_dhaultfoeuille.py::TestBootstrap`. @@ -615,7 +615,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - **Note (Phase 3 state-set trends):** Implements state-set-specific trends from Web Appendix Section 1.4 (Assumptions 13-14). Restricts the control pool for each switcher to groups in the same set (e.g., same state in county-level data). The restriction applies in all four DID/IF paths: `_compute_multi_horizon_dids()`, `_compute_per_group_if_multi_horizon()`, `_compute_multi_horizon_placebos()`, and `_compute_per_group_if_placebo_horizon()`. Cohort structure stays as `(D_{g,1}, F_g, S_g)` triples (does not incorporate set membership). Set membership must be time-invariant per group. **Note on Assumption 14 (common support):** The paper requires a common last-untreated period across sets (`T_u^s` equal for all `s`). This implementation does NOT enforce Assumption 14 up front. Instead, when within-set controls are exhausted at a given horizon (because a set has shorter untreated support than others), the affected switcher/horizon pairs are silently excluded via the existing empty-control-pool mechanism. This means `N_l` may be smaller under `trends_nonparam` than without it, and the effective estimand is trimmed to the within-set support at each horizon. The existing multi-horizon A11 warning fires when exclusions occur. Activated via `trends_nonparam="state_column"` in `fit()`. -- **Note (Phase 3 heterogeneity testing - partial implementation):** Partial implementation of the heterogeneity test from Web Appendix Section 1.5 (Assumption 15, Lemma 7). Computes post-treatment saturated OLS regressions of `S_g * (Y_{g, F_g-1+l} - Y_{g, F_g-1})` on a time-invariant covariate `X_g` plus cohort indicator dummies. Standard OLS inference is valid (paper shows no DID error correction needed). **Deviation from R `predict_het`:** R's full `predict_het` option additionally computes placebo regressions and a joint null test, and disallows combination with `controls`. This implementation provides only post-treatment regressions. **Rejected combinations:** `controls` (matching R), `trends_linear` (heterogeneity test uses raw level changes, incompatible with second-differenced outcomes), and `trends_nonparam` (heterogeneity test does not thread state-set control-pool restrictions). Results stored in `results.heterogeneity_effects`. Activated via `heterogeneity="covariate_column"` in `fit()`. **Note (survey support):** Under `survey_design`, heterogeneity uses WLS with per-group weights `W_g = sum of obs-level survey weights in group g`, and the standard error is computed via Taylor Series Linearization of the group-level influence function: `ψ_g[X] = inv(X'WX)[1,:] @ x_g * W_g * r_g`, expanded to observation level as `ψ_i = ψ_g * (w_i / W_g)`, then aggregated through `compute_survey_if_variance` for stratified/PSU/FPC variance. Inference uses the t-distribution with `df_survey` when provided. Under rank deficiency (any regression coefficient dropped by `solve_ols`'s R-style drop), all inference fields return NaN (conservative, matches the NaN-consistent contract). **Library extension (replicate weights):** Under a replicate-weight design (BRR/Fay/JK1/JKn/SDR), the heterogeneity regression dispatches to `compute_replicate_if_variance` (Rao-Wu weight-ratio rescaling) instead of the Binder TSL formula. The effective df is the shared `min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` used by the rest of the dCDH surfaces; if the base `df_survey` is undefined (QR-rank ≤ 1), heterogeneity inference is NaN regardless of the local `n_valid_het` (matching the dCDH top-level contract — per-site `n_valid` cannot rescue a rank-deficient design). **Library extension:** R `DIDmultiplegtDYN::predict_het` does not natively support survey weights. +- **Note (Phase 3 heterogeneity testing - partial implementation):** Partial implementation of the heterogeneity test from Web Appendix Section 1.5 (Assumption 15, Lemma 7). Computes post-treatment saturated OLS regressions of `S_g * (Y_{g, F_g-1+l} - Y_{g, F_g-1})` on a time-invariant covariate `X_g` plus cohort indicator dummies. Standard OLS inference is valid (paper shows no DID error correction needed). **Deviation from R `predict_het`:** R's full `predict_het` option additionally computes placebo regressions and a joint null test, and disallows combination with `controls`. This implementation provides only post-treatment regressions. **Rejected combinations:** `controls` (matching R), `trends_linear` (heterogeneity test uses raw level changes, incompatible with second-differenced outcomes), and `trends_nonparam` (heterogeneity test does not thread state-set control-pool restrictions). Results stored in `results.heterogeneity_effects`. Activated via `heterogeneity="covariate_column"` in `fit()`. **Note (survey support):** Under `survey_design`, heterogeneity uses WLS with per-group weights `W_g = sum of obs-level survey weights in group g`, and the standard error is computed via Taylor Series Linearization of the group-level influence function: `ψ_g[X] = inv(X'WX)[1,:] @ x_g * W_g * r_g`, expanded to observation level as `ψ_i = ψ_g * (w_i / W_g)`, then aggregated through `compute_survey_if_variance` for stratified/PSU/FPC variance. Inference uses the t-distribution with `df_survey` when provided. Under rank deficiency (any regression coefficient dropped by `solve_ols`'s R-style drop), all inference fields return NaN (conservative, matches the NaN-consistent contract). **Library extension (replicate weights):** Under a replicate-weight design (BRR/Fay/JK1/JKn/SDR), the heterogeneity regression dispatches to `compute_replicate_if_variance` (Rao-Wu weight-ratio rescaling) instead of the Binder TSL formula. The effective df is the shared `min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` used by the rest of the dCDH surfaces; if the base `df_survey` is undefined (QR-rank ≤ 1), heterogeneity inference is NaN regardless of the local `n_valid_het` (matching the dCDH top-level contract — per-site `n_valid` cannot rescue a rank-deficient design). **Library extension:** R `DIDmultiplegtDYN::predict_het` does not natively support survey weights. **Scope note (cell-period allocator):** Heterogeneity still uses the legacy group-level IF expansion `ψ_i = ψ_g * (w_i / W_g)`. Combining `heterogeneity=` with a survey design whose PSU or strata vary within group raises `NotImplementedError`; the cell-period extension of the heterogeneity path is deferred to a follow-up PR. Within-group-constant PSU/strata (including the auto-injected `psu=group`) is fully supported. - **Note (HonestDiD integration):** HonestDiD sensitivity analysis (Rambachan & Roth 2023) is available on the placebo + event study surface via `honest_did=True` in `fit()` or `compute_honest_did(results)` post-hoc. **Library extension:** dCDH HonestDiD uses `DID^{pl}_l` placebo estimates as pre-period coefficients rather than standard event-study pre-treatment coefficients. The Rambachan-Roth restrictions bound violations of the parallel trends assumption underlying the dCDH placebo estimand; interpretation differs from canonical event-study HonestDiD. A `UserWarning` is emitted at runtime. Uses diagonal variance (no full VCV available for dCDH). Relative magnitudes (DeltaRM) with Mbar=1.0 is the default when called from `fit()`, targeting the equal-weight average over all post-treatment horizons (`l_vec=None`). R's HonestDiD defaults to the first post/on-impact effect; use `compute_honest_did(results, ...)` with a custom `l_vec` to match that behavior. When `trends_linear=True`, bounds apply to the second-differenced estimand (parallel trends in first differences). Requires `L_max >= 1` for multi-horizon placebos. Gaps in the horizon grid from `trends_nonparam` support-trimming are handled by filtering to the largest consecutive block and warning. @@ -649,8 +649,8 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - [x] Design-2 switch-in/switch-out descriptive wrapper (Web Appendix Section 1.6) - [x] HonestDiD (Rambachan-Roth 2023) integration on placebo + event study surface - [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) **or replicate-weight variance (BRR/Fay/JK1/JKn/SDR)**, covering the main ATT surface, covariate adjustment (DID^X), heterogeneity testing, the TWFE diagnostic (fit and standalone `twowayfeweights()` helper), and HonestDiD bounds. Opt-in **PSU-level Hall-Mammen wild bootstrap** is also supported via `n_bootstrap > 0`. -- **Note:** Survey IF expansion (`psi_i = U[g] * (w_i / W_g)`) is a library extension not in the dCDH papers. The paper's plug-in variance assumes iid sampling; the TSL variance accounts for complex survey design by expanding group-level influence functions to observation level proportionally to survey weights, then applying the standard Binder (1983) stratified PSU variance formula. The expansion treats each group as the effective sampling unit; **strata and PSU must therefore be constant within group** (validated in `fit()` — designs with mixed strata or PSU labels within a single group raise `ValueError`). When `survey_design.psu` is not specified, `fit()` auto-injects `psu=` so the TSL variance, `df_survey`, and t-based inference match the per-group structure the IF expansion assumes; users who want a coarser cluster can pass an explicit `psu` that is constant within group. Within-group-varying **weights** are supported (each observation contributes proportionally). Under replicate-weight designs, the same group-level IF expansion is aggregated via Rao-Wu weight-ratio rescaling (`compute_replicate_if_variance` at `diff_diff/survey.py:1681`) rather than the Binder TSL formula. All five methods (BRR/Fay/JK1/JKn/SDR) are supported method-agnostically through the unified helper; the effective `df_survey` is reduced to `min(n_valid) - 1` across IF sites when some replicate solves fail (matching `efficient_did.py:1133-1135` and `triple_diff.py:676-686` precedents). Under DID^X, the first-stage residualization coefficient `theta_hat` is computed once on full-sample weights and treated as fixed (FWL plug-in IF convention) — per-replicate refits of `theta_hat` are not performed. -- **Note (survey + bootstrap contract):** When `survey_design` and `n_bootstrap > 0` are both active, the bootstrap uses Hall-Mammen wild multiplier weights (Rademacher/Mammen/Webb) **at the PSU level**. Under the default auto-injected `psu=group`, the PSU coincides with the group so the wild bootstrap is a clean group-level clustered bootstrap (identity-map fast path, bit-identical to the non-survey multiplier bootstrap). When the user passes an explicit strictly-coarser PSU (e.g., `psu=state` with groups at county level), the IF contributions of all groups within a PSU receive the same bootstrap multiplier — the standard Hall-Mammen wild PSU bootstrap. Strata do not participate in the bootstrap randomization (they contribute only through the analytical TSL variance); this is conservative when strata differ substantially in variance. A `UserWarning` fires only when PSU is strictly coarser than group. **Replicate-weight designs and `n_bootstrap > 0` are mutually exclusive** (replicate variance is closed-form; bootstrap would double-count variance) — the combination raises `NotImplementedError`, matching `efficient_did.py:989`, `staggered.py:1869`, `two_stage.py:251-253`. For HonestDiD bounds under replicate weights, the replicate-effective `df_survey = min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` propagates to t-critical values — capped by the design's QR-rank-based df so a rank-deficient replicate matrix never produces a larger effective df than the design supports. When `resolved_survey.df_survey` is undefined (QR-rank ≤ 1), the effective df stays `None` and all inference fields (including HonestDiD bounds) are NaN — per-site `n_valid` cannot rescue a rank-deficient design. +- **Note (Survey IF expansion — cell-period allocator):** Survey IF expansion is a library extension not in the dCDH papers (the paper's plug-in variance assumes iid sampling). The expansion decomposes the per-group IF `U[g]` into per-cell contributions `U[g, t]` (the per-period term of the sum that yields `U[g]` inside `_compute_full_per_group_contributions` and the per-horizon helpers), cohort-centers each column independently, and expands to observation level as `psi_i = U_centered_per_period[g_i, t_i] * (w_i / W_{g_i, t_i})`, then applies the Binder (1983) stratified-PSU variance formula. **Strata and PSU must be constant within each `(g, t)` cell** (trivially satisfied in one-obs-per-cell panels — the canonical dCDH structure); variation **across cells of a group** is fully supported. This is a strict relaxation of the earlier within-group constancy rule shipped before the cell-period allocator. Byte-identity on the pre-relaxation input set is guaranteed because PSU-level Binder aggregation telescopes per-cell sums to `U_centered[g]` under within-group-constant PSU, matching the previous group-level expansion up to single-ULP floating-point noise. Within-group-varying **weights** are supported as before. When `survey_design.psu` is not specified, `fit()` auto-injects `psu=` so the TSL variance, `df_survey`, and t-based inference match the per-group PSU structure. Under replicate-weight designs, the same cell-level `psi_i` is aggregated via Rao-Wu weight-ratio rescaling (`compute_replicate_if_variance` at `diff_diff/survey.py:1681`) rather than the Binder TSL formula. All five methods (BRR/Fay/JK1/JKn/SDR) are supported method-agnostically through the unified helper; the effective `df_survey` is reduced to `min(n_valid) - 1` across IF sites when some replicate solves fail (matching `efficient_did.py:1133-1135` and `triple_diff.py:676-686` precedents). Under DID^X, the first-stage residualization coefficient `theta_hat` is computed once on full-sample weights and treated as fixed (FWL plug-in IF convention) — per-replicate refits of `theta_hat` are not performed. **Scope limitations (follow-up PRs):** (a) `heterogeneity=` combined with within-group-varying PSU/strata raises `NotImplementedError` — the heterogeneity WLS `psi_obs` still uses the legacy group-level expansion, to be extended in PR 3; (b) `n_bootstrap > 0` combined with within-group-varying PSU raises `NotImplementedError` — the PSU-level Hall-Mammen wild bootstrap still uses the legacy group-level PSU map, to be extended in PR 4. +- **Note (survey + bootstrap contract):** When `survey_design` and `n_bootstrap > 0` are both active, the bootstrap uses Hall-Mammen wild multiplier weights (Rademacher/Mammen/Webb) **at the PSU level**. Under the default auto-injected `psu=group`, the PSU coincides with the group so the wild bootstrap is a clean group-level clustered bootstrap (identity-map fast path, bit-identical to the non-survey multiplier bootstrap). When the user passes an explicit strictly-coarser PSU (e.g., `psu=state` with groups at county level), the IF contributions of all groups within a PSU receive the same bootstrap multiplier — the standard Hall-Mammen wild PSU bootstrap. Strata do not participate in the bootstrap randomization (they contribute only through the analytical TSL variance); this is conservative when strata differ substantially in variance. A `UserWarning` fires only when PSU is strictly coarser than group. **Scope note (cell-period allocator):** The PSU-level bootstrap uses a group-level `group_id_to_psu_code` map and therefore requires PSU to be constant within each group. Combining `n_bootstrap > 0` with a PSU that varies within group raises `NotImplementedError`; the cell-level Hall-Mammen extension is deferred to a follow-up PR. The analytical TSL variance fully supports within-group-varying PSU via the cell-period allocator — use `n_bootstrap=0` for those designs. **Replicate-weight designs and `n_bootstrap > 0` are mutually exclusive** (replicate variance is closed-form; bootstrap would double-count variance) — the combination raises `NotImplementedError`, matching `efficient_did.py:989`, `staggered.py:1869`, `two_stage.py:251-253`. For HonestDiD bounds under replicate weights, the replicate-effective `df_survey = min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` propagates to t-critical values — capped by the design's QR-rank-based df so a rank-deficient replicate matrix never produces a larger effective df than the design supports. When `resolved_survey.df_survey` is undefined (QR-rank ≤ 1), the effective df stays `None` and all inference fields (including HonestDiD bounds) are NaN — per-site `n_valid` cannot rescue a rank-deficient design. --- diff --git a/tests/test_dcdh_cell_period_coverage.py b/tests/test_dcdh_cell_period_coverage.py new file mode 100644 index 00000000..b3ba0203 --- /dev/null +++ b/tests/test_dcdh_cell_period_coverage.py @@ -0,0 +1,140 @@ +"""Monte Carlo coverage simulation for the cell-period IF allocator. + +Validates empirical coverage under a DGP with PSU that varies across +cells of the same group — the regime unlocked by PR 2's relaxation of +the within-group constancy rule. Under within-group-constant PSU the +cell allocator reduces to the previous group allocator byte-for-byte, +so the coverage check only exercises the new code path. + +Marked ``slow`` and excluded from the default pytest run. To execute: + + pytest tests/test_dcdh_cell_period_coverage.py -m slow -v +""" + +from __future__ import annotations + +import warnings + +import numpy as np +import pandas as pd +import pytest + +from diff_diff.chaisemartin_dhaultfoeuille import ChaisemartinDHaultfoeuille +from diff_diff.survey import SurveyDesign + + +def _simulate_panel( + n_groups: int, + n_periods: int, + first_treated_period: int, + tau: float, + rng: np.random.Generator, + psu_sigma: float = 0.5, + obs_sigma: float = 1.0, + group_sigma: float = 1.0, +) -> pd.DataFrame: + """Generate a one-obs-per-cell panel with within-group-varying PSU. + + Each group has two PSUs assigned by ``period % 2``. The PSU + contributes a ``(group, psu)``-specific intercept, creating the + within-group residual correlation that design-based variance is + meant to capture. Treatment starts at ``first_treated_period`` for + the first half of groups and never for the rest. + """ + groups = np.arange(n_groups) + treated = groups < n_groups // 2 + group_fe = rng.normal(0.0, group_sigma, size=n_groups) + # (group, psu) effect — constant across time within the (g, psu) + # pair. Drives within-group correlation that varies by cell. + psu_fe = rng.normal(0.0, psu_sigma, size=(n_groups, 2)) + + rows = [] + for g in groups: + for t in range(n_periods): + parity = int(t % 2) + # PSU labels are globally unique per (group, parity) so the + # Binder variance sees 2 * n_groups sampling clusters rather + # than just 2 global PSU codes reused across groups. Each + # PSU collects the cells of a single group's odd or even + # periods; within-cell constancy (one obs per cell) is + # trivially satisfied. + psu_id = int(g) * 2 + parity + d = 1 if (treated[g] and t >= first_treated_period) else 0 + y = ( + group_fe[g] + + 0.1 * t + + tau * d + + psu_fe[g, parity] + + rng.normal(0.0, obs_sigma) + ) + rows.append({ + "group": int(g), + "period": int(t), + "treatment": int(d), + "outcome": float(y), + "psu": psu_id, + "pw": 1.0, + }) + return pd.DataFrame(rows) + + +@pytest.mark.slow +def test_cell_period_allocator_coverage_within_group_varying_psu(): + """Empirical 95% coverage for DID_M under the cell-period allocator + on a DGP with within-group-varying PSU. Tolerance ±2.5pp is ~2 + MC-sigma at n_reps=500 for a true 95% target (sqrt(0.95*0.05/500) + = 0.0098, so 2-sigma = 1.96pp — ±2.5pp is deliberately a touch + looser to absorb small-sample bias without masking a real + under-coverage bug). + """ + n_reps = 500 + n_groups = 40 + n_periods = 6 + first_treated_period = 3 + tau_true = 2.0 + + rng = np.random.default_rng(20260418) + covered = 0 + failed = 0 + + for r in range(n_reps): + df = _simulate_panel( + n_groups=n_groups, + n_periods=n_periods, + first_treated_period=first_treated_period, + tau=tau_true, + rng=rng, + ) + sd = SurveyDesign(weights="pw", psu="psu") + try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore") + res = ChaisemartinDHaultfoeuille(seed=r + 1).fit( + df, + outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, L_max=1, + ) + except Exception: + failed += 1 + continue + + ci = res.event_study_effects[1]["conf_int"] + if ci is None or not all(np.isfinite(ci)): + failed += 1 + continue + lo, hi = float(ci[0]), float(ci[1]) + if lo <= tau_true <= hi: + covered += 1 + + completed = n_reps - failed + assert completed >= int(0.95 * n_reps), ( + f"MC simulation had {failed}/{n_reps} fit failures, above " + f"the 5% tolerance." + ) + coverage = covered / completed + assert 0.925 <= coverage <= 0.975, ( + f"Empirical coverage {coverage:.3f} (completed {completed}) " + f"outside [0.925, 0.975]; true tau={tau_true}, " + f"n_groups={n_groups}, n_periods={n_periods}, n_reps={n_reps}." + ) diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py index 1aa9f14c..178e3676 100644 --- a/tests/test_survey_dcdh.py +++ b/tests/test_survey_dcdh.py @@ -1072,35 +1072,142 @@ def test_survey_design2_runs(self): class TestSurveyWithinGroupValidation: - """Survey designs with strata or PSU varying within a single group - are rejected because the dCDH IF expansion treats the group as the - effective sampling unit.""" - - def test_rejects_varying_psu_within_group(self, base_data): + """Cell-period IF allocator contract: strata and PSU may vary ACROSS + cells of a group, but must be constant WITHIN each (g, t) cell. In + canonical one-obs-per-cell panels the cell-level constancy check is + trivially satisfied. Out-of-scope combinations (heterogeneity + + within-group-varying PSU; n_bootstrap > 0 + within-group-varying + PSU) raise NotImplementedError with a pointer to the follow-up PR. + """ + + def test_accepts_varying_psu_within_group(self, base_data): + """Under the cell-period allocator, PSU that varies across cells + of a group is a valid design — the allocator attributes IF mass + to each (g, t) cell separately and Binder TSL aggregates at PSU + level with the honest cell-level variance. + """ df_ = base_data.copy() df_["pw"] = 1.0 df_["stratum"] = 0 - # PSU varies within each group (alternates by period) + # PSU varies within each group (alternates by period). Still + # constant within each (g, t) cell because one obs per cell. df_["psu"] = df_["period"] % 2 sd = SurveyDesign(weights="pw", strata="stratum", psu="psu") - with pytest.raises(ValueError, match="PSU to be constant within group"): + result = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, L_max=2, + ) + assert np.isfinite(result.overall_att) + assert np.isfinite(result.overall_se) + # And the SE differs from a within-group-constant-PSU baseline, + # because the cell allocator now honors the extra PSU structure. + df_const = base_data.copy() + df_const["pw"] = 1.0 + df_const["stratum"] = 0 + df_const["psu"] = 0 # constant-within-group PSU baseline + sd_const = SurveyDesign(weights="pw", strata="stratum", psu="psu") + r_const = ChaisemartinDHaultfoeuille(seed=1).fit( + df_const, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd_const, L_max=2, + ) + assert result.overall_se != r_const.overall_se, ( + "Cell-period allocator must produce a different SE when PSU " + "actually varies across cells vs. constant-within-group." + ) + + def test_accepts_varying_strata_within_group(self, base_data): + """Strata that vary across cells of a group are supported under + the cell-period allocator, trivially satisfying within-cell + constancy in one-obs-per-cell panels. + """ + df_ = base_data.copy() + df_["pw"] = 1.0 + # Stratum varies within each group across cells + df_["stratum"] = df_["period"] % 2 + # PSU = group, nested inside stratum so the resolver accepts the + # cross-stratum reuse of group labels. + df_["psu"] = df_["group"] + sd = SurveyDesign(weights="pw", strata="stratum", psu="psu", nest=True) + result = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, L_max=2, + ) + assert np.isfinite(result.overall_att) + assert np.isfinite(result.overall_se) + + def test_heterogeneity_with_varying_psu_raises(self, base_data): + """heterogeneity= is gated under within-group-varying PSU until + PR 3 ships the cell-period allocator for the WLS psi_obs path. + """ + df_ = base_data.copy() + df_["pw"] = 1.0 + df_["stratum"] = 0 + df_["psu"] = df_["period"] % 2 # varies within group + df_["x_het"] = np.arange(len(df_)) % 3 # categorical covariate + sd = SurveyDesign(weights="pw", strata="stratum", psu="psu") + with pytest.raises(NotImplementedError, match="heterogeneity"): ChaisemartinDHaultfoeuille(seed=1).fit( df_, outcome="outcome", group="group", time="period", treatment="treatment", + heterogeneity="x_het", L_max=1, survey_design=sd, ) - def test_rejects_varying_strata_within_group(self, base_data): + def test_bootstrap_with_varying_psu_raises(self, base_data): + """n_bootstrap > 0 is gated under within-group-varying PSU until + PR 4 ships the cell-level Hall-Mammen wild bootstrap. + """ df_ = base_data.copy() df_["pw"] = 1.0 - # Stratum varies within each group - df_["stratum"] = df_["period"] % 2 - # Give each obs a unique PSU label so the SurveyDesign resolver - # doesn't reject on cross-stratum PSU reuse — we want our - # within-group strata check to fire first. + df_["stratum"] = 0 + df_["psu"] = df_["period"] % 2 # varies within group + sd = SurveyDesign(weights="pw", strata="stratum", psu="psu") + with pytest.raises(NotImplementedError, match="n_bootstrap"): + ChaisemartinDHaultfoeuille(n_bootstrap=50, seed=1).fit( + df_, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) + + def test_within_cell_psu_variation_rejected(self, base_data): + """Multiple PSUs inside a single (g, t) cell (a multi-obs-per- + cell panel) remain ambiguous under the cell allocator and must + be rejected. + """ + df_ = base_data.copy() + df_["pw"] = 1.0 + df_["stratum"] = 0 + df_["psu"] = 0 + # Duplicate the first row with a different PSU so that cell + # (group[0], period[0]) has two obs with different PSU labels. + dup = df_.iloc[0].copy() + dup["psu"] = 99 + df_ = pd.concat([df_, pd.DataFrame([dup])], ignore_index=True) + sd = SurveyDesign(weights="pw", strata="stratum", psu="psu") + with pytest.raises(ValueError, match="PSU to be constant within each"): + ChaisemartinDHaultfoeuille(seed=1).fit( + df_, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) + + def test_within_cell_strata_variation_rejected(self, base_data): + """Multiple strata inside a single (g, t) cell are ambiguous + under the cell allocator and must be rejected. + """ + df_ = base_data.copy() + df_["pw"] = 1.0 + df_["stratum"] = 0 + # Duplicate the first row with a different stratum. + dup = df_.iloc[0].copy() + dup["stratum"] = 1 + df_ = pd.concat([df_, pd.DataFrame([dup])], ignore_index=True) df_["psu"] = np.arange(len(df_)) sd = SurveyDesign(weights="pw", strata="stratum", psu="psu") - with pytest.raises(ValueError, match="strata to be constant within group"): + with pytest.raises(ValueError, match="strata to be constant within each"): ChaisemartinDHaultfoeuille(seed=1).fit( df_, outcome="outcome", group="group", time="period", treatment="treatment", @@ -1271,17 +1378,19 @@ def test_off_horizon_row_duplication_does_not_change_se(self, base_data): f"{r_dup.overall_se}) — auto-inject psu=group is not active." ) - def test_rejection_excludes_zero_weight_rows(self, base_data): - """A zero-weight row with a different PSU from its group must - not trigger rejection — it is out-of-sample by the - subpopulation contract and does not enter the variance.""" + def test_within_cell_check_excludes_zero_weight_rows(self, base_data): + """A zero-weight row with a different PSU label from its cell + must not trigger rejection — it is out-of-sample by the + subpopulation contract and does not enter the variance. + """ df_ = base_data.copy() df_["pw"] = 1.0 df_["stratum"] = 0 df_["psu"] = 0 - # Inject a zero-weight row with a different PSU + # Inject a zero-weight row whose PSU would collide with the + # first row's cell if it were counted. sample = df_.iloc[0].copy() - sample["psu"] = 99 # would violate constancy if counted + sample["psu"] = 99 # would violate within-cell constancy if counted sample["pw"] = 0.0 df_ = pd.concat([df_, pd.DataFrame([sample])], ignore_index=True) sd = SurveyDesign(weights="pw", strata="stratum", psu="psu") From f52461195bd88626c3b6c11ba994f586e6a69ec7 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 17:25:06 -0400 Subject: [PATCH 3/6] Unpack _compute_full_per_group_contributions tuple in methodology test The helper now returns (U, U_per_period). The cohort-recentering methodology test at test_cohort_recentering_not_grand_mean only needs the per-group vector U, so destructure and discard the per-period matrix. Matches the return-shape change landed in Stage 1 of the cell-period IF allocator work. Co-Authored-By: Claude Opus 4.7 (1M context) --- tests/test_methodology_chaisemartin_dhaultfoeuille.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_methodology_chaisemartin_dhaultfoeuille.py b/tests/test_methodology_chaisemartin_dhaultfoeuille.py index fc62f6ca..06be4fef 100644 --- a/tests/test_methodology_chaisemartin_dhaultfoeuille.py +++ b/tests/test_methodology_chaisemartin_dhaultfoeuille.py @@ -348,7 +348,7 @@ def test_cohort_recentering_not_grand_mean(self): a11_minus_zeroed_arr, ) = _compute_per_period_dids(D_mat=D_mat, Y_mat=Y_mat, N_mat=N_mat, periods=periods) - U_overall = _compute_full_per_group_contributions( + U_overall, _ = _compute_full_per_group_contributions( D_mat=D_mat, Y_mat=Y_mat, N_mat=N_mat, From b14aa22b4d81209131005375638658c496b2455f Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 17:48:31 -0400 Subject: [PATCH 4/6] Document post-period attribution convention + invariant tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a row-sum identity test for the cell-period allocator on a hand-computed 4-group x 3-period panel, verifying: (1) per-group IF matches the closed-form joiner/stable_0 decomposition; (2) per-period attribution sums across time to the per-group IF exactly; (3) post- period attribution places all mass at t=2 with zeros at earlier columns; (4) column-wise cohort centering preserves the row-sum identity. REGISTRY.md frames the post-period attribution as a library convention — adopted because it preserves the group/PSU-sum identities of the prior group-level expansion and produces approximately nominal MC coverage on the test DGP — rather than a theorem derived from the observation-level survey linearization. Inline comments in _compute_full_per_group_contributions, _compute_per_group_if_multi_horizon, and _compute_per_group_if_placebo_horizon mirror that framing. Updates stale local type annotations for multi_horizon_if / placebo_horizon_if to the Tuple[np.ndarray, np.ndarray] returns. TODO.md gains a Methodology/Correctness entry tracking the open validation question (formal derivation against observation-level survey linearization, or replacement with a covariance-aware alternative). All 348 tests pass (slow MC coverage sim included). Co-Authored-By: Claude Opus 4.7 (1M context) --- TODO.md | 1 + diff_diff/chaisemartin_dhaultfoeuille.py | 26 ++++++-- docs/methodology/REGISTRY.md | 2 +- tests/test_survey_dcdh.py | 82 ++++++++++++++++++++++++ 4 files changed, 105 insertions(+), 6 deletions(-) diff --git a/TODO.md b/TODO.md index 2ef24f36..a27d34a6 100644 --- a/TODO.md +++ b/TODO.md @@ -57,6 +57,7 @@ Deferred items from PR reviews that were not addressed before merge. | Issue | Location | PR | Priority | |-------|----------|----|----------| | dCDH: Phase 1 per-period placebo DID_M^pl has NaN SE (no IF derivation for the per-period aggregation path). Multi-horizon placebos (L_max >= 1) have valid SE. | `chaisemartin_dhaultfoeuille.py` | #294 | Low | +| dCDH: Survey cell-period allocator's post-period attribution is a library convention, not derived from the observation-level survey linearization. MC coverage is empirically close to nominal on the test DGP; a formal derivation (or a covariance-aware two-cell alternative) is deferred. Documented in REGISTRY.md survey IF expansion Note. | `chaisemartin_dhaultfoeuille.py`, `docs/methodology/REGISTRY.md` | PR 2 | Medium | | dCDH: Parity test SE/CI assertions only cover pure-direction scenarios; mixed-direction SE comparison is structurally apples-to-oranges (cell-count vs obs-count weighting). | `test_chaisemartin_dhaultfoeuille_parity.py` | #294 | Low | | CallawaySantAnna: consider materializing NaN entries for non-estimable (g,t) cells in group_time_effects dict (currently omitted with consolidated warning); would require updating downstream consumers (event study, balance_e, aggregation) | `staggered.py` | #256 | Low | | ImputationDiD dense `(A0'A0).toarray()` scales O((U+T+K)^2), OOM risk on large panels | `imputation.py` | #141 | Medium (deferred — only triggers when sparse solver fails) | diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index 706a82a9..664c817d 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -1601,7 +1601,7 @@ def fit( # Step 12c: Multi-horizon per-group computation (L_max >= 1) # ------------------------------------------------------------------ multi_horizon_dids: Optional[Dict[int, Dict[str, Any]]] = None - multi_horizon_if: Optional[Dict[int, np.ndarray]] = None + multi_horizon_if: Optional[Dict[int, Tuple[np.ndarray, np.ndarray]]] = None multi_horizon_se: Optional[Dict[int, float]] = None multi_horizon_inference: Optional[Dict[int, Dict[str, Any]]] = None @@ -1747,7 +1747,7 @@ def fit( # Phase 2: placebos, normalized effects, cost-benefit delta multi_horizon_placebos: Optional[Dict[int, Dict[str, Any]]] = None - placebo_horizon_if: Optional[Dict[int, np.ndarray]] = None + placebo_horizon_if: Optional[Dict[int, Tuple[np.ndarray, np.ndarray]]] = None placebo_horizon_se: Optional[Dict[int, float]] = None placebo_horizon_inference: Optional[Dict[int, Dict[str, Any]]] = None normalized_effects_dict: Optional[Dict[int, Dict[str, Any]]] = None @@ -4408,13 +4408,18 @@ def _compute_per_group_if_multi_horizon( # contribution to U_l is zero, but its count is in N_l. continue - # Switcher contribution: +S_g * (Y_{g, out} - Y_{g, ref}) + # Switcher contribution: +S_g * (Y_{g, out} - Y_{g, ref}). + # Per-cell attribution convention: assign the whole contrast + # to the outcome cell (g, out_idx). See REGISTRY.md's Note + # on survey IF expansion for the rationale behind this + # convention (library choice, not a derived result). switcher_change = Y_mat[g, out_idx] - Y_mat[g, ref_idx] U_l[g] += S_g * switcher_change U_per_period_l[g, out_idx] += S_g * switcher_change # Control contributions: each control g' in the pool gets - # -S_g * (1/n_ctrl) * (Y_{g', out} - Y_{g', ref}) + # -S_g * (1/n_ctrl) * (Y_{g', out} - Y_{g', ref}). Same + # post-period attribution as the switcher side. ctrl_changes = Y_mat[ctrl_pool, out_idx] - Y_mat[ctrl_pool, ref_idx] ctrl_contrib = (S_g / n_ctrl) * ctrl_changes U_l[ctrl_pool] -= ctrl_contrib @@ -4516,7 +4521,10 @@ def _compute_per_group_if_placebo_horizon( if n_ctrl == 0: continue - # Switcher contribution: paper convention backward - ref + # Switcher contribution: paper convention backward - ref. + # Attribute the whole contrast to the backward cell + # (mirrors the multi-horizon / DID_M post-period + # attribution convention). switcher_change = Y_mat[g, backward_idx] - Y_mat[g, ref_idx] U_pl[g] += S_g * switcher_change U_per_period_pl[g, backward_idx] += S_g * switcher_change @@ -4936,6 +4944,14 @@ def _compute_full_per_group_contributions( include_joiners_side = side in ("overall", "joiners") include_leavers_side = side in ("overall", "leavers") + # Per-cell attribution convention (not a derivation from the + # observation-level survey linearization — see REGISTRY.md + # ``ChaisemartinDHaultfoeuille`` Note on survey IF expansion): + # attribute each (Y_curr - Y_prev) transition as a single + # difference to its post-period cell (g, t_idx). Preserves the + # row-sum identity U_per_period.sum(axis=1) == U and therefore + # the group-sum invariance that makes the cell expansion + # byte-identical to the pre-allocator convention under PSU=group. for t_idx in range(1, n_periods): d_curr = D_mat[:, t_idx] d_prev = D_mat[:, t_idx - 1] diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index a871ce8b..c8bcea8c 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -649,7 +649,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - [x] Design-2 switch-in/switch-out descriptive wrapper (Web Appendix Section 1.6) - [x] HonestDiD (Rambachan-Roth 2023) integration on placebo + event study surface - [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) **or replicate-weight variance (BRR/Fay/JK1/JKn/SDR)**, covering the main ATT surface, covariate adjustment (DID^X), heterogeneity testing, the TWFE diagnostic (fit and standalone `twowayfeweights()` helper), and HonestDiD bounds. Opt-in **PSU-level Hall-Mammen wild bootstrap** is also supported via `n_bootstrap > 0`. -- **Note (Survey IF expansion — cell-period allocator):** Survey IF expansion is a library extension not in the dCDH papers (the paper's plug-in variance assumes iid sampling). The expansion decomposes the per-group IF `U[g]` into per-cell contributions `U[g, t]` (the per-period term of the sum that yields `U[g]` inside `_compute_full_per_group_contributions` and the per-horizon helpers), cohort-centers each column independently, and expands to observation level as `psi_i = U_centered_per_period[g_i, t_i] * (w_i / W_{g_i, t_i})`, then applies the Binder (1983) stratified-PSU variance formula. **Strata and PSU must be constant within each `(g, t)` cell** (trivially satisfied in one-obs-per-cell panels — the canonical dCDH structure); variation **across cells of a group** is fully supported. This is a strict relaxation of the earlier within-group constancy rule shipped before the cell-period allocator. Byte-identity on the pre-relaxation input set is guaranteed because PSU-level Binder aggregation telescopes per-cell sums to `U_centered[g]` under within-group-constant PSU, matching the previous group-level expansion up to single-ULP floating-point noise. Within-group-varying **weights** are supported as before. When `survey_design.psu` is not specified, `fit()` auto-injects `psu=` so the TSL variance, `df_survey`, and t-based inference match the per-group PSU structure. Under replicate-weight designs, the same cell-level `psi_i` is aggregated via Rao-Wu weight-ratio rescaling (`compute_replicate_if_variance` at `diff_diff/survey.py:1681`) rather than the Binder TSL formula. All five methods (BRR/Fay/JK1/JKn/SDR) are supported method-agnostically through the unified helper; the effective `df_survey` is reduced to `min(n_valid) - 1` across IF sites when some replicate solves fail (matching `efficient_did.py:1133-1135` and `triple_diff.py:676-686` precedents). Under DID^X, the first-stage residualization coefficient `theta_hat` is computed once on full-sample weights and treated as fixed (FWL plug-in IF convention) — per-replicate refits of `theta_hat` are not performed. **Scope limitations (follow-up PRs):** (a) `heterogeneity=` combined with within-group-varying PSU/strata raises `NotImplementedError` — the heterogeneity WLS `psi_obs` still uses the legacy group-level expansion, to be extended in PR 3; (b) `n_bootstrap > 0` combined with within-group-varying PSU raises `NotImplementedError` — the PSU-level Hall-Mammen wild bootstrap still uses the legacy group-level PSU map, to be extended in PR 4. +- **Note (Survey IF expansion — library convention):** Survey IF expansion is a library extension not in the dCDH papers (the paper's plug-in variance assumes iid sampling). The library convention builds observation-level `psi_i` by proportionally distributing per-group IF mass within weight share: either at the group level (`psi_i = U_centered[g] * w_i / W_g`, the previous convention) or at the per-`(g, t)` cell level via the cell-period allocator shipped in this release. Cell-level expansion: decompose `U[g]` into per-period attributions `U[g, t]`, cohort-center each column independently, then expand to observation level as `psi_i = U_centered_per_period[g_i, t_i] * (w_i / W_{g_i, t_i})`. Binder (1983) stratified-PSU variance aggregates the resulting `psi` at PSU level. **Post-period attribution convention:** each transition term in the IF sum (of the form `role_weight * (Y_{g, t} - Y_{g, t-1})` for DID_M or `S_g * (Y_{g, out} - Y_{g, ref})` for DID_l) is attributed as a single *difference* to the POST-period cell, not split into a `+Y_post` / `-Y_pre` pair across two cells. This is a library *convention*, not a theorem — adopted because it preserves the group-sum, PSU-sum, and cohort-sum identities of the previous group-level expansion (so Binder variance coincides with the group-level variance under the auto-injected `psu=group`) and because Monte Carlo coverage at nominal 95% is empirically close to nominal on a DGP where PSUs vary across the cells of each group (see `tests/test_dcdh_cell_period_coverage.py`). A covariance-aware two-cell allocator is a plausible alternative and may be worth exploring if future designs motivate an explicit observation-level IF derivation; the method currently in the library is **not derived from the observation-level survey linearization of the contrast** and makes no stronger claim than "coverage is approximately nominal under the tested DGPs and the group-sum identity holds exactly." Under within-group-constant PSU (the pre-allocator accepted input), per-cell sums telescope to `U_centered[g]` and Binder variance is byte-identical (up to single-ULP floating-point noise) to the previous group-level expansion. **Strata and PSU must be constant within each `(g, t)` cell** (trivially satisfied in one-obs-per-cell panels — the canonical dCDH structure); variation **across cells of a group** is supported by the allocator. Within-group-varying **weights** are supported as before. When `survey_design.psu` is not specified, `fit()` auto-injects `psu=` so the TSL variance, `df_survey`, and t-based inference match the per-group PSU structure. Under replicate-weight designs, the same cell-level `psi_i` is aggregated via Rao-Wu weight-ratio rescaling (`compute_replicate_if_variance` at `diff_diff/survey.py:1681`) rather than the Binder TSL formula. All five methods (BRR/Fay/JK1/JKn/SDR) are supported method-agnostically through the unified helper; the effective `df_survey` is reduced to `min(n_valid) - 1` across IF sites when some replicate solves fail (matching `efficient_did.py:1133-1135` and `triple_diff.py:676-686` precedents). Under DID^X, the first-stage residualization coefficient `theta_hat` is computed once on full-sample weights and treated as fixed (FWL plug-in IF convention) — per-replicate refits of `theta_hat` are not performed. **Scope limitations (follow-up PRs):** (a) `heterogeneity=` combined with within-group-varying PSU/strata raises `NotImplementedError` — the heterogeneity WLS `psi_obs` still uses the legacy group-level expansion, to be extended in PR 3; (b) `n_bootstrap > 0` combined with within-group-varying PSU raises `NotImplementedError` — the PSU-level Hall-Mammen wild bootstrap still uses the legacy group-level PSU map, to be extended in PR 4. - **Note (survey + bootstrap contract):** When `survey_design` and `n_bootstrap > 0` are both active, the bootstrap uses Hall-Mammen wild multiplier weights (Rademacher/Mammen/Webb) **at the PSU level**. Under the default auto-injected `psu=group`, the PSU coincides with the group so the wild bootstrap is a clean group-level clustered bootstrap (identity-map fast path, bit-identical to the non-survey multiplier bootstrap). When the user passes an explicit strictly-coarser PSU (e.g., `psu=state` with groups at county level), the IF contributions of all groups within a PSU receive the same bootstrap multiplier — the standard Hall-Mammen wild PSU bootstrap. Strata do not participate in the bootstrap randomization (they contribute only through the analytical TSL variance); this is conservative when strata differ substantially in variance. A `UserWarning` fires only when PSU is strictly coarser than group. **Scope note (cell-period allocator):** The PSU-level bootstrap uses a group-level `group_id_to_psu_code` map and therefore requires PSU to be constant within each group. Combining `n_bootstrap > 0` with a PSU that varies within group raises `NotImplementedError`; the cell-level Hall-Mammen extension is deferred to a follow-up PR. The analytical TSL variance fully supports within-group-varying PSU via the cell-period allocator — use `n_bootstrap=0` for those designs. **Replicate-weight designs and `n_bootstrap > 0` are mutually exclusive** (replicate variance is closed-form; bootstrap would double-count variance) — the combination raises `NotImplementedError`, matching `efficient_did.py:989`, `staggered.py:1869`, `two_stage.py:251-253`. For HonestDiD bounds under replicate weights, the replicate-effective `df_survey = min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` propagates to t-critical values — capped by the design's QR-rank-based df so a rank-deficient replicate matrix never produces a larger effective df than the design supports. When `resolved_survey.df_survey` is undefined (QR-rank ≤ 1), the effective df stays `None` and all inference fields (including HonestDiD bounds) are NaN — per-site `n_valid` cannot rescue a rank-deficient design. --- diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py index 178e3676..5d88a7ce 100644 --- a/tests/test_survey_dcdh.py +++ b/tests/test_survey_dcdh.py @@ -1378,6 +1378,88 @@ def test_off_horizon_row_duplication_does_not_change_se(self, base_data): f"{r_dup.overall_se}) — auto-inject psu=group is not active." ) + def test_cell_allocator_row_sum_identity(self): + """Cell-period allocator contract: for every group, the per- + period attribution sums across time to the per-group IF + (before cohort centering). This is the invariant that makes + PSU-level Binder aggregation telescope to ``U_centered[g]`` + under within-group-constant PSU and therefore guarantees byte- + identity with the legacy group-level allocator on the old + accepted input set. Hand-computed on a 4-group × 3-period + panel: two never-treated (stable_0) and two joiners switching + at ``t = 2``. + """ + from diff_diff.chaisemartin_dhaultfoeuille import ( + _compute_full_per_group_contributions, + _cohort_recenter, + _cohort_recenter_per_period, + ) + + # D_mat, Y_mat, N_mat shaped (n_groups=4, n_periods=3). + D_mat = np.array( + [ + [0, 0, 0], # G0 never-treated + [0, 0, 0], # G1 never-treated + [0, 0, 1], # G2 joiner at t=2 + [0, 0, 1], # G3 joiner at t=2 + ], + dtype=float, + ) + Y_mat = np.array( + [ + [1.0, 2.0, 3.0], + [2.1, 3.1, 4.2], + [0.5, 1.2, 5.4], + [1.3, 2.4, 6.1], + ], + dtype=float, + ) + N_mat = np.ones_like(D_mat, dtype=int) + # Per-period cell counts aligned to periods[1:] + # t=1: all stable_0 (4 in n_00); t=2: 2 joiners (n_10) + 2 stable_0 (n_00) + n_10_t_arr = np.array([0, 2], dtype=int) + n_00_t_arr = np.array([4, 2], dtype=int) + n_01_t_arr = np.array([0, 0], dtype=int) + n_11_t_arr = np.array([0, 0], dtype=int) + # A11 zeroed at t=1 (no joiners); active at t=2. + a11_plus_zeroed = np.array([True, False], dtype=bool) + a11_minus_zeroed = np.array([True, True], dtype=bool) + + U, U_pp = _compute_full_per_group_contributions( + D_mat=D_mat, Y_mat=Y_mat, N_mat=N_mat, + n_10_t_arr=n_10_t_arr, n_00_t_arr=n_00_t_arr, + n_01_t_arr=n_01_t_arr, n_11_t_arr=n_11_t_arr, + a11_plus_zeroed_arr=a11_plus_zeroed, + a11_minus_zeroed_arr=a11_minus_zeroed, + side="overall", + ) + + # Hand computation at t=2 joiner side: + # G0: stable_0, -(2/2) * (3.0 - 2.0) = -1.0 + # G1: stable_0, -(2/2) * (4.2 - 3.1) = -1.1 + # G2: joiner, (5.4 - 1.2) = 4.2 + # G3: joiner, (6.1 - 2.4) = 3.7 + expected_U = np.array([-1.0, -1.1, 4.2, 3.7]) + np.testing.assert_allclose(U, expected_U, atol=1e-12) + + # Row-sum identity: U_per_period.sum(axis=1) == U exactly. + np.testing.assert_allclose(U_pp.sum(axis=1), U, atol=1e-12) + + # Post-period attribution: all mass at t=2 (the transition's + # post cell); t=0 and t=1 columns are zero for every group. + np.testing.assert_array_equal(U_pp[:, 0], np.zeros(4)) + np.testing.assert_array_equal(U_pp[:, 1], np.zeros(4)) + np.testing.assert_allclose(U_pp[:, 2], expected_U, atol=1e-12) + + # Cohort centering preserves the row-sum identity: per-period + # cohort centering and group-level cohort centering produce + # 2D and 1D arrays whose row sums agree to FP precision. + # Cohorts: A = {G0, G1} (never-treated), B = {G2, G3} (joiners). + cohort_ids = np.array([0, 0, 1, 1]) + U_c = _cohort_recenter(U, cohort_ids) + U_pp_c = _cohort_recenter_per_period(U_pp, cohort_ids) + np.testing.assert_allclose(U_pp_c.sum(axis=1), U_c, atol=1e-12) + def test_within_cell_check_excludes_zero_weight_rows(self, base_data): """A zero-weight row with a different PSU label from its cell must not trigger rejection — it is out-of-sample by the From dedc2a6e38ca50aefeaf2411fdddf7d91c7c8938 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 18:34:40 -0400 Subject: [PATCH 5/6] Gate auto-inject on varying-strata + gate per-period tensors on survey MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Public contract fix: when `survey_design.psu` is omitted, fit() auto-injects psu= with nest=False. If strata vary within group, the synthesized PSU column reuses group labels across strata and downstream survey resolution fails on the cross-stratum PSU uniqueness check. fit() now detects that combination before auto- inject runs (via the existing _strata_psu_vary_within_group helper) and raises a targeted ValueError pointing users at the explicit `psu=, nest=True` path. Performance fix: the cell-period allocator's per-(g,t) tensor (U_per_period) was being allocated unconditionally in _compute_full_per_group_contributions, _compute_per_group_if_multi_horizon, _compute_per_group_if_placebo_horizon, and their _compute_cohort_recentered_inputs aggregator. That added O(G*T) memory to the main ATT path and O(G*T*L_max) to the event-study / placebo paths on every fit — including non-survey fits that never consume the tensor. Adds a `compute_per_period: bool = True` param through each helper, gates allocation + accumulation behind the flag, and threads `compute_per_period=(_obs_survey_info is not None)` from fit(). Downstream consumers now None-guard the per-period recentering / _compute_se calls so the plug-in SE path runs with no cell-level work. Adds `test_auto_inject_with_varying_strata_raises` under `TestSurveyWithinGroupValidation` covering the targeted error path. Narrows the REGISTRY.md survey IF expansion Note to state the nest=True requirement under varying strata. Softens two stale comments (`_cohort_recenter_per_period` docstring, `_compute_cohort_recentered_inputs` inline comment) so they match the convention framing already in REGISTRY.md and TODO.md rather than claiming methodological correctness. All 352 tests pass (slow MC coverage sim included). Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/chaisemartin_dhaultfoeuille.py | 196 +++++++++++++++++------ docs/methodology/REGISTRY.md | 2 +- tests/test_survey_dcdh.py | 21 +++ 3 files changed, 165 insertions(+), 54 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index 664c817d..b8653a40 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -726,6 +726,38 @@ def fit( or resolved_survey.replicate_weights.shape[1] == 0 ) ): + # Pre-auto-inject contract check: the auto-injected PSU + # column reuses group labels with nest=False, but the + # survey resolver enforces globally-unique PSU labels when + # nest=False and strata are present (see + # ``diff_diff/survey.py``). If strata varies within group, + # the synthesized PSU column collides across strata and + # resolution fails downstream with an opaque error. Flag + # that configuration up front with an actionable message + # pointing users to the explicit ``psu=, nest=True`` + # path (REGISTRY.md survey IF expansion Note). + if resolved_survey.strata is not None: + _strata_varies_pre, _ = _strata_psu_vary_within_group( + resolved_survey, data, group, survey_weights, + ) + if _strata_varies_pre: + raise ValueError( + "ChaisemartinDHaultfoeuille survey support: " + "strata that vary across cells of the same " + "group require an explicit `psu=` with " + "`nest=True` so that `(stratum, psu)` pairs " + "are globally unique. The default auto-" + "injected `psu=` path does NOT support " + "this because the synthesized PSU column " + "reuses group labels across strata and trips " + "the cross-stratum PSU uniqueness check in " + "survey resolution. Either (a) set strata " + "constant within each group, or (b) pass " + "`SurveyDesign(..., psu=, nest=True)` " + "with PSU labels that are unique within " + "strata." + ) + from diff_diff.survey import SurveyDesign as _SurveyDesign # Build a synthesized PSU column on a private copy of data @@ -1650,6 +1682,7 @@ def fit( T_g=T_g_arr, L_max=L_max, set_ids=set_ids_arr, + compute_per_period=_obs_survey_info is not None, ) # Per-horizon analytical SE via cohort recentering. @@ -1697,9 +1730,16 @@ def fit( U_l_elig = U_l[eligible_mask_var] cid_elig = cid_l[eligible_mask_var] U_centered_l = _cohort_recenter(U_l_elig, cid_elig) - U_centered_pp_l = _cohort_recenter_per_period( - U_pp_l[eligible_mask_var], cid_elig - ) + # Only build the cell-level attribution when the IF + # helper actually produced a per-period tensor (i.e., + # a survey design is present). Otherwise the plug-in + # path consumes U_centered_l only. + if U_pp_l is not None: + U_centered_pp_l: Optional[np.ndarray] = _cohort_recenter_per_period( + U_pp_l[eligible_mask_var], cid_elig + ) + else: + U_centered_pp_l = None N_l_h = multi_horizon_dids[l_h]["N_l"] _elig_groups_l = [all_groups[g] for g in range(len(all_groups)) if eligible_mask_var[g]] se_l, n_valid_l = _compute_se( @@ -1792,6 +1832,7 @@ def fit( switch_direction=switch_direction_arr, T_g=T_g_arr, L_max=L_max, + compute_per_period=_obs_survey_info is not None, set_ids=set_ids_arr, ) # Per-placebo-horizon analytical SE via cohort recentering @@ -1831,9 +1872,12 @@ def fit( U_pl_elig = U_pl[eligible_mask_pl] cid_elig_pl = cid_pl[eligible_mask_pl] U_centered_pl_l = _cohort_recenter(U_pl_elig, cid_elig_pl) - U_centered_pp_pl_l = _cohort_recenter_per_period( - U_pp_pl[eligible_mask_pl], cid_elig_pl - ) + if U_pp_pl is not None: + U_centered_pp_pl_l: Optional[np.ndarray] = _cohort_recenter_per_period( + U_pp_pl[eligible_mask_pl], cid_elig_pl + ) + else: + U_centered_pp_pl_l = None _elig_groups_pl = [all_groups[g] for g in range(len(all_groups)) if eligible_mask_pl[g]] se_pl_l, n_valid_pl_l = _compute_se( U_centered=U_centered_pl_l, @@ -1922,6 +1966,7 @@ def fit( a11_minus_zeroed_arr=a11_minus_zeroed_arr, all_groups=all_groups, singleton_baseline_groups=singleton_baseline_groups, + compute_per_period=_obs_survey_info is not None, ) # Analytical SE for DID_M (survey-aware when survey_design provided) @@ -4320,7 +4365,8 @@ def _compute_per_group_if_multi_horizon( T_g: np.ndarray, L_max: int, set_ids: Optional[np.ndarray] = None, -) -> Dict[int, Tuple[np.ndarray, np.ndarray]]: + compute_per_period: bool = True, +) -> Dict[int, Tuple[np.ndarray, Optional[np.ndarray]]]: """ Compute per-group influence function ``U^G_{g,l}`` for ``l = 1..L_max``. @@ -4367,11 +4413,15 @@ def _compute_per_group_if_multi_horizon( baseline_groups[float(d)] = np.where(mask)[0] baseline_f[float(d)] = first_switch_idx[mask] - results: Dict[int, Tuple[np.ndarray, np.ndarray]] = {} + results: Dict[int, Tuple[np.ndarray, Optional[np.ndarray]]] = {} for l in range(1, L_max + 1): # noqa: E741 U_l = np.zeros(n_groups, dtype=float) - U_per_period_l = np.zeros((n_groups, n_periods), dtype=float) + U_per_period_l: Optional[np.ndarray] = ( + np.zeros((n_groups, n_periods), dtype=float) + if compute_per_period + else None + ) for g in range(n_groups): if not is_switcher[g]: @@ -4415,7 +4465,8 @@ def _compute_per_group_if_multi_horizon( # convention (library choice, not a derived result). switcher_change = Y_mat[g, out_idx] - Y_mat[g, ref_idx] U_l[g] += S_g * switcher_change - U_per_period_l[g, out_idx] += S_g * switcher_change + if U_per_period_l is not None: + U_per_period_l[g, out_idx] += S_g * switcher_change # Control contributions: each control g' in the pool gets # -S_g * (1/n_ctrl) * (Y_{g', out} - Y_{g', ref}). Same @@ -4423,7 +4474,8 @@ def _compute_per_group_if_multi_horizon( ctrl_changes = Y_mat[ctrl_pool, out_idx] - Y_mat[ctrl_pool, ref_idx] ctrl_contrib = (S_g / n_ctrl) * ctrl_changes U_l[ctrl_pool] -= ctrl_contrib - U_per_period_l[ctrl_pool, out_idx] -= ctrl_contrib + if U_per_period_l is not None: + U_per_period_l[ctrl_pool, out_idx] -= ctrl_contrib results[l] = (U_l, U_per_period_l) @@ -4440,7 +4492,8 @@ def _compute_per_group_if_placebo_horizon( T_g: np.ndarray, L_max: int, set_ids: Optional[np.ndarray] = None, -) -> Dict[int, Tuple[np.ndarray, np.ndarray]]: + compute_per_period: bool = True, +) -> Dict[int, Tuple[np.ndarray, Optional[np.ndarray]]]: """ Compute per-group influence function for placebo horizons. @@ -4476,11 +4529,15 @@ def _compute_per_group_if_placebo_horizon( baseline_groups[float(d)] = np.where(mask)[0] baseline_f[float(d)] = first_switch_idx[mask] - results: Dict[int, Tuple[np.ndarray, np.ndarray]] = {} + results: Dict[int, Tuple[np.ndarray, Optional[np.ndarray]]] = {} for l in range(1, L_max + 1): # noqa: E741 U_pl = np.zeros(n_groups, dtype=float) - U_per_period_pl = np.zeros((n_groups, n_periods), dtype=float) + U_per_period_pl: Optional[np.ndarray] = ( + np.zeros((n_groups, n_periods), dtype=float) + if compute_per_period + else None + ) for g in range(n_groups): if not is_switcher[g]: @@ -4527,13 +4584,15 @@ def _compute_per_group_if_placebo_horizon( # attribution convention). switcher_change = Y_mat[g, backward_idx] - Y_mat[g, ref_idx] U_pl[g] += S_g * switcher_change - U_per_period_pl[g, backward_idx] += S_g * switcher_change + if U_per_period_pl is not None: + U_per_period_pl[g, backward_idx] += S_g * switcher_change # Control contributions ctrl_changes = Y_mat[ctrl_pool, backward_idx] - Y_mat[ctrl_pool, ref_idx] ctrl_contrib = (S_g / n_ctrl) * ctrl_changes U_pl[ctrl_pool] -= ctrl_contrib - U_per_period_pl[ctrl_pool, backward_idx] -= ctrl_contrib + if U_per_period_pl is not None: + U_per_period_pl[ctrl_pool, backward_idx] -= ctrl_contrib results[l] = (U_pl, U_per_period_pl) @@ -4876,7 +4935,8 @@ def _compute_full_per_group_contributions( a11_plus_zeroed_arr: np.ndarray, a11_minus_zeroed_arr: np.ndarray, side: str = "overall", -) -> Tuple[np.ndarray, np.ndarray]: + compute_per_period: bool = True, +) -> Tuple[np.ndarray, Optional[np.ndarray]]: """ Compute the per-group influence function ``U^G_g`` for ``DID_M``, ``DID_+``, or ``DID_-`` by summing role-weighted outcome differences @@ -4926,20 +4986,27 @@ def _compute_full_per_group_contributions( U : np.ndarray of shape (n_groups,) Per-group contributions. NOT cohort-centered; the caller is responsible for centering before computing the SE. - U_per_period : np.ndarray of shape (n_groups, n_periods) + U_per_period : np.ndarray of shape (n_groups, n_periods) or ``None`` Per-``(g, t)``-cell contributions, attributed to the post- period ``t`` of each transition pair. Satisfies ``U_per_period.sum(axis=1) == U`` exactly. NOT cohort-centered. Used by the cell-period IF allocator in ``_survey_se_from_group_if`` so that PSU/strata that vary across the cells of g can be honored by design-based variance. + ``None`` when ``compute_per_period=False`` (the caller has no + survey design, so the allocator is not needed and building the + dense O(n_groups * n_periods) tensor would be wasted work). """ if side not in ("overall", "joiners", "leavers"): raise ValueError(f"side must be one of overall/joiners/leavers, got {side!r}") n_groups, n_periods = D_mat.shape U = np.zeros(n_groups, dtype=float) - U_per_period = np.zeros((n_groups, n_periods), dtype=float) + U_per_period: Optional[np.ndarray] = ( + np.zeros((n_groups, n_periods), dtype=float) + if compute_per_period + else None + ) include_joiners_side = side in ("overall", "joiners") include_leavers_side = side in ("overall", "leavers") @@ -4979,8 +5046,9 @@ def _compute_full_per_group_contributions( ): U[joiner_mask] += y_diff[joiner_mask] U[stable0_mask] -= (n_10_t / n_00_t) * y_diff[stable0_mask] - U_per_period[joiner_mask, t_idx] += y_diff[joiner_mask] - U_per_period[stable0_mask, t_idx] -= (n_10_t / n_00_t) * y_diff[stable0_mask] + if U_per_period is not None: + U_per_period[joiner_mask, t_idx] += y_diff[joiner_mask] + U_per_period[stable0_mask, t_idx] -= (n_10_t / n_00_t) * y_diff[stable0_mask] # Leavers side (-y_diff for leavers; +(n_01/n_11)*y_diff for stable_1) if ( @@ -4991,8 +5059,9 @@ def _compute_full_per_group_contributions( ): U[leaver_mask] -= y_diff[leaver_mask] U[stable1_mask] += (n_01_t / n_11_t) * y_diff[stable1_mask] - U_per_period[leaver_mask, t_idx] -= y_diff[leaver_mask] - U_per_period[stable1_mask, t_idx] += (n_01_t / n_11_t) * y_diff[stable1_mask] + if U_per_period is not None: + U_per_period[leaver_mask, t_idx] -= y_diff[leaver_mask] + U_per_period[stable1_mask, t_idx] += (n_01_t / n_11_t) * y_diff[stable1_mask] return U, U_per_period @@ -5041,8 +5110,11 @@ def _cohort_recenter_per_period( Hence the cell-level Binder TSL aggregation telescopes to the same group-level sum produced by ``_cohort_recenter`` on ``U``, giving byte-identical variance under within-group-constant PSU (old - validator's accepted input set) and a methodologically correct - per-cell attribution under within-group-varying PSU. + validator's accepted input set) and a per-cell attribution under + within-group-varying PSU that follows the library's post-period + convention (see the REGISTRY.md Note on survey IF expansion — a + formal derivation from the observation-level survey linearization + is still open, tracked in ``TODO.md``). """ U_centered = U_per_period.astype(float).copy() if U_per_period.size == 0: @@ -5069,17 +5141,18 @@ def _compute_cohort_recentered_inputs( a11_minus_zeroed_arr: np.ndarray, all_groups: List[Any], singleton_baseline_groups: List[Any], + compute_per_period: bool = True, ) -> Tuple[ - np.ndarray, # U_centered_overall (n_eligible,) - int, # n_groups_for_overall - int, # n_cohorts - int, # n_groups_dropped_never_switching - np.ndarray, # U_centered_joiners - np.ndarray, # U_centered_leavers - List[Any], # eligible_group_ids - np.ndarray, # U_centered_per_period_overall (n_eligible, n_periods) - np.ndarray, # U_centered_per_period_joiners - np.ndarray, # U_centered_per_period_leavers + np.ndarray, # U_centered_overall (n_eligible,) + int, # n_groups_for_overall + int, # n_cohorts + int, # n_groups_dropped_never_switching + np.ndarray, # U_centered_joiners + np.ndarray, # U_centered_leavers + List[Any], # eligible_group_ids + Optional[np.ndarray], # U_centered_per_period_overall (n_eligible, n_periods) or None + Optional[np.ndarray], # U_centered_per_period_joiners or None + Optional[np.ndarray], # U_centered_per_period_leavers or None ]: """ Compute the cohort-centered influence-function vectors for variance. @@ -5128,6 +5201,9 @@ def _compute_cohort_recentered_inputs( n_groups, n_periods = D_mat.shape if n_groups == 0: + empty_pp: Optional[np.ndarray] = ( + np.zeros((0, 0), dtype=float) if compute_per_period else None + ) return ( np.array([], dtype=float), 0, @@ -5136,9 +5212,9 @@ def _compute_cohort_recentered_inputs( np.array([], dtype=float), np.array([], dtype=float), [], - np.zeros((0, 0), dtype=float), - np.zeros((0, 0), dtype=float), - np.zeros((0, 0), dtype=float), + empty_pp, + empty_pp, + empty_pp, ) # Per-group switch metadata via the shared helper (factored out in @@ -5174,7 +5250,9 @@ def _compute_cohort_recentered_inputs( cohort_id[g] = unique_cohorts[key] n_cohorts = len(unique_cohorts) - # Compute the full IF vectors + per-period attributions via the helper + # Compute the full IF vectors + per-period attributions via the + # helper. Skip the O(n_groups * n_periods) per-period tensor + # allocation when the caller won't use it (no survey design). U_overall_full, U_per_period_overall_full = _compute_full_per_group_contributions( D_mat=D_mat, Y_mat=Y_mat, @@ -5186,6 +5264,7 @@ def _compute_cohort_recentered_inputs( a11_plus_zeroed_arr=a11_plus_zeroed_arr, a11_minus_zeroed_arr=a11_minus_zeroed_arr, side="overall", + compute_per_period=compute_per_period, ) U_joiners_full, U_per_period_joiners_full = _compute_full_per_group_contributions( D_mat=D_mat, @@ -5198,6 +5277,7 @@ def _compute_cohort_recentered_inputs( a11_plus_zeroed_arr=a11_plus_zeroed_arr, a11_minus_zeroed_arr=a11_minus_zeroed_arr, side="joiners", + compute_per_period=compute_per_period, ) U_leavers_full, U_per_period_leavers_full = _compute_full_per_group_contributions( D_mat=D_mat, @@ -5210,15 +5290,13 @@ def _compute_cohort_recentered_inputs( a11_plus_zeroed_arr=a11_plus_zeroed_arr, a11_minus_zeroed_arr=a11_minus_zeroed_arr, side="leavers", + compute_per_period=compute_per_period, ) # Restrict to variance-eligible groups (drop singleton-baseline groups) U_overall = U_overall_full[eligible_mask] U_joiners = U_joiners_full[eligible_mask] U_leavers = U_leavers_full[eligible_mask] - U_per_period_overall = U_per_period_overall_full[eligible_mask] - U_per_period_joiners = U_per_period_joiners_full[eligible_mask] - U_per_period_leavers = U_per_period_leavers_full[eligible_mask] cohort_id_eligible = cohort_id[eligible_mask] # Cohort-recenter each IF vector (group level for plug-in path) @@ -5230,17 +5308,29 @@ def _compute_cohort_recentered_inputs( # Column-wise centering preserves sum_t U_centered_pp[g, t] = # U_centered[g], so Binder TSL PSU-level aggregation telescopes to # the same group-level sum under within-group-constant PSU (byte- - # identical to the old allocator) and gives a valid per-cell - # attribution under within-group-varying PSU. - U_centered_pp_overall = _cohort_recenter_per_period( - U_per_period_overall, cohort_id_eligible - ) - U_centered_pp_joiners = _cohort_recenter_per_period( - U_per_period_joiners, cohort_id_eligible - ) - U_centered_pp_leavers = _cohort_recenter_per_period( - U_per_period_leavers, cohort_id_eligible - ) + # identical to the old allocator). Under within-group-varying PSU + # the per-cell attribution follows the library's post-period + # convention — a formal derivation from the observation-level + # survey linearization is an open question tracked in TODO.md. + # Skip entirely when the 2D tensor was not computed. + if U_per_period_overall_full is not None: + U_centered_pp_overall: Optional[np.ndarray] = _cohort_recenter_per_period( + U_per_period_overall_full[eligible_mask], cohort_id_eligible + ) + else: + U_centered_pp_overall = None + if U_per_period_joiners_full is not None: + U_centered_pp_joiners: Optional[np.ndarray] = _cohort_recenter_per_period( + U_per_period_joiners_full[eligible_mask], cohort_id_eligible + ) + else: + U_centered_pp_joiners = None + if U_per_period_leavers_full is not None: + U_centered_pp_leavers: Optional[np.ndarray] = _cohort_recenter_per_period( + U_per_period_leavers_full[eligible_mask], cohort_id_eligible + ) + else: + U_centered_pp_leavers = None # Eligible group IDs for survey IF expansion eligible_group_ids = [all_groups[g] for g in range(n_groups) if eligible_mask[g]] diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index c8bcea8c..3abfd7e8 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -649,7 +649,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - [x] Design-2 switch-in/switch-out descriptive wrapper (Web Appendix Section 1.6) - [x] HonestDiD (Rambachan-Roth 2023) integration on placebo + event study surface - [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) **or replicate-weight variance (BRR/Fay/JK1/JKn/SDR)**, covering the main ATT surface, covariate adjustment (DID^X), heterogeneity testing, the TWFE diagnostic (fit and standalone `twowayfeweights()` helper), and HonestDiD bounds. Opt-in **PSU-level Hall-Mammen wild bootstrap** is also supported via `n_bootstrap > 0`. -- **Note (Survey IF expansion — library convention):** Survey IF expansion is a library extension not in the dCDH papers (the paper's plug-in variance assumes iid sampling). The library convention builds observation-level `psi_i` by proportionally distributing per-group IF mass within weight share: either at the group level (`psi_i = U_centered[g] * w_i / W_g`, the previous convention) or at the per-`(g, t)` cell level via the cell-period allocator shipped in this release. Cell-level expansion: decompose `U[g]` into per-period attributions `U[g, t]`, cohort-center each column independently, then expand to observation level as `psi_i = U_centered_per_period[g_i, t_i] * (w_i / W_{g_i, t_i})`. Binder (1983) stratified-PSU variance aggregates the resulting `psi` at PSU level. **Post-period attribution convention:** each transition term in the IF sum (of the form `role_weight * (Y_{g, t} - Y_{g, t-1})` for DID_M or `S_g * (Y_{g, out} - Y_{g, ref})` for DID_l) is attributed as a single *difference* to the POST-period cell, not split into a `+Y_post` / `-Y_pre` pair across two cells. This is a library *convention*, not a theorem — adopted because it preserves the group-sum, PSU-sum, and cohort-sum identities of the previous group-level expansion (so Binder variance coincides with the group-level variance under the auto-injected `psu=group`) and because Monte Carlo coverage at nominal 95% is empirically close to nominal on a DGP where PSUs vary across the cells of each group (see `tests/test_dcdh_cell_period_coverage.py`). A covariance-aware two-cell allocator is a plausible alternative and may be worth exploring if future designs motivate an explicit observation-level IF derivation; the method currently in the library is **not derived from the observation-level survey linearization of the contrast** and makes no stronger claim than "coverage is approximately nominal under the tested DGPs and the group-sum identity holds exactly." Under within-group-constant PSU (the pre-allocator accepted input), per-cell sums telescope to `U_centered[g]` and Binder variance is byte-identical (up to single-ULP floating-point noise) to the previous group-level expansion. **Strata and PSU must be constant within each `(g, t)` cell** (trivially satisfied in one-obs-per-cell panels — the canonical dCDH structure); variation **across cells of a group** is supported by the allocator. Within-group-varying **weights** are supported as before. When `survey_design.psu` is not specified, `fit()` auto-injects `psu=` so the TSL variance, `df_survey`, and t-based inference match the per-group PSU structure. Under replicate-weight designs, the same cell-level `psi_i` is aggregated via Rao-Wu weight-ratio rescaling (`compute_replicate_if_variance` at `diff_diff/survey.py:1681`) rather than the Binder TSL formula. All five methods (BRR/Fay/JK1/JKn/SDR) are supported method-agnostically through the unified helper; the effective `df_survey` is reduced to `min(n_valid) - 1` across IF sites when some replicate solves fail (matching `efficient_did.py:1133-1135` and `triple_diff.py:676-686` precedents). Under DID^X, the first-stage residualization coefficient `theta_hat` is computed once on full-sample weights and treated as fixed (FWL plug-in IF convention) — per-replicate refits of `theta_hat` are not performed. **Scope limitations (follow-up PRs):** (a) `heterogeneity=` combined with within-group-varying PSU/strata raises `NotImplementedError` — the heterogeneity WLS `psi_obs` still uses the legacy group-level expansion, to be extended in PR 3; (b) `n_bootstrap > 0` combined with within-group-varying PSU raises `NotImplementedError` — the PSU-level Hall-Mammen wild bootstrap still uses the legacy group-level PSU map, to be extended in PR 4. +- **Note (Survey IF expansion — library convention):** Survey IF expansion is a library extension not in the dCDH papers (the paper's plug-in variance assumes iid sampling). The library convention builds observation-level `psi_i` by proportionally distributing per-group IF mass within weight share: either at the group level (`psi_i = U_centered[g] * w_i / W_g`, the previous convention) or at the per-`(g, t)` cell level via the cell-period allocator shipped in this release. Cell-level expansion: decompose `U[g]` into per-period attributions `U[g, t]`, cohort-center each column independently, then expand to observation level as `psi_i = U_centered_per_period[g_i, t_i] * (w_i / W_{g_i, t_i})`. Binder (1983) stratified-PSU variance aggregates the resulting `psi` at PSU level. **Post-period attribution convention:** each transition term in the IF sum (of the form `role_weight * (Y_{g, t} - Y_{g, t-1})` for DID_M or `S_g * (Y_{g, out} - Y_{g, ref})` for DID_l) is attributed as a single *difference* to the POST-period cell, not split into a `+Y_post` / `-Y_pre` pair across two cells. This is a library *convention*, not a theorem — adopted because it preserves the group-sum, PSU-sum, and cohort-sum identities of the previous group-level expansion (so Binder variance coincides with the group-level variance under the auto-injected `psu=group`) and because Monte Carlo coverage at nominal 95% is empirically close to nominal on a DGP where PSUs vary across the cells of each group (see `tests/test_dcdh_cell_period_coverage.py`). A covariance-aware two-cell allocator is a plausible alternative and may be worth exploring if future designs motivate an explicit observation-level IF derivation; the method currently in the library is **not derived from the observation-level survey linearization of the contrast** and makes no stronger claim than "coverage is approximately nominal under the tested DGPs and the group-sum identity holds exactly." Under within-group-constant PSU (the pre-allocator accepted input), per-cell sums telescope to `U_centered[g]` and Binder variance is byte-identical (up to single-ULP floating-point noise) to the previous group-level expansion. **Strata and PSU must be constant within each `(g, t)` cell** (trivially satisfied in one-obs-per-cell panels — the canonical dCDH structure); variation **across cells of a group** is supported by the allocator. Within-group-varying **weights** are supported as before. When `survey_design.psu` is not specified, `fit()` auto-injects `psu=` so the TSL variance, `df_survey`, and t-based inference match the per-group PSU structure. **Strata that vary across cells of a group require an explicit `psu=` with `nest=True`** so that `(stratum, psu)` labels are globally unique; the auto-injected `psu=` path does NOT support this (the synthesized PSU column would reuse group labels across strata and trip the cross-stratum PSU uniqueness check in `SurveyDesign.resolve()`). `fit()` detects that combination before survey resolution and raises a targeted `ValueError` pointing users at the explicit-`psu, nest=True` path. Under replicate-weight designs, the same cell-level `psi_i` is aggregated via Rao-Wu weight-ratio rescaling (`compute_replicate_if_variance` at `diff_diff/survey.py:1681`) rather than the Binder TSL formula. All five methods (BRR/Fay/JK1/JKn/SDR) are supported method-agnostically through the unified helper; the effective `df_survey` is reduced to `min(n_valid) - 1` across IF sites when some replicate solves fail (matching `efficient_did.py:1133-1135` and `triple_diff.py:676-686` precedents). Under DID^X, the first-stage residualization coefficient `theta_hat` is computed once on full-sample weights and treated as fixed (FWL plug-in IF convention) — per-replicate refits of `theta_hat` are not performed. **Scope limitations (follow-up PRs):** (a) `heterogeneity=` combined with within-group-varying PSU/strata raises `NotImplementedError` — the heterogeneity WLS `psi_obs` still uses the legacy group-level expansion, to be extended in PR 3; (b) `n_bootstrap > 0` combined with within-group-varying PSU raises `NotImplementedError` — the PSU-level Hall-Mammen wild bootstrap still uses the legacy group-level PSU map, to be extended in PR 4. - **Note (survey + bootstrap contract):** When `survey_design` and `n_bootstrap > 0` are both active, the bootstrap uses Hall-Mammen wild multiplier weights (Rademacher/Mammen/Webb) **at the PSU level**. Under the default auto-injected `psu=group`, the PSU coincides with the group so the wild bootstrap is a clean group-level clustered bootstrap (identity-map fast path, bit-identical to the non-survey multiplier bootstrap). When the user passes an explicit strictly-coarser PSU (e.g., `psu=state` with groups at county level), the IF contributions of all groups within a PSU receive the same bootstrap multiplier — the standard Hall-Mammen wild PSU bootstrap. Strata do not participate in the bootstrap randomization (they contribute only through the analytical TSL variance); this is conservative when strata differ substantially in variance. A `UserWarning` fires only when PSU is strictly coarser than group. **Scope note (cell-period allocator):** The PSU-level bootstrap uses a group-level `group_id_to_psu_code` map and therefore requires PSU to be constant within each group. Combining `n_bootstrap > 0` with a PSU that varies within group raises `NotImplementedError`; the cell-level Hall-Mammen extension is deferred to a follow-up PR. The analytical TSL variance fully supports within-group-varying PSU via the cell-period allocator — use `n_bootstrap=0` for those designs. **Replicate-weight designs and `n_bootstrap > 0` are mutually exclusive** (replicate variance is closed-form; bootstrap would double-count variance) — the combination raises `NotImplementedError`, matching `efficient_did.py:989`, `staggered.py:1869`, `two_stage.py:251-253`. For HonestDiD bounds under replicate weights, the replicate-effective `df_survey = min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` propagates to t-critical values — capped by the design's QR-rank-based df so a rank-deficient replicate matrix never produces a larger effective df than the design supports. When `resolved_survey.df_survey` is undefined (QR-rank ≤ 1), the effective df stays `None` and all inference fields (including HonestDiD bounds) are NaN — per-site `n_valid` cannot rescue a rank-deficient design. --- diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py index 5d88a7ce..f53eb0c6 100644 --- a/tests/test_survey_dcdh.py +++ b/tests/test_survey_dcdh.py @@ -1172,6 +1172,25 @@ def test_bootstrap_with_varying_psu_raises(self, base_data): survey_design=sd, ) + def test_auto_inject_with_varying_strata_raises(self, base_data): + """Auto-injected `psu=` with nest=False cannot honor + strata that vary across cells of a group — the synthesized PSU + column would reuse group labels across strata and trip the + cross-stratum PSU uniqueness check. fit() detects that combo + before survey resolution and raises a targeted ValueError + pointing users to the explicit `psu=, nest=True` path. + """ + df_ = base_data.copy() + df_["pw"] = 1.0 + df_["stratum"] = df_["period"] % 2 # varies across cells of each group + sd = SurveyDesign(weights="pw", strata="stratum") + with pytest.raises(ValueError, match=r"psu="): + ChaisemartinDHaultfoeuille(seed=1).fit( + df_, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd, + ) + def test_within_cell_psu_variation_rejected(self, base_data): """Multiple PSUs inside a single (g, t) cell (a multi-obs-per- cell panel) remain ambiguous under the cell allocator and must @@ -1432,7 +1451,9 @@ def test_cell_allocator_row_sum_identity(self): a11_plus_zeroed_arr=a11_plus_zeroed, a11_minus_zeroed_arr=a11_minus_zeroed, side="overall", + compute_per_period=True, ) + assert U_pp is not None # Hand computation at t=2 joiner side: # G0: stable_0, -(2/2) * (3.0 - 2.0) = -1.0 From ca7b6bfa5d0e321f3a2fb60fb3340148a44b3d7f Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 18:55:42 -0400 Subject: [PATCH 6/6] Narrow auto-inject guard to nest=False + regression test for nest=True The previous round's guard fired on any varying-strata + omitted-psu combination, which rejected `SurveyDesign(weights, strata, nest=True)` unnecessarily. `SurveyDesign.resolve()` at `diff_diff/survey.py:299-302` combines `(stratum, psu)` into globally-unique labels under nest=True, so the auto-injected `psu=` is re-labeled per stratum and the cross-stratum uniqueness check passes. Only the `nest=False` default path actually needs the up-front guard. Narrows the guard to `not getattr(survey_design, "nest", False)` and updates the error message to enumerate three actionable remediations (constant-within-group strata, or `nest=True`, or explicit `psu`). Adds `test_auto_inject_with_varying_strata_nest_true_succeeds` under `TestSurveyWithinGroupValidation` covering the newly-accepted path: byte-for-byte match against explicit `SurveyDesign(weights, strata, psu="group", nest=True)` on `overall_se` and `survey_metadata.df_survey`. The default `nest=False` still-raises regression (`test_auto_inject_with_varying_strata_raises`) remains unchanged. Updates fit() docstring and the REGISTRY.md survey IF expansion Note to enumerate the three supported auto-inject paths: (1) strata constant within group, (2) strata vary + nest=True, (3) strata vary + nest=False (rejected with targeted ValueError). All 338 tests pass (affected surface + slow MC coverage sim). Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/chaisemartin_dhaultfoeuille.py | 63 +++++++++++++++--------- docs/methodology/REGISTRY.md | 2 +- tests/test_survey_dcdh.py | 40 +++++++++++++++ 3 files changed, 80 insertions(+), 25 deletions(-) diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index b8653a40..a8562a33 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -650,7 +650,15 @@ def fit( **Strata and PSU may vary across cells of a group** but must be constant within each ``(g, t)`` cell (trivially true in one-obs-per-cell panels; enforced otherwise with - ``ValueError``). When ``n_bootstrap > 0`` and a survey + ``ValueError``). Three supported combinations under the + auto-injected ``psu=``: + (1) strata constant within group (any ``nest`` flag works); + (2) strata vary within group **and** ``nest=True`` — the + resolver re-labels the synthesized ``psu`` uniquely within + strata; (3) strata vary within group **and** ``nest=False`` + — rejected up front with a targeted ``ValueError``; pass + ``SurveyDesign(..., nest=True)`` or an explicit + ``psu=`` with globally-unique labels instead. When ``n_bootstrap > 0`` and a survey design is supplied, the multiplier bootstrap operates at the PSU level (Hall-Mammen wild PSU bootstrap) — under the default auto-inject this collapses to a group-level @@ -726,17 +734,22 @@ def fit( or resolved_survey.replicate_weights.shape[1] == 0 ) ): - # Pre-auto-inject contract check: the auto-injected PSU - # column reuses group labels with nest=False, but the - # survey resolver enforces globally-unique PSU labels when - # nest=False and strata are present (see - # ``diff_diff/survey.py``). If strata varies within group, - # the synthesized PSU column collides across strata and - # resolution fails downstream with an opaque error. Flag - # that configuration up front with an actionable message - # pointing users to the explicit ``psu=, nest=True`` - # path (REGISTRY.md survey IF expansion Note). - if resolved_survey.strata is not None: + # Pre-auto-inject contract check: the auto-inject path + # synthesizes ``psu=`` and preserves the user's + # ``nest`` flag. Under ``nest=False`` (the default), the + # survey resolver requires globally-unique PSU labels when + # strata are present; if strata varies within group, the + # synthesized PSU column reuses group labels across strata + # and trips the cross-stratum PSU uniqueness check at + # resolution time. Under ``nest=True`` the resolver + # re-labels ``(stratum, psu)`` uniquely within strata + # (``diff_diff/survey.py:299-302``), so varying strata is + # fine — let the auto-inject proceed. Only the + # ``nest=False`` + varying-strata + omitted-psu triple + # warrants an up-front targeted error. + if resolved_survey.strata is not None and not getattr( + survey_design, "nest", False + ): _strata_varies_pre, _ = _strata_psu_vary_within_group( resolved_survey, data, group, survey_weights, ) @@ -744,18 +757,20 @@ def fit( raise ValueError( "ChaisemartinDHaultfoeuille survey support: " "strata that vary across cells of the same " - "group require an explicit `psu=` with " - "`nest=True` so that `(stratum, psu)` pairs " - "are globally unique. The default auto-" - "injected `psu=` path does NOT support " - "this because the synthesized PSU column " - "reuses group labels across strata and trips " - "the cross-stratum PSU uniqueness check in " - "survey resolution. Either (a) set strata " - "constant within each group, or (b) pass " - "`SurveyDesign(..., psu=, nest=True)` " - "with PSU labels that are unique within " - "strata." + "group require either an explicit " + "`psu=` (any column whose labels are " + "globally unique within strata) or the " + "original `SurveyDesign(..., nest=True)` " + "flag so the auto-injected `psu=` is " + "re-labeled uniquely within strata by the " + "resolver. The default `nest=False` auto-" + "inject path reuses group labels across " + "strata and trips the cross-stratum PSU " + "uniqueness check in survey resolution. " + "Either (a) set strata constant within each " + "group, (b) pass `SurveyDesign(..., " + "nest=True)`, or (c) pass an explicit " + "`psu=` with globally-unique labels." ) from diff_diff.survey import SurveyDesign as _SurveyDesign diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index 3abfd7e8..424b0c2e 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -649,7 +649,7 @@ Alternative: Multiplier bootstrap clustered at group via the `n_bootstrap` param - [x] Design-2 switch-in/switch-out descriptive wrapper (Web Appendix Section 1.6) - [x] HonestDiD (Rambachan-Roth 2023) integration on placebo + event study surface - [x] Survey design support: pweight with strata/PSU/FPC via Taylor Series Linearization (analytical) **or replicate-weight variance (BRR/Fay/JK1/JKn/SDR)**, covering the main ATT surface, covariate adjustment (DID^X), heterogeneity testing, the TWFE diagnostic (fit and standalone `twowayfeweights()` helper), and HonestDiD bounds. Opt-in **PSU-level Hall-Mammen wild bootstrap** is also supported via `n_bootstrap > 0`. -- **Note (Survey IF expansion — library convention):** Survey IF expansion is a library extension not in the dCDH papers (the paper's plug-in variance assumes iid sampling). The library convention builds observation-level `psi_i` by proportionally distributing per-group IF mass within weight share: either at the group level (`psi_i = U_centered[g] * w_i / W_g`, the previous convention) or at the per-`(g, t)` cell level via the cell-period allocator shipped in this release. Cell-level expansion: decompose `U[g]` into per-period attributions `U[g, t]`, cohort-center each column independently, then expand to observation level as `psi_i = U_centered_per_period[g_i, t_i] * (w_i / W_{g_i, t_i})`. Binder (1983) stratified-PSU variance aggregates the resulting `psi` at PSU level. **Post-period attribution convention:** each transition term in the IF sum (of the form `role_weight * (Y_{g, t} - Y_{g, t-1})` for DID_M or `S_g * (Y_{g, out} - Y_{g, ref})` for DID_l) is attributed as a single *difference* to the POST-period cell, not split into a `+Y_post` / `-Y_pre` pair across two cells. This is a library *convention*, not a theorem — adopted because it preserves the group-sum, PSU-sum, and cohort-sum identities of the previous group-level expansion (so Binder variance coincides with the group-level variance under the auto-injected `psu=group`) and because Monte Carlo coverage at nominal 95% is empirically close to nominal on a DGP where PSUs vary across the cells of each group (see `tests/test_dcdh_cell_period_coverage.py`). A covariance-aware two-cell allocator is a plausible alternative and may be worth exploring if future designs motivate an explicit observation-level IF derivation; the method currently in the library is **not derived from the observation-level survey linearization of the contrast** and makes no stronger claim than "coverage is approximately nominal under the tested DGPs and the group-sum identity holds exactly." Under within-group-constant PSU (the pre-allocator accepted input), per-cell sums telescope to `U_centered[g]` and Binder variance is byte-identical (up to single-ULP floating-point noise) to the previous group-level expansion. **Strata and PSU must be constant within each `(g, t)` cell** (trivially satisfied in one-obs-per-cell panels — the canonical dCDH structure); variation **across cells of a group** is supported by the allocator. Within-group-varying **weights** are supported as before. When `survey_design.psu` is not specified, `fit()` auto-injects `psu=` so the TSL variance, `df_survey`, and t-based inference match the per-group PSU structure. **Strata that vary across cells of a group require an explicit `psu=` with `nest=True`** so that `(stratum, psu)` labels are globally unique; the auto-injected `psu=` path does NOT support this (the synthesized PSU column would reuse group labels across strata and trip the cross-stratum PSU uniqueness check in `SurveyDesign.resolve()`). `fit()` detects that combination before survey resolution and raises a targeted `ValueError` pointing users at the explicit-`psu, nest=True` path. Under replicate-weight designs, the same cell-level `psi_i` is aggregated via Rao-Wu weight-ratio rescaling (`compute_replicate_if_variance` at `diff_diff/survey.py:1681`) rather than the Binder TSL formula. All five methods (BRR/Fay/JK1/JKn/SDR) are supported method-agnostically through the unified helper; the effective `df_survey` is reduced to `min(n_valid) - 1` across IF sites when some replicate solves fail (matching `efficient_did.py:1133-1135` and `triple_diff.py:676-686` precedents). Under DID^X, the first-stage residualization coefficient `theta_hat` is computed once on full-sample weights and treated as fixed (FWL plug-in IF convention) — per-replicate refits of `theta_hat` are not performed. **Scope limitations (follow-up PRs):** (a) `heterogeneity=` combined with within-group-varying PSU/strata raises `NotImplementedError` — the heterogeneity WLS `psi_obs` still uses the legacy group-level expansion, to be extended in PR 3; (b) `n_bootstrap > 0` combined with within-group-varying PSU raises `NotImplementedError` — the PSU-level Hall-Mammen wild bootstrap still uses the legacy group-level PSU map, to be extended in PR 4. +- **Note (Survey IF expansion — library convention):** Survey IF expansion is a library extension not in the dCDH papers (the paper's plug-in variance assumes iid sampling). The library convention builds observation-level `psi_i` by proportionally distributing per-group IF mass within weight share: either at the group level (`psi_i = U_centered[g] * w_i / W_g`, the previous convention) or at the per-`(g, t)` cell level via the cell-period allocator shipped in this release. Cell-level expansion: decompose `U[g]` into per-period attributions `U[g, t]`, cohort-center each column independently, then expand to observation level as `psi_i = U_centered_per_period[g_i, t_i] * (w_i / W_{g_i, t_i})`. Binder (1983) stratified-PSU variance aggregates the resulting `psi` at PSU level. **Post-period attribution convention:** each transition term in the IF sum (of the form `role_weight * (Y_{g, t} - Y_{g, t-1})` for DID_M or `S_g * (Y_{g, out} - Y_{g, ref})` for DID_l) is attributed as a single *difference* to the POST-period cell, not split into a `+Y_post` / `-Y_pre` pair across two cells. This is a library *convention*, not a theorem — adopted because it preserves the group-sum, PSU-sum, and cohort-sum identities of the previous group-level expansion (so Binder variance coincides with the group-level variance under the auto-injected `psu=group`) and because Monte Carlo coverage at nominal 95% is empirically close to nominal on a DGP where PSUs vary across the cells of each group (see `tests/test_dcdh_cell_period_coverage.py`). A covariance-aware two-cell allocator is a plausible alternative and may be worth exploring if future designs motivate an explicit observation-level IF derivation; the method currently in the library is **not derived from the observation-level survey linearization of the contrast** and makes no stronger claim than "coverage is approximately nominal under the tested DGPs and the group-sum identity holds exactly." Under within-group-constant PSU (the pre-allocator accepted input), per-cell sums telescope to `U_centered[g]` and Binder variance is byte-identical (up to single-ULP floating-point noise) to the previous group-level expansion. **Strata and PSU must be constant within each `(g, t)` cell** (trivially satisfied in one-obs-per-cell panels — the canonical dCDH structure); variation **across cells of a group** is supported by the allocator. Within-group-varying **weights** are supported as before. When `survey_design.psu` is not specified, `fit()` auto-injects `psu=` so the TSL variance, `df_survey`, and t-based inference match the per-group PSU structure. **Strata that vary across cells of a group require either an explicit `psu=` or the original `SurveyDesign(..., nest=True)` flag** — under `nest=True` the resolver combines `(stratum, psu)` into globally-unique labels, so the auto-injected `psu=` is re-labeled per stratum and the cell allocator proceeds. Only the `nest=False` + varying-strata + omitted-psu combination is rejected up front with a targeted `ValueError` at `fit()` time (the synthesized PSU column would reuse group labels across strata and trip the cross-stratum PSU uniqueness check in `SurveyDesign.resolve()`). Under replicate-weight designs, the same cell-level `psi_i` is aggregated via Rao-Wu weight-ratio rescaling (`compute_replicate_if_variance` at `diff_diff/survey.py:1681`) rather than the Binder TSL formula. All five methods (BRR/Fay/JK1/JKn/SDR) are supported method-agnostically through the unified helper; the effective `df_survey` is reduced to `min(n_valid) - 1` across IF sites when some replicate solves fail (matching `efficient_did.py:1133-1135` and `triple_diff.py:676-686` precedents). Under DID^X, the first-stage residualization coefficient `theta_hat` is computed once on full-sample weights and treated as fixed (FWL plug-in IF convention) — per-replicate refits of `theta_hat` are not performed. **Scope limitations (follow-up PRs):** (a) `heterogeneity=` combined with within-group-varying PSU/strata raises `NotImplementedError` — the heterogeneity WLS `psi_obs` still uses the legacy group-level expansion, to be extended in PR 3; (b) `n_bootstrap > 0` combined with within-group-varying PSU raises `NotImplementedError` — the PSU-level Hall-Mammen wild bootstrap still uses the legacy group-level PSU map, to be extended in PR 4. - **Note (survey + bootstrap contract):** When `survey_design` and `n_bootstrap > 0` are both active, the bootstrap uses Hall-Mammen wild multiplier weights (Rademacher/Mammen/Webb) **at the PSU level**. Under the default auto-injected `psu=group`, the PSU coincides with the group so the wild bootstrap is a clean group-level clustered bootstrap (identity-map fast path, bit-identical to the non-survey multiplier bootstrap). When the user passes an explicit strictly-coarser PSU (e.g., `psu=state` with groups at county level), the IF contributions of all groups within a PSU receive the same bootstrap multiplier — the standard Hall-Mammen wild PSU bootstrap. Strata do not participate in the bootstrap randomization (they contribute only through the analytical TSL variance); this is conservative when strata differ substantially in variance. A `UserWarning` fires only when PSU is strictly coarser than group. **Scope note (cell-period allocator):** The PSU-level bootstrap uses a group-level `group_id_to_psu_code` map and therefore requires PSU to be constant within each group. Combining `n_bootstrap > 0` with a PSU that varies within group raises `NotImplementedError`; the cell-level Hall-Mammen extension is deferred to a follow-up PR. The analytical TSL variance fully supports within-group-varying PSU via the cell-period allocator — use `n_bootstrap=0` for those designs. **Replicate-weight designs and `n_bootstrap > 0` are mutually exclusive** (replicate variance is closed-form; bootstrap would double-count variance) — the combination raises `NotImplementedError`, matching `efficient_did.py:989`, `staggered.py:1869`, `two_stage.py:251-253`. For HonestDiD bounds under replicate weights, the replicate-effective `df_survey = min(resolved_survey.df_survey, min(n_valid_across_sites) - 1)` propagates to t-critical values — capped by the design's QR-rank-based df so a rank-deficient replicate matrix never produces a larger effective df than the design supports. When `resolved_survey.df_survey` is undefined (QR-rank ≤ 1), the effective df stays `None` and all inference fields (including HonestDiD bounds) are NaN — per-site `n_valid` cannot rescue a rank-deficient design. --- diff --git a/tests/test_survey_dcdh.py b/tests/test_survey_dcdh.py index f53eb0c6..e400c3eb 100644 --- a/tests/test_survey_dcdh.py +++ b/tests/test_survey_dcdh.py @@ -1172,6 +1172,46 @@ def test_bootstrap_with_varying_psu_raises(self, base_data): survey_design=sd, ) + def test_auto_inject_with_varying_strata_nest_true_succeeds(self, base_data): + """When strata varies across cells of a group and the user + passes ``nest=True`` with no explicit ``psu``, the auto-inject + path is valid: ``SurveyDesign.resolve()`` combines + ``(stratum, psu)`` into globally-unique labels via the + nest=True path (``diff_diff/survey.py:299-302``), so the + cross-stratum PSU uniqueness check is satisfied. Byte-check + against the explicit ``SurveyDesign(..., psu="group", + nest=True)`` baseline — both paths resolve to the same design. + """ + df_ = base_data.copy() + df_["pw"] = 1.0 + df_["stratum"] = df_["period"] % 2 + sd_auto = SurveyDesign(weights="pw", strata="stratum", nest=True) + sd_explicit = SurveyDesign( + weights="pw", strata="stratum", psu="group", nest=True, + ) + r_auto = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd_auto, L_max=2, + ) + r_explicit = ChaisemartinDHaultfoeuille(seed=1).fit( + df_, outcome="outcome", group="group", + time="period", treatment="treatment", + survey_design=sd_explicit, L_max=2, + ) + assert np.isfinite(r_auto.overall_att) + assert np.isfinite(r_auto.overall_se) + if np.isfinite(r_auto.overall_se) and np.isfinite(r_explicit.overall_se): + assert r_auto.overall_se == pytest.approx( + r_explicit.overall_se, rel=1e-6 + ) + assert r_auto.survey_metadata is not None + assert r_explicit.survey_metadata is not None + assert ( + r_auto.survey_metadata.df_survey + == r_explicit.survey_metadata.df_survey + ) + def test_auto_inject_with_varying_strata_raises(self, base_data): """Auto-injected `psu=` with nest=False cannot honor strata that vary across cells of a group — the synthesized PSU