diff --git a/CHANGELOG.md b/CHANGELOG.md index 11351d7c..5092219c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- **`target_parameter` block in BR/DR schemas (experimental; schema version bumped to 2.0)** — `BUSINESS_REPORT_SCHEMA_VERSION` and `DIAGNOSTIC_REPORT_SCHEMA_VERSION` bumped from `"1.0"` to `"2.0"` because the new `"no_scalar_by_design"` value on the `headline.status` / `headline_metric.status` enum (dCDH `trends_linear=True, L_max>=2` configuration) is a breaking change per the REPORTING.md stability policy. BusinessReport and DiagnosticReport now emit a top-level `target_parameter` block naming what the headline scalar actually represents for each of the 16 result classes. Closes BR/DR foundation gap #6 (target-parameter clarity). Fields: `name`, `definition`, `aggregation` (machine-readable dispatch tag), `headline_attribute` (raw result attribute), `reference` (citation pointer). BR's summary emits the short `name` right after the headline; DR's overall-interpretation paragraph does the same; both full reports carry a "## Target Parameter" section with the full definition. Per-estimator dispatch is sourced from REGISTRY.md and lives in the new `diff_diff/_reporting_helpers.py::describe_target_parameter`. A few branches read fit-time config (`EfficientDiDResults.pt_assumption`, `StackedDiDResults.clean_control`, `ChaisemartinDHaultfoeuilleResults.L_max` / `covariate_residuals` / `linear_trends_effects`); others emit a fixed tag (the fit-time `aggregate` kwarg on CS / Imputation / TwoStage / Wooldridge does not change the `overall_att` scalar — disambiguating horizon / group tables is tracked under gap #9). See `docs/methodology/REPORTING.md` "Target parameter" section. + ## [3.2.0] - 2026-04-19 ### Added diff --git a/diff_diff/_reporting_helpers.py b/diff_diff/_reporting_helpers.py new file mode 100644 index 00000000..f5711344 --- /dev/null +++ b/diff_diff/_reporting_helpers.py @@ -0,0 +1,634 @@ +"""Shared helpers for BusinessReport and DiagnosticReport. + +This module hosts per-estimator dispatch logic that both BR and DR +consume. It lives in its own module rather than in ``business_report`` +or ``diagnostic_report`` to avoid a circular-import risk: BR already +imports from DR (for ``DiagnosticReportResults``), so placing shared +helpers here keeps the dependency graph tidy if DR ever needs to +reference a BR symbol. + +Current contents: + +- ``describe_target_parameter(results)`` — returns the + ``target_parameter`` block documenting what scalar the headline + represents. Introduced for BR/DR gap #6 (target-parameter + clarity); see ``docs/methodology/REPORTING.md`` and per-estimator + entries in ``docs/methodology/REGISTRY.md`` for the canonical + wording. +""" + +from __future__ import annotations + +from typing import Any, Dict + + +def describe_target_parameter(results: Any) -> Dict[str, Any]: + """Return the target-parameter block for a fitted result. + + The block names what scalar the headline number actually + represents (overall ATT, DID_M, dose-response ATT(d|d), etc.) so + BR/DR output is self-contained. Baker et al. (2025) Step 2 is + "Define the target parameter"; this helper does that work for the + user. + + Shape:: + + { + "name": str, # stakeholder-facing name + "definition": str, # plain-English description + "aggregation": str, # machine-readable tag + "headline_attribute": str, # which raw attribute the scalar comes from + "reference": str, # citation pointer + } + + Each per-estimator branch cites REGISTRY.md as the single source + of truth for the canonical wording (per + ``feedback_verify_claims.md``). All wording choices are + deliberate: + + - ``ImputationDiD`` / ``TwoStageDiD``: the ``aggregate`` fit-time + kwarg controls which horizon / group tables get populated but + does NOT change ``overall_att``. The headline is always the + sample-mean overall ATT (per BJS 2024 Step 3 with + ``w_it = 1/N_1``); disambiguate via the event-study or group + aggregate if you need the horizon / group target. + - ``CallawaySantAnna``: ``overall_att`` is cohort-size-weighted + across post-treatment ``ATT(g, t)`` cells regardless of the + fit-time ``aggregate`` kwarg. The event-study / group + aggregations live on dedicated fields + (``event_study_effects`` / ``group_effects``). + - ``ContinuousDiD``: the regime (PT vs. SPT) is a user-level + assumption, not a library setting. The ``definition`` names + both regime readings (``ATT^loc`` under PT, + ``ATT^glob`` under SPT) so the user can pick the + interpretation that matches their assumption. + - ``WooldridgeDiD``: ``overall_att`` depends on the fit-time + ``method`` — for OLS ETWFE (``method="ols"``) it is the + observation-count-weighted average of ``ATT(g, t)`` + coefficients from the saturated regression; for nonlinear + ETWFE (``method="logit"`` / ``"poisson"``) it is the + average-structural-function (ASF) contrast across cohort x + time cells. Both paths preserve ``overall_att`` across + ``.aggregate("event")`` calls (which only populate additional + event-study tables). + """ + name = type(results).__name__ + + if name == "DiDResults": + # Covers both ``DifferenceInDifferences`` (2x2 DiD) and + # ``TwoWayFixedEffects`` (TWFE with unit + time FE). Both + # estimators return ``DiDResults``; there is no separate + # ``TwoWayFixedEffectsResults`` class as of this PR + # (confirmed in PR #347 R1 review). The description covers + # both interpretations because the result carries no + # estimator-provenance marker BR/DR can dispatch on. Adding a + # dedicated TWFE result class (or persisting provenance on + # DiDResults) is queued as follow-up so this branch can split + # in a future PR. + # + # PR #347 R12 P2: the ``aggregation`` tag is + # ``"did_or_twfe"`` (not ``"2x2"``) because real TWFE fits + # can be weighted averages of 2x2 comparisons, potentially + # with forbidden later-vs-earlier weights. Downstream agents + # dispatching on ``aggregation`` must not treat this as a + # clean 2x2 fit — the ambiguity is intrinsic until an + # estimator-provenance marker exists. + return { + "name": "ATT (2x2 or TWFE within-transformed coefficient)", + "definition": ( + "The average treatment effect on the treated. For " + "``DifferenceInDifferences``, this is the 2x2 DiD " + "contrast between treated-unit change and control-unit " + "change across pre / post. For ``TwoWayFixedEffects``, " + "this is the coefficient on the treatment-by-post " + "interaction in a regression with unit and time fixed " + "effects; under homogeneous treatment effects it is " + "the ATT, and under heterogeneous effects with staggered " + "adoption it is a weighted average of 2x2 comparisons " + "that may include forbidden later-vs-earlier comparisons " + "(see Goodman-Bacon)." + ), + "aggregation": "did_or_twfe", + "headline_attribute": "att", + "reference": ("REGISTRY.md Sec. DifferenceInDifferences / TwoWayFixedEffects"), + } + + if name == "MultiPeriodDiDResults": + return { + "name": "average ATT over post-treatment periods (event-study average)", + "definition": ( + "The equally-weighted average of the post-treatment event-study " + "coefficients. Each coefficient is an ATT at a single event time " + "relative to treatment onset; the headline averages across " + "post-treatment horizons." + ), + "aggregation": "event_study", + "headline_attribute": "avg_att", + "reference": "REGISTRY.md Sec. MultiPeriodDiD", + } + + if name == "CallawaySantAnnaResults": + return { + "name": "overall ATT (cohort-size-weighted average of ATT(g,t))", + "definition": ( + "A cohort-size-weighted average of group-time ATTs " + "``ATT(g, t)`` across post-treatment cells (``t >= g``). " + "``overall_att`` is the simple-aggregation headline regardless " + "of the fit-time ``aggregate`` kwarg; event-study and group " + "aggregations populate ``event_study_effects`` / " + "``group_effects`` fields when requested." + ), + "aggregation": "simple", + "headline_attribute": "overall_att", + "reference": "Callaway & Sant'Anna (2021); REGISTRY.md Sec. CallawaySantAnna", + } + + if name == "SunAbrahamResults": + return { + "name": "overall ATT (interaction-weighted average of CATT(g, e))", + "definition": ( + "An interaction-weighted (IW) average of cohort-specific " + "ATT(g, e) effects estimated via saturated cohort-by-event-time " + "interactions. Weights are the sample shares of each cohort at " + "each event time; the headline is the resulting overall ATT." + ), + "aggregation": "iw", + "headline_attribute": "overall_att", + "reference": "Sun & Abraham (2021); REGISTRY.md Sec. SunAbraham", + } + + if name == "ImputationDiDResults": + return { + "name": "overall ATT (sample-mean imputation average)", + "definition": ( + "The sample-mean overall ATT ``tau_hat_w = (1/N_1) * sum " + "tau_hat_it`` across treated observations, where " + "``tau_hat_it = Y_it - Y_hat_it(0)`` and ``Y_hat_it(0)`` is " + "imputed from a unit+time fixed-effects model fitted on " + "untreated observations only (BJS 2024 Step 3). The fit-time " + "``aggregate`` kwarg populates additional horizon / group " + "tables but does NOT change ``overall_att`` — for the " + "horizon or group estimand, consult the event-study or group " + "aggregate directly." + ), + "aggregation": "simple", + "headline_attribute": "overall_att", + "reference": "Borusyak-Jaravel-Spiess (2024); REGISTRY.md Sec. ImputationDiD", + } + + if name == "TwoStageDiDResults": + return { + "name": "overall ATT (two-stage residualization average)", + "definition": ( + "The sample-mean overall ATT recovered from Stage 2 of " + "Gardner's two-stage procedure: Stage 1 fits unit+time fixed " + "effects on untreated observations only, and Stage 2 regresses " + "the residualized outcome on the treatment indicator across " + "treated observations. Point estimate is algebraically " + "equivalent to Borusyak-Jaravel-Spiess imputation. As with " + "ImputationDiD, the fit-time ``aggregate`` kwarg populates " + "additional tables but does NOT change ``overall_att``." + ), + "aggregation": "simple", + "headline_attribute": "overall_att", + "reference": "Gardner (2022); REGISTRY.md Sec. TwoStageDiD", + } + + if name == "StackedDiDResults": + clean_control = getattr(results, "clean_control", None) + if clean_control == "never_treated": + control_clause = "Controls are the never-treated units (``A_s = infinity``)." + elif clean_control == "strict": + control_clause = ( + "Controls for each sub-experiment ``a`` are units strictly " + "untreated across the full pre/post event window " + "(``A_s > a + kappa_post + kappa_pre``)." + ) + else: # "not_yet_treated" or unknown -> default rule + control_clause = ( + "Controls for each sub-experiment ``a`` are units not yet " + "treated through the event's post-window " + "(``A_s > a + kappa_post``)." + ) + return { + "name": "overall ATT (average of post-treatment event-study coefficients)", + "definition": ( + "The average of post-treatment event-study coefficients " + "``delta_h`` (h >= -anticipation), estimated from the stacked " + "sub-experiment panel with delta-method SE. Each sub-" + "experiment aligns a treated cohort with its clean-control " + "set over the event window ``[-kappa_pre, +kappa_post]``; " + "each per-horizon ``delta_h`` is the paper's " + "``theta_kappa^e`` treated-share-weighted cross-event " + "aggregate. The ``overall_att`` headline is the equally-" + "weighted average of these per-horizon coefficients, not a " + "separate cross-event weighted aggregate at the ATT level. " + control_clause + ), + "aggregation": "stacked", + "headline_attribute": "overall_att", + "reference": "Wing-Freedman-Hollingsworth (2024); REGISTRY.md Sec. StackedDiD", + } + + if name == "WooldridgeDiDResults": + # PR #347 R4 P2: Wooldridge ETWFE has two identification paths + # (REGISTRY.md splits them at Sec. WooldridgeDiD): the OLS + # path computes ``overall_att`` as an observation-count- + # weighted aggregation of ``ATT(g, t)`` coefficients from the + # saturated regression, while the nonlinear (logit / Poisson) + # paths produce an ASF-based ATT from the average-structural- + # function contrast. ``WooldridgeDiDResults.method`` persists + # the choice; branch on it so OLS fits aren't mislabeled with + # nonlinear-ASF wording. + method = getattr(results, "method", "ols") + if method == "ols": + return { + "name": ( + "overall ATT (observation-count-weighted average of " + "ATT(g,t) from saturated OLS ETWFE)" + ), + "definition": ( + "The overall ATT under OLS ETWFE (Wooldridge 2025): the " + "saturated regression fits cohort x time ATT(g, t) " + "coefficients, and ``overall_att`` is their " + "observation-count-weighted average across post-" + 'treatment cells. Calling ``.aggregate("event")`` ' + "populates additional event-study tables but does NOT " + "change the ``overall_att`` scalar." + ), + "aggregation": "simple", + "headline_attribute": "overall_att", + "reference": ("Wooldridge (2025); REGISTRY.md Sec. WooldridgeDiD (OLS path)"), + } + return { + "name": ( + f"overall ATT (ASF-based average from Wooldridge ETWFE, " f"method={method!r})" + ), + "definition": ( + f"The overall ATT under Wooldridge ETWFE with a nonlinear " + f"link function (``method={method!r}``, typically logit or " + f"Poisson QMLE): the average-structural-function (ASF) " + f"contrast between treated and counterfactual untreated " + f"outcomes averaged across cohort x time cells with " + f"observation-count weights. The ASF handles the " + f"nonlinearity; OLS ETWFE uses the saturated-regression " + f'coefficient path instead. Calling ``.aggregate("event")`` ' + f"populates additional event-study tables but does NOT " + f"change the ``overall_att`` scalar." + ), + "aggregation": "simple", + "headline_attribute": "overall_att", + "reference": ( + "Wooldridge (2023, 2025); REGISTRY.md Sec. WooldridgeDiD " "(nonlinear / ASF path)" + ), + } + + if name == "EfficientDiDResults": + # PR #347 R7 P1: the BR/DR headline ``overall_att`` is the + # library's cohort-size-weighted average over post-treatment + # ``(g, t)`` cells (see ``efficient_did.py`` around line 1274 + # and REGISTRY.md Sec. EfficientDiD). This is distinct from + # the paper's ``ES_avg`` uniform event-time average. + # Disambiguating this in the stakeholder-facing definition + # keeps the user from mistaking one for the other — the + # regime (PT-All vs PT-Post) describes identification, not + # the aggregation choice for the headline scalar. + pt_assumption = getattr(results, "pt_assumption", "all") + library_aggregation_note = ( + " The BR/DR headline ``overall_att`` is the library's " + "cohort-size-weighted average of ATT(g, t) over post-" + "treatment cells, NOT the paper's ``ES_avg`` uniform event-" + "time average (see REGISTRY.md Sec. EfficientDiD for the " + "distinction)." + ) + if pt_assumption == "post": + return { + "name": "overall ATT under PT-Post (single-baseline)", + "definition": ( + "The overall ATT identified under the weaker PT-Post " + "regime (parallel trends hold only in post-treatment " + "periods). The baseline is period ``g - 1`` only; the " + "estimator is just-identified and reduces to standard " + "single-baseline DiD (Corollary 3.2)." + library_aggregation_note + ), + "aggregation": "pt_post_single_baseline", + "headline_attribute": "overall_att", + "reference": "Chen-Sant'Anna-Xie (2025) Cor. 3.2; REGISTRY.md Sec. EfficientDiD", + } + return { + "name": "overall ATT under PT-All (over-identified combined)", + "definition": ( + "The overall ATT identified under the stronger PT-All regime " + "(parallel trends hold for all groups and all periods). The " + "estimator is over-identified (Lemma 2.1) and applies " + "optimal-combination weights to achieve the semiparametric " + "efficiency bound on the no-covariate path." + library_aggregation_note + ), + "aggregation": "pt_all_combined", + "headline_attribute": "overall_att", + "reference": "Chen-Sant'Anna-Xie (2025) Lemma 2.1; REGISTRY.md Sec. EfficientDiD", + } + + if name == "ContinuousDiDResults": + return { + "name": "overall ATT (dose-aggregated)", + "definition": ( + "The overall ATT aggregated across treated dose levels. Under " + "Parallel Trends (PT) this identifies ``ATT^loc`` — the " + "binarized ATT for treated-vs-untreated; under Strong Parallel " + "Trends (SPT) it identifies ``ATT^glob`` — the population " + "average ATT across dose levels. The regime is a user-level " + "assumption, not a library setting; the dose-response curve " + "``ATT(d)`` / marginal effect ``ACRT(d)`` / per-dose " + "``ATT(d|d)`` are available on the result object for " + "dose-specific inference." + ), + "aggregation": "dose_overall", + "headline_attribute": "overall_att", + "reference": "Callaway-Goodman-Bacon-Sant'Anna (2024); REGISTRY.md Sec. ContinuousDiD", + } + + if name == "TripleDifferenceResults": + return { + "name": "ATT (triple-difference)", + "definition": ( + "The ATT identified via the DDD cancellation " + "``DDD = DiD_A + DiD_B - DiD_C`` across the Group x Period x " + "Eligibility cells. Differences out group-specific and period-" + "specific unobservables without requiring separate parallel-" + "trends assumptions between each cell pair." + ), + "aggregation": "ddd", + "headline_attribute": "att", + "reference": "Ortiz-Villavicencio & Sant'Anna (2025); REGISTRY.md Sec. TripleDifference", + } + + if name == "StaggeredTripleDiffResults": + return { + "name": "overall ATT (cohort-weighted staggered triple-difference)", + "definition": ( + "A cohort-weighted aggregate of per-(g, g_c, t) triple-" + "difference ATTs ``DDD(g, g_c, t)`` via the GMM optimal-" + "combination weighting. Extends the DDD cancellation to " + "staggered adoption timing with a third (eligibility) " + "dimension." + ), + "aggregation": "staggered_ddd", + "headline_attribute": "overall_att", + "reference": "Ortiz-Villavicencio & Sant'Anna (2025); REGISTRY.md Sec. StaggeredTripleDifference", + } + + if name == "ChaisemartinDHaultfoeuilleResults": + l_max = getattr(results, "L_max", None) + has_controls = getattr(results, "covariate_residuals", None) is not None + # PR #347 R9 P1: read the persisted ``trends_linear`` flag + # directly rather than inferring from + # ``linear_trends_effects is not None``. The estimator can set + # ``linear_trends_effects = None`` when the cumulated-horizon + # dict is empty (no estimable horizons) while still + # unconditionally NaN-ing ``overall_att`` under + # ``trends_linear=True`` + ``L_max >= 2`` + # (``chaisemartin_dhaultfoeuille.py:2828-2834``). The previous + # inference missed that edge case; the new explicit flag + # (persisted on ``ChaisemartinDHaultfoeuilleResults``) closes + # the gap. Older fits without the persisted flag fall back to + # the legacy inference. + _persisted = getattr(results, "trends_linear", None) + if isinstance(_persisted, bool): + has_trends = _persisted + else: + has_trends = getattr(results, "linear_trends_effects", None) is not None + reference = ( + "de Chaisemartin & D'Haultfoeuille (2020, 2024); " + "REGISTRY.md Sec. ChaisemartinDHaultfoeuille" + ) + # PR #347 R2 P1 review: mirror the exact ``overall_att`` + # contract from ``chaisemartin_dhaultfoeuille.py`` lines + # 2602-2634 (L_max>=2 overrides with cost-benefit ``delta``, + # NaN if non-estimable) and lines 2828-2834 (``trends_linear`` + # + ``L_max>=2`` suppresses the scalar entirely with NaN). + # The result class's own ``_estimand_label`` at + # ``chaisemartin_dhaultfoeuille_results.py:454-490`` is the + # source-of-truth; this branch tracks that logic. + + # Trends + L_max>=2: overall_att is NaN. No scalar aggregate; + # per-horizon effects are on ``linear_trends_effects``. + if has_trends and l_max is not None and l_max >= 2: + # PR #347 R12 P1: distinguish the populated-surface case + # (``linear_trends_effects`` has horizons) from the + # empty-surface subcase (``linear_trends_effects is + # None``: requested ``trends_linear=True`` but no + # horizons survived). Pointing users to a nonexistent + # dict in the latter is dead-end guidance. + horizon_surface_empty = getattr(results, "linear_trends_effects", None) is None + if has_controls: + estimand_label = "DID^{X,fd}_l" + else: + estimand_label = "DID^{fd}_l" + if horizon_surface_empty: + # PR #347 R14 P1: name the estimand label with the + # covariate-adjusted form (``DID^{X,fd}_l``) when + # covariates are active rather than hardcoding + # ``DID^{fd}_l``. The native result label already + # handles this; propagate to the reporting-layer + # prose. + definition_text = ( + "Under ``trends_linear=True`` with ``L_max >= 2``, the " + "estimator intentionally does NOT produce a scalar " + "aggregate in ``overall_att`` (it is NaN by design, " + "matching R's ``did_multiplegt_dyn`` with " + "``trends_lin=TRUE``). On this fit, no cumulated level " + f"effects ``{estimand_label}`` survived estimation " + "(the horizon surface is empty — either no eligible " + "switchers at any positive horizon or all horizons " + "were dropped). There is no scalar headline AND no " + "per-horizon table to fall back on; re-fit with a " + "larger ``L_max`` or with ``trends_linear=False`` if " + "you need a reportable estimand." + ) + else: + estimand_label = estimand_label + " (see linear_trends_effects)" + definition_text = ( + "Under ``trends_linear=True`` with ``L_max >= 2``, the " + "estimator intentionally does NOT produce a scalar " + "aggregate in ``overall_att`` (it is NaN by design, " + "matching R's ``did_multiplegt_dyn`` with " + "``trends_lin=TRUE``). Per-horizon cumulated level " + "effects ``DID^{fd}_l`` (or ``DID^{X,fd}_l`` when " + "covariates are active) live on " + "``results.linear_trends_effects[l]``. Consult those " + "rather than the headline." + ) + return { + "name": estimand_label, + "definition": definition_text, + "aggregation": "no_scalar_headline", + "headline_attribute": None, + "reference": reference, + } + + if l_max is None: + # DID_M — period-aggregated contemporaneous-switch ATT. + base, agg_base, base_label = ( + "DID_M", + "M", + "period-aggregated contemporaneous-switch ATT", + ) + definition_core = ( + "The contemporaneous-switch ATT averaged across switching " + "periods. At each period the estimator contrasts joiners " + "(``D: 0 -> 1``), leavers (``D: 1 -> 0``), and stable-" + "control cells that share the same treatment state across " + "adjacent periods; ``DID_M`` averages these per-period " + "contrasts." + ) + elif l_max == 1: + # DID_1 — single-horizon per-group estimand. + base, agg_base, base_label = ( + "DID_1", + "DID_1", + "single-horizon per-group dynamic ATT", + ) + definition_core = ( + "The per-group dynamic ATT at event horizon ``l = 1`` " + "post-switch (Equation 3 of the dCDH dynamic companion " + "paper). The estimator contrasts joiners and stable " + "controls conditioning on baseline treatment ``D_{g,1}``; " + "the aggregate averages across cohorts with treated-share " + "weights. This is the per-group ``DID_{g,1}`` building " + "block averaged, NOT the per-period ``DID_M`` (the two can " + "differ by O(1%) on mixed-direction panels — see the " + "``Phase 2 DID_1 vs Phase 1 DID_M`` Note in REGISTRY.md)." + ) + else: + # L_max >= 2: cost-benefit delta aggregate. + base, agg_base, base_label = ( + "delta", + "delta", + "cost-benefit cross-horizon aggregate", + ) + definition_core = ( + "The cost-benefit aggregate " + "``delta = sum_l w_l * DID_l`` (Lemma 4 of the dCDH " + "dynamic companion paper), a weighted cross-horizon " + "combination where ``w_l`` reflects the cumulative dose at " + "each horizon. ``overall_att`` holds this delta when " + "``L_max >= 2``; if delta is non-estimable (no eligible " + "switchers at any horizon) it is NaN with the same " + "``overall_se`` / CI surface." + ) + + # Suffix for controls / trends. ``trends_linear`` + ``L_max >= 2`` + # was handled above (no-scalar case); here ``has_trends`` can + # only co-occur with ``L_max in {None, 1}`` (or ``L_max == None`` + # where the no-scalar rule does not apply). + if has_controls and has_trends: + suffix, agg_suffix, suffix_clause = ( + "^{X,fd}", + "_x_fd", + " Identification is conditional on the first-stage " + "covariates and allows group-specific linear pre-trends.", + ) + elif has_controls: + suffix, agg_suffix, suffix_clause = ( + "^X", + "_x", + " Identification is conditional on the first-stage " "covariates.", + ) + elif has_trends: + suffix, agg_suffix, suffix_clause = ( + "^{fd}", + "_fd", + " The identifying restriction allows group-specific linear " "pre-trends.", + ) + else: + suffix, agg_suffix, suffix_clause = "", "", "" + + # Assemble the name label to match the result class's + # ``_estimand_label()``: for ``delta``, suffix follows the base + # (``delta^X``); for DID variants, suffix goes on ``DID`` with + # the subscript preserved (``DID^X_1``, ``DID^{fd}_M``). + if base == "delta": + name_label = f"delta{suffix}" if suffix else "delta" + elif suffix: + did_part = base.split("_")[0] + sub_part = base.split("_")[1] if "_" in base else "" + name_label = f"{did_part}{suffix}_{sub_part}" if sub_part else f"{did_part}{suffix}" + else: + name_label = base + + return { + "name": f"{name_label} ({base_label})", + "definition": definition_core + suffix_clause, + "aggregation": agg_base + agg_suffix, + "headline_attribute": "overall_att", + "reference": reference, + } + + if name == "SyntheticDiDResults": + return { + "name": "synthetic-DiD ATT (time- and unit-weighted synthetic-control comparison)", + "definition": ( + "The treatment-effect contrast ``tau_hat^{sdid} = sum_t " + "lambda_t (Y_bar_{tr, t} - sum_j omega_j Y_{j, t})`` between " + "the treated group and a synthetic-control composed of " + "unit-weighted donors (``omega_j``) aggregated with time-" + "weighted pre-period matching weights (``lambda_t``). " + "Identification is through the weighted parallel-trends " + "analogue rather than unconditional PT." + ), + "aggregation": "synthetic", + "headline_attribute": "att", + "reference": "Arkhangelsky et al. (2021); REGISTRY.md Sec. SyntheticDiD", + } + + if name == "BaconDecompositionResults": + # BaconDecompositionResults is a diagnostic, not an estimator: + # BR refuses it (raises TypeError), but DR accepts it as a + # read-out and the schema needs a target-parameter block. + # The "target parameter" of a Bacon decomposition is the TWFE + # coefficient it is decomposing — named here for schema + # completeness. + return { + "name": "TWFE ATT being decomposed", + "definition": ( + "Goodman-Bacon decomposition is a diagnostic of the TWFE " + "coefficient ``twfe_estimate``: it partitions TWFE weights " + "across treated-vs-never, earlier-vs-later, and " + "later-vs-earlier (forbidden) 2x2 comparisons. The scalar " + "``twfe_estimate`` is the weighted sum of these 2x2 DiDs." + ), + "aggregation": "twfe", + "headline_attribute": "twfe_estimate", + "reference": "Goodman-Bacon (2021); REGISTRY.md Sec. BaconDecomposition", + } + + if name == "TROPResults": + return { + "name": "TROP ATT (factor-model-adjusted weighted average)", + "definition": ( + "A weighted average of per-(unit, time) treatment effects " + "``tau_hat_{it}(lambda_hat)`` estimated under low-rank factor-" + "model identification. Unobserved heterogeneity is captured " + "through latent factor loadings rather than a parallel-trends " + "assumption; the aggregate averages across treated cells with " + "weights ``W_{it}``." + ), + "aggregation": "factor_model", + "headline_attribute": "att", + "reference": "REGISTRY.md Sec. TROP", + } + + # Default: unrecognized result class. Fall through with a neutral + # block — agents / downstream consumers can still dispatch on + # ``aggregation="unknown"`` and fall back to generic ATT narration. + return { + "name": "ATT (unrecognized result class)", + "definition": ( + f"The target parameter for ``{name}`` is not specifically " + "documented in BR/DR. Defaulting to generic ATT narration; see " + "the estimator's own documentation for the canonical estimand." + ), + "aggregation": "unknown", + "headline_attribute": "att", + "reference": "REGISTRY.md", + } diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 36f19780..fb169820 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -42,9 +42,10 @@ import numpy as np +from diff_diff._reporting_helpers import describe_target_parameter from diff_diff.diagnostic_report import DiagnosticReport, DiagnosticReportResults -BUSINESS_REPORT_SCHEMA_VERSION = "1.0" +BUSINESS_REPORT_SCHEMA_VERSION = "2.0" __all__ = [ "BusinessReport", @@ -432,7 +433,67 @@ def _build_schema(self) -> Dict[str, Any]: diagnostics_results.schema if diagnostics_results is not None else None ) - headline = self._extract_headline(dr_schema) + # PR #347 R4 P1: compute target_parameter BEFORE extracting + # the headline so the no-scalar-by-design case + # (``aggregation == "no_scalar_headline"``, e.g., dCDH + # ``trends_linear=True`` with ``L_max >= 2``) can route the + # headline through a dedicated branch that names the intentional + # NaN rather than an estimation-failure path. + target_parameter = describe_target_parameter(self._results) + if target_parameter.get("aggregation") == "no_scalar_headline": + # PR #347 R12 P1: the no-scalar ``reason`` must distinguish + # the populated-surface case (per-horizon table exists) from + # the empty-surface subcase (``linear_trends_effects=None`` + # — no horizons survived estimation). Telling a user with + # an empty surface to "see linear_trends_effects" is + # dead-end guidance. + _surface_empty = getattr(self._results, "linear_trends_effects", None) is None + # PR #347 R14 P1: the empty-surface reason must use the + # covariate-adjusted label when covariates are active. + _has_controls = getattr(self._results, "covariate_residuals", None) is not None + _empty_surface_label = "DID^{X,fd}_l" if _has_controls else "DID^{fd}_l" + if _surface_empty: + no_scalar_reason = ( + "The fitted estimator intentionally does not produce a " + "scalar overall ATT on this configuration " + "(``trends_linear=True`` with ``L_max >= 2``), and on " + f"this fit no cumulated level effects ``{_empty_surface_label}`` " + "survived estimation — the per-horizon surface is " + "empty. Re-fit with a larger ``L_max`` or with " + "``trends_linear=False`` if you need a reportable " + "estimand." + ) + else: + no_scalar_reason = ( + "The fitted estimator intentionally does not produce a " + "scalar overall ATT on this configuration " + "(``trends_linear=True`` with ``L_max >= 2``). Per-horizon " + "cumulated level effects are on " + "``results.linear_trends_effects[l]``." + ) + headline = { + "status": "no_scalar_by_design", + "effect": None, + "se": None, + "ci_lower": None, + "ci_upper": None, + "alpha_was_honored": True, + "alpha_override_caveat": None, + "ci_level": int(round((1.0 - self._context.alpha) * 100)), + "p_value": None, + "is_significant": False, + "near_significance_threshold": False, + "unit": self._context.outcome_unit, + "unit_kind": _UNIT_KINDS.get( + self._context.outcome_unit.lower() if self._context.outcome_unit else "", + "unknown", + ), + "sign": "none", + "breakdown_M": None, + "reason": no_scalar_reason, + } + else: + headline = self._extract_headline(dr_schema) sample = self._extract_sample() heterogeneity = _lift_heterogeneity(dr_schema) pre_trends = _lift_pre_trends(dr_schema) @@ -475,6 +536,7 @@ def _build_schema(self) -> Dict[str, Any]: "alpha": self._context.alpha, }, "headline": headline, + "target_parameter": target_parameter, "assumption": assumption, "pre_trends": pre_trends, "sensitivity": sensitivity, @@ -1167,9 +1229,18 @@ def _describe_assumption(estimator_name: str, results: Any = None) -> Dict[str, has_controls = ( results is not None and getattr(results, "covariate_residuals", None) is not None ) - has_trends = ( - results is not None and getattr(results, "linear_trends_effects", None) is not None - ) + # PR #347 R10 P1: read the persisted ``trends_linear`` flag + # first — empty-horizon trends-linear fits set + # ``linear_trends_effects=None`` but are still trends-linear + # per the estimator contract. Legacy fit objects predating + # the persisted field fall back to the presence inference. + _trends_persisted = getattr(results, "trends_linear", None) if results is not None else None + if isinstance(_trends_persisted, bool): + has_trends = _trends_persisted + else: + has_trends = ( + results is not None and getattr(results, "linear_trends_effects", None) is not None + ) has_heterogeneity = ( results is not None and getattr(results, "heterogeneity_effects", None) is not None ) @@ -1928,6 +1999,31 @@ def _render_headline_sentence(schema: Dict[str, Any]) -> str: """ ctx = schema.get("context", {}) h = schema.get("headline", {}) + # PR #347 R4 P1: the dCDH ``trends_linear=True`` + ``L_max>=2`` + # configuration does not produce a scalar headline by design — + # ``overall_att`` is intentionally NaN (per + # ``chaisemartin_dhaultfoeuille.py:2828-2834``). Render explicit + # "no scalar headline by design" prose instead of routing through + # the non-finite / estimation-failure path. + if h.get("status") == "no_scalar_by_design": + # PR #347 R13 P1: the headline-level ``reason`` field is the + # single source for the no-scalar prose and is already + # branched on populated-vs-empty surface in ``_build_schema``. + # Use it verbatim so the headline sentence never drifts from + # the schema-level message on the empty-surface subcase. + treatment = ctx.get("treatment_label", "the treatment") + outcome_label = ctx.get("outcome_label", "the outcome") + treatment_sentence = _sentence_first_upper(treatment) + reason = h.get("reason") + if isinstance(reason, str) and reason: + return ( + f"{treatment_sentence} does not produce a scalar aggregate " + f"effect on {outcome_label} under this configuration. " + reason + ) + return ( + f"{treatment_sentence} does not produce a scalar aggregate effect " + f"on {outcome_label} under this configuration (by design)." + ) effect = h.get("effect") outcome = ctx.get("outcome_label", "the outcome") treatment = ctx.get("treatment_label", "the treatment") @@ -1993,6 +2089,17 @@ def _render_summary(schema: Dict[str, Any]) -> str: # Headline sentence with significance phrase. sentences.append(_render_headline_sentence(schema)) + # BR/DR gap #6 (target-parameter clarity): name what the headline + # scalar actually represents so the stakeholder can map the number + # to a specific estimand. Rendered immediately after the headline + # and before the significance phrase. The summary surfaces only + # the short ``name`` so the paragraph stays within the + # 6-10-sentence target; ``definition`` lives in the full report + # and in the structured schema for agents that want the long form. + tp = schema.get("target_parameter", {}) or {} + tp_name = tp.get("name") + if tp_name: + sentences.append(f"Target parameter: {tp_name}.") h = schema.get("headline", {}) p = h.get("p_value") alpha = ctx.get("alpha", 0.05) @@ -2314,6 +2421,21 @@ def _render_full_report(schema: Dict[str, Any]) -> str: lines.append(f"Statistically, {_significance_phrase(p, alpha)}.") lines.append("") + # Target parameter (BR/DR gap #6): name what the headline scalar + # represents so the stakeholder can map the number to a specific + # estimand. Rendered between "Headline" and "Identifying Assumption" + # because the target parameter is about what the scalar IS, whereas + # identifying assumption is about what makes it valid. + tp = schema.get("target_parameter", {}) or {} + if tp.get("name") or tp.get("definition"): + lines.append("## Target Parameter") + lines.append("") + if tp.get("name"): + lines.append(f"- **{tp['name']}**") + if tp.get("definition"): + lines.append(f"- {tp['definition']}") + lines.append("") + # Identifying assumption lines.append("## Identifying Assumption") lines.append("") diff --git a/diff_diff/chaisemartin_dhaultfoeuille.py b/diff_diff/chaisemartin_dhaultfoeuille.py index 1416d5ed..fb6695f8 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille.py +++ b/diff_diff/chaisemartin_dhaultfoeuille.py @@ -260,9 +260,7 @@ def _validate_and_aggregate_to_cells( non_constant_mask = cell["d_min"] != cell["d_max"] if non_constant_mask.any(): n_non_constant = int(non_constant_mask.sum()) - example_cells = cell.loc[ - non_constant_mask, [group, time, "d_gt", "d_min", "d_max"] - ].head(5) + example_cells = cell.loc[non_constant_mask, [group, time, "d_gt", "d_min", "d_max"]].head(5) raise ValueError( f"Within-cell-varying treatment detected in {n_non_constant} " f"(group, time) cell(s). dCDH requires treatment to be " @@ -727,8 +725,8 @@ def fit( # ------------------------------------------------------------------ from diff_diff.survey import _resolve_survey_for_fit - resolved_survey, survey_weights, _, survey_metadata = ( - _resolve_survey_for_fit(survey_design, data, "analytical") + resolved_survey, survey_weights, _, survey_metadata = _resolve_survey_for_fit( + survey_design, data, "analytical" ) # dCDH contract: the group is the effective sampling unit for @@ -762,11 +760,12 @@ def fit( # fine — let the auto-inject proceed. Only the # ``nest=False`` + varying-strata + omitted-psu triple # warrants an up-front targeted error. - if resolved_survey.strata is not None and not getattr( - survey_design, "nest", False - ): + if resolved_survey.strata is not None and not getattr(survey_design, "nest", False): _strata_varies_pre, _ = _strata_psu_vary_within_group( - resolved_survey, data, group, survey_weights, + resolved_survey, + data, + group, + survey_weights, ) if _strata_varies_pre: raise ValueError( @@ -821,8 +820,8 @@ def fit( nest=getattr(survey_design, "nest", False), lonely_psu=getattr(survey_design, "lonely_psu", "remove"), ) - resolved_survey, survey_weights, _, survey_metadata = ( - _resolve_survey_for_fit(eff_design, synth_data, "analytical") + resolved_survey, survey_weights, _, survey_metadata = _resolve_survey_for_fit( + eff_design, synth_data, "analytical" ) if resolved_survey is not None: @@ -850,7 +849,11 @@ def fit( # group-constant regimes through the legacy group-level # path for bit-identity with prior releases). _validate_cell_constant_strata_psu( - resolved_survey, data, group, time, survey_weights, + resolved_survey, + data, + group, + time, + survey_weights, ) # Design-2 precondition: requires drop_larger_lower=False @@ -980,9 +983,7 @@ def fit( g_agg[c] = g_agg[f"_wx_{c}"] / w_safe x_cell_agg = g_agg[[group, time] + controls] else: - x_cell_agg = x_agg_input.groupby( - [group, time], as_index=False - )[controls].mean() + x_cell_agg = x_agg_input.groupby([group, time], as_index=False)[controls].mean() cell = cell.merge(x_cell_agg, on=[group, time], how="left") # ------------------------------------------------------------------ @@ -1305,9 +1306,9 @@ def fit( # Pivot covariates to (n_groups, n_periods, n_covariates) X_pivots = [] for c in controls: - x_piv = cell.pivot( - index=group, columns=time, values=c - ).reindex(index=all_groups, columns=all_periods) + x_piv = cell.pivot(index=group, columns=time, values=c).reindex( + index=all_groups, columns=all_periods + ) X_pivots.append(x_piv.to_numpy()) X_cell = np.stack(X_pivots, axis=2) @@ -1377,9 +1378,7 @@ def fit( ) _switch_metadata_computed = True # Count and warn about excluded groups (F_g < 3 -> f_g < 2) - n_excluded_fd = int( - ((first_switch_idx_arr >= 0) & (first_switch_idx_arr < 2)).sum() - ) + n_excluded_fd = int(((first_switch_idx_arr >= 0) & (first_switch_idx_arr < 2)).sum()) if n_excluded_fd > 0: warnings.warn( f"DID^{{fd}} (trends_linear=True): {n_excluded_fd} " @@ -1455,9 +1454,7 @@ def fit( ) # Extract set membership per group aligned with all_groups set_map = data_tnp.groupby(group)[set_col].first() - set_ids_arr = np.array( - [set_map.loc[g] for g in all_groups], dtype=object - ) + set_ids_arr = np.array([set_map.loc[g] for g in all_groups], dtype=object) # ------------------------------------------------------------------ # Step 8-9: Switching-cell counts and per-period DIDs (Theorem 3) @@ -1483,7 +1480,11 @@ def fit( # contract (Web Appendix Section 1.2). # Use raw outcomes for per-period DID when controls or # trends_linear is active (both transform Y_mat). - Y_mat=Y_mat_raw if controls is not None else (y_pivot.to_numpy() if _is_trends_linear else Y_mat), + Y_mat=( + Y_mat_raw + if controls is not None + else (y_pivot.to_numpy() if _is_trends_linear else Y_mat) + ), N_mat=N_mat_orig, periods=all_periods, ) @@ -1524,9 +1525,7 @@ def fit( "differs from the previous period." ) if N_S > 0: - overall_att = float( - (n_10_t_arr @ did_plus_t_arr + n_01_t_arr @ did_minus_t_arr) / N_S - ) + overall_att = float((n_10_t_arr @ did_plus_t_arr + n_01_t_arr @ did_minus_t_arr) / N_S) else: # Non-binary treatment with L_max: per-period DID is not # applicable. The multi-horizon path will provide overall_att @@ -1753,7 +1752,9 @@ def fit( else: U_centered_pp_l = None N_l_h = multi_horizon_dids[l_h]["N_l"] - _elig_groups_l = [all_groups[g] for g in range(len(all_groups)) if eligible_mask_var[g]] + _elig_groups_l = [ + all_groups[g] for g in range(len(all_groups)) if eligible_mask_var[g] + ] se_l, n_valid_l = _compute_se( U_centered=U_centered_l, divisor=N_l_h, @@ -1767,7 +1768,9 @@ def fit( did_l_val = multi_horizon_dids[l_h]["did_l"] _df_s = _effective_df_survey(resolved_survey, _replicate_n_valid_list) - t_l, p_l, ci_l = safe_inference(did_l_val, se_l, alpha=self.alpha, df=_inference_df(_df_s, resolved_survey)) + t_l, p_l, ci_l = safe_inference( + did_l_val, se_l, alpha=self.alpha, df=_inference_df(_df_s, resolved_survey) + ) multi_horizon_inference[l_h] = { "effect": did_l_val, "se": se_l, @@ -1895,7 +1898,9 @@ def fit( _pl_pp_cache[lag_l] = U_centered_pp_pl_l else: U_centered_pp_pl_l = None - _elig_groups_pl = [all_groups[g] for g in range(len(all_groups)) if eligible_mask_pl[g]] + _elig_groups_pl = [ + all_groups[g] for g in range(len(all_groups)) if eligible_mask_pl[g] + ] se_pl_l, n_valid_pl_l = _compute_se( U_centered=U_centered_pl_l, divisor=pl_data["N_pl_l"], @@ -1909,7 +1914,9 @@ def fit( pl_val = pl_data["placebo_l"] _df_s = _effective_df_survey(resolved_survey, _replicate_n_valid_list) t_pl_l, p_pl_l, ci_pl_l = safe_inference( - pl_val, se_pl_l, alpha=self.alpha, + pl_val, + se_pl_l, + alpha=self.alpha, df=_inference_df(_df_s, resolved_survey), ) placebo_horizon_inference[lag_l] = { @@ -1936,13 +1943,13 @@ def fit( # Cost-benefit delta (only meaningful when L_max >= 2) if L_max >= 2: cost_benefit_result = _compute_cost_benefit_delta( - multi_horizon_dids=multi_horizon_dids, - D_mat=D_mat, - baselines=baselines, - first_switch_idx=first_switch_idx_arr, - switch_direction=switch_direction_arr, - L_max=L_max, - ) + multi_horizon_dids=multi_horizon_dids, + D_mat=D_mat, + baselines=baselines, + first_switch_idx=first_switch_idx_arr, + switch_direction=switch_direction_arr, + L_max=L_max, + ) if cost_benefit_result.get("has_leavers", False): warnings.warn( "Assumption 7 (D_{g,t} >= D_{g,1}) is violated: leavers " @@ -1973,7 +1980,11 @@ def fit( D_mat=D_mat, # Phase 1 IF uses per-period structure: use raw outcomes # when controls or trends_linear transform Y_mat. - Y_mat=Y_mat_raw if controls is not None else (y_pivot.to_numpy() if _is_trends_linear else Y_mat), + Y_mat=( + Y_mat_raw + if controls is not None + else (y_pivot.to_numpy() if _is_trends_linear else Y_mat) + ), N_mat=N_mat_orig, n_10_t_arr=n_10_t_arr, n_00_t_arr=n_00_t_arr, @@ -2022,7 +2033,9 @@ def fit( ) _df_survey = _effective_df_survey(resolved_survey, _replicate_n_valid_list) overall_t, overall_p, overall_ci = safe_inference( - overall_att, overall_se, alpha=self.alpha, + overall_att, + overall_se, + alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey), ) @@ -2039,7 +2052,9 @@ def fit( _replicate_n_valid_list.append(n_valid_joiners) _df_survey = _effective_df_survey(resolved_survey, _replicate_n_valid_list) joiners_t, joiners_p, joiners_ci = safe_inference( - joiners_att, joiners_se, alpha=self.alpha, + joiners_att, + joiners_se, + alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey), ) else: @@ -2063,7 +2078,9 @@ def fit( _replicate_n_valid_list.append(n_valid_leavers) _df_survey = _effective_df_survey(resolved_survey, _replicate_n_valid_list) leavers_t, leavers_p, leavers_ci = safe_inference( - leavers_att, leavers_se, alpha=self.alpha, + leavers_att, + leavers_se, + alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey), ) else: @@ -2148,9 +2165,7 @@ def fit( psu_varies_within_warn = False else: obs_gids_warn = np.asarray(_obs_survey_info["group_ids"]) - obs_ws_warn = np.asarray( - _obs_survey_info["weights"], dtype=np.float64 - ) + obs_ws_warn = np.asarray(_obs_survey_info["weights"], dtype=np.float64) pos_mask_warn = obs_ws_warn > 0 psu_codes_warn = np.asarray(psu_arr_warn) eligible_gid_set = set(_eligible_group_ids) @@ -2160,18 +2175,18 @@ def fit( ) if elig_obs_mask_warn.any(): elig_psu_labels_arr = psu_codes_warn[elig_obs_mask_warn] - n_psu_eff_warn = int( - len(np.unique(elig_psu_labels_arr)) - ) + n_psu_eff_warn = int(len(np.unique(elig_psu_labels_arr))) n_groups_eff_warn = len(_eligible_group_ids) # Detect within-group-varying PSU on the # eligible subset so we can suppress the # "strictly coarser PSU" warning there. psu_varies_within_warn = bool( - pd.DataFrame({ - "g": obs_gids_warn[elig_obs_mask_warn], - "p": elig_psu_labels_arr, - }) + pd.DataFrame( + { + "g": obs_gids_warn[elig_obs_mask_warn], + "p": elig_psu_labels_arr, + } + ) .groupby("g")["p"] .nunique() .gt(1) @@ -2180,10 +2195,7 @@ def fit( else: n_psu_eff_warn, n_groups_eff_warn = -1, -1 psu_varies_within_warn = False - if ( - 0 <= n_psu_eff_warn < n_groups_eff_warn - and not psu_varies_within_warn - ): + if 0 <= n_psu_eff_warn < n_groups_eff_warn and not psu_varies_within_warn: warnings.warn( f"Bootstrap with survey_design uses Hall-Mammen " f"wild multiplier weights at the PSU level " @@ -2201,11 +2213,13 @@ def fit( ) joiners_inputs = ( (U_centered_joiners, joiner_total, joiners_att, U_centered_pp_joiners) - if joiners_available else None + if joiners_available + else None ) leavers_inputs = ( (U_centered_leavers, leaver_total, leavers_att, U_centered_pp_leavers) - if leavers_available else None + if leavers_available + else None ) # Phase 1 placebo bootstrap: the Phase 1 per-period placebo # DID_M^pl still uses NaN SE (no IF derivation for the @@ -2345,13 +2359,9 @@ def fit( obs_psu_codes = np.asarray(resolved_survey.psu) obs_gids_boot = np.asarray(_obs_survey_info["group_ids"]) obs_tids_boot = np.asarray(_obs_survey_info["time_ids"]) - obs_weights_boot = np.asarray( - _obs_survey_info["weights"], dtype=np.float64 - ) + obs_weights_boot = np.asarray(_obs_survey_info["weights"], dtype=np.float64) pos_mask_boot = obs_weights_boot > 0 - gid_to_idx = { - gid: i for i, gid in enumerate(_eligible_group_ids) - } + gid_to_idx = {gid: i for i, gid in enumerate(_eligible_group_ids)} tid_to_idx = {t: i for i, t in enumerate(all_periods)} n_elig_boot = len(_eligible_group_ids) n_per_boot = len(all_periods) @@ -2372,18 +2382,19 @@ def fit( # is exact: no gaps from singleton-baseline-excluded # groups that would silently trigger the identity # fast path in `_generate_psu_or_group_weights`. - elig_obs_mask = ( - pos_mask_boot & (g_idx_arr >= 0) & (t_idx_arr >= 0) - ) + elig_obs_mask = pos_mask_boot & (g_idx_arr >= 0) & (t_idx_arr >= 0) elig_psu_labels = obs_psu_codes[elig_obs_mask] dense_per_row: Optional[np.ndarray] = None if elig_psu_labels.size > 0: _, elig_dense_codes = np.unique( - elig_psu_labels, return_inverse=True, + elig_psu_labels, + return_inverse=True, ) elig_dense_codes = np.asarray(elig_dense_codes, dtype=np.int64) dense_per_row = np.full( - len(obs_psu_codes), -1, dtype=np.int64, + len(obs_psu_codes), + -1, + dtype=np.int64, ) dense_per_row[elig_obs_mask] = elig_dense_codes @@ -2397,7 +2408,9 @@ def fit( # ignores sentinel entries row-wise. if dense_per_row is not None: psu_codes_per_cell = np.full( - (n_elig_boot, n_per_boot), -1, dtype=np.int64, + (n_elig_boot, n_per_boot), + -1, + dtype=np.int64, ) psu_codes_per_cell[ g_idx_arr[elig_obs_mask], @@ -2425,14 +2438,9 @@ def fit( else: group_psu_labels.append(int(valid[0])) group_id_to_psu_code_bootstrap = { - gid: code - for gid, code in zip( - _eligible_group_ids, group_psu_labels - ) + gid: code for gid, code in zip(_eligible_group_ids, group_psu_labels) } - eligible_group_ids_bootstrap = np.asarray( - _eligible_group_ids - ) + eligible_group_ids_bootstrap = np.asarray(_eligible_group_ids) br = self._compute_dcdh_bootstrap( n_groups_for_overall=n_groups_for_overall_var, @@ -2476,17 +2484,32 @@ def fit( overall_se = br.overall_se overall_p = br.overall_p_value if br.overall_p_value is not None else np.nan overall_ci = br.overall_ci if br.overall_ci is not None else (np.nan, np.nan) - overall_t = safe_inference(overall_att, overall_se, alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey))[0] + overall_t = safe_inference( + overall_att, + overall_se, + alpha=self.alpha, + df=_inference_df(_df_survey, resolved_survey), + )[0] if joiners_available and br.joiners_se is not None and np.isfinite(br.joiners_se): joiners_se = br.joiners_se joiners_p = br.joiners_p_value if br.joiners_p_value is not None else np.nan joiners_ci = br.joiners_ci if br.joiners_ci is not None else (np.nan, np.nan) - joiners_t = safe_inference(joiners_att, joiners_se, alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey))[0] + joiners_t = safe_inference( + joiners_att, + joiners_se, + alpha=self.alpha, + df=_inference_df(_df_survey, resolved_survey), + )[0] if leavers_available and br.leavers_se is not None and np.isfinite(br.leavers_se): leavers_se = br.leavers_se leavers_p = br.leavers_p_value if br.leavers_p_value is not None else np.nan leavers_ci = br.leavers_ci if br.leavers_ci is not None else (np.nan, np.nan) - leavers_t = safe_inference(leavers_att, leavers_se, alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey))[0] + leavers_t = safe_inference( + leavers_att, + leavers_se, + alpha=self.alpha, + df=_inference_df(_df_survey, resolved_survey), + )[0] # ------------------------------------------------------------------ # Step 20: Build the results dataclass @@ -2561,11 +2584,7 @@ def fit( # When L_max >= 1 and the per-group path is active, sync # overall_* from event_study_effects[1] AFTER bootstrap propagation # so that bootstrap SE/p/CI flow to the top-level surface. - if ( - L_max is not None - and L_max >= 1 - and 1 in event_study_effects - ): + if L_max is not None and L_max >= 1 and 1 in event_study_effects: es1 = event_study_effects[1] overall_att = es1["effect"] overall_se = es1["se"] @@ -2648,7 +2667,9 @@ def fit( if np.isfinite(delta_se): effective_overall_se = delta_se effective_overall_t, effective_overall_p, effective_overall_ci = safe_inference( - delta_val, delta_se, alpha=self.alpha, + delta_val, + delta_se, + alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey), ) else: @@ -2666,10 +2687,7 @@ def fit( for lag_l, pl_data in multi_horizon_placebos.items(): if pl_data["N_pl_l"] > 0: # Pull analytical SE from placebo IF computation - if ( - placebo_horizon_inference is not None - and lag_l in placebo_horizon_inference - ): + if placebo_horizon_inference is not None and lag_l in placebo_horizon_inference: inf = placebo_horizon_inference[lag_l] placebo_event_study_dict[-lag_l] = { "effect": inf["effect"], @@ -2683,7 +2701,9 @@ def fit( # Fallback: NaN SE (Phase 1 path or missing IF) pl_se = float("nan") pl_t, pl_p, pl_ci = safe_inference( - pl_data["placebo_l"], pl_se, alpha=self.alpha, + pl_data["placebo_l"], + pl_se, + alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey), ) placebo_event_study_dict[-lag_l] = { @@ -2735,7 +2755,9 @@ def fit( bs_ci if bs_ci is not None else (np.nan, np.nan) ) placebo_event_study_dict[neg_key]["t_stat"] = safe_inference( - eff, bs_se, alpha=self.alpha, + eff, + bs_se, + alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey), )[0] @@ -2749,7 +2771,9 @@ def fit( # SE via delta method: SE(DID^n_l) = SE(DID_l) / delta^D_l se_did_l = multi_horizon_se.get(l_h, float("nan")) se_norm = se_did_l / denom if np.isfinite(denom) and denom > 0 else float("nan") - t_n, p_n, ci_n = safe_inference(eff, se_norm, alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey)) + t_n, p_n, ci_n = safe_inference( + eff, se_norm, alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey) + ) normalized_effects_out[l_h] = { "effect": eff, "se": se_norm, @@ -2778,8 +2802,8 @@ def fit( if l_h not in multi_horizon_dids: continue mh = multi_horizon_dids[l_h] - did_g_l = mh["did_g_l"] # (n_groups,) per-group DID - eligible = mh["eligible_mask"] # (n_groups,) bool + did_g_l = mh["did_g_l"] # (n_groups,) per-group DID + eligible = mh["eligible_mask"] # (n_groups,) bool N_l = mh["N_l"] if N_l == 0: continue @@ -2790,9 +2814,7 @@ def fit( # Average the cumulated sum over groups eligible at THIS horizon # Weight by S_g (switch direction) and divide by N_l S_arr = switch_direction_arr.astype(float) - cum_effect = float( - np.sum(S_arr[eligible] * running_per_group[eligible]) / N_l - ) + cum_effect = float(np.sum(S_arr[eligible] * running_per_group[eligible]) / N_l) # SE: conservative upper bound (sum of per-horizon SEs). # NaN-consistency: if ANY component SE up to horizon l is # non-finite, the cumulated SE is NaN (not 0.0). @@ -2808,7 +2830,9 @@ def fit( else: running_se_ub = float("nan") cum_t, cum_p, cum_ci = safe_inference( - cum_effect, running_se_ub, alpha=self.alpha, + cum_effect, + running_se_ub, + alpha=self.alpha, df=_inference_df(_df_survey, resolved_survey), ) cumulated[l_h] = { @@ -2845,9 +2869,7 @@ def fit( ) het_col = str(heterogeneity) if het_col not in data.columns: - raise ValueError( - f"heterogeneity column {het_col!r} not found in data." - ) + raise ValueError(f"heterogeneity column {het_col!r} not found in data.") # R's predict_het disallows controls; our partial implementation # follows this restriction to avoid inconsistent behavior. if controls is not None: @@ -2888,9 +2910,7 @@ def fit( f"{len(het_varying)} group(s) have varying values." ) het_map = data_het.groupby(group)[het_col].first() - X_het = np.array( - [float(het_map.loc[g]) for g in all_groups] - ) + X_het = np.array([float(het_map.loc[g]) for g in all_groups]) # Use original Y_mat (not first-differenced) for heterogeneity # test, since it operates on level differences Y[out] - Y[ref]. # When trends_linear, the DID^{fd} second-differences are in @@ -2943,9 +2963,9 @@ def fit( if not is_binary: # For non-binary: count all observations where treatment # differs from baseline - effective_n_treated = int( - N_mat[D_mat != D_mat[:, 0:1]].sum() - ) if D_mat.shape[1] > 1 else 0 + effective_n_treated = ( + int(N_mat[D_mat != D_mat[:, 0:1]].sum()) if D_mat.shape[1] > 1 else 0 + ) if not is_binary: # Suppress binary-only Phase 1 artifacts on non-binary # panels: per_period_effects and single-period placebo @@ -2973,9 +2993,7 @@ def fit( # anti-conservative relative to `survey_metadata.df_survey` # and HonestDiD. Re-run safe_inference with the FINAL # effective df so every surface agrees. - _final_eff_df = _effective_df_survey( - resolved_survey, _replicate_n_valid_list - ) + _final_eff_df = _effective_df_survey(resolved_survey, _replicate_n_valid_list) if _replicate_n_valid_list: _final_inf_df = _inference_df(_final_eff_df, resolved_survey) # Recompute `effective_overall_*` directly — that's what @@ -2984,34 +3002,42 @@ def fit( # (cost-benefit) path at L_max >= 2; recomputing from # `effective_*` ensures both paths get the final df. if np.isfinite(effective_overall_se): - effective_overall_t, effective_overall_p, effective_overall_ci = ( - safe_inference( - effective_overall_att, effective_overall_se, - alpha=self.alpha, df=_final_inf_df, - ) + effective_overall_t, effective_overall_p, effective_overall_ci = safe_inference( + effective_overall_att, + effective_overall_se, + alpha=self.alpha, + df=_final_inf_df, ) # Keep `overall_*` in sync for any downstream code that # reads them directly (e.g., placebo_event_study_dict # construction flows from overall_*). overall_t, overall_p, overall_ci = safe_inference( - overall_att, overall_se, - alpha=self.alpha, df=_final_inf_df, + overall_att, + overall_se, + alpha=self.alpha, + df=_final_inf_df, ) if joiners_available: joiners_t, joiners_p, joiners_ci = safe_inference( - joiners_att, joiners_se, - alpha=self.alpha, df=_final_inf_df, + joiners_att, + joiners_se, + alpha=self.alpha, + df=_final_inf_df, ) if leavers_available: leavers_t, leavers_p, leavers_ci = safe_inference( - leavers_att, leavers_se, - alpha=self.alpha, df=_final_inf_df, + leavers_att, + leavers_se, + alpha=self.alpha, + df=_final_inf_df, ) if multi_horizon_inference is not None: for _lag_r2, _info_r2 in list(multi_horizon_inference.items()): _t_r2, _p_r2, _ci_r2 = safe_inference( - _info_r2["effect"], _info_r2["se"], - alpha=self.alpha, df=_final_inf_df, + _info_r2["effect"], + _info_r2["se"], + alpha=self.alpha, + df=_final_inf_df, ) _info_r2["t_stat"] = _t_r2 _info_r2["p_value"] = _p_r2 @@ -3019,8 +3045,10 @@ def fit( if placebo_horizon_inference is not None: for _lag_r2, _info_r2 in list(placebo_horizon_inference.items()): _t_r2, _p_r2, _ci_r2 = safe_inference( - _info_r2["effect"], _info_r2["se"], - alpha=self.alpha, df=_final_inf_df, + _info_r2["effect"], + _info_r2["se"], + alpha=self.alpha, + df=_final_inf_df, ) _info_r2["t_stat"] = _t_r2 _info_r2["p_value"] = _p_r2 @@ -3042,8 +3070,10 @@ def fit( for _lag_r2, _info_r2 in list(heterogeneity_effects.items()): if np.isfinite(_info_r2["se"]): _t_r2, _p_r2, _ci_r2 = safe_inference( - _info_r2["beta"], _info_r2["se"], - alpha=self.alpha, df=_final_inf_df, + _info_r2["beta"], + _info_r2["se"], + alpha=self.alpha, + df=_final_inf_df, ) _info_r2["t_stat"] = _t_r2 _info_r2["p_value"] = _p_r2 @@ -3055,8 +3085,10 @@ def fit( if normalized_effects_out is not None: for _lag_r2, _info_r2 in list(normalized_effects_out.items()): _t_r2, _p_r2, _ci_r2 = safe_inference( - _info_r2["effect"], _info_r2["se"], - alpha=self.alpha, df=_final_inf_df, + _info_r2["effect"], + _info_r2["se"], + alpha=self.alpha, + df=_final_inf_df, ) _info_r2["t_stat"] = _t_r2 _info_r2["p_value"] = _p_r2 @@ -3082,7 +3114,9 @@ def fit( joiners_se=joiners_se if effective_joiners_available else float("nan"), joiners_t_stat=joiners_t if effective_joiners_available else float("nan"), joiners_p_value=joiners_p if effective_joiners_available else float("nan"), - joiners_conf_int=joiners_ci if effective_joiners_available else (float("nan"), float("nan")), + joiners_conf_int=( + joiners_ci if effective_joiners_available else (float("nan"), float("nan")) + ), n_joiner_cells=n_joiner_cells if effective_joiners_available else 0, n_joiner_obs=n_joiner_obs if effective_joiners_available else 0, joiners_available=effective_joiners_available, @@ -3090,7 +3124,9 @@ def fit( leavers_se=leavers_se if effective_leavers_available else float("nan"), leavers_t_stat=leavers_t if effective_leavers_available else float("nan"), leavers_p_value=leavers_p if effective_leavers_available else float("nan"), - leavers_conf_int=leavers_ci if effective_leavers_available else (float("nan"), float("nan")), + leavers_conf_int=( + leavers_ci if effective_leavers_available else (float("nan"), float("nan")) + ), n_leaver_cells=n_leaver_cells if effective_leavers_available else 0, n_leaver_obs=n_leaver_obs if effective_leavers_available else 0, leavers_available=effective_leavers_available, @@ -3137,6 +3173,7 @@ def fit( else None ), linear_trends_effects=linear_trends_effects, + trends_linear=_is_trends_linear, heterogeneity_effects=heterogeneity_effects, design2_effects=( _compute_design2_effects( @@ -3166,7 +3203,9 @@ def fit( from diff_diff.honest_did import compute_honest_did results.honest_did_results = compute_honest_did( - results, method="relative_magnitude", M=1.0, + results, + method="relative_magnitude", + M=1.0, alpha=self.alpha, ) except (ValueError, np.linalg.LinAlgError) as exc: @@ -3699,7 +3738,7 @@ def _compute_covariate_residualization( # Extract covariate coefficients (indices 1..n_covariates; # index 0 is the intercept) - theta_hat = coefs[1:1 + n_covariates] + theta_hat = coefs[1 : 1 + n_covariates] # R-squared of first-stage regression ss_res = float(np.sum(residuals**2)) @@ -3887,9 +3926,7 @@ def _compute_heterogeneity_test( # Survey setup (once, before horizon loop). When inactive, df_s=None and # the existing plain-OLS path runs unchanged. - use_survey = ( - obs_survey_info is not None and group_ids_order is not None - ) + use_survey = obs_survey_info is not None and group_ids_order is not None if use_survey: from diff_diff.survey import ( compute_replicate_if_variance, @@ -3908,9 +3945,7 @@ def _compute_heterogeneity_test( # surfaces (R2 P1a). `list(... or [])` avoids accidental # mutation of the caller's shared tracker at this site; the # explicit append happens inside the horizon loop below. - df_s = _effective_df_survey( - resolved, list(replicate_n_valid_list or []) - ) + df_s = _effective_df_survey(resolved, list(replicate_n_valid_list or [])) # Contract: only obs whose group is in the canonical post-filter # list contribute. Groups dropped upstream (Step 5b interior gaps, # Step 6 multi-switch) appear in obs_gids_raw but must be @@ -3956,15 +3991,15 @@ def _compute_heterogeneity_test( eligible.append(g) dep_var.append(S_g * y_diff) x_vals.append(X_het[g]) - cohort_keys.append( - (float(baselines[g]), int(f_g), int(switch_direction[g])) - ) + cohort_keys.append((float(baselines[g]), int(f_g), int(switch_direction[g]))) n_obs = len(eligible) if n_obs < 3: results[l_h] = { - "beta": float("nan"), "se": float("nan"), - "t_stat": float("nan"), "p_value": float("nan"), + "beta": float("nan"), + "se": float("nan"), + "t_stat": float("nan"), + "p_value": float("nan"), "conf_int": (float("nan"), float("nan")), "n_obs": n_obs, } @@ -3995,8 +4030,10 @@ def _compute_heterogeneity_test( n_params = design.shape[1] if n_obs <= n_params: results[l_h] = { - "beta": float("nan"), "se": float("nan"), - "t_stat": float("nan"), "p_value": float("nan"), + "beta": float("nan"), + "se": float("nan"), + "t_stat": float("nan"), + "p_value": float("nan"), "conf_int": (float("nan"), float("nan")), "n_obs": n_obs, } @@ -4005,7 +4042,8 @@ def _compute_heterogeneity_test( if not use_survey: # Plain OLS path (unchanged): standard inference per Lemma 7. coefs, _residuals, vcov = solve_ols( - design, dep_arr, + design, + dep_arr, return_vcov=True, rank_deficient_action=rank_deficient_action, ) @@ -4021,8 +4059,10 @@ def _compute_heterogeneity_test( # weight_type='pweight' (linalg.py). Skip vcov — we compute # design-based variance ourselves below. coefs, _residuals, _vcov_ignored = solve_ols( - design, dep_arr, - weights=W_elig, weight_type="pweight", + design, + dep_arr, + weights=W_elig, + weight_type="pweight", return_vcov=False, rank_deficient_action=rank_deficient_action, ) @@ -4031,8 +4071,10 @@ def _compute_heterogeneity_test( # IF would describe different estimands. if not np.all(np.isfinite(coefs)): results[l_h] = { - "beta": float("nan"), "se": float("nan"), - "t_stat": float("nan"), "p_value": float("nan"), + "beta": float("nan"), + "se": float("nan"), + "t_stat": float("nan"), + "p_value": float("nan"), "conf_int": (float("nan"), float("nan")), "n_obs": n_obs, } @@ -4090,12 +4132,8 @@ def _compute_heterogeneity_test( mask_g = (obs_gids_raw == gid) & valid w_sum_g = obs_w_raw[mask_g].sum() if w_sum_g > 0: - psi_obs[mask_g] = psi_g[e_idx] * ( - obs_w_raw[mask_g] / w_sum_g - ) - var_s, n_valid_het = compute_replicate_if_variance( - psi_obs, resolved - ) + psi_obs[mask_g] = psi_g[e_idx] * (obs_w_raw[mask_g] / w_sum_g) + var_s, n_valid_het = compute_replicate_if_variance(psi_obs, resolved) if replicate_n_valid_list is not None: replicate_n_valid_list.append(n_valid_het) # Reduce df_s to reflect this horizon's n_valid. @@ -4118,25 +4156,17 @@ def _compute_heterogeneity_test( gid = gid_list[g_idx] out_idx = first_switch_idx[g_idx] - 1 + l_h t_val_out = periods_arr[out_idx] - mask_cell = ( - (obs_gids_raw == gid) - & (obs_tids == t_val_out) - & valid - ) + mask_cell = (obs_gids_raw == gid) & (obs_tids == t_val_out) & valid w_cell = obs_w_raw[mask_cell].sum() if w_cell > 0: - psi_obs[mask_cell] = psi_g[e_idx] * ( - obs_w_raw[mask_cell] / w_cell - ) + psi_obs[mask_cell] = psi_g[e_idx] * (obs_w_raw[mask_cell] / w_cell) var_s = compute_survey_if_variance(psi_obs, resolved) df_s_local = df_s - se_het = ( - float(np.sqrt(var_s)) - if np.isfinite(var_s) and var_s > 0 - else float("nan") - ) + se_het = float(np.sqrt(var_s)) if np.isfinite(var_s) and var_s > 0 else float("nan") t_stat, p_val, ci = safe_inference( - beta_het, se_het, alpha=alpha, + beta_het, + se_het, + alpha=alpha, df=_inference_df(df_s_local, resolved), ) @@ -4593,9 +4623,7 @@ def _compute_per_group_if_multi_horizon( for l in range(1, L_max + 1): # noqa: E741 U_l = np.zeros(n_groups, dtype=float) U_per_period_l: Optional[np.ndarray] = ( - np.zeros((n_groups, n_periods), dtype=float) - if compute_per_period - else None + np.zeros((n_groups, n_periods), dtype=float) if compute_per_period else None ) for g in range(n_groups): @@ -4709,9 +4737,7 @@ def _compute_per_group_if_placebo_horizon( for l in range(1, L_max + 1): # noqa: E741 U_pl = np.zeros(n_groups, dtype=float) U_per_period_pl: Optional[np.ndarray] = ( - np.zeros((n_groups, n_periods), dtype=float) - if compute_per_period - else None + np.zeros((n_groups, n_periods), dtype=float) if compute_per_period else None ) for g in range(n_groups): @@ -5178,9 +5204,7 @@ def _compute_full_per_group_contributions( n_groups, n_periods = D_mat.shape U = np.zeros(n_groups, dtype=float) U_per_period: Optional[np.ndarray] = ( - np.zeros((n_groups, n_periods), dtype=float) - if compute_per_period - else None + np.zeros((n_groups, n_periods), dtype=float) if compute_per_period else None ) include_joiners_side = side in ("overall", "joiners") @@ -5318,13 +5342,13 @@ def _compute_cohort_recentered_inputs( singleton_baseline_groups: List[Any], compute_per_period: bool = True, ) -> Tuple[ - np.ndarray, # U_centered_overall (n_eligible,) - int, # n_groups_for_overall - int, # n_cohorts - int, # n_groups_dropped_never_switching - np.ndarray, # U_centered_joiners - np.ndarray, # U_centered_leavers - List[Any], # eligible_group_ids + np.ndarray, # U_centered_overall (n_eligible,) + int, # n_groups_for_overall + int, # n_cohorts + int, # n_groups_dropped_never_switching + np.ndarray, # U_centered_joiners + np.ndarray, # U_centered_leavers + List[Any], # eligible_group_ids Optional[np.ndarray], # U_centered_per_period_overall (n_eligible, n_periods) or None Optional[np.ndarray], # U_centered_per_period_joiners or None Optional[np.ndarray], # U_centered_per_period_leavers or None @@ -5775,11 +5799,7 @@ def _compute_se( # compute_survey_if_variance() expects estimator-scale psi. # Scale by 1/divisor to normalize before survey expansion. U_scaled = U_centered / divisor - U_pp_scaled = ( - U_centered_per_period / divisor - if U_centered_per_period is not None - else None - ) + U_pp_scaled = U_centered_per_period / divisor if U_centered_per_period is not None else None return _survey_se_from_group_if( U_centered=U_scaled, eligible_groups=eligible_groups, @@ -5854,7 +5874,7 @@ def _survey_se_from_group_if( # ATT, placebos, normalized/cumulated, heterogeneity). if U_centered.size == 0: return float("nan"), None - if float((U_centered ** 2).sum()) <= 0: + if float((U_centered**2).sum()) <= 0: return float("nan"), None group_ids = obs_survey_info["group_ids"] @@ -5910,24 +5930,26 @@ def _survey_se_from_group_if( # A replicate allocator contract is asserted by # `tests/test_survey_dcdh_replicate_psu.py::TestReplicateClassA:: # test_att_cell_allocator_with_varying_replicate_ratios`. - if use_cell_allocator and not getattr( - resolved, "uses_replicate_variance", False - ): + if use_cell_allocator and not getattr(resolved, "uses_replicate_variance", False): psu_arr = getattr(resolved, "psu", None) if psu_arr is None: use_cell_allocator = False else: psu_eff = np.asarray(psu_arr)[pos_mask] eligible_set = set(eligible_groups) - elig_row_mask = np.array( - [g in eligible_set for g in gids_eff], dtype=bool - ) + elig_row_mask = np.array([g in eligible_set for g in gids_eff], dtype=bool) if elig_row_mask.any(): psu_varies_within = bool( - pd.DataFrame({ - "g": gids_eff[elig_row_mask], - "p": psu_eff[elig_row_mask], - }).groupby("g")["p"].nunique().gt(1).any() + pd.DataFrame( + { + "g": gids_eff[elig_row_mask], + "p": psu_eff[elig_row_mask], + } + ) + .groupby("g")["p"] + .nunique() + .gt(1) + .any() ) if not psu_varies_within: use_cell_allocator = False @@ -5948,9 +5970,7 @@ def _survey_se_from_group_if( # (should be rare in practice — positive-weight rows almost # always survive cell aggregation; if one slips through we # treat it like an ineligible row rather than crash). - col_idx_eff = np.asarray( - pd.Index(periods).get_indexer(tids_eff), dtype=np.int64 - ) + col_idx_eff = np.asarray(pd.Index(periods).get_indexer(tids_eff), dtype=np.int64) valid_cell = (elig_idx_eff >= 0) & (col_idx_eff >= 0) n_eligible = len(eligible_groups) n_periods_pp = U_centered_per_period.shape[1] @@ -5974,17 +5994,13 @@ def _survey_se_from_group_if( missing_cell_mask = W_cell == 0 if missing_cell_mask.any(): leaked = U_centered_per_period[missing_cell_mask] - if leaked.size > 0 and bool( - np.any(np.abs(leaked) > 1e-12) - ): + if leaked.size > 0 and bool(np.any(np.abs(leaked) > 1e-12)): # Branch the wording on replicate-vs-TSL so replicate # ATT users debugging this error are not misdirected # to "within-group-varying PSU" (replicate designs # have no PSU structure). Core mechanics and the # pre-processing workaround are shared. - is_replicate = bool( - getattr(resolved, "uses_replicate_variance", False) - ) + is_replicate = bool(getattr(resolved, "uses_replicate_variance", False)) path_name = ( "Rao-Wu replicate-weight ATT variance" if is_replicate @@ -6022,9 +6038,7 @@ def _survey_se_from_group_if( elig_idx_eff[valid_cell], col_idx_eff[valid_cell] ] w_cell_at_row = np.zeros(w_eff.shape[0], dtype=np.float64) - w_cell_at_row[valid_cell] = W_cell[ - elig_idx_eff[valid_cell], col_idx_eff[valid_cell] - ] + w_cell_at_row[valid_cell] = W_cell[elig_idx_eff[valid_cell], col_idx_eff[valid_cell]] safe_w = np.where(w_cell_at_row > 0, w_cell_at_row, 1.0) psi[pos_mask] = u_obs_cell * (w_eff / safe_w) else: @@ -6196,9 +6210,7 @@ def _compute_twfe_diagnostic( weights_df = cell[[group_col, time_col]].copy() weights_df["weight"] = contribution_weights - fraction_negative = float( - (contribution_weights[treated_mask] < 0).sum() / treated_mask.sum() - ) + fraction_negative = float((contribution_weights[treated_mask] < 0).sum() / treated_mask.sum()) # Step 5: plain TWFE regression of y on (FE + d_gt) X_with_d = np.column_stack([X, d_arr.reshape(-1, 1)]) @@ -6367,9 +6379,7 @@ def twowayfeweights( if survey_design is not None: from diff_diff.survey import _resolve_survey_for_fit - resolved, survey_weights, _, _ = _resolve_survey_for_fit( - survey_design, data, "analytical" - ) + resolved, survey_weights, _, _ = _resolve_survey_for_fit(survey_design, data, "analytical") if resolved is not None and resolved.weight_type != "pweight": raise ValueError( f"twowayfeweights() survey support requires " diff --git a/diff_diff/chaisemartin_dhaultfoeuille_results.py b/diff_diff/chaisemartin_dhaultfoeuille_results.py index 801624fd..eb5cadbc 100644 --- a/diff_diff/chaisemartin_dhaultfoeuille_results.py +++ b/diff_diff/chaisemartin_dhaultfoeuille_results.py @@ -426,6 +426,16 @@ class ChaisemartinDHaultfoeuilleResults: sup_t_bands: Optional[Dict[str, Any]] = field(default=None, repr=False) covariate_residuals: Optional[pd.DataFrame] = field(default=None, repr=False) linear_trends_effects: Optional[Dict[int, Dict[str, Any]]] = field(default=None, repr=False) + # PR #347 R9 P1: persist the fit-time ``trends_linear`` flag + # explicitly. The previous approach of inferring the flag from + # ``linear_trends_effects is not None`` broke on the empty- + # horizon case — the estimator sets ``linear_trends_effects=None`` + # when the cumulated dict is empty but still unconditionally + # NaN-s ``overall_att`` for ``trends_linear=True`` with + # ``L_max >= 2``. BR/DR dispatch on this flag (via + # ``describe_target_parameter``) to route the no-scalar-by-design + # headline correctly even when the horizon surface is empty. + trends_linear: Optional[bool] = None heterogeneity_effects: Optional[Dict[int, Dict[str, Any]]] = field(default=None, repr=False) design2_effects: Optional[Dict[str, Any]] = field(default=None, repr=False) honest_did_results: Optional["HonestDiDResults"] = field(default=None, repr=False) @@ -439,10 +449,26 @@ class ChaisemartinDHaultfoeuilleResults: # Repr / properties # ------------------------------------------------------------------ + def _has_trends_linear(self) -> bool: + """Return whether ``trends_linear=True`` was active on this fit. + + PR #347 R9/R10: prefer the persisted fit-time flag. Fall back + to ``linear_trends_effects is not None`` for legacy result + objects that predate the persisted field. The inference + fallback is correct on populated-surface fits but fails on + empty-surface fits (``trends_linear=True``, ``L_max>=2`` with + no estimable horizons) because ``linear_trends_effects`` is + set to ``None`` in that case; the persisted flag handles it. + """ + persisted = self.trends_linear + if isinstance(persisted, bool): + return persisted + return self.linear_trends_effects is not None + def _horizon_label(self, h) -> str: """Return per-horizon estimand label for event study rows.""" has_controls = self.covariate_residuals is not None - has_trends = self.linear_trends_effects is not None + has_trends = self._has_trends_linear() if has_controls and has_trends: return f"DID^{{X,fd}}_{h}" elif has_controls: @@ -454,14 +480,19 @@ def _horizon_label(self, h) -> str: def _estimand_label(self) -> str: """Return the estimand label based on active features.""" has_controls = self.covariate_residuals is not None - has_trends = self.linear_trends_effects is not None + has_trends = self._has_trends_linear() # When trends_linear + L_max>=2, overall is NaN (no aggregate). - # Label reflects that per-horizon effects are in linear_trends_effects. + # Label reflects that per-horizon effects are in + # linear_trends_effects — UNLESS that surface is also empty + # (empty-horizon subcase; PR #347 R13 P1). In the + # empty-surface subcase we do not direct users to a + # nonexistent dict; we name the empty state instead. if has_trends and self.L_max is not None and self.L_max >= 2: - if has_controls: - return "DID^{X,fd}_l (see linear_trends_effects)" - return "DID^{fd}_l (see linear_trends_effects)" + base_label = "DID^{X,fd}_l" if has_controls else "DID^{fd}_l" + if self.linear_trends_effects is None: + return f"{base_label} (no cumulated level effects survived estimation)" + return f"{base_label} (see linear_trends_effects)" if self.L_max is not None and self.L_max >= 2: base = "delta" @@ -558,9 +589,11 @@ def summary(self, alpha: Optional[float] = None) -> str: "", f"{'Total observations:':<35} {self.n_obs:>10}", f"{'Treated observations:':<35} {self.n_treated_obs:>10}", - f"{'Eligible switchers (N_1):':<35} {self.n_switcher_cells:>10}" - if self.L_max is not None and self.L_max >= 1 - else f"{'Switcher cells (N_S):':<35} {self.n_switcher_cells:>10}", + ( + f"{'Eligible switchers (N_1):':<35} {self.n_switcher_cells:>10}" + if self.L_max is not None and self.L_max >= 1 + else f"{'Switcher cells (N_S):':<35} {self.n_switcher_cells:>10}" + ), f"{'Groups (post-filter):':<35} {len(self.groups):>10}", f"{'Cohorts:':<35} {self.n_cohorts:>10}", f"{'Time periods:':<35} {len(self.time_periods):>10}", @@ -591,7 +624,7 @@ def summary(self, alpha: Optional[float] = None) -> str: # --- Overall --- has_controls = self.covariate_residuals is not None - has_trends = self.linear_trends_effects is not None + has_trends = self._has_trends_linear() adj_tag = "" if has_controls and has_trends: adj_tag = " (Covariate-and-Trend-Adjusted)" @@ -866,17 +899,13 @@ def print_summary(self, alpha: Optional[float] = None) -> None: # Summary section helpers (Phase 3 blocks) # ------------------------------------------------------------------ - def _render_covariate_section( - self, lines: List[str], width: int, thin: str - ) -> None: + def _render_covariate_section(self, lines: List[str], width: int, thin: str) -> None: if self.covariate_residuals is None: return cov_df = self.covariate_residuals control_names = sorted(cov_df["covariate"].unique()) n_baselines = cov_df["baseline_treatment"].nunique() - failed = int( - (cov_df.groupby("baseline_treatment")["theta_hat"].first().isna()).sum() - ) + failed = int((cov_df.groupby("baseline_treatment")["theta_hat"].first().isna()).sum()) lines.extend( [ thin, @@ -917,9 +946,7 @@ def _render_linear_trends_section( ) lines.extend([thin, ""]) - def _render_heterogeneity_section( - self, lines: List[str], width: int, thin: str - ) -> None: + def _render_heterogeneity_section(self, lines: List[str], width: int, thin: str) -> None: if self.heterogeneity_effects is None: return lines.extend( @@ -951,9 +978,7 @@ def _render_heterogeneity_section( ] ) - def _render_design2_section( - self, lines: List[str], width: int, thin: str - ) -> None: + def _render_design2_section(self, lines: List[str], width: int, thin: str) -> None: if self.design2_effects is None: return d2 = self.design2_effects @@ -976,9 +1001,7 @@ def _render_design2_section( ] ) - def _render_honest_did_section( - self, lines: List[str], width: int, thin: str - ) -> None: + def _render_honest_did_section(self, lines: List[str], width: int, thin: str) -> None: if self.honest_did_results is None: return hd = self.honest_did_results @@ -996,22 +1019,16 @@ def _render_honest_did_section( ] ) if hd.post_periods_used is not None: - lines.append( - f"{'Post horizons used:':<35} {hd.post_periods_used}" - ) + lines.append(f"{'Post horizons used:':<35} {hd.post_periods_used}") if hd.pre_periods_used is not None: - lines.append( - f"{'Pre horizons used:':<35} {hd.pre_periods_used}" - ) + lines.append(f"{'Pre horizons used:':<35} {hd.pre_periods_used}") lines.extend( [ f"{'Original estimate:':<35} {_fmt_float(hd.original_estimate):>10}", - f"{'Identified set:':<35} " - f"[{_fmt_float(hd.lb)}, {_fmt_float(hd.ub)}]", + f"{'Identified set:':<35} " f"[{_fmt_float(hd.lb)}, {_fmt_float(hd.ub)}]", f"{'Robust ' + str(conf_pct) + '% CI:':<35} " f"[{_fmt_float(hd.ci_lb)}, {_fmt_float(hd.ci_ub)}]", - f"{'Significant at ' + str(int(hd.alpha * 100)) + '%:':<35} " - f"{sig_label:>10}", + f"{'Significant at ' + str(int(hd.alpha * 100)) + '%:':<35} " f"{sig_label:>10}", thin, "", ] @@ -1255,9 +1272,24 @@ def to_dataframe(self, level: str = "overall") -> pd.DataFrame: elif level == "linear_trends": if self.linear_trends_effects is None: + # PR #347 R12 P1: distinguish the "trends_linear was + # not requested" case from the "trends_linear was + # requested but no horizons survived" case. Telling + # a user who already passed ``trends_linear=True`` + # to pass it again is a dead-end. + if self._has_trends_linear(): + return pd.DataFrame( + columns=[ + "horizon", + "effect", + "se", + "t_stat", + "p_value", + "conf_int", + ] + ) raise ValueError( - "Linear trends effects not available. Pass " - "trends_linear=True to fit()." + "Linear trends effects not available. Pass trends_linear=True to fit()." ) rows = [] for h, data in sorted(self.linear_trends_effects.items()): diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 6ec87b2a..f5162038 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -38,7 +38,9 @@ import numpy as np import pandas as pd -DIAGNOSTIC_REPORT_SCHEMA_VERSION = "1.0" +from diff_diff._reporting_helpers import describe_target_parameter # noqa: E402 (top-level import) + +DIAGNOSTIC_REPORT_SCHEMA_VERSION = "2.0" __all__ = [ "DiagnosticReport", @@ -926,7 +928,57 @@ def _execute(self) -> DiagnosticReportResults: } # Headline metric — best-effort across estimator types. - headline = self._extract_headline_metric() + # PR #347 R4 P1: the dCDH ``trends_linear=True`` + ``L_max>=2`` + # configuration does not produce a scalar headline by design + # (``overall_att`` is intentionally NaN per + # ``chaisemartin_dhaultfoeuille.py:2828-2834``). Route the + # headline through a dedicated no-scalar block when the + # target-parameter helper flags this case so prose does not + # narrate it as an estimation failure. + _tp_agg = describe_target_parameter(self._results).get("aggregation") + if _tp_agg == "no_scalar_headline": + # PR #347 R12 P1: distinguish populated vs empty per-horizon + # surface. Pointing users at ``linear_trends_effects`` is + # dead-end guidance when that dict is ``None``. + _surface_empty = getattr(self._results, "linear_trends_effects", None) is None + # PR #347 R14 P1: control-aware empty-surface label. + _has_controls = getattr(self._results, "covariate_residuals", None) is not None + _empty_surface_label = "DID^{X,fd}_l" if _has_controls else "DID^{fd}_l" + if _surface_empty: + headline_name = "no scalar headline (empty per-horizon surface)" + headline_reason = ( + "The fitted estimator intentionally does not produce a " + "scalar overall ATT on this configuration " + "(``trends_linear=True`` with ``L_max >= 2``), and on " + f"this fit no cumulated level effects ``{_empty_surface_label}`` " + "survived estimation — the per-horizon surface is " + "empty. Re-fit with a larger ``L_max`` or with " + "``trends_linear=False`` if you need a reportable " + "estimand." + ) + else: + headline_name = "no scalar headline (see linear_trends_effects)" + headline_reason = ( + "The fitted estimator intentionally does not produce a " + "scalar overall ATT on this configuration " + "(``trends_linear=True`` with ``L_max >= 2``). Per-horizon " + "cumulated level effects are on " + "``results.linear_trends_effects[l]``." + ) + headline = { + "status": "no_scalar_by_design", + "name": headline_name, + "value": None, + "se": None, + "p_value": None, + "conf_int": (None, None), + "alpha": self._alpha, + "is_significant": False, + "sign": "none", + "reason": headline_reason, + } + else: + headline = self._extract_headline_metric() # Pull suggested next steps from the practitioner workflow. next_steps = self._collect_next_steps(sections) @@ -962,6 +1014,7 @@ def _execute(self) -> DiagnosticReportResults: "schema_version": DIAGNOSTIC_REPORT_SCHEMA_VERSION, "estimator": type(self._results).__name__, "headline_metric": headline, + "target_parameter": describe_target_parameter(self._results), "parallel_trends": sections["parallel_trends"], "pretrends_power": sections["pretrends_power"], "sensitivity": sections["sensitivity"], @@ -2976,7 +3029,25 @@ def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str ci = headline.get("conf_int") if isinstance(headline, dict) else None p = headline.get("p_value") if isinstance(headline, dict) else None val_finite = isinstance(val, (int, float)) and np.isfinite(val) - if val is not None and not val_finite: + # PR #347 R4 P1: if the estimator intentionally produces no scalar + # aggregate (dCDH ``trends_linear=True`` + ``L_max>=2``), route + # through explicit no-scalar prose rather than the + # estimation-failure branch below. The headline block carries + # ``status="no_scalar_by_design"`` in that case. + if isinstance(headline, dict) and headline.get("status") == "no_scalar_by_design": + # PR #347 R13 P1: route through the headline-level ``reason`` + # field so the DR interpretation never drifts from the + # schema-level populated-vs-empty branching. + reason = headline.get("reason") + base = ( + f"On {est}, {treatment} does not produce a scalar aggregate " + f"effect on {outcome} under this configuration." + ) + if isinstance(reason, str) and reason: + sentences.append(f"{base} {reason}") + else: + sentences.append(base + " (by design)") + elif val is not None and not val_finite: sentences.append( f"On {est}, {treatment}'s effect on {outcome} is non-finite " "(the estimation did not produce a usable point estimate). " @@ -3003,7 +3074,19 @@ def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str f"On {est}, {treatment} {direction} {outcome} by {val:.3g}{ci_str}{p_str}." ) - # Sentence 2: parallel trends + power (method-aware prose per the + # Sentence 2: name the target parameter (BR/DR gap #6). Rendered + # right after the headline so the reader sees what the scalar + # represents before pre-trends / sensitivity context. Only the + # terse ``name`` goes in the interpretation paragraph; the full + # ``definition`` lives in DR's "## Target Parameter" markdown + # section and in the structured ``schema["target_parameter"]`` + # dict for agents that want the long form. + tp = schema.get("target_parameter") or {} + tp_name = tp.get("name") + if tp_name: + sentences.append(f"Target parameter: {tp_name}.") + + # Sentence 3: parallel trends + power (method-aware prose per the # round-8 CI review on PR #318; PT method can be slope_difference # (2x2), joint_wald / bonferroni (event study), hausman (EfficientDiD # PT-All vs PT-Post), synthetic_fit (SDiD), or factor (TROP), and the @@ -3215,12 +3298,45 @@ def _render_dr_full_report(results: "DiagnosticReportResults") -> str: lines.append(f"**Estimator**: `{schema.get('estimator')}`") headline = schema.get("headline_metric") if headline: - lines.append( - f"**Headline**: {headline.get('name')} = " - f"{headline.get('value')} " - f"(SE {headline.get('se')}, p = {headline.get('p_value')})" - ) + # PR #347 R5 P2: the no-scalar-by-design branch + # (``trends_linear=True`` + ``L_max>=2`` on dCDH) has + # ``value`` / ``se`` / ``p_value`` all ``None``. Formatting + # those straight into the ``**Headline**: ... = None`` line + # would produce a malformed top headline even though the + # "## Target Parameter" section below correctly explains + # the design. Route to explicit no-scalar markdown. + if headline.get("status") == "no_scalar_by_design": + lines.append("**Headline**: no scalar aggregate by design.") + reason = headline.get("reason") + if reason: + lines.append("") + lines.append(reason) + else: + lines.append( + f"**Headline**: {headline.get('name')} = " + f"{headline.get('value')} " + f"(SE {headline.get('se')}, p = {headline.get('p_value')})" + ) lines.append("") + + # BR/DR gap #6: target-parameter section between headline metadata + # and the overall-interpretation paragraph. + tp = schema.get("target_parameter") or {} + if tp.get("name") or tp.get("definition"): + lines.append("## Target Parameter") + lines.append("") + if tp.get("name"): + lines.append(f"- **{tp['name']}**") + if tp.get("definition"): + lines.append(f"- {tp['definition']}") + if tp.get("aggregation"): + lines.append(f"- Aggregation tag: `{tp['aggregation']}`") + if tp.get("headline_attribute"): + lines.append(f"- Headline attribute: `{tp['headline_attribute']}`") + if tp.get("reference"): + lines.append(f"- Reference: {tp['reference']}") + lines.append("") + lines.append("## Overall Interpretation") lines.append("") lines.append(schema.get("overall_interpretation", "") or "_No synthesis available._") diff --git a/diff_diff/guides/llms-full.txt b/diff_diff/guides/llms-full.txt index 26b02514..5b648826 100644 --- a/diff_diff/guides/llms-full.txt +++ b/diff_diff/guides/llms-full.txt @@ -1776,11 +1776,23 @@ Schema top-level keys (all always present; missing content uses a `{"status": "skipped", "reason": "..."}` shape rather than being absent): - `schema_version`, `estimator`, `context` -- `headline`, `assumption`, `pre_trends`, `sensitivity` +- `headline`, `target_parameter`, `assumption`, `pre_trends`, `sensitivity` - `sample`, `heterogeneity`, `robustness`, `diagnostics` - `next_steps`, `caveats`, `references` +`target_parameter` (experimental) names what the headline scalar +represents for each estimator — overall ATT, DID_M, DID_1, cost- +benefit delta, dose-response aggregate, ASF-based ETWFE, etc. +Fields: `name` (short stakeholder label), `definition` (full prose), +`aggregation` (machine-readable dispatch tag, e.g., `"simple"`, +`"event_study"`, `"delta"`, `"no_scalar_headline"`), +`headline_attribute` (which raw attribute holds the scalar, or +`None` when no scalar exists by design — e.g., dCDH with +`trends_linear=True` and `L_max>=2`), `reference` (citation). + Status enum values: `ran | skipped | error | not_applicable | not_run | computed`. +`headline.status` also supports `"no_scalar_by_design"` on the dCDH +no-scalar branch. ## DiagnosticReport @@ -1830,9 +1842,14 @@ dr.skipped_checks # dict of {check: plain-English reason} ``` Schema top-level keys: `schema_version, estimator, headline_metric, -parallel_trends, pretrends_power, sensitivity, placebo, bacon, -design_effect, heterogeneity, epv, estimator_native_diagnostics, -skipped, warnings, overall_interpretation, next_steps`. +target_parameter, parallel_trends, pretrends_power, sensitivity, +placebo, bacon, design_effect, heterogeneity, epv, +estimator_native_diagnostics, skipped, warnings, +overall_interpretation, next_steps`. The `target_parameter` block +mirrors BR's (same shape and dispatch); see BR's section above for +field semantics including the `headline_attribute=None` / +`aggregation="no_scalar_headline"` case for dCDH +`trends_linear=True, L_max>=2` fits. ### Verdicts and tiers diff --git a/docs/api/business_report.rst b/docs/api/business_report.rst index 3017dbf2..093bc0a9 100644 --- a/docs/api/business_report.rst +++ b/docs/api/business_report.rst @@ -49,6 +49,22 @@ Methodology deviations (no traffic-light gates, pre-trends verdict thresholds, power-aware phrasing, unit-translation policy, schema stability) are documented in :doc:`../methodology/REPORTING`. +The schema carries a top-level ``target_parameter`` block +(experimental) naming what the headline scalar represents per +estimator — simple ATT, event-study average, DID_M, DID_1, +cost-benefit delta, dose-response aggregate, factor-model-adjusted ATT, +etc. For the dCDH dynamic branch with ``trends_linear=True`` and +``L_max>=2``, the scalar is intentionally NaN and +``aggregation`` is ``"no_scalar_headline"`` with +``headline_attribute`` set to ``None``. Agents should dispatch on +this case and inspect the headline ``reason`` field, which +distinguishes the populated-surface subcase (per-horizon table +available on ``linear_trends_effects``) from the empty-surface +subcase (no horizons survived estimation; re-fit with a larger +``L_max`` or with ``trends_linear=False``). See the "Target +parameter" section of :doc:`../methodology/REPORTING` for the +full per-estimator dispatch table and schema shape. + Example ------- diff --git a/docs/api/diagnostic_report.rst b/docs/api/diagnostic_report.rst index 0a72e2ca..3745beb2 100644 --- a/docs/api/diagnostic_report.rst +++ b/docs/api/diagnostic_report.rst @@ -15,6 +15,12 @@ Methodology deviations (no traffic-light gates, opt-in placebo battery, estimator-native diagnostic routing, power-aware phrasing threshold) are documented in :doc:`../methodology/REPORTING`. +The schema carries a top-level ``target_parameter`` block +(experimental) naming what the headline scalar represents per +estimator. See the "Target parameter" section of +:doc:`../methodology/REPORTING` for the per-estimator dispatch and +schema shape. + Data-dependent checks (2x2 parallel trends on simple DiD, Goodman-Bacon decomposition on staggered estimators, the EfficientDiD Hausman PT-All vs PT-Post pretest) require the raw panel + column diff --git a/docs/methodology/REPORTING.md b/docs/methodology/REPORTING.md index fd10cfda..0a04b4e6 100644 --- a/docs/methodology/REPORTING.md +++ b/docs/methodology/REPORTING.md @@ -53,6 +53,146 @@ effects, pre-period and reference-marker rows excluded). These are reporting-layer aggregations of inputs already in the result object, not new inference. +## Target parameter + +The BusinessReport and DiagnosticReport schemas both carry a +top-level `target_parameter` block that names what scalar the +headline number actually represents. The 16 result classes have +meaningfully different estimands — a stakeholder reading +`overall_att = -0.0214` on a Callaway-Sant'Anna fit cannot tell +whether that is the simple-weighted average across `ATT(g,t)` +cells, an event-study-weighted aggregate, or a group-weighted +aggregate. Baker et al. (2025) Step 2 is "Define the target +parameter"; BR/DR does that work for the user. + +Schema shape: + +```json +"target_parameter": { + "name": "overall ATT (cohort-size-weighted average of ATT(g,t))", + "definition": "A cohort-size-weighted average of group-time ATTs ...", + "aggregation": "simple", + "headline_attribute": "overall_att", + "reference": "Callaway & Sant'Anna (2021); REGISTRY.md Sec. CallawaySantAnna" +} +``` + +Field semantics: + +- `name` — short stakeholder-facing name. Rendered verbatim in + BR's summary paragraph and DR's overall-interpretation + paragraph. Always non-empty. +- `definition` — plain-English description of what the scalar is + and how it is aggregated. Rendered in BR's and DR's full-report + markdown (under "## Target Parameter") but omitted from the + summary paragraph so stakeholder prose stays within the 6-10- + sentence target. +- `aggregation` — machine-readable tag dispatching agents can + branch on. Complete enumeration per estimator: + - `"did_or_twfe"` (DiDResults / TwoWayFixedEffects both route here — neutral tag; ambiguous at the result-class level until estimator provenance is persisted) + - `"event_study"` (MultiPeriodDiDResults) + - `"simple"` (CallawaySantAnna / Imputation / TwoStage / Wooldridge) + - `"iw"` (SunAbraham) + - `"stacked"` (StackedDiD) + - `"pt_all_combined"` / `"pt_post_single_baseline"` (EfficientDiD + branched on `pt_assumption`) + - `"dose_overall"` (ContinuousDiD) + - `"ddd"` / `"staggered_ddd"` (TripleDifference / StaggeredTripleDiff) + - dCDH dynamic branches follow the exact `overall_att` + contract: `"M"` / `"M_x"` / `"M_fd"` / `"M_x_fd"` for + `L_max=None`; `"DID_1"` / `"DID_1_x"` / `"DID_1_fd"` / + `"DID_1_x_fd"` for `L_max=1`; `"delta"` / `"delta_x"` for + `L_max>=2` without trend suppression; and + `"no_scalar_headline"` when `trends_linear=True` AND + `L_max>=2` (the scalar is intentionally NaN). + - `"synthetic"` (SyntheticDiD) / `"factor_model"` (TROP) / + `"twfe"` (BaconDecomposition read-out) / `"unknown"` (default + fallback). +- `headline_attribute` — the raw result attribute the scalar + comes from (`"overall_att"` / `"att"` / `"avg_att"` / + `"twfe_estimate"`), OR `None` when `aggregation == + "no_scalar_headline"` (the dCDH `trends_linear=True, + L_max>=2` branch where `overall_att` is intentionally NaN by + design). Agents dispatching on this field must handle `None` by + inspecting `headline.reason` (BR) / `headline_metric.reason` + (DR), which distinguishes two subcases: + + - **Populated-surface subcase** (per-horizon + `linear_trends_effects` dict is non-empty): `reason` + directs callers to `results.linear_trends_effects[l]` for + per-horizon cumulated level effects. + - **Empty-surface subcase** (`linear_trends_effects is None` + because no horizons survived estimation): `reason` names + the empty state explicitly and directs callers toward + re-fit remediation (larger `L_max` or + `trends_linear=False`) rather than a nonexistent dict. The + dCDH native estimand label is also branched — on this + subcase `_estimand_label()` returns + `DID^{fd}_l (no cumulated level effects survived estimation)` + (or `DID^{X,fd}_l (...)` when covariates are active). + + Different result classes use different attribute names; agents + that want to re-read the raw value can dispatch on + `headline_attribute`. +- `reference` — one-line citation pointer to the canonical paper + and the REGISTRY.md section. + +Per-estimator dispatch lives in +`diff_diff/_reporting_helpers.py::describe_target_parameter`. Each +branch is sourced from the corresponding estimator's section in +REGISTRY.md; new result classes must add an explicit branch (the +exhaustiveness test `TestTargetParameterCoversEveryResultClass` +locks this in). + +A few branches read fit-time config from the result object: + +- `EfficientDiDResults.pt_assumption`: `"all"` (over-identified + combined) vs `"post"` (just-identified single-baseline) branches + `aggregation` between `"pt_all_combined"` and + `"pt_post_single_baseline"`. +- `StackedDiDResults.clean_control`: `"never_treated"` / + `"strict"` / `"not_yet_treated"` varies the `definition` clause + describing which units qualify as controls. +- `ChaisemartinDHaultfoeuilleResults.L_max` + + `covariate_residuals` + `linear_trends_effects`: branches the + dCDH estimand tag per the exact `overall_att` contract in + `chaisemartin_dhaultfoeuille.py:2602-2634` and + `chaisemartin_dhaultfoeuille.py:2828-2834`: + - `L_max=None` → `DID_M` (Phase 1 per-period aggregate; + `aggregation="M"`). + - `L_max=1` → `DID_1` (single-horizon per-group estimand, + Equation 3 of the dynamic companion paper; + `aggregation="DID_1"`). + - `L_max>=2` → cost-benefit `delta` (Lemma 4 cross-horizon + aggregate; `aggregation="delta"`). + - `trends_linear=True` AND `L_max>=2` → `overall_att` is + intentionally NaN (no scalar aggregate; per-horizon level + effects live on `results.linear_trends_effects[l]`). + `aggregation="no_scalar_headline"` and + `headline_attribute` is `None`. + + Covariates (`has_controls`) and/or linear trends + (`has_trends`, when `L_max < 2`) add `_x` / `_fd` / + `_x_fd` suffixes to the `aggregation` tag and the + corresponding `^X` / `^{fd}` / `^{X,fd}` superscripts to the + `name` (e.g. `DID^X_1`, `delta^X`, `DID^{fd}_M`), matching the + result class's own `_estimand_label()` helper at + `chaisemartin_dhaultfoeuille_results.py:454-490`. + +A few branches emit a fixed tag regardless of fit-time config — +notably `CallawaySantAnna`, `ImputationDiD`, `TwoStageDiD`, and +`WooldridgeDiD`. For these estimators the `overall_att` +(or `att` / `avg_att`) scalar is ALWAYS the simple weighted +aggregation; the fit-time `aggregate` kwarg populates additional +horizon / group tables on the result object but does not change +the headline scalar. Disambiguating those tables in prose is +tracked under BR/DR gap #9 (per-cohort narrative rendering). + +`ContinuousDiDResults` emits a single `"dose_overall"` tag with a +disjunctive definition (`ATT^loc` under PT; `ATT^glob` under +SPT) because the PT-vs-SPT regime is a user-level assumption, not +a library setting. + ## Design deviations - **Note:** No hard pass/fail gates. `DiagnosticReport` does not produce @@ -234,6 +374,17 @@ not new inference. anchor tooling on them prematurely; a formal deprecation policy will land within two subsequent PRs. +- **Note:** Schema version 2.0 (both BR and DR). The BR/DR gap #6 + target-parameter PR adds the `headline.status` / + `headline_metric.status` value `"no_scalar_by_design"` (used for + the dCDH `trends_linear=True, L_max>=2` configuration where + `overall_att` is intentionally NaN). Per the stability policy + above, new enum values are breaking changes, so + `BUSINESS_REPORT_SCHEMA_VERSION` and + `DIAGNOSTIC_REPORT_SCHEMA_VERSION` bumped from `"1.0"` to + `"2.0"`. The schemas remain marked experimental, so the formal + deprecation policy does not yet apply. + ## Reference implementation(s) The phrasing rules follow the guidance in: diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 8ec51c2b..66895513 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -49,6 +49,7 @@ "estimator", "context", "headline", + "target_parameter", "assumption", "pre_trends", "sensitivity", @@ -958,7 +959,7 @@ class DiDResults: from diff_diff.diagnostic_report import DiagnosticReportResults fake_schema = { - "schema_version": "1.0", + "schema_version": "2.0", "estimator": "DiDResults", "headline_metric": {"name": "att", "value": 1.0}, "parallel_trends": { @@ -1057,7 +1058,7 @@ class DiDResults: pt_block["joint_p_value"] = 0.40 fake_schema = { - "schema_version": "1.0", + "schema_version": "2.0", "estimator": "DiDResults", "headline_metric": {"name": "att", "value": 1.0}, "parallel_trends": pt_block, @@ -2320,7 +2321,7 @@ def test_br_schema_tier_is_downgraded(self): from diff_diff.diagnostic_report import DiagnosticReportResults schema = { - "schema_version": "1.0", + "schema_version": "2.0", "estimator": "CallawaySantAnnaResults", "headline_metric": {"name": "overall_att", "value": 1.0}, "parallel_trends": { @@ -4027,7 +4028,7 @@ def _fragile_dr_schema(self, breakdown_m: float, grid=None): for row in grid ] schema = { - "schema_version": "1.0", + "schema_version": "2.0", "estimator": {"class_name": "CallawaySantAnnaResults", "display_name": "CS"}, "headline_metric": {}, "parallel_trends": {"status": "skipped", "reason": "stub"}, @@ -4214,7 +4215,7 @@ def _bacon_schema_with_high_forbidden_weight(): from diff_diff.diagnostic_report import DiagnosticReportResults schema = { - "schema_version": "1.0", + "schema_version": "2.0", "estimator": {"class_name": "Stub", "display_name": "Stub"}, "headline_metric": {}, "parallel_trends": {"status": "skipped", "reason": "stub"}, diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index ba473a56..810eca20 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -48,6 +48,7 @@ "schema_version", "estimator", "headline_metric", + "target_parameter", "parallel_trends", "pretrends_power", "sensitivity", @@ -172,7 +173,7 @@ def test_schema_version_constant(self, multi_period_fit): fit, _ = multi_period_fit schema = DiagnosticReport(fit).to_dict() assert schema["schema_version"] == DIAGNOSTIC_REPORT_SCHEMA_VERSION - assert DIAGNOSTIC_REPORT_SCHEMA_VERSION == "1.0" + assert DIAGNOSTIC_REPORT_SCHEMA_VERSION == "2.0" def test_all_statuses_use_closed_enum(self, cs_fit): fit, sdf = cs_fit @@ -1930,7 +1931,7 @@ def _render(self, sens_block): from diff_diff.diagnostic_report import _render_overall_interpretation schema = { - "schema_version": "1.0", + "schema_version": "2.0", "estimator": {"class_name": "CallawaySantAnnaResults", "display_name": "CS"}, "headline_metric": { "status": "ran", diff --git a/tests/test_target_parameter.py b/tests/test_target_parameter.py new file mode 100644 index 00000000..cbf8ad4c --- /dev/null +++ b/tests/test_target_parameter.py @@ -0,0 +1,1247 @@ +"""Tests for the BR/DR ``target_parameter`` block (gap #6). + +Covers: + +- Per-estimator dispatch in ``describe_target_parameter`` emits the + expected ``aggregation`` tag and non-empty prose for each of the 16 + result classes in ``_APPLICABILITY``. +- Fit-time config reads are honored: + - ``EfficientDiDResults.pt_assumption`` branches the tag between + ``pt_all_combined`` and ``pt_post_single_baseline``. + - ``StackedDiDResults.clean_control`` varies the ``definition`` + clause (never_treated vs strict vs not_yet_treated). + - ``ChaisemartinDHaultfoeuilleResults.L_max`` + ``covariate_residuals`` + + ``linear_trends_effects`` branches the dCDH estimand tag. +- BR and DR emit identical ``target_parameter`` blocks (cross-surface + parity). +- Exhaustiveness: every result-class name in ``_APPLICABILITY`` gets + a non-default, non-empty target-parameter block. +- Prose renders the target-parameter sentence in BR's summary + full + report and DR's overall_interpretation + full report. +""" + +from __future__ import annotations + +from types import SimpleNamespace + +import pytest + +from diff_diff._reporting_helpers import describe_target_parameter +from diff_diff.diagnostic_report import _APPLICABILITY + + +def _minimal_result(class_name: str, **attrs): + """Build a minimal stub result object with the given class name.""" + cls = type(class_name, (), {}) + obj = cls() + for k, v in attrs.items(): + setattr(obj, k, v) + return obj + + +class TestTargetParameterPerEstimator: + """Dispatch table lookup for each of the 16 result classes.""" + + def test_did_results(self): + tp = describe_target_parameter(_minimal_result("DiDResults")) + assert tp["aggregation"] == "did_or_twfe" + assert tp["headline_attribute"] == "att" + assert "ATT" in tp["name"] + + def test_multi_period_did_results(self): + tp = describe_target_parameter(_minimal_result("MultiPeriodDiDResults")) + assert tp["aggregation"] == "event_study" + assert tp["headline_attribute"] == "avg_att" + assert "event-study" in tp["name"].lower() + + def test_did_results_mentions_twfe(self): + """``TwoWayFixedEffects.fit()`` returns ``DiDResults`` (verified in + PR #347 R1 review), so the DiDResults branch must cover both + the 2x2 DiD and TWFE interpretations. There is no separate + ``TwoWayFixedEffectsResults`` class. + """ + tp = describe_target_parameter(_minimal_result("DiDResults")) + assert tp["aggregation"] == "did_or_twfe" + assert "TWFE" in tp["name"] or "TWFE" in tp["definition"] + + def test_callaway_santanna(self): + tp = describe_target_parameter(_minimal_result("CallawaySantAnnaResults")) + assert tp["aggregation"] == "simple" + assert tp["headline_attribute"] == "overall_att" + assert "ATT(g,t)" in tp["name"] or "ATT(g, t)" in tp["definition"] + + def test_sun_abraham(self): + tp = describe_target_parameter(_minimal_result("SunAbrahamResults")) + assert tp["aggregation"] == "iw" + assert "interaction-weighted" in tp["name"].lower() or "IW" in tp["name"] + + def test_imputation(self): + tp = describe_target_parameter(_minimal_result("ImputationDiDResults")) + assert tp["aggregation"] == "simple" + assert tp["headline_attribute"] == "overall_att" + assert ( + "imputation" in tp["name"].lower() + or "BJS" in tp["reference"] + or "Borusyak" in tp["reference"] + ) + + def test_two_stage(self): + tp = describe_target_parameter(_minimal_result("TwoStageDiDResults")) + assert tp["aggregation"] == "simple" + assert tp["headline_attribute"] == "overall_att" + assert "Gardner" in tp["reference"] or "two-stage" in tp["name"].lower() + + def test_stacked(self): + tp = describe_target_parameter( + _minimal_result("StackedDiDResults", clean_control="not_yet_treated") + ) + assert tp["aggregation"] == "stacked" + assert tp["headline_attribute"] == "overall_att" + assert "sub-experiment" in tp["definition"].lower() + + def test_wooldridge_ols(self): + """PR #347 R4 P2: OLS Wooldridge ETWFE must not be labeled with + ASF wording. The OLS path aggregates ATT(g,t) coefficients with + observation-count weights; the ASF path is for nonlinear links. + """ + tp = describe_target_parameter(_minimal_result("WooldridgeDiDResults", method="ols")) + assert tp["aggregation"] == "simple" + assert tp["headline_attribute"] == "overall_att" + # OLS wording: mentions ATT(g,t) aggregation, not ASF. + assert ( + "ATT(g,t)" in tp["name"] or "ATT(g,t)" in tp["definition"] or "OLS ETWFE" in tp["name"] + ) + assert "ASF" not in tp["name"] + + def test_wooldridge_nonlinear(self): + """Nonlinear (logit/Poisson) Wooldridge ETWFE uses the ASF-based + ATT path — different wording, different REGISTRY reference. + """ + for method in ("logit", "poisson"): + tp = describe_target_parameter(_minimal_result("WooldridgeDiDResults", method=method)) + assert tp["aggregation"] == "simple" + assert tp["headline_attribute"] == "overall_att" + assert "ASF" in tp["name"] + assert method in tp["name"] or method in tp["definition"] + + def test_efficient_did_pt_all(self): + tp = describe_target_parameter(_minimal_result("EfficientDiDResults", pt_assumption="all")) + assert tp["aggregation"] == "pt_all_combined" + assert "PT-All" in tp["name"] + # PR #347 R7 P1 regression: the definition must disambiguate + # the library's cohort-size-weighted ``overall_att`` from the + # paper's uniform-event-time ``ES_avg``. + defn = tp["definition"] + assert "cohort-size-weighted" in defn + assert "ES_avg" in defn + assert "post-treatment" in defn.lower() + + def test_efficient_did_pt_post(self): + tp = describe_target_parameter(_minimal_result("EfficientDiDResults", pt_assumption="post")) + assert tp["aggregation"] == "pt_post_single_baseline" + assert "PT-Post" in tp["name"] + defn = tp["definition"] + assert "cohort-size-weighted" in defn + assert "ES_avg" in defn + + def test_continuous_did(self): + tp = describe_target_parameter(_minimal_result("ContinuousDiDResults")) + assert tp["aggregation"] == "dose_overall" + # Definition must name both PT and SPT readings per plan-review CRITICAL #2. + defn = tp["definition"] + assert "PT" in defn and "SPT" in defn + + def test_triple_difference(self): + tp = describe_target_parameter(_minimal_result("TripleDifferenceResults")) + assert tp["aggregation"] == "ddd" + assert tp["headline_attribute"] == "att" + + def test_staggered_triple_difference(self): + tp = describe_target_parameter(_minimal_result("StaggeredTripleDiffResults")) + assert tp["aggregation"] == "staggered_ddd" + assert tp["headline_attribute"] == "overall_att" + + def test_dcdh_m(self): + tp = describe_target_parameter( + _minimal_result("ChaisemartinDHaultfoeuilleResults", L_max=None) + ) + assert tp["aggregation"] == "M" + assert "DID_M" in tp["name"] + # R1 PR #347 review: headline lives in ``overall_att``, not + # ``att``. Machine-readable field must match the raw attribute + # downstream consumers should read. + assert tp["headline_attribute"] == "overall_att" + + def test_dcdh_did_1(self): + """``L_max = 1`` → headline is ``DID_1`` (per-group single- + horizon estimand, Equation 3 of the dCDH dynamic paper), NOT + the generic ``DID_l``. R2 PR #347 review P1 regression: + previously every ``L_max >= 1`` was flattened to ``DID_l``. + """ + tp = describe_target_parameter( + _minimal_result( + "ChaisemartinDHaultfoeuilleResults", + L_max=1, + covariate_residuals=None, + linear_trends_effects=None, + ) + ) + assert tp["aggregation"] == "DID_1" + assert "DID_1" in tp["name"] + assert tp["headline_attribute"] == "overall_att" + + def test_dcdh_delta(self): + """``L_max >= 2`` (no trends) → headline is the cost-benefit + ``delta`` aggregate (Lemma 4), NOT ``DID_l``. Mirrors + ``chaisemartin_dhaultfoeuille.py:2602-2634``. + """ + tp = describe_target_parameter( + _minimal_result( + "ChaisemartinDHaultfoeuilleResults", + L_max=2, + covariate_residuals=None, + linear_trends_effects=None, + ) + ) + assert tp["aggregation"] == "delta" + assert "delta" in tp["name"] + assert tp["headline_attribute"] == "overall_att" + + def test_dcdh_did_1_with_controls(self): + tp = describe_target_parameter( + _minimal_result( + "ChaisemartinDHaultfoeuilleResults", + L_max=1, + covariate_residuals=SimpleNamespace(), + linear_trends_effects=None, + ) + ) + assert tp["aggregation"] == "DID_1_x" + assert "DID^X_1" in tp["name"] + + def test_dcdh_delta_with_controls(self): + tp = describe_target_parameter( + _minimal_result( + "ChaisemartinDHaultfoeuilleResults", + L_max=2, + covariate_residuals=SimpleNamespace(), + linear_trends_effects=None, + ) + ) + assert tp["aggregation"] == "delta_x" + assert "delta^X" in tp["name"] + + def test_dcdh_trends_linear_with_l_max_geq_2_emits_no_scalar_headline(self): + """``trends_linear=True`` with ``L_max >= 2`` intentionally + suppresses the scalar aggregate (``overall_att`` is NaN by + design per ``chaisemartin_dhaultfoeuille.py:2828-2834``). The + target-parameter block must reflect that — ``aggregation = + "no_scalar_headline"`` and ``headline_attribute = None``. + R2 PR #347 review P1 regression. + """ + # PR #347 R9 P1: exercise BOTH routes to the no-scalar branch + # — the explicit persisted flag (primary contract) and the + # legacy ``linear_trends_effects is not None`` inference + # (fallback for older fits). + tp = describe_target_parameter( + _minimal_result( + "ChaisemartinDHaultfoeuilleResults", + L_max=2, + covariate_residuals=None, + linear_trends_effects={"foo": "bar"}, + trends_linear=True, + ) + ) + assert tp["aggregation"] == "no_scalar_headline" + assert tp["headline_attribute"] is None + assert "linear_trends_effects" in tp["definition"] + + def test_dcdh_trends_linear_empty_surface_still_no_scalar(self): + """PR #347 R9 P1 regression: the estimator sets + ``linear_trends_effects=None`` when the cumulated-horizon dict + is empty, but still unconditionally NaN-s ``overall_att`` for + ``trends_linear=True`` with ``L_max >= 2`` (see + ``chaisemartin_dhaultfoeuille.py:2828-2834``). The previous + inference on ``linear_trends_effects is not None`` would have + routed this case to the ``delta`` branch and narrated it as an + estimation failure. With the persisted ``trends_linear`` flag + the no-scalar branch fires correctly. + """ + tp = describe_target_parameter( + _minimal_result( + "ChaisemartinDHaultfoeuilleResults", + L_max=2, + covariate_residuals=None, + linear_trends_effects=None, # empty surface + trends_linear=True, # persisted fit-time flag + ) + ) + assert tp["aggregation"] == "no_scalar_headline" + assert tp["headline_attribute"] is None + + def test_dcdh_legacy_fit_without_persisted_flag_still_routes_correctly(self): + """Older fits that do not carry ``trends_linear`` on the + result still work via the legacy + ``linear_trends_effects is not None`` inference. This + preserves backwards compatibility for any cached fit objects + predating the persisted-flag change. + """ + tp = describe_target_parameter( + _minimal_result( + "ChaisemartinDHaultfoeuilleResults", + L_max=2, + covariate_residuals=None, + linear_trends_effects={"foo": "bar"}, + # trends_linear NOT set — legacy fallback kicks in + ) + ) + assert tp["aggregation"] == "no_scalar_headline" + + def test_dcdh_trends_linear_with_l_max_1_still_has_scalar(self): + """``trends_linear=True`` with ``L_max = 1`` still produces a + scalar headline (``DID^{fd}_1``) — the no-scalar rule only + applies when ``L_max >= 2``. + """ + tp = describe_target_parameter( + _minimal_result( + "ChaisemartinDHaultfoeuilleResults", + L_max=1, + covariate_residuals=None, + linear_trends_effects={"foo": "bar"}, + ) + ) + assert tp["aggregation"] == "DID_1_fd" + assert "DID^{fd}_1" in tp["name"] + assert tp["headline_attribute"] == "overall_att" + + def test_sdid(self): + tp = describe_target_parameter(_minimal_result("SyntheticDiDResults")) + assert tp["aggregation"] == "synthetic" + assert "synthetic" in tp["name"].lower() + + def test_trop(self): + tp = describe_target_parameter(_minimal_result("TROPResults")) + assert tp["aggregation"] == "factor_model" + assert "factor" in tp["name"].lower() + + def test_bacon_decomposition(self): + """PR #347 R8 P3: BaconDecompositionResults is accepted by DR + (as a diagnostic read-out) but is NOT in ``_APPLICABILITY``, + so the exhaustiveness test does not exercise it. Cover the + branch directly. Contract: the target parameter of a Bacon + decomposition is the TWFE coefficient it decomposes. + """ + tp = describe_target_parameter(_minimal_result("BaconDecompositionResults")) + assert tp["aggregation"] == "twfe" + assert tp["headline_attribute"] == "twfe_estimate" + assert "TWFE" in tp["name"] + assert "Goodman-Bacon" in tp["definition"] or "decomposition" in tp["definition"].lower() + assert "Goodman-Bacon" in tp["reference"] + + +class TestTargetParameterBaconDRIntegration: + """PR #347 R8 P3 follow-on: pass a real ``BaconDecompositionResults`` + through DR and assert the ``target_parameter`` block propagates + into the DR schema. BR rejects ``BaconDecompositionResults`` with + a ``TypeError`` (Bacon is a diagnostic, not an estimator), so this + branch is DR-only. + """ + + def test_dr_with_bacon_result_emits_target_parameter(self): + import warnings + + from diff_diff import DiagnosticReport, bacon_decompose, generate_staggered_data + + warnings.filterwarnings("ignore") + df = generate_staggered_data(n_units=40, n_periods=5, seed=21) + bacon = bacon_decompose( + df, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ) + dr = DiagnosticReport(bacon).to_dict() + tp = dr["target_parameter"] + assert tp["aggregation"] == "twfe" + assert tp["headline_attribute"] == "twfe_estimate" + # Sanity: the named attribute exists on the real result object. + assert hasattr(bacon, "twfe_estimate") + + +class TestTargetParameterFitConfigReads: + """Parameterized fit-config-branching tests.""" + + @pytest.mark.parametrize( + "clean_control, expected_clause", + [ + ("never_treated", "never-treated"), + ("strict", "strictly untreated"), + ("not_yet_treated", "not yet treated"), + ], + ) + def test_stacked_clean_control_branches_definition(self, clean_control, expected_clause): + tp = describe_target_parameter( + _minimal_result("StackedDiDResults", clean_control=clean_control) + ) + assert tp["aggregation"] == "stacked" + assert expected_clause in tp["definition"] + + @pytest.mark.parametrize( + "pt_assumption, expected_tag", + [("all", "pt_all_combined"), ("post", "pt_post_single_baseline")], + ) + def test_efficient_did_pt_assumption_branches_tag(self, pt_assumption, expected_tag): + tp = describe_target_parameter( + _minimal_result("EfficientDiDResults", pt_assumption=pt_assumption) + ) + assert tp["aggregation"] == expected_tag + + @pytest.mark.parametrize( + "L_max, controls, trends, expected_tag", + [ + # L_max=None -> DID_M (Phase 1 per-period aggregate). + (None, None, None, "M"), + (None, SimpleNamespace(), None, "M_x"), + # L_max=1 -> DID_1 (single-horizon per-group). + (1, None, None, "DID_1"), + (1, SimpleNamespace(), None, "DID_1_x"), + (1, None, {"foo": "bar"}, "DID_1_fd"), + (1, SimpleNamespace(), {"foo": "bar"}, "DID_1_x_fd"), + # L_max>=2 (no trends) -> cost-benefit delta aggregate. + (2, None, None, "delta"), + (2, SimpleNamespace(), None, "delta_x"), + # L_max>=2 + trends_linear -> no scalar aggregate. + (2, None, {"foo": "bar"}, "no_scalar_headline"), + (2, SimpleNamespace(), {"foo": "bar"}, "no_scalar_headline"), + ], + ) + def test_dcdh_config_branches_tag(self, L_max, controls, trends, expected_tag): + tp = describe_target_parameter( + _minimal_result( + "ChaisemartinDHaultfoeuilleResults", + L_max=L_max, + covariate_residuals=controls, + linear_trends_effects=trends, + ) + ) + assert tp["aggregation"] == expected_tag + # Contract: ``headline_attribute`` is "overall_att" whenever + # there IS a scalar aggregate, and None when the + # ``trends_linear + L_max>=2`` no-scalar rule fires. + if expected_tag == "no_scalar_headline": + assert tp["headline_attribute"] is None + else: + assert tp["headline_attribute"] == "overall_att" + + +class TestTargetParameterCoversEveryResultClass: + """Exhaustiveness guard (per plan-review MEDIUM #5): every result- + class name in DR's ``_APPLICABILITY`` dict must get a non-default, + non-empty target-parameter block.""" + + def test_every_applicability_class_has_explicit_branch(self): + missing = [] + for class_name in _APPLICABILITY: + tp = describe_target_parameter(_minimal_result(class_name)) + if tp["aggregation"] == "unknown": + missing.append(class_name) + assert not missing, ( + "Every result class in _APPLICABILITY must have an explicit " + f"describe_target_parameter branch. Missing: {missing}" + ) + + def test_every_applicability_class_has_nonempty_fields(self): + for class_name in _APPLICABILITY: + tp = describe_target_parameter(_minimal_result(class_name)) + assert tp["name"], f"{class_name}: name is empty" + assert tp["definition"], f"{class_name}: definition is empty" + assert tp["aggregation"], f"{class_name}: aggregation is empty" + assert tp["headline_attribute"], f"{class_name}: headline_attribute is empty" + assert tp["reference"], f"{class_name}: reference is empty" + + +class TestTargetParameterCrossSurfaceParity: + """BR and DR emit identical target_parameter blocks (the shared + helper is the single source of truth).""" + + def test_br_and_dr_emit_identical_target_parameter(self): + from diff_diff import BusinessReport, DiagnosticReport + + class CallawaySantAnnaResults: + pass + + stub = CallawaySantAnnaResults() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 500 + stub.n_treated = 100 + stub.n_control_units = 400 + stub.survey_metadata = None + stub.event_study_effects = None + stub.base_period = "universal" + stub.inference_method = "analytical" + + br_tp = BusinessReport(stub, auto_diagnostics=False).to_dict()["target_parameter"] + dr_tp = DiagnosticReport(stub).to_dict()["target_parameter"] + assert br_tp == dr_tp + + +class TestTargetParameterProseRendering: + """The short ``name`` must render in BR summary + DR overall_interpretation; + the ``definition`` must render in BR full_report and DR full report.""" + + def _cs_stub(self): + class CallawaySantAnnaResults: + pass + + stub = CallawaySantAnnaResults() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 500 + stub.n_treated = 100 + stub.n_control_units = 400 + stub.survey_metadata = None + stub.event_study_effects = None + stub.base_period = "universal" + stub.inference_method = "analytical" + return stub + + def test_br_summary_emits_target_parameter_name(self): + from diff_diff import BusinessReport + + summary = BusinessReport(self._cs_stub(), auto_diagnostics=False).summary() + assert "Target parameter:" in summary + assert "overall ATT" in summary + # But summary must not embed the full verbose definition — + # stakeholder paragraph stays within the 6-10 sentence target. + assert "event_study_effects" not in summary + + def test_br_full_report_emits_target_parameter_section(self): + from diff_diff import BusinessReport + + md = BusinessReport(self._cs_stub(), auto_diagnostics=False).full_report() + assert "## Target Parameter" in md + assert "cohort-size-weighted" in md + # Full report DOES carry the definition (structural context for + # readers who scan the markdown). + assert "event_study_effects" in md or "event-study" in md.lower() + + def test_dr_overall_interpretation_emits_target_parameter_sentence(self): + from diff_diff import DiagnosticReport + + dr = DiagnosticReport(self._cs_stub()).run_all() + prose = dr.interpretation + assert "Target parameter:" in prose + assert "overall ATT" in prose + + def test_dr_full_report_emits_target_parameter_section(self): + from diff_diff import DiagnosticReport + + md = DiagnosticReport(self._cs_stub()).full_report() + assert "## Target Parameter" in md + assert "Aggregation tag:" in md + assert "Headline attribute:" in md + + +class TestTargetParameterRealFitIntegration: + """PR #347 R1 review P2: stub-based dispatch tests missed (a) the + TWFE-returns-DiDResults mismatch, (b) the dCDH headline_attribute + bug. Exercise real fits so these contracts are enforced end-to-end. + """ + + def test_twfe_fit_returns_did_results_branch(self): + """``TwoWayFixedEffects.fit()`` returns ``DiDResults``, so the + target-parameter block must be the DiD/TWFE-covering branch. + Guards against reintroducing a dead-code + ``TwoWayFixedEffectsResults`` branch. + """ + import warnings + + from diff_diff import TwoWayFixedEffects, generate_did_data + + warnings.filterwarnings("ignore") + df = generate_did_data(n_units=40, n_periods=4, seed=7) + fit = TwoWayFixedEffects().fit( + df, outcome="outcome", treatment="treated", time="post", unit="unit" + ) + # Real TWFE fit returns DiDResults (no separate TWFE result class). + assert type(fit).__name__ == "DiDResults" + tp = describe_target_parameter(fit) + assert tp["aggregation"] == "did_or_twfe" + # The DiDResults branch must name TWFE explicitly so the + # description is source-faithful for both DiD and TWFE fits. + assert "TWFE" in tp["name"] or "TWFE" in tp["definition"] + + def test_stacked_did_fit_headline_attribute_matches_real_estimand(self): + """``StackedDiDResults.overall_att`` is the average of + post-treatment event-study coefficients ``delta_h`` with + delta-method SE (``stacked_did.py`` around line 541). Real-fit + regression against the reviewer's R1 P1 wording catch. + """ + import warnings + + from diff_diff import StackedDiD, generate_staggered_data + + warnings.filterwarnings("ignore") + df = generate_staggered_data(n_units=60, n_periods=6, seed=13) + fit = StackedDiD(kappa_pre=1, kappa_post=1).fit( + df, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ) + assert type(fit).__name__ == "StackedDiDResults" + tp = describe_target_parameter(fit) + assert tp["headline_attribute"] == "overall_att" + # R1 P1 fix: the definition must describe the actual estimand — + # the average of post-treatment delta_h event-study coefficients. + assert ( + "event-study" in tp["definition"].lower() + or "delta_h" in tp["definition"] + or "post-treatment" in tp["definition"].lower() + ) + + def _dcdh_reversible_panel(self, seed): + """Build a minimal reversible-treatment panel that dCDH + accepts (group / time / treatment columns, at least one + switcher).""" + import numpy as np + import pandas as pd + + rng = np.random.default_rng(seed) + units = list(range(20)) + periods = list(range(6)) + rows = [] + for u in units: + # Half of units switch from 0 -> 1 at period 3. + for t in periods: + d = 1 if (u < 10 and t >= 3) else 0 + y = d * 2.0 + rng.normal(0.0, 0.5) + rows.append({"unit": u, "period": t, "treated": d, "outcome": y}) + return pd.DataFrame(rows) + + def test_dcdh_did_m_fit_headline_attribute_is_overall_att(self): + """``ChaisemartinDHaultfoeuilleResults.overall_att`` holds the + DID_M headline scalar (``chaisemartin_dhaultfoeuille_results.py`` + line ~357). R1 P1: previously ``headline_attribute="att"`` + pointed at a non-existent attribute. + """ + import warnings + + from diff_diff import ChaisemartinDHaultfoeuille + + warnings.filterwarnings("ignore") + df = self._dcdh_reversible_panel(seed=11) + # DID_M regime: L_max not supplied. + fit = ChaisemartinDHaultfoeuille().fit( + df, + outcome="outcome", + group="unit", + time="period", + treatment="treated", + ) + assert type(fit).__name__ == "ChaisemartinDHaultfoeuilleResults" + tp = describe_target_parameter(fit) + assert tp["aggregation"] == "M" + assert tp["headline_attribute"] == "overall_att" + # Sanity: the attribute BR/DR points at actually exists on the + # real fit object. + assert hasattr(fit, tp["headline_attribute"]), ( + f"headline_attribute={tp['headline_attribute']!r} must name " + "an attribute that actually exists on the real result object." + ) + + def test_dcdh_did_1_fit_overall_att_real(self): + """Real ``L_max = 1`` fit: headline is ``DID_1`` (single- + horizon per-group estimand), not the generic ``DID_l``. + PR #347 R2 P1 regression on a live fit. + """ + import warnings + + from diff_diff import ChaisemartinDHaultfoeuille + + warnings.filterwarnings("ignore") + df = self._dcdh_reversible_panel(seed=13) + fit = ChaisemartinDHaultfoeuille().fit( + df, + outcome="outcome", + group="unit", + time="period", + treatment="treated", + L_max=1, + ) + tp = describe_target_parameter(fit) + assert tp["aggregation"] == "DID_1" + assert "DID_1" in tp["name"] + assert tp["headline_attribute"] == "overall_att" + assert hasattr(fit, "overall_att") + + def test_dcdh_delta_fit_real(self): + """Real ``L_max >= 2`` fit: headline is the cost-benefit + ``delta`` aggregate. PR #347 R2 P1 regression on a live fit. + """ + import warnings + + from diff_diff import ChaisemartinDHaultfoeuille + + warnings.filterwarnings("ignore") + df = self._dcdh_reversible_panel(seed=14) + fit = ChaisemartinDHaultfoeuille().fit( + df, + outcome="outcome", + group="unit", + time="period", + treatment="treated", + L_max=2, + ) + tp = describe_target_parameter(fit) + assert tp["aggregation"] == "delta" + assert "delta" in tp["name"] + assert tp["headline_attribute"] == "overall_att" + assert hasattr(fit, "overall_att") + + def test_dcdh_trends_linear_no_scalar_propagates_through_br(self): + """PR #347 R4 P1 end-to-end: on the dCDH no-scalar + configuration (``trends_linear=True`` + ``L_max>=2``), BR's + ``to_dict()`` headline must carry ``status="no_scalar_by_design"`` + and BR's summary / full report must emit explicit no-scalar + prose — NOT the generic "non-finite effect / inspect the fit + for rank deficiency" estimation-failure messaging. + """ + import warnings + + from diff_diff import BusinessReport, ChaisemartinDHaultfoeuille + + warnings.filterwarnings("ignore") + df = self._dcdh_reversible_panel(seed=16) + fit = ChaisemartinDHaultfoeuille().fit( + df, + outcome="outcome", + group="unit", + time="period", + treatment="treated", + L_max=2, + trends_linear=True, + ) + br = BusinessReport(fit, outcome_label="the outcome", auto_diagnostics=False) + schema = br.to_dict() + assert schema["headline"]["status"] == "no_scalar_by_design" + assert schema["headline"]["effect"] is None + # BR's summary prose must be explicit no-scalar, not + # "non-finite estimate / inspect rank deficiency". + summary = br.summary() + assert "no scalar" in summary.lower() or "does not produce a scalar" in summary.lower() + assert "rank deficiency" not in summary.lower() + assert "estimation failed" not in summary.lower() + # Must NOT emit the "estimation_failure" caveat either. + caveats = br.caveats() + topics = {c.get("topic") for c in caveats} + assert "estimation_failure" not in topics + + def test_dcdh_trends_linear_no_scalar_propagates_through_dr(self): + """Same contract on the DR side: ``headline_metric`` carries + ``status="no_scalar_by_design"`` and the overall-interpretation + prose is explicit no-scalar, not an estimation-failure sentence. + """ + import warnings + + from diff_diff import ChaisemartinDHaultfoeuille, DiagnosticReport + + warnings.filterwarnings("ignore") + df = self._dcdh_reversible_panel(seed=17) + fit = ChaisemartinDHaultfoeuille().fit( + df, + outcome="outcome", + group="unit", + time="period", + treatment="treated", + L_max=2, + trends_linear=True, + ) + dr_report = DiagnosticReport(fit) + dr = dr_report.run_all() + schema = dr.schema + assert schema["headline_metric"]["status"] == "no_scalar_by_design" + # DR interpretation must not narrate estimation failure. + prose = dr.interpretation + assert "does not produce a scalar" in prose.lower() or "no scalar" in prose.lower() + assert "rank deficiency" not in prose.lower() + assert "zero effective sample" not in prose.lower() + # PR #347 R5 P2 + P3: the DR markdown full_report must also + # handle the no-scalar case — the top **Headline** line + # previously formatted ``None`` values straight in + # (``**Headline**: ... = None (SE None, p = None)``). The + # fixed renderer should emit explicit no-scalar markdown + # instead. + md = dr_report.full_report() + assert "**Headline**: no scalar aggregate by design" in md, ( + f"DR full_report must emit explicit no-scalar top headline on " + f"the trends_linear + L_max>=2 dCDH branch. Got: {md!r}" + ) + # And must NOT contain the raw `None`-interpolated line from + # the generic headline path. + assert "= None (SE None" not in md + assert "p = None)" not in md + + def test_dcdh_empty_surface_target_parameter_definition_names_empty_state(self): + """PR #347 R12 P1: when ``trends_linear=True, L_max>=2, + linear_trends_effects=None`` (no horizons survived), the + target-parameter ``definition`` must NOT tell users to "see + linear_trends_effects" — that dict is empty. It must name the + empty-surface state explicitly and point at a different + remediation (re-fit). + """ + tp = describe_target_parameter( + _minimal_result( + "ChaisemartinDHaultfoeuilleResults", + L_max=2, + covariate_residuals=None, + linear_trends_effects=None, + trends_linear=True, + ) + ) + defn = tp["definition"] + # Must NOT claim the horizon dict has content. + assert "``results.linear_trends_effects[l]``" not in defn + # Must name the empty-surface state. + assert ( + "no cumulated level effects" in defn.lower() + or "horizon surface is empty" in defn.lower() + or "survived estimation" in defn.lower() + ), f"Empty-surface definition must name the state. Got: {defn!r}" + + def _empty_surface_stub(self): + from diff_diff.chaisemartin_dhaultfoeuille_results import ( + ChaisemartinDHaultfoeuilleResults, + ) + + return ChaisemartinDHaultfoeuilleResults( + overall_att=float("nan"), + overall_se=float("nan"), + overall_t_stat=float("nan"), + overall_p_value=float("nan"), + overall_conf_int=(float("nan"), float("nan")), + joiners_att=float("nan"), + joiners_se=float("nan"), + joiners_t_stat=float("nan"), + joiners_p_value=float("nan"), + joiners_conf_int=(float("nan"), float("nan")), + n_joiner_cells=0, + n_joiner_obs=0, + joiners_available=False, + leavers_att=float("nan"), + leavers_se=float("nan"), + leavers_t_stat=float("nan"), + leavers_p_value=float("nan"), + leavers_conf_int=(float("nan"), float("nan")), + n_leaver_cells=0, + n_leaver_obs=0, + leavers_available=False, + placebo_effect=float("nan"), + placebo_se=float("nan"), + placebo_t_stat=float("nan"), + placebo_p_value=float("nan"), + placebo_conf_int=(float("nan"), float("nan")), + placebo_available=False, + per_period_effects={}, + groups=[1, 2, 3], + time_periods=[1, 2, 3, 4], + n_obs=100, + n_treated_obs=50, + n_switcher_cells=10, + n_cohorts=2, + n_groups_dropped_crossers=0, + n_groups_dropped_singleton_baseline=0, + n_groups_dropped_never_switching=0, + L_max=2, + trends_linear=True, + linear_trends_effects=None, + ) + + def test_dcdh_empty_surface_br_dr_reason_names_empty_state(self): + """Mirror empty-surface contract through BR and DR headline + ``reason``. Pointing users at ``linear_trends_effects`` is + dead-end guidance when that surface is empty. + """ + from diff_diff import BusinessReport, DiagnosticReport + + stub = self._empty_surface_stub() + br_reason = BusinessReport(stub, auto_diagnostics=False).to_dict()["headline"]["reason"] + assert "``results.linear_trends_effects[l]``" not in br_reason + assert ( + "no cumulated level effects" in br_reason.lower() + or "horizon surface is empty" in br_reason.lower() + or "survived estimation" in br_reason.lower() + ) + + dr_reason = DiagnosticReport(stub).to_dict()["headline_metric"]["reason"] + assert "``results.linear_trends_effects[l]``" not in dr_reason + assert ( + "no cumulated level effects" in dr_reason.lower() + or "horizon surface is empty" in dr_reason.lower() + or "survived estimation" in dr_reason.lower() + ) + + def test_dcdh_empty_surface_to_dataframe_linear_trends_returns_empty_frame(self): + """PR #347 R12 P1: ``to_dataframe("linear_trends")`` on an + empty-surface trends-linear fit previously raised the wrong + remediation ("Pass trends_linear=True to fit()") — which is + dead-end when the user already did. Now returns an empty + DataFrame. + """ + stub = self._empty_surface_stub() + df = stub.to_dataframe("linear_trends") + assert df.empty + assert "horizon" in df.columns + assert "effect" in df.columns + + def test_dcdh_empty_surface_br_rendered_prose_no_stale_guidance(self): + """PR #347 R13 P1: BR's rendered prose surfaces + (``headline()``, ``summary()``, ``full_report()``) must not + tell users to "see linear_trends_effects" on the empty- + surface subcase. Previously the schema ``reason`` was + correct but the renderers still hardcoded the populated- + surface phrasing. + """ + from diff_diff import BusinessReport + + stub = self._empty_surface_stub() + br = BusinessReport(stub, auto_diagnostics=False) + + headline = br.headline() + assert "see ``linear_trends_effects``" not in headline + assert ( + "no cumulated level effects" in headline.lower() + or "horizon surface is empty" in headline.lower() + or "survived estimation" in headline.lower() + ) + + summary = br.summary() + assert "see ``linear_trends_effects``" not in summary + + full = br.full_report() + assert "see ``linear_trends_effects``" not in full + assert ( + "no cumulated level effects" in full.lower() + or "horizon surface is empty" in full.lower() + or "survived estimation" in full.lower() + ) + + def test_dcdh_empty_surface_dr_rendered_prose_no_stale_guidance(self): + """PR #347 R13 P1: DR's overall-interpretation prose must not + tell users to "see linear_trends_effects" on the empty- + surface subcase. + """ + from diff_diff import DiagnosticReport + + stub = self._empty_surface_stub() + dr = DiagnosticReport(stub).run_all() + + prose = dr.interpretation + assert "see ``linear_trends_effects``" not in prose + assert ( + "no cumulated level effects" in prose.lower() + or "horizon surface is empty" in prose.lower() + or "survived estimation" in prose.lower() + ) + + def _empty_surface_stub_with_controls(self): + """Mirror of ``_empty_surface_stub`` with ``covariate_residuals`` + populated — the covariate-adjusted empty-surface subcase. + """ + from diff_diff.chaisemartin_dhaultfoeuille_results import ( + ChaisemartinDHaultfoeuilleResults, + ) + + return ChaisemartinDHaultfoeuilleResults( + overall_att=float("nan"), + overall_se=float("nan"), + overall_t_stat=float("nan"), + overall_p_value=float("nan"), + overall_conf_int=(float("nan"), float("nan")), + joiners_att=float("nan"), + joiners_se=float("nan"), + joiners_t_stat=float("nan"), + joiners_p_value=float("nan"), + joiners_conf_int=(float("nan"), float("nan")), + n_joiner_cells=0, + n_joiner_obs=0, + joiners_available=False, + leavers_att=float("nan"), + leavers_se=float("nan"), + leavers_t_stat=float("nan"), + leavers_p_value=float("nan"), + leavers_conf_int=(float("nan"), float("nan")), + n_leaver_cells=0, + n_leaver_obs=0, + leavers_available=False, + placebo_effect=float("nan"), + placebo_se=float("nan"), + placebo_t_stat=float("nan"), + placebo_p_value=float("nan"), + placebo_conf_int=(float("nan"), float("nan")), + placebo_available=False, + per_period_effects={}, + groups=[1, 2, 3], + time_periods=[1, 2, 3, 4], + n_obs=100, + n_treated_obs=50, + n_switcher_cells=10, + n_cohorts=2, + n_groups_dropped_crossers=0, + n_groups_dropped_singleton_baseline=0, + n_groups_dropped_never_switching=0, + L_max=2, + trends_linear=True, + linear_trends_effects=None, + # The load-bearing difference from ``_empty_surface_stub``: + # covariates are active, so every label and reason surface + # must flip to the covariate-adjusted form ``DID^{X,fd}_l``. + covariate_residuals=SimpleNamespace(), + ) + + def test_dcdh_empty_surface_with_controls_target_parameter_uses_x_fd_label(self): + """PR #347 R14 P1 regression: when ``trends_linear=True``, + ``L_max >= 2``, ``covariate_residuals`` is populated, and + ``linear_trends_effects is None`` (empty-surface subcase with + covariates), the helper definition must name + ``DID^{X,fd}_l`` — NOT bare ``DID^{fd}_l``. The R14 review + flagged the hardcoded ``DID^{fd}_l`` label on this exact + branch. + """ + stub = self._empty_surface_stub_with_controls() + tp = describe_target_parameter(stub) + defn = tp["definition"] + assert "DID^{X,fd}_l" in defn, ( + f"Covariate-adjusted empty-surface definition must use " + f"``DID^{{X,fd}}_l``. Got: {defn!r}" + ) + # The helper must NOT also emit the bare ``DID^{fd}_l`` form on + # the covariate path (that label belongs to the no-covariate + # branch only). + assert "DID^{fd}_l" not in defn.replace("DID^{X,fd}_l", ""), ( + f"Covariate-adjusted empty-surface definition must not emit " + f"the bare ``DID^{{fd}}_l`` label. Got: {defn!r}" + ) + + def test_dcdh_empty_surface_with_controls_br_dr_reason_uses_x_fd_label(self): + """PR #347 R14 P1 regression: BR's ``headline.reason`` and + DR's ``headline_metric.reason`` on the covariate-adjusted + empty-surface subcase must both name ``DID^{X,fd}_l``, not + bare ``DID^{fd}_l``. + """ + from diff_diff import BusinessReport, DiagnosticReport + + stub = self._empty_surface_stub_with_controls() + + br_reason = BusinessReport(stub, auto_diagnostics=False).to_dict()["headline"]["reason"] + assert "DID^{X,fd}_l" in br_reason, ( + f"BR headline.reason on covariate-adjusted empty-surface fit " + f"must name ``DID^{{X,fd}}_l``. Got: {br_reason!r}" + ) + assert "DID^{fd}_l" not in br_reason.replace("DID^{X,fd}_l", ""), ( + f"BR headline.reason must not emit the bare ``DID^{{fd}}_l`` " + f"label when covariates are active. Got: {br_reason!r}" + ) + + dr_reason = DiagnosticReport(stub).to_dict()["headline_metric"]["reason"] + assert "DID^{X,fd}_l" in dr_reason, ( + f"DR headline_metric.reason on covariate-adjusted empty-surface " + f"fit must name ``DID^{{X,fd}}_l``. Got: {dr_reason!r}" + ) + assert "DID^{fd}_l" not in dr_reason.replace("DID^{X,fd}_l", ""), ( + f"DR headline_metric.reason must not emit the bare " + f"``DID^{{fd}}_l`` label when covariates are active. " + f"Got: {dr_reason!r}" + ) + + def test_dcdh_empty_surface_with_controls_rendered_prose_uses_x_fd_label(self): + """PR #347 R14 P1 regression: BR's rendered prose + (``headline()``, ``full_report()``) and DR's + ``interpretation`` / full report on the covariate-adjusted + empty-surface subcase must all name ``DID^{X,fd}_l``. + """ + from diff_diff import BusinessReport, DiagnosticReport + + stub = self._empty_surface_stub_with_controls() + br = BusinessReport(stub, auto_diagnostics=False) + br_headline = br.headline() + assert "DID^{X,fd}_l" in br_headline, ( + f"BR headline() prose must name ``DID^{{X,fd}}_l`` on " + f"covariate-adjusted empty-surface fit. Got: {br_headline!r}" + ) + br_full = br.full_report() + assert "DID^{X,fd}_l" in br_full, ( + "BR full_report() prose must name ``DID^{X,fd}_l`` on " + "covariate-adjusted empty-surface fit." + ) + + dr = DiagnosticReport(stub).run_all() + assert "DID^{X,fd}_l" in dr.interpretation, ( + f"DR interpretation prose must name ``DID^{{X,fd}}_l`` on " + f"covariate-adjusted empty-surface fit. Got: " + f"{dr.interpretation!r}" + ) + + def test_dcdh_empty_surface_native_label_no_stale_guidance(self): + """PR #347 R13 P1: the dCDH result's own + ``_estimand_label()`` (surfaced via + ``to_dataframe("overall")``) must not direct users to + ``linear_trends_effects`` on the empty-surface subcase. + Instead it names the empty state. + """ + stub = self._empty_surface_stub() + label = stub._estimand_label() + assert "see linear_trends_effects" not in label + assert "no cumulated level effects survived" in label.lower() + + # to_dataframe("overall") pulls the label too. + row = stub.to_dataframe("overall").iloc[0] + assert "see linear_trends_effects" not in row["estimand"] + + def test_dcdh_empty_surface_propagates_to_assumption_and_native_label(self): + """PR #347 R10 P1 regression: the persisted ``trends_linear`` + flag must drive every downstream consumer that previously + inferred trends-linear from ``linear_trends_effects``. This + test exercises two consumers the R9 patch missed: + + (a) ``ChaisemartinDHaultfoeuilleResults._estimand_label()`` + and its dependencies (``_horizon_label``, + ``to_dataframe("overall")``) — must return a + ``DID^{fd}_l`` / ``DID^{X,fd}_l`` label rather than + ``delta`` on an empty-surface fit. + (b) ``BusinessReport._describe_assumption`` — must emit the + ``DID^{fd}_l`` identification clause rather than the + non-trends default. + + Builds a stub ``ChaisemartinDHaultfoeuilleResults`` with + ``trends_linear=True``, ``L_max=2``, and + ``linear_trends_effects=None`` (the empty-surface case). + """ + from diff_diff import BusinessReport + from diff_diff.chaisemartin_dhaultfoeuille_results import ( + ChaisemartinDHaultfoeuilleResults, + ) + + stub = ChaisemartinDHaultfoeuilleResults( + overall_att=float("nan"), + overall_se=float("nan"), + overall_t_stat=float("nan"), + overall_p_value=float("nan"), + overall_conf_int=(float("nan"), float("nan")), + joiners_att=float("nan"), + joiners_se=float("nan"), + joiners_t_stat=float("nan"), + joiners_p_value=float("nan"), + joiners_conf_int=(float("nan"), float("nan")), + n_joiner_cells=0, + n_joiner_obs=0, + joiners_available=False, + leavers_att=float("nan"), + leavers_se=float("nan"), + leavers_t_stat=float("nan"), + leavers_p_value=float("nan"), + leavers_conf_int=(float("nan"), float("nan")), + n_leaver_cells=0, + n_leaver_obs=0, + leavers_available=False, + placebo_effect=float("nan"), + placebo_se=float("nan"), + placebo_t_stat=float("nan"), + placebo_p_value=float("nan"), + placebo_conf_int=(float("nan"), float("nan")), + placebo_available=False, + per_period_effects={}, + groups=[1, 2, 3], + time_periods=[1, 2, 3, 4], + n_obs=100, + n_treated_obs=50, + n_switcher_cells=10, + n_cohorts=2, + n_groups_dropped_crossers=0, + n_groups_dropped_singleton_baseline=0, + n_groups_dropped_never_switching=0, + L_max=2, + # The load-bearing configuration: trends_linear with empty + # horizon surface. + trends_linear=True, + linear_trends_effects=None, + ) + + # (a) Native result label: must NOT be "delta", must name DID^{fd}_l. + label = stub._estimand_label() + assert "delta" not in label, ( + f"Empty-surface trends_linear fit must not be labeled delta. " + f"Got _estimand_label={label!r}" + ) + assert "DID^{fd}_l" in label or "DID^{X,fd}_l" in label, ( + f"Empty-surface trends_linear fit must name DID^{{fd}}_l. " f"Got: {label!r}" + ) + + # to_dataframe("overall") uses _estimand_label() — same guarantee. + row = stub.to_dataframe("overall").iloc[0] + assert "delta" not in row["estimand"] + + # (b) BR assumption description: must mention linear trends / + # first-differences clause. + br = BusinessReport(stub, auto_diagnostics=False) + desc = br.to_dict()["assumption"]["description"] + assert ( + "first-differenced" in desc.lower() + or "linear pre-trend" in desc.lower() + or "linear trends" in desc.lower() + ), ( + "BR's identifying-assumption description must include the " + f"linear-trends identification clause on an empty-surface " + f"trends_linear fit. Got: {desc!r}" + ) + + def test_dcdh_trends_linear_with_l_max_geq_2_fit_real(self): + """Real ``trends_linear=True`` + ``L_max>=2`` fit: the library + intentionally sets ``overall_att=NaN`` and populates the + per-horizon effects on ``linear_trends_effects`` instead + (``chaisemartin_dhaultfoeuille.py:2828-2834``). The target- + parameter block must reflect that via + ``aggregation="no_scalar_headline"`` and + ``headline_attribute is None``. PR #347 R3 P3 regression on + a live fit. + """ + import math + import warnings + + from diff_diff import ChaisemartinDHaultfoeuille + + warnings.filterwarnings("ignore") + df = self._dcdh_reversible_panel(seed=15) + fit = ChaisemartinDHaultfoeuille().fit( + df, + outcome="outcome", + group="unit", + time="period", + treatment="treated", + L_max=2, + trends_linear=True, + ) + # Real fit must intentionally produce NaN overall_att. + assert math.isnan(fit.overall_att), ( + "trends_linear + L_max>=2 must suppress overall_att (NaN by " + "design). If this test fails, the library contract changed — " + "update the ``no_scalar_headline`` branch of " + "describe_target_parameter accordingly." + ) + # linear_trends_effects must be populated (the per-horizon + # cumulated level effects that replace the scalar aggregate). + assert fit.linear_trends_effects is not None + # Target-parameter block must route through the no-scalar branch. + tp = describe_target_parameter(fit) + assert tp["aggregation"] == "no_scalar_headline" + assert tp["headline_attribute"] is None + assert "linear_trends_effects" in tp["definition"]