From 2b1e76b1164b9b3cedf9e258a82e0b76239dff3c Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 15:06:45 -0400 Subject: [PATCH 01/48] Add BusinessReport and DiagnosticReport for practitioner-ready output Implements the 'practitioner-ready output' roadmap pair: plain-English stakeholder narratives from any of the 16 fitted result types, backed by a stable AI-legible to_dict() schema (single source of truth; prose renders from the dict). BusinessReport: - summary()/full_report()/export_markdown() surface stakeholder text - to_dict()/to_json() expose the structured schema for AI agents - Optional outcome_label/outcome_unit/business_question/treatment_label for context; single-knob alpha drives CI level and phrasing threshold - auto_diagnostics=True (default) constructs an internal DiagnosticReport so the summary mentions pre-trends, sensitivity, and design effects in one call; diagnostics= overrides explicitly - Rejects BaconDecompositionResults with a helpful TypeError DiagnosticReport: - Orchestrates check_parallel_trends, compute_pretrends_power, HonestDiD.sensitivity (grid form yielding breakdown_M), bacon_decompose, compute_deff_diagnostics, results.epv_diagnostics, plus heterogeneity (CV + range + sign consistency) - Estimator-native routing: SDiD uses pre_treatment_fit + in_time_placebo + sensitivity_to_zeta_omega; EfficientDiD uses native hausman_pretest; TROP surfaces factor-model fit (effective_rank / loocv_score / lambdas) - Lazy: construction is free; run_all() triggers compute and caches - precomputed={...} escape hatch for user-supplied diagnostic results - Power-aware phrasing tiers (well/moderately/underpowered) drive the 'no_detected_violation' verdict prose rather than always hedging Docs: - New docs/methodology/REPORTING.md records design deviations via the '- **Note:**' label pattern (no-traffic-light gates, pre-trends verdict thresholds, unit-translation policy, schema stability policy) - docs/methodology/REGISTRY.md cross-links to REPORTING.md - New docs/api/business_report.rst and docs/api/diagnostic_report.rst registered under a new 'Reporting' section in docs/api/index.rst - docs/doc-deps.yaml tracks both modules - README adds a stakeholder-report example under 'For Data Scientists' - CHANGELOG marks both schemas experimental for this release - ROADMAP moves BR/DR to Recently Shipped; splits the original bundled bullet so context-aware practitioner_next_steps() remains queued - diff_diff/guides/llms-full.txt documents the public API and both schemas for AI agents; llms-practitioner.txt notes that DR covers Baker steps 3/6/7 in one call Tests: 61 new tests (32 BR + 29 DR) cover schema contract, applicability matrix across all 16 result types, JSON round-trip, precomputed-sensitivity passthrough (no re-compute), error handling, power-tier/verdict boundaries, unit-label behavior, significance-chasing guard, NaN ATT, include_appendix toggle, BaconDecompositionResults TypeError, survey metadata passthrough, and alpha single-knob behavior. All pass in 0.26s under pytest. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 3 + README.md | 29 + ROADMAP.md | 4 +- diff_diff/__init__.py | 16 + diff_diff/business_report.py | 1171 ++++++++++++++++ diff_diff/diagnostic_report.py | 1765 ++++++++++++++++++++++++ diff_diff/guides/llms-full.txt | 112 ++ diff_diff/guides/llms-practitioner.txt | 36 + docs/api/business_report.rst | 52 + docs/api/diagnostic_report.rst | 51 + docs/api/index.rst | 9 + docs/doc-deps.yaml | 30 + docs/methodology/REGISTRY.md | 13 + docs/methodology/REPORTING.md | 141 ++ tests/test_business_report.py | 484 +++++++ tests/test_diagnostic_report.py | 462 +++++++ 16 files changed, 4376 insertions(+), 2 deletions(-) create mode 100644 diff_diff/business_report.py create mode 100644 diff_diff/diagnostic_report.py create mode 100644 docs/api/business_report.rst create mode 100644 docs/api/diagnostic_report.rst create mode 100644 docs/methodology/REPORTING.md create mode 100644 tests/test_business_report.py create mode 100644 tests/test_diagnostic_report.py diff --git a/CHANGELOG.md b/CHANGELOG.md index c24e1b73..3ae8a124 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +### Added +- **`BusinessReport` and `DiagnosticReport` (experimental preview)** - practitioner-ready output layer. `BusinessReport(results, ...)` produces plain-English narrative summaries (`.summary()`, `.full_report()`, `.export_markdown()`, `.to_dict()`) from any of the 16 fitted result types. `DiagnosticReport(results, ...)` orchestrates the existing diagnostic battery (parallel trends, pre-trends power, HonestDiD sensitivity, Goodman-Bacon, heterogeneity, design-effect, EPV) plus estimator-native diagnostics for SyntheticDiD (`pre_treatment_fit`, weight concentration, in-time placebo, zeta sensitivity) and TROP (factor-model fit metrics). Both classes expose an AI-legible `to_dict()` schema (single source of truth; prose renders from the dict). BR auto-constructs DR by default so summaries mention pre-trends, robustness, and design-effect findings in one call. See `docs/methodology/REPORTING.md` for methodology deviations including the no-traffic-light-gates decision, pre-trends verdict thresholds (0.05 / 0.30), and power-aware phrasing driven by `compute_pretrends_power`. **Both schemas are marked experimental in this release** - wording, verdict thresholds, and schema shape will change; do not anchor downstream tooling on them yet. + ## [3.1.2] - 2026-04-18 ### Fixed diff --git a/README.md b/README.md index f6b32a66..b9daf68a 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,35 @@ Measuring campaign lift? Evaluating a product launch? diff-diff handles the caus - **[Brand awareness survey tutorial](docs/tutorials/17_brand_awareness_survey.ipynb)** - Full example with complex survey design, brand funnel analysis, and staggered rollouts - **Have BRFSS/ACS/CPS individual records?** Use [`aggregate_survey()`](docs/api/prep.rst) to roll respondent-level microdata into a geographic-period panel with inverse-variance precision weights. The returned second-stage design uses analytic weights (`aweight`), so it works directly with `DifferenceInDifferences`, `TwoWayFixedEffects`, `MultiPeriodDiD`, `SunAbraham`, `ContinuousDiD`, and `EfficientDiD` (estimators marked **Full** in the [survey support matrix](docs/choosing_estimator.rst)) +### Stakeholder-ready report from any fit + +Wrap any fitted result in `BusinessReport` for a plain-English stakeholder summary; pair with `DiagnosticReport` for a validity check: + +```python +from diff_diff import CallawaySantAnna, BusinessReport + +cs = CallawaySantAnna().fit( + df, outcome="revenue", unit="store", time="month", + first_treat="first_treat", aggregate="event_study", +) +report = BusinessReport( + cs, + outcome_label="Revenue per store", + outcome_unit="$", + business_question="Did the loyalty program lift revenue?", + treatment_label="the loyalty program", +) +print(report.summary()) +# "The loyalty program increased Revenue per store by $1.78 (95% CI: $1.56 to $2.00). +# Statistically, the direction of the effect is strongly supported by the data. +# Pre-treatment data do not reject parallel trends, but the test has limited +# power — a non-rejection does not prove the assumption. See the sensitivity +# analysis below for a more reliable signal. +# Sample: 600 observations (70 treated, 30 control)." +``` + +`BusinessReport` auto-constructs a `DiagnosticReport` by default so the summary mentions pre-trends, robustness, and design-effect findings in one call. `.to_dict()` returns the same content as a stable AI-legible schema (single source of truth; prose is rendered from the dict). See [docs/methodology/REPORTING.md](docs/methodology/REPORTING.md) for the phrasing rules, verdict thresholds, and schema stability policy. **Schema is experimental in this release.** + Already know DiD? The [academic quickstart](docs/quickstart.rst) and [estimator guide](docs/choosing_estimator.rst) cover the full technical details. ## Features diff --git a/ROADMAP.md b/ROADMAP.md index c7abe633..84d13139 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -57,6 +57,7 @@ See [Survey Design Support](docs/choosing_estimator.rst#survey-design-support) f Major landings since the prior roadmap revision. See [CHANGELOG.md](CHANGELOG.md) for the full history. +- **`BusinessReport` and `DiagnosticReport`** - practitioner-ready output layer. Plain-English stakeholder summaries + unified diagnostic runner with a stable AI-legible `to_dict()` schema. `BusinessReport` auto-constructs `DiagnosticReport` by default so summaries mention pre-trends, robustness, and design-effect findings in one call. Estimator-native validation surfaces are routed through: SyntheticDiD uses `pre_treatment_fit` / `in_time_placebo` / `sensitivity_to_zeta_omega`; EfficientDiD uses its native `hausman_pretest`; TROP exposes factor-model fit metrics. See `docs/methodology/REPORTING.md` for methodology deviations including no-traffic-light gates, pre-trends verdict thresholds, and power-aware phrasing. - **ChaisemartinDHaultfoeuille (dCDH)** - full feature set: `DID_M` contemporaneous-switch, multi-horizon `DID_l` event study, analytical SE, multiplier bootstrap, TWFE decomposition diagnostic, dynamic placebos, normalized estimator, cost-benefit aggregate, sup-t bands, covariate adjustment (`DID^X`), group-specific linear trends (`DID^{fd}`), state-set-specific trends, heterogeneity testing, non-binary treatment, HonestDiD integration, and survey support (TSL + pweight). - **SyntheticDiD jackknife variance** (`variance_method='jackknife'`) with survey-weighted jackknife. - **SyntheticDiD validation diagnostics**. @@ -78,8 +79,7 @@ Queued work, ordered by expected leverage. Each item is its own PR. Ordering is ### Practitioner-ready output -- **`BusinessReport` class.** Plain-English summaries of any estimator's results with markdown export. Optional rich formatting via a `[reporting]` extra; core remains numpy/pandas/scipy only. Turns raw coefficients into stakeholder-ready artifacts. -- **`DiagnosticReport` with context-aware `practitioner_next_steps()`.** Unified diagnostic runner that bundles parallel-trends, placebo, HonestDiD, Bacon decomposition, DEFF, EPV, and power diagnostics into one plain-English report. `practitioner_next_steps()` substitutes actual column names from fitted results instead of generic placeholders. +- **Context-aware `practitioner_next_steps()`.** Substitutes actual column names from fitted results instead of generic placeholders, so next-step guidance is executable rather than illustrative. (Standalone follow-up to the `BusinessReport` / `DiagnosticReport` landing below; tracked under the AI-Agent Track too.) ### Practitioner tutorials diff --git a/diff_diff/__init__.py b/diff_diff/__init__.py index 3ae225b0..6b455c35 100644 --- a/diff_diff/__init__.py +++ b/diff_diff/__init__.py @@ -202,6 +202,16 @@ plot_synth_weights, ) from diff_diff.practitioner import practitioner_next_steps +from diff_diff.business_report import ( + BUSINESS_REPORT_SCHEMA_VERSION, + BusinessContext, + BusinessReport, +) +from diff_diff.diagnostic_report import ( + DIAGNOSTIC_REPORT_SCHEMA_VERSION, + DiagnosticReport, + DiagnosticReportResults, +) from diff_diff._guides_api import get_llm_guide from diff_diff.datasets import ( clear_cache, @@ -405,6 +415,12 @@ "clear_cache", # Practitioner guidance "practitioner_next_steps", + "BusinessReport", + "BusinessContext", + "BUSINESS_REPORT_SCHEMA_VERSION", + "DiagnosticReport", + "DiagnosticReportResults", + "DIAGNOSTIC_REPORT_SCHEMA_VERSION", # LLM guide accessor "get_llm_guide", ] diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py new file mode 100644 index 00000000..bdd072f8 --- /dev/null +++ b/diff_diff/business_report.py @@ -0,0 +1,1171 @@ +""" +BusinessReport — plain-English stakeholder narrative from any diff-diff result. + +Wraps any of the 16 fitted result types and produces: + +- ``summary()``: a short paragraph block suitable for an email or Slack message. +- ``full_report()``: a multi-section markdown report with headline, assumptions, + pre-trends, main result, robustness, sample, and an optional academic appendix. +- ``to_dict()``: a stable AI-legible structured schema (single source of truth — + prose is rendered from this dict, not templated alongside it). + +Design principles: + +- Plain English, not academic jargon. The library ships this in addition to, not + in place of, the estimator's existing ``results.summary()`` academic output. +- No new statistical computation. Every reported number is either read from + ``results`` or computed by an existing diff-diff utility function; no p-value + or variance is re-derived here. +- Optional business context via keyword args (``outcome_label``, ``outcome_unit``, + ``business_question``, ``treatment_label``). Without them, BusinessReport uses + generic fallbacks — the zero-config path works. +- Diagnostic integration is implicit by default: ``BusinessReport(results)`` + auto-constructs a ``DiagnosticReport`` so the summary can mention pre-trends, + robustness, and design-effect findings. Pass ``auto_diagnostics=False`` or an + explicit ``diagnostics=`` object to override. + +Methodology deviations (no traffic-light gates, pre-trends verdict thresholds, +power-aware phrasing, unit-translation policy, schema stability) are documented +in ``docs/methodology/REPORTING.md``. The ``to_dict()`` schema is marked +experimental in v3.2. +""" + +from __future__ import annotations + +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Union + +import numpy as np + +from diff_diff.diagnostic_report import DiagnosticReport, DiagnosticReportResults + +BUSINESS_REPORT_SCHEMA_VERSION = "1.0" + +__all__ = [ + "BusinessReport", + "BusinessContext", + "BUSINESS_REPORT_SCHEMA_VERSION", +] + +# Recognized ``outcome_unit`` values mapped to a coarse "kind" used by the +# formatter. Unrecognized strings are accepted and rendered verbatim without +# arithmetic translation (``unit_kind = "unknown"``). +_UNIT_KINDS: Dict[str, str] = { + "$": "currency", + "usd": "currency", + "%": "percent", + "pp": "percentage_points", + "percentage_points": "percentage_points", + "percent": "percent", + "log_points": "log_points", + "log": "log_points", + "count": "count", + "users": "count", +} + + +@dataclass(frozen=True) +class BusinessContext: + """Frozen bundle of business-framing metadata used when rendering prose. + + Populated from ``BusinessReport`` constructor kwargs. Falls back to + neutral labels when fields are not supplied. + """ + + outcome_label: str + outcome_unit: Optional[str] + outcome_direction: Optional[str] + business_question: Optional[str] + treatment_label: str + alpha: float + + +class BusinessReport: + """Produce a stakeholder-ready narrative from any diff-diff results object. + + Parameters + ---------- + results : Any + A fitted diff-diff results object. Any of the 16 result types is + accepted. ``BaconDecompositionResults`` is not a valid input — Bacon + is a diagnostic, not an estimator; use ``DiagnosticReport`` for that. + outcome_label : str, optional + Stakeholder-friendly outcome name (e.g. ``"Revenue per user"``). + outcome_unit : str, optional + Unit label: ``"$"`` / ``"%"`` / ``"pp"`` / ``"log_points"`` / ``"count"`` + (recognized for formatting) or any free-form string (used verbatim + without arithmetic translation). + outcome_direction : str, optional + ``"higher_is_better"`` or ``"lower_is_better"``. Drives whether the + effect is described as "lift" / "drag" rather than just "increase" / + "decrease". + business_question : str, optional + Question the analysis answers (prepended to the summary). + treatment_label : str, optional + Stakeholder-friendly treatment name (e.g. ``"the campaign"``). + alpha : float, optional + Significance level. Defaults to ``results.alpha`` when not supplied. + Single knob: drives both CI level and significance phrasing. + honest_did_results : HonestDiDResults or SensitivityResults, optional + Pre-computed sensitivity result. When supplied, this is forwarded to + the internal ``DiagnosticReport`` so sensitivity is not re-computed. + auto_diagnostics : bool, default True + When ``True`` and ``diagnostics`` is ``None``, auto-construct a + ``DiagnosticReport``. Set ``False`` to skip diagnostics entirely. + diagnostics : DiagnosticReport or DiagnosticReportResults, optional + Explicit diagnostics object. Takes precedence over ``auto_diagnostics``. + include_appendix : bool, default True + Whether ``full_report()`` appends the estimator's academic + ``results.summary()`` output under a "Technical Appendix" section. + """ + + def __init__( + self, + results: Any, + *, + outcome_label: Optional[str] = None, + outcome_unit: Optional[str] = None, + outcome_direction: Optional[str] = None, + business_question: Optional[str] = None, + treatment_label: Optional[str] = None, + alpha: Optional[float] = None, + honest_did_results: Optional[Any] = None, + auto_diagnostics: bool = True, + diagnostics: Optional[Union[DiagnosticReport, DiagnosticReportResults]] = None, + include_appendix: bool = True, + ): + if type(results).__name__ == "BaconDecompositionResults": + raise TypeError( + "BaconDecompositionResults is a diagnostic, not an estimator; " + "wrap the underlying estimator with BusinessReport and pass the " + "Bacon object to DiagnosticReport(precomputed={'bacon': ...})." + ) + + if diagnostics is not None and not isinstance( + diagnostics, (DiagnosticReport, DiagnosticReportResults) + ): + raise TypeError( + "diagnostics= must be a DiagnosticReport or " + "DiagnosticReportResults instance; " + f"got {type(diagnostics).__name__}." + ) + + self._results = results + self._honest_did_results = honest_did_results + self._auto_diagnostics = auto_diagnostics + self._diagnostics_arg = diagnostics + self._include_appendix = include_appendix + + resolved_alpha = alpha if alpha is not None else getattr(results, "alpha", 0.05) + self._context = BusinessContext( + outcome_label=outcome_label or "the outcome", + outcome_unit=outcome_unit, + outcome_direction=outcome_direction, + business_question=business_question, + treatment_label=treatment_label or "the treatment", + alpha=float(resolved_alpha), + ) + + self._cached_schema: Optional[Dict[str, Any]] = None + + # -- Public API --------------------------------------------------------- + + def to_dict(self) -> Dict[str, Any]: + """Return the AI-legible structured schema (single source of truth).""" + if self._cached_schema is None: + self._cached_schema = self._build_schema() + return self._cached_schema + + def to_json(self, *, indent: int = 2) -> str: + """Return ``to_dict()`` serialized as JSON.""" + import json + + return json.dumps(self.to_dict(), indent=indent) + + def summary(self) -> str: + """Return a short plain-English paragraph block (6-10 sentences).""" + return _render_summary(self.to_dict()) + + def full_report(self) -> str: + """Return a structured multi-section markdown report.""" + base = _render_full_report(self.to_dict()) + if self._include_appendix: + try: + appendix = self._results.summary() + except Exception: # noqa: BLE001 + appendix = None + if appendix: + base = base + "\n\n## Technical Appendix\n\n```\n" + str(appendix) + "\n```\n" + return base + + def export_markdown(self) -> str: + """Alias for ``full_report()`` (discoverability).""" + return self.full_report() + + def headline(self) -> str: + """Return just the headline sentence.""" + return _render_headline_sentence(self.to_dict()) + + def caveats(self) -> List[Dict[str, str]]: + """Return the list of structured caveats (severity + topic + message).""" + return list(self.to_dict().get("caveats", [])) + + def __repr__(self) -> str: + estimator = type(self._results).__name__ + headline = self.to_dict().get("headline") or {} + val = headline.get("effect") + if isinstance(val, (int, float)) and np.isfinite(val): + return f"BusinessReport(results={estimator}, effect={val:.3g})" + return f"BusinessReport(results={estimator})" + + def __str__(self) -> str: + return self.summary() + + # -- Implementation detail --------------------------------------------- + + def _resolve_diagnostics(self) -> Optional[DiagnosticReportResults]: + """Return the DiagnosticReportResults to embed, or ``None`` if skipped.""" + if self._diagnostics_arg is not None: + if isinstance(self._diagnostics_arg, DiagnosticReportResults): + return self._diagnostics_arg + if isinstance(self._diagnostics_arg, DiagnosticReport): + return self._diagnostics_arg.run_all() + raise TypeError("diagnostics= must be a DiagnosticReport or DiagnosticReportResults") + if not self._auto_diagnostics: + return None + precomputed: Dict[str, Any] = {} + if self._honest_did_results is not None: + precomputed["sensitivity"] = self._honest_did_results + dr = DiagnosticReport( + self._results, + alpha=self._context.alpha, + precomputed=precomputed or None, + outcome_label=self._context.outcome_label, + treatment_label=self._context.treatment_label, + ) + return dr.run_all() + + def _build_schema(self) -> Dict[str, Any]: + """Assemble the structured schema. + + Pulls validation content (PT, sensitivity, Bacon, DEFF, EPV, ...) from + the internal ``DiagnosticReport``; extracts the stakeholder-facing + headline and sample metadata from the fitted result itself. + """ + estimator_name = type(self._results).__name__ + diagnostics_results = self._resolve_diagnostics() + dr_schema: Optional[Dict[str, Any]] = ( + diagnostics_results.schema if diagnostics_results is not None else None + ) + + headline = self._extract_headline(dr_schema) + sample = self._extract_sample() + heterogeneity = _lift_heterogeneity(dr_schema) + pre_trends = _lift_pre_trends(dr_schema) + sensitivity = _lift_sensitivity(dr_schema) + robustness = _lift_robustness(dr_schema) + assumption = _describe_assumption(estimator_name) + next_steps = (dr_schema or {}).get("next_steps", []) + caveats = _build_caveats(self._results, headline, sample, dr_schema) + references = _references_for(estimator_name) + + if diagnostics_results is None: + diagnostics_block: Dict[str, Any] = { + "status": "skipped", + "reason": "auto_diagnostics=False", + } + else: + diagnostics_block = { + "status": "ran", + "schema": dr_schema, + "overall_interpretation": ( + dr_schema.get("overall_interpretation", "") if dr_schema is not None else "" + ), + } + + return { + "schema_version": BUSINESS_REPORT_SCHEMA_VERSION, + "estimator": { + "class_name": estimator_name, + "display_name": estimator_name, + }, + "context": { + "outcome_label": self._context.outcome_label, + "outcome_unit": self._context.outcome_unit, + "outcome_direction": self._context.outcome_direction, + "business_question": self._context.business_question, + "treatment_label": self._context.treatment_label, + "alpha": self._context.alpha, + }, + "headline": headline, + "assumption": assumption, + "pre_trends": pre_trends, + "sensitivity": sensitivity, + "sample": sample, + "heterogeneity": heterogeneity, + "robustness": robustness, + "diagnostics": diagnostics_block, + "next_steps": next_steps, + "caveats": caveats, + "references": references, + } + + def _extract_headline(self, dr_schema: Optional[Dict[str, Any]]) -> Dict[str, Any]: + """Extract the headline effect + CI + p-value from the result.""" + r = self._results + att: Optional[float] = None + se: Optional[float] = None + p: Optional[float] = None + ci: Optional[List[float]] = None + alpha = self._context.alpha + + for name in ("overall_att", "avg_att", "att"): + val = getattr(r, name, None) + if val is None: + continue + att = _safe_float(val) + se = _safe_float( + getattr( + r, + { + "overall_att": "overall_se", + "avg_att": "avg_se", + "att": "se", + }[name], + None, + ) + ) + p = _safe_float( + getattr( + r, + { + "overall_att": "overall_p_value", + "avg_att": "avg_p_value", + "att": "p_value", + }[name], + None, + ) + ) + ci = _safe_ci( + getattr( + r, + { + "overall_att": "overall_conf_int", + "avg_att": "avg_conf_int", + "att": "conf_int", + }[name], + None, + ) + ) + break + + unit = self._context.outcome_unit + unit_kind = _UNIT_KINDS.get(unit.lower() if unit else "", "unknown") + sign = ( + "positive" + if (att is not None and att > 0) + else ( + "negative" + if (att is not None and att < 0) + else ("null" if att == 0 else "undefined") + ) + ) + if att is None or not np.isfinite(att): + sign = "undefined" + ci_level = int(round((1.0 - alpha) * 100)) + is_significant = p is not None and np.isfinite(p) and p < alpha if p is not None else False + near_threshold = p is not None and np.isfinite(p) and (alpha - 0.01) < p < (alpha + 0.001) + # Use DR-computed breakdown_M if available for quick reference. + breakdown_M: Optional[float] = None + if dr_schema: + sens_section = dr_schema.get("sensitivity") or {} + if sens_section.get("status") == "ran": + breakdown_M = sens_section.get("breakdown_M") + + return { + "effect": att, + "se": se, + "ci_lower": ci[0] if ci else None, + "ci_upper": ci[1] if ci else None, + "ci_level": ci_level, + "p_value": p, + "is_significant": is_significant, + "near_significance_threshold": near_threshold, + "unit": unit, + "unit_kind": unit_kind, + "sign": sign, + "breakdown_M": breakdown_M, + } + + def _extract_sample(self) -> Dict[str, Any]: + """Extract sample metadata from the fitted result.""" + r = self._results + survey = self._extract_survey_block() + return { + "n_obs": _safe_int(getattr(r, "n_obs", None)), + "n_treated": _safe_int(getattr(r, "n_treated", getattr(r, "n_treated_units", None))), + "n_control": _safe_int(getattr(r, "n_control", getattr(r, "n_control_units", None))), + "n_periods": _safe_int(getattr(r, "n_periods", None)), + "pre_periods": _safe_list_len(getattr(r, "pre_periods", None)), + "post_periods": _safe_list_len(getattr(r, "post_periods", None)), + "survey": survey, + } + + def _extract_survey_block(self) -> Optional[Dict[str, Any]]: + sm = getattr(self._results, "survey_metadata", None) + if sm is None: + return None + deff = _safe_float(getattr(sm, "design_effect", None)) + return { + "weight_type": getattr(sm, "weight_type", None), + "effective_n": _safe_float(getattr(sm, "effective_n", None)), + "design_effect": deff, + "is_trivial": deff is not None and 0.95 <= deff <= 1.05, + "n_strata": _safe_int(getattr(sm, "n_strata", None)), + "n_psu": _safe_int(getattr(sm, "n_psu", None)), + "df_survey": _safe_int(getattr(sm, "df_survey", None)), + "replicate_method": getattr(sm, "replicate_method", None), + } + + +# --------------------------------------------------------------------------- +# Schema helpers (module-private) +# --------------------------------------------------------------------------- +def _safe_float(val: Any) -> Optional[float]: + if val is None: + return None + try: + return float(val) + except (TypeError, ValueError): + return None + + +def _safe_int(val: Any) -> Optional[int]: + if val is None: + return None + try: + return int(val) + except (TypeError, ValueError): + return None + + +def _safe_ci(ci: Any) -> Optional[List[float]]: + if ci is None: + return None + try: + lo, hi = ci + except (TypeError, ValueError): + return None + lo_f = _safe_float(lo) + hi_f = _safe_float(hi) + if lo_f is None or hi_f is None: + return None + return [lo_f, hi_f] + + +def _safe_list_len(val: Any) -> Optional[int]: + if val is None: + return None + try: + return int(len(val)) + except TypeError: + return None + + +def _lift_pre_trends(dr: Optional[Dict[str, Any]]) -> Dict[str, Any]: + """Pull pre-trends + power into a single BR-facing block.""" + if dr is None: + return {"status": "skipped", "reason": "auto_diagnostics=False"} + pt = dr.get("parallel_trends") or {} + pp = dr.get("pretrends_power") or {} + if pt.get("status") != "ran": + return { + "status": pt.get("status", "not_run"), + "reason": pt.get("reason"), + } + return { + "status": "computed", + "method": pt.get("method"), + "joint_p_value": pt.get("joint_p_value"), + "verdict": pt.get("verdict"), + "n_pre_periods": pt.get("n_pre_periods"), + "power_status": pp.get("status"), + "power_tier": pp.get("tier"), + "mdv": pp.get("mdv"), + "mdv_share_of_att": pp.get("mdv_share_of_att"), + } + + +def _lift_sensitivity(dr: Optional[Dict[str, Any]]) -> Dict[str, Any]: + if dr is None: + return {"status": "skipped", "reason": "auto_diagnostics=False"} + sens = dr.get("sensitivity") or {} + if sens.get("status") != "ran": + return { + "status": sens.get("status", "not_run"), + "reason": sens.get("reason"), + } + return { + "status": "computed", + "method": sens.get("method"), + "breakdown_M": sens.get("breakdown_M"), + "conclusion": sens.get("conclusion"), + "grid": sens.get("grid"), + } + + +def _lift_heterogeneity(dr: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]: + if dr is None: + return None + het = dr.get("heterogeneity") or {} + if het.get("status") != "ran": + return None + return { + "source": het.get("source"), + "n_effects": het.get("n_effects"), + "min": het.get("min"), + "max": het.get("max"), + "cv": het.get("cv"), + "sign_consistent": het.get("sign_consistent"), + } + + +def _lift_robustness(dr: Optional[Dict[str, Any]]) -> Dict[str, Any]: + if dr is None: + return {"status": "skipped", "reason": "auto_diagnostics=False"} + bacon = dr.get("bacon") or {} + native = dr.get("estimator_native_diagnostics") or {} + return { + "bacon": { + "status": bacon.get("status"), + "forbidden_weight": bacon.get("forbidden_weight"), + "verdict": bacon.get("verdict"), + }, + "estimator_native": { + "status": native.get("status"), + "pre_treatment_fit": native.get("pre_treatment_fit"), + }, + } + + +def _describe_assumption(estimator_name: str) -> Dict[str, Any]: + """Return the identifying-assumption block for an estimator.""" + if estimator_name in { + "SyntheticDiDResults", + }: + return { + "parallel_trends_variant": "weighted_pt", + "no_anticipation": True, + "description": ( + "Synthetic-Difference-in-Differences identifies the ATT under a " + "weighted parallel-trends analogue: the synthetic control is " + "chosen to match the treated group's pre-period trajectory." + ), + } + if estimator_name in {"TROPResults"}: + return { + "parallel_trends_variant": "factor_model", + "no_anticipation": True, + "description": ( + "TROP uses low-rank factor-model identification rather than a " + "parallel-trends assumption; unobserved heterogeneity is " + "captured through latent factor loadings." + ), + } + if estimator_name in { + "CallawaySantAnnaResults", + "SunAbrahamResults", + "ImputationDiDResults", + "TwoStageDiDResults", + "StackedDiDResults", + "EfficientDiDResults", + "WooldridgeDiDResults", + "ChaisemartinDHaultfoeuilleResults", + "StaggeredTripleDiffResults", + }: + return { + "parallel_trends_variant": "conditional_or_group_time", + "no_anticipation": True, + "description": ( + "Identification relies on parallel trends across treatment " + "cohorts and time periods (group-time ATT), plus no " + "anticipation." + ), + } + return { + "parallel_trends_variant": "unconditional", + "no_anticipation": True, + "description": ( + "Identification relies on the standard DiD parallel-trends " + "assumption plus no anticipation of treatment by either group." + ), + } + + +def _build_caveats( + _results: Any, + headline: Dict[str, Any], + sample: Dict[str, Any], + dr_schema: Optional[Dict[str, Any]], +) -> List[Dict[str, Any]]: + """Assemble the plain-English caveats list for the headline schema.""" + caveats: List[Dict[str, Any]] = [] + + # NaN ATT is the highest-severity caveat. + if headline.get("sign") == "undefined": + caveats.append( + { + "severity": "warning", + "topic": "estimation_failure", + "message": ( + "Estimation produced a non-finite effect. Inspect data " + "preparation and model specification before interpreting." + ), + } + ) + + # Near-threshold p-value. + if headline.get("near_significance_threshold"): + caveats.append( + { + "severity": "info", + "topic": "near_significance", + "message": ( + "The p-value is close to the conventional significance " + "threshold; small changes to the sample or specification " + "could move it either way." + ), + } + ) + + # Few treated units. + nt = sample.get("n_treated") + if nt is not None and nt <= 3: + caveats.append( + { + "severity": "warning", + "topic": "few_treated", + "message": ( + f"Only {nt} treated units in this fit; standard errors " + "rely on large-cluster asymptotics and may be unreliable. " + "Consider SyntheticDiD or an exact-permutation inference " + "alternative." + ), + } + ) + + # Non-trivial design effect. + survey = sample.get("survey") + if survey and not survey.get("is_trivial"): + deff = survey.get("design_effect") + eff_n = survey.get("effective_n") + if isinstance(deff, (int, float)) and deff >= 5.0: + caveats.append( + { + "severity": "warning", + "topic": "design_effect", + "message": ( + f"Very large survey design effect (DEFF = {deff:.2g}). " + "Inspect the weight distribution and consider weight " + "trimming if driven by outlier weights." + ), + } + ) + elif isinstance(deff, (int, float)) and deff >= 1.5: + if isinstance(eff_n, (int, float)): + caveats.append( + { + "severity": "info", + "topic": "design_effect", + "message": ( + f"Survey design reduces effective sample size: " + f"DEFF = {deff:.2g}; effective n = {eff_n:.0f}." + ), + } + ) + + # Bacon forbidden comparisons. + if dr_schema: + bacon = dr_schema.get("bacon") or {} + if bacon.get("status") == "ran": + fw = bacon.get("forbidden_weight") + if isinstance(fw, (int, float)) and fw > 0.10: + caveats.append( + { + "severity": "warning", + "topic": "bacon_contamination", + "message": ( + f"Goodman-Bacon decomposition places {fw:.0%} " + "of implicit TWFE weight on 'forbidden' " + "later-vs-earlier comparisons. TWFE may be " + "materially biased under heterogeneous effects. " + "Re-estimate with a heterogeneity-robust " + "estimator (CS / SA / BJS / Gardner)." + ), + } + ) + + # Fragile sensitivity. + sens = dr_schema.get("sensitivity") or {} + if sens.get("status") == "ran": + bkd = sens.get("breakdown_M") + if isinstance(bkd, (int, float)) and bkd < 0.5: + caveats.append( + { + "severity": "warning", + "topic": "sensitivity_fragility", + "message": ( + f"HonestDiD breakdown value is {bkd:.2g}: the " + "result's confidence interval includes zero " + "once parallel-trends violations reach less than " + "half the observed pre-period variation. Treat " + "the headline as tentative." + ), + } + ) + + # Unit mismatch caveat (log_points + unit override). + unit_kind = headline.get("unit_kind") + if unit_kind == "log_points": + caveats.append( + { + "severity": "info", + "topic": "unit_policy", + "message": ( + "The effect is reported in log-points as estimated; " + "BusinessReport does not arithmetically translate log-points " + "to percent or level changes. For small effects, log-points " + "approximate percentage changes." + ), + } + ) + return caveats + + +def _references_for(estimator_name: str) -> List[Dict[str, str]]: + """Map the estimator to the appropriate citation references.""" + base = [ + { + "role": "sensitivity", + "citation": ( + "Rambachan, A., & Roth, J. (2023). A More Credible Approach " + "to Parallel Trends. Review of Economic Studies." + ), + }, + { + "role": "workflow", + "citation": ( + "Baker, A. C., Callaway, B., Cunningham, S., Goodman-Bacon, A., " + "& Sant'Anna, P. H. C. (2025). Difference-in-Differences " + "Designs: A Practitioner's Guide." + ), + }, + ] + estimator_refs = { + "CallawaySantAnnaResults": { + "role": "estimator", + "citation": ( + "Callaway, B., & Sant'Anna, P. H. C. (2021). " + "Difference-in-Differences with multiple time periods. " + "Journal of Econometrics." + ), + }, + "SyntheticDiDResults": { + "role": "estimator", + "citation": ( + "Arkhangelsky, D., Athey, S., Hirshberg, D. A., Imbens, G. W., " + "& Wager, S. (2021). Synthetic Difference in Differences." + ), + }, + "SunAbrahamResults": { + "role": "estimator", + "citation": ( + "Sun, L., & Abraham, S. (2021). Estimating dynamic treatment " + "effects in event studies. Journal of Econometrics." + ), + }, + "ImputationDiDResults": { + "role": "estimator", + "citation": ( + "Borusyak, K., Jaravel, X., & Spiess, J. (2024). " "Revisiting event-study designs." + ), + }, + "EfficientDiDResults": { + "role": "estimator", + "citation": ( + "Chen, X., Sant'Anna, P. H. C., & Xie, H. (2025). " + "Efficient Estimation of Treatment Effects in Staggered " + "DiD Designs." + ), + }, + "ChaisemartinDHaultfoeuilleResults": { + "role": "estimator", + "citation": ( + "de Chaisemartin, C., & D'Haultfœuille, X. (2020). " + "Two-way fixed effects estimators with heterogeneous " + "treatment effects. American Economic Review." + ), + }, + } + if estimator_name in estimator_refs: + return [estimator_refs[estimator_name]] + base + return base + + +# --------------------------------------------------------------------------- +# Prose rendering +# --------------------------------------------------------------------------- +def _format_value(value: Optional[float], unit: Optional[str], unit_kind: str) -> str: + """Format a numeric effect with its unit. No arithmetic translation.""" + if value is None or not np.isfinite(value): + return "undefined" + if unit_kind == "currency": + sign = "-" if value < 0 else "" + return f"{sign}${abs(value):,.2f}" + if unit_kind == "percent": + return f"{value:.2f}%" + if unit_kind == "percentage_points": + return f"{value:.2f} pp" + if unit_kind == "log_points": + return f"{value:.3g} log-points" + if unit_kind == "count": + return f"{value:,.0f}" + # unknown / free-form + if unit: + return f"{value:.3g} {unit}" + return f"{value:.3g}" + + +def _significance_phrase(p: Optional[float], alpha: float) -> str: + """Return a plain-English significance phrase. + + Tiers per ``docs/methodology/REPORTING.md``: + * p < 0.001: "strongly supported by the data" + * 0.001 <= p < 0.01: "well-supported" + * 0.01 <= p < alpha: "statistically significant at the X% level" + * alpha <= p < 0.10: CI-includes-zero language + * p >= 0.10: consistent-with-no-effect language + """ + if p is None or not np.isfinite(p): + return "statistical significance cannot be assessed (p-value unavailable)" + ci_level = int(round((1.0 - alpha) * 100)) + if p < 0.001: + return "the direction of the effect is strongly supported by the data" + if p < 0.01: + return "the direction of the effect is well-supported by the data" + if p < alpha: + return f"the effect is statistically significant at the {ci_level}% level" + if p < 0.10: + return ( + "the confidence interval includes zero; the direction is suggestive " + "but not statistically significant" + ) + return "the confidence interval includes zero; the data are consistent with no effect" + + +def _render_headline_sentence(schema: Dict[str, Any]) -> str: + """Render the headline sentence from the schema. + + Uses the absolute value in the magnitude slot when the verb already + conveys direction ("decreased ... by $0.14" rather than "decreased ... + by -$0.14"). CI bounds are rendered at their natural signed values. + """ + ctx = schema.get("context", {}) + h = schema.get("headline", {}) + effect = h.get("effect") + outcome = ctx.get("outcome_label", "the outcome") + treatment = ctx.get("treatment_label", "the treatment") + unit = h.get("unit") + unit_kind = h.get("unit_kind", "unknown") + + if effect is None or not np.isfinite(effect): + return ( + f"We were unable to produce a finite estimate of {treatment}'s " + f"effect on {outcome}. Inspect the data and model specification." + ) + + verb = "increased" if effect > 0 else "decreased" if effect < 0 else "did not change" + magnitude = _format_value(abs(effect), unit, unit_kind) + lo = h.get("ci_lower") + hi = h.get("ci_upper") + ci_str = "" + if isinstance(lo, (int, float)) and isinstance(hi, (int, float)): + lo_s = _format_value(lo, unit, unit_kind) + hi_s = _format_value(hi, unit, unit_kind) + ci_str = f" ({h.get('ci_level', 95)}% CI: {lo_s} to {hi_s})" + by_clause = f" by {magnitude}" if effect != 0 else "" + return f"{treatment.capitalize()} {verb} {outcome}{by_clause}{ci_str}." + + +def _render_summary(schema: Dict[str, Any]) -> str: + """Render the short-form stakeholder summary paragraph.""" + sentences: List[str] = [] + ctx = schema.get("context", {}) + question = ctx.get("business_question") + if question: + sentences.append(f"Question: {question}") + + # Headline sentence with significance phrase. + sentences.append(_render_headline_sentence(schema)) + h = schema.get("headline", {}) + p = h.get("p_value") + alpha = ctx.get("alpha", 0.05) + if p is not None and np.isfinite(p): + sig = _significance_phrase(p, alpha) + sentences.append(f"Statistically, {sig}.") + if h.get("near_significance_threshold"): + sentences.append( + "The p-value is close to the conventional threshold; " + "small changes to the sample could move it either way." + ) + + # Pre-trends + power-aware phrasing. + pt = schema.get("pre_trends", {}) or {} + if pt.get("status") == "computed": + jp = pt.get("joint_p_value") + verdict = pt.get("verdict") + tier = pt.get("power_tier") + if verdict == "clear_violation": + sentences.append( + f"Pre-treatment data clearly reject parallel trends (joint " + f"p = {jp:.3g}); the headline should be treated as tentative " + f"pending the sensitivity analysis below." + if isinstance(jp, (int, float)) + else "Pre-treatment data clearly reject parallel trends; the " + "headline should be treated as tentative." + ) + elif verdict == "some_evidence_against": + sentences.append( + f"Pre-treatment data show some evidence of diverging trends " + f"(joint p = {jp:.3g}); interpret the headline alongside the " + f"sensitivity analysis below." + if isinstance(jp, (int, float)) + else "Pre-treatment data show some evidence of diverging trends." + ) + elif verdict == "no_detected_violation": + if tier == "well_powered": + sentences.append( + "Pre-treatment data are consistent with parallel trends, " + "and the test is well-powered (the minimum-detectable " + "violation is small relative to the estimated effect)." + ) + elif tier == "moderately_powered": + sentences.append( + "Pre-treatment data do not reject parallel trends; the " + "test is moderately informative. See the sensitivity " + "analysis below for bounded-violation guarantees." + ) + else: + sentences.append( + "Pre-treatment data do not reject parallel trends, but " + "the test has limited power — a non-rejection does not " + "prove the assumption. See the sensitivity analysis below." + ) + elif verdict == "design_enforced_pt": + sentences.append( + "The synthetic control is designed to match the treated " + "group's pre-period trajectory (SDiD's weighted-parallel-" + "trends analogue)." + ) + + # Sensitivity. + sens = schema.get("sensitivity", {}) or {} + if sens.get("status") == "computed": + bkd = sens.get("breakdown_M") + if bkd is None: + sentences.append( + "HonestDiD: the result remains significant across the " + "full grid — robust to plausible parallel-trends violations." + ) + elif isinstance(bkd, (int, float)) and bkd >= 1.0: + sentences.append( + f"HonestDiD: the result remains significant under " + f"parallel-trends violations up to {bkd:.2g}x the observed " + f"pre-period variation." + ) + elif isinstance(bkd, (int, float)): + sentences.append( + f"HonestDiD: the result is fragile — the confidence interval " + f"includes zero once violations reach {bkd:.2g}x the " + f"pre-period variation." + ) + + # Sample sentence. + sample = schema.get("sample", {}) or {} + n_obs = sample.get("n_obs") + n_t = sample.get("n_treated") + n_c = sample.get("n_control") + if isinstance(n_obs, int): + sentences.append( + f"Sample: {n_obs:,} observations" + + ( + f" ({n_t:,} treated, {n_c:,} control)" + if isinstance(n_t, int) and isinstance(n_c, int) + else "" + ) + + "." + ) + survey = sample.get("survey") + if survey and not survey.get("is_trivial"): + deff = survey.get("design_effect") + eff_n = survey.get("effective_n") + if isinstance(deff, (int, float)) and isinstance(eff_n, (int, float)): + sentences.append( + f"Survey design reduces effective sample size to " + f"~{eff_n:,.0f} (DEFF = {deff:.2g})." + ) + + # Highest-severity caveat (if any). + caveats = schema.get("caveats", []) + warning_caveats = [c for c in caveats if c.get("severity") == "warning"] + if warning_caveats: + top = warning_caveats[0] + sentences.append(f"Caveat: {top.get('message')}") + + return " ".join(s for s in sentences if s) + + +def _render_full_report(schema: Dict[str, Any]) -> str: + """Render the structured multi-section markdown report.""" + ctx = schema.get("context", {}) + h = schema.get("headline", {}) + sample = schema.get("sample", {}) + pt = schema.get("pre_trends", {}) or {} + sens = schema.get("sensitivity", {}) or {} + assumption = schema.get("assumption", {}) + het = schema.get("heterogeneity") + caveats = schema.get("caveats", []) + references = schema.get("references", []) + next_steps = schema.get("next_steps", []) + + lines: List[str] = [] + lines.append(f"# Business Report: {ctx.get('outcome_label', 'Outcome')}") + lines.append("") + if ctx.get("business_question"): + lines.append(f"**Question**: {ctx['business_question']}") + lines.append("") + lines.append(f"**Estimator**: `{schema.get('estimator', {}).get('class_name')}`") + lines.append("") + + # Headline + lines.append("## Headline") + lines.append("") + lines.append(_render_headline_sentence(schema)) + p = h.get("p_value") + alpha = ctx.get("alpha", 0.05) + if isinstance(p, (int, float)): + lines.append("") + lines.append(f"Statistically, {_significance_phrase(p, alpha)}.") + lines.append("") + + # Identifying assumption + lines.append("## Identifying Assumption") + lines.append("") + lines.append(assumption.get("description", "") or "Standard DiD parallel-trends assumption.") + lines.append("") + + # Pre-trends + lines.append("## Pre-Trends") + lines.append("") + if pt.get("status") == "computed": + jp = pt.get("joint_p_value") + verdict = pt.get("verdict") + tier = pt.get("power_tier") + jp_str = f"joint p = {jp:.3g}" if isinstance(jp, (int, float)) else "joint p unavailable" + lines.append(f"- Verdict: `{verdict}` ({jp_str})") + if tier: + lines.append(f"- Power tier: `{tier}`") + mdv = pt.get("mdv") + ratio = pt.get("mdv_share_of_att") + if isinstance(mdv, (int, float)): + lines.append(f"- Minimum detectable violation (MDV): {mdv:.3g}") + if isinstance(ratio, (int, float)): + lines.append(f"- MDV / |ATT|: {ratio:.2g}") + else: + lines.append(f"- Pre-trends not computed: {pt.get('reason', 'unavailable')}") + lines.append("") + + # Sensitivity + lines.append("## Sensitivity (HonestDiD)") + lines.append("") + if sens.get("status") == "computed": + bkd = sens.get("breakdown_M") + concl = sens.get("conclusion") + lines.append(f"- Method: `{sens.get('method')}`") + lines.append( + f"- Breakdown M: {bkd:.3g}" + if isinstance(bkd, (int, float)) + else "- Breakdown M: robust across full grid (no breakdown)" + ) + lines.append(f"- Conclusion: `{concl}`") + else: + lines.append(f"- Sensitivity not computed: {sens.get('reason', 'unavailable')}") + lines.append("") + + # Sample + lines.append("## Sample") + lines.append("") + if isinstance(sample.get("n_obs"), int): + lines.append(f"- Observations: {sample['n_obs']:,}") + if isinstance(sample.get("n_treated"), int): + lines.append(f"- Treated: {sample['n_treated']:,}") + if isinstance(sample.get("n_control"), int): + lines.append(f"- Control: {sample['n_control']:,}") + survey = sample.get("survey") + if survey: + if survey.get("is_trivial"): + lines.append("- Survey design: trivial DEFF (~1.0)") + else: + deff = survey.get("design_effect") + eff_n = survey.get("effective_n") + if isinstance(deff, (int, float)): + lines.append(f"- Survey DEFF: {deff:.2g}") + if isinstance(eff_n, (int, float)): + lines.append(f"- Effective N: {eff_n:,.0f}") + lines.append("") + + # Heterogeneity + if het: + lines.append("## Heterogeneity") + lines.append("") + lines.append(f"- Source: `{het.get('source')}`") + lines.append(f"- N effects: {het.get('n_effects')}") + mn = het.get("min") + mx = het.get("max") + if isinstance(mn, (int, float)) and isinstance(mx, (int, float)): + lines.append(f"- Range: {mn:.3g} to {mx:.3g}") + cv = het.get("cv") + if isinstance(cv, (int, float)): + lines.append(f"- CV: {cv:.3g}") + lines.append(f"- Sign consistent: {het.get('sign_consistent')}") + lines.append("") + + # Caveats + if caveats: + lines.append("## Caveats") + lines.append("") + for c in caveats: + sev = c.get("severity", "info") + lines.append(f"- **{sev.upper()}** — {c.get('message')}") + lines.append("") + + # Next steps + if next_steps: + lines.append("## Next Steps") + lines.append("") + for s in next_steps: + if s.get("label"): + lines.append(f"- {s['label']}") + if s.get("why"): + lines.append(f" - _why_: {s['why']}") + lines.append("") + + # References + if references: + lines.append("## References") + lines.append("") + for ref in references: + lines.append(f"- {ref.get('citation')}") + lines.append("") + + return "\n".join(lines) diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py new file mode 100644 index 00000000..4ca24c79 --- /dev/null +++ b/diff_diff/diagnostic_report.py @@ -0,0 +1,1765 @@ +""" +DiagnosticReport — unified, plain-English validity assessment for diff-diff results. + +Orchestrates the library's existing diagnostic functions (parallel trends, +pre-trends power, HonestDiD sensitivity, Goodman-Bacon, design-effect +diagnostics, EPV, heterogeneity, and estimator-native checks for SDiD/TROP) +into a single report with a stable AI-legible schema. + +Design principles: + +- No hard pass/fail gates. Severity is conveyed by natural-language phrasing, + not a traffic-light enum. See ``docs/methodology/REPORTING.md``. +- No new statistical computation. Every reported number is either read from + ``results`` or computed by an existing diff-diff utility function. +- Lazy evaluation. ``DiagnosticReport(results, ...)`` is free; ``run_all()`` + triggers compute and caches. +- Never prove a null. Pre-trends phrasing uses power information from + ``compute_pretrends_power`` to distinguish well-powered from underpowered + non-violations. + +The ``to_dict()`` surface is an AI-legible contract. See the schema reference +in ``docs/methodology/REPORTING.md`` and the ``DIAGNOSTIC_REPORT_SCHEMA_VERSION`` +constant below. The schema is marked experimental in v3.2. +""" + +from __future__ import annotations + +from dataclasses import dataclass, field +from typing import Any, Dict, FrozenSet, List, Optional, Tuple + +import numpy as np +import pandas as pd + +DIAGNOSTIC_REPORT_SCHEMA_VERSION = "1.0" + +__all__ = [ + "DiagnosticReport", + "DiagnosticReportResults", + "DIAGNOSTIC_REPORT_SCHEMA_VERSION", +] + + +# --------------------------------------------------------------------------- +# Canonical check names and per-type applicability +# --------------------------------------------------------------------------- +# The set of check names that ``DiagnosticReport`` supports. +_CHECK_NAMES: Tuple[str, ...] = ( + "parallel_trends", + "pretrends_power", + "sensitivity", + "bacon", + "design_effect", + "heterogeneity", + "epv", + "estimator_native", + "placebo", +) + +# Type-level applicability: which checks are *ever* applicable for each of the +# 16 result types. Instance-level applicability further filters by whether +# required attributes are present (e.g. ``survey_metadata`` for DEFF) and by +# whether the user disabled a check via ``run_*=False``. +# See ``docs/methodology/REPORTING.md`` for the full matrix and rationale. +_APPLICABILITY: Dict[str, FrozenSet[str]] = { + "DiDResults": frozenset({"parallel_trends", "design_effect"}), + "MultiPeriodDiDResults": frozenset( + {"parallel_trends", "pretrends_power", "sensitivity", "bacon", "design_effect"} + ), + "CallawaySantAnnaResults": frozenset( + { + "parallel_trends", + "pretrends_power", + "sensitivity", + "bacon", + "design_effect", + "heterogeneity", + "epv", + } + ), + "SunAbrahamResults": frozenset( + { + "parallel_trends", + "pretrends_power", + "sensitivity", + "bacon", + "design_effect", + "heterogeneity", + } + ), + "ImputationDiDResults": frozenset( + { + "parallel_trends", + "pretrends_power", + "sensitivity", + "bacon", + "design_effect", + "heterogeneity", + } + ), + "TwoStageDiDResults": frozenset( + { + "parallel_trends", + "pretrends_power", + "bacon", + "design_effect", + "heterogeneity", + } + ), + "StackedDiDResults": frozenset( + { + "parallel_trends", + "pretrends_power", + "sensitivity", + "bacon", + "design_effect", + "heterogeneity", + } + ), + "SyntheticDiDResults": frozenset( + {"parallel_trends", "sensitivity", "design_effect", "estimator_native"} + ), + "TROPResults": frozenset( + { + "parallel_trends", + "sensitivity", + "design_effect", + "heterogeneity", + "estimator_native", + } + ), + "EfficientDiDResults": frozenset( + { + "parallel_trends", + "pretrends_power", + "sensitivity", + "bacon", + "design_effect", + "heterogeneity", + "epv", + } + ), + "ContinuousDiDResults": frozenset({"design_effect", "heterogeneity"}), + "TripleDifferenceResults": frozenset({"design_effect", "epv"}), + "StaggeredTripleDiffResults": frozenset( + {"parallel_trends", "pretrends_power", "sensitivity", "design_effect"} + ), + "WooldridgeDiDResults": frozenset( + { + "parallel_trends", + "pretrends_power", + "sensitivity", + "bacon", + "design_effect", + "heterogeneity", + } + ), + "ChaisemartinDHaultfoeuilleResults": frozenset( + { + "parallel_trends", + "pretrends_power", + "sensitivity", + "bacon", + "design_effect", + } + ), + "BaconDecompositionResults": frozenset({"bacon"}), +} + +# Per-type parallel-trends method. The PT check dispatches internally on this. +# Values: +# "two_x_two" — uses utils.check_parallel_trends (requires ``data``) +# "event_study" — joint Wald on pre-period event-study coefficients +# "hausman" — EfficientDiD.hausman_pretest (native PT-All vs PT-Post) +# "synthetic_fit" — SDiD weighted pre-treatment fit (surfaces pre_treatment_fit) +# "factor" — TROP factor-model identification (no PT; renders "N/A" prose) +_PT_METHOD: Dict[str, str] = { + "DiDResults": "two_x_two", + "MultiPeriodDiDResults": "event_study", + "CallawaySantAnnaResults": "event_study", + "SunAbrahamResults": "event_study", + "ImputationDiDResults": "event_study", + "TwoStageDiDResults": "event_study", + "StackedDiDResults": "event_study", + "EfficientDiDResults": "hausman", + "ContinuousDiDResults": "event_study", + "StaggeredTripleDiffResults": "event_study", + "WooldridgeDiDResults": "event_study", + "ChaisemartinDHaultfoeuilleResults": "event_study", + "SyntheticDiDResults": "synthetic_fit", + "TROPResults": "factor", +} + + +@dataclass(frozen=True) +class DiagnosticReportResults: + """Frozen container holding the outcome of a ``DiagnosticReport.run_all()`` call. + + Attributes + ---------- + schema : dict + The AI-legible structured schema (also returned by ``to_dict()``). + interpretation : str + The ``overall_interpretation`` paragraph synthesizing findings across + checks. + applicable_checks : tuple of str + The names of checks that applied to this estimator + options. + skipped_checks : dict of str -> str + Mapping from skipped-check name to plain-English reason. + warnings : tuple of str + Warnings captured while running the underlying diagnostic functions. + """ + + schema: Dict[str, Any] + interpretation: str + applicable_checks: Tuple[str, ...] + skipped_checks: Dict[str, str] = field(default_factory=dict) + warnings: Tuple[str, ...] = () + + +class DiagnosticReport: + """Run the standard diff-diff diagnostic battery on a fitted result. + + Parameters + ---------- + results : Any + A fitted diff-diff results object (e.g. ``CallawaySantAnnaResults``, + ``DiDResults``, ``SyntheticDiDResults``). Any of the 16 result types + in the library is accepted. + data : pandas.DataFrame, optional + The underlying panel. Required for checks that need raw data + (2x2 parallel-trends check on ``DiDResults``; Bacon-from-scratch when + ``results`` is not itself a Bacon fit; the opt-in placebo battery). + outcome, treatment, time, unit, first_treat : str, optional + Column names identifying the panel structure. + pre_periods, post_periods : list, optional + Explicit pre- and post-treatment period labels. + run_parallel_trends, run_sensitivity, run_placebo, run_bacon, + run_design_effect, run_heterogeneity, run_epv, run_pretrends_power : bool + Per-check opt-in flags. ``run_placebo`` defaults to ``False`` (opt-in, + expensive, currently not implemented — placebo key remains reserved + as ``skipped`` in the schema). All other checks default to ``True`` + and are further gated by estimator-type and instance-level + applicability (see ``docs/methodology/REPORTING.md``). + sensitivity_M_grid : tuple of float, default (0.5, 1.0, 1.5, 2.0) + Grid of M values passed to ``HonestDiD.sensitivity``. Yields a + ``SensitivityResults`` object with ``breakdown_M`` populated. + sensitivity_method : str, default "relative_magnitude" + HonestDiD restriction type. + alpha : float, default 0.05 + Significance level used across checks. + precomputed : dict, optional + Map of check name to a pre-computed result object. Accepted keys: + ``"parallel_trends"``, ``"sensitivity"``, ``"placebo"``, ``"bacon"``, + ``"design_effect"``, ``"heterogeneity"``, ``"epv"``, + ``"pretrends_power"``. Supplied values are used verbatim and the + corresponding underlying function is not called. + outcome_label, treatment_label : str, optional + Plain-English labels used in prose rendering. + """ + + def __init__( + self, + results: Any, + *, + data: Optional[pd.DataFrame] = None, + outcome: Optional[str] = None, + treatment: Optional[str] = None, + time: Optional[str] = None, + unit: Optional[str] = None, + first_treat: Optional[str] = None, + pre_periods: Optional[List[Any]] = None, + post_periods: Optional[List[Any]] = None, + run_parallel_trends: bool = True, + run_sensitivity: bool = True, + run_placebo: bool = False, + run_bacon: bool = True, + run_design_effect: bool = True, + run_heterogeneity: bool = True, + run_epv: bool = True, + run_pretrends_power: bool = True, + sensitivity_M_grid: Tuple[float, ...] = (0.5, 1.0, 1.5, 2.0), + sensitivity_method: str = "relative_magnitude", + alpha: float = 0.05, + precomputed: Optional[Dict[str, Any]] = None, + outcome_label: Optional[str] = None, + treatment_label: Optional[str] = None, + ): + self._results = results + self._data = data + self._outcome = outcome + self._treatment = treatment + self._time = time + self._unit = unit + self._first_treat = first_treat + self._pre_periods = pre_periods + self._post_periods = post_periods + self._run_flags: Dict[str, bool] = { + "parallel_trends": run_parallel_trends, + "pretrends_power": run_pretrends_power, + "sensitivity": run_sensitivity, + "bacon": run_bacon, + "design_effect": run_design_effect, + "heterogeneity": run_heterogeneity, + "epv": run_epv, + "placebo": run_placebo, + "estimator_native": True, + } + self._sensitivity_M_grid = tuple(sensitivity_M_grid) + self._sensitivity_method = sensitivity_method + self._alpha = float(alpha) + self._precomputed = dict(precomputed or {}) + self._outcome_label = outcome_label + self._treatment_label = treatment_label + self._cached: Optional[DiagnosticReportResults] = None + + # -- Public API --------------------------------------------------------- + + def run_all(self) -> DiagnosticReportResults: + """Run all applicable diagnostics. Idempotent; caches on first call.""" + if self._cached is None: + self._cached = self._execute() + return self._cached + + def to_dict(self) -> Dict[str, Any]: + """Return the AI-legible structured schema.""" + return self.run_all().schema + + def summary(self) -> str: + """Return a short plain-English paragraph.""" + return self.run_all().interpretation + + def full_report(self) -> str: + """Return the multi-section markdown report.""" + return _render_dr_full_report(self.run_all()) + + def export_markdown(self) -> str: + """Alias for ``full_report()``.""" + return self.full_report() + + def to_dataframe(self) -> pd.DataFrame: + """Return one row per check with status and headline metric.""" + schema = self.to_dict() + rows = [] + for check in _CHECK_NAMES: + section_key = "estimator_native_diagnostics" if check == "estimator_native" else check + section = schema.get(section_key, {}) + rows.append( + { + "check": check, + "status": section.get("status"), + "headline": _check_headline(check, section), + "reason": section.get("reason"), + } + ) + return pd.DataFrame(rows) + + @property + def applicable_checks(self) -> Tuple[str, ...]: + """Names of checks that will run, given estimator + instance + options. + + No compute is triggered; this reflects only the applicability matrix + filtered by instance state (survey_metadata, epv_diagnostics, vcov) + and the user's ``run_*`` flags. + """ + return tuple(sorted(self._compute_applicable_checks()[0])) + + @property + def skipped_checks(self) -> Dict[str, str]: + """Mapping of skipped check -> plain-English reason. Requires ``run_all()``.""" + return dict(self.run_all().skipped_checks) + + # -- Implementation detail --------------------------------------------- + + def _compute_applicable_checks(self) -> Tuple[set, Dict[str, str]]: + """Compute the applicable-check set + per-check skipped reasons. + + Returns + ------- + applicable : set of str + Checks that will run. + skipped : dict + Mapping from check name -> plain-English reason for any check + that is type-applicable but skipped for this instance or by user + opt-out. Checks that are not type-applicable for this estimator + are omitted from both sets (not surfaced as "skipped"). + """ + type_name = type(self._results).__name__ + type_level = set(_APPLICABILITY.get(type_name, frozenset())) + applicable: set = set() + skipped: Dict[str, str] = {} + + for check in type_level: + # Per-check user opt-out + if not self._run_flags.get(check, True): + skipped[check] = f"run_{check}=False (user opted out)" + continue + # Instance-level gating + reason = self._instance_skip_reason(check) + if reason is not None: + skipped[check] = reason + continue + applicable.add(check) + + # Placebo is always skipped in MVP (opt-in path deferred) + if "placebo" in type_level and "placebo" not in applicable: + skipped.setdefault( + "placebo", + "Placebo battery runs on opt-in only; not yet implemented in MVP. " + "Reserved in the schema for forward compatibility.", + ) + + return applicable, skipped + + def _instance_skip_reason(self, check: str) -> Optional[str]: + """Return a plain-English reason this check cannot run on this instance, or None.""" + r = self._results + name = type(r).__name__ + if check == "design_effect": + if getattr(r, "survey_metadata", None) is None: + return "No survey design attached to results.survey_metadata." + return None + if check == "epv": + if getattr(r, "epv_diagnostics", None) is None: + return "Estimator did not produce results.epv_diagnostics for this fit." + return None + if check == "parallel_trends": + method = _PT_METHOD.get(name) + if method == "two_x_two" and self._data is None: + return ( + "2x2 parallel-trends check needs raw panel data; " + "pass data= with outcome / time / treatment columns." + ) + if method == "event_study": + pre_coefs = _collect_pre_period_coefs(r) + if not pre_coefs: + return ( + "No pre-period event-study coefficients are exposed on " + "this fit. For staggered estimators, re-fit with " + "aggregate='event_study' to populate event-study output." + ) + # vcov is optional for the Bonferroni fallback. + return None + if check == "pretrends_power": + if getattr(r, "vcov", None) is None: + return "Pre-trends power requires results.vcov; not available." + pre_coefs = _collect_pre_period_coefs(r) + if len(pre_coefs) < 2: + return "Pre-trends power needs >= 2 pre-treatment periods." + return None + if check == "sensitivity": + # Native SDiD/TROP paths substitute for HonestDiD. + if name in {"SyntheticDiDResults", "TROPResults"}: + return None + # Standard HonestDiD path. + if getattr(r, "vcov", None) is None: + return "HonestDiD requires results.vcov for the pre-period coefficients." + pre_coefs = _collect_pre_period_coefs(r) + if len(pre_coefs) < 1: + return "HonestDiD requires at least one pre-period coefficient." + return None + if check == "bacon": + # Can run if results is itself Bacon, or if data + first_treat supplied. + if name == "BaconDecompositionResults": + return None + if self._data is None or self._first_treat is None: + return ( + "Bacon decomposition needs panel data + first_treat column; " + "pass data= and first_treat=." + ) + return None + if check == "heterogeneity": + # Needs multiple group or event-study effects. Use len() rather than + # truthiness because some estimators expose these as DataFrames, + # which raise on bool() conversion. + for attr in ( + "group_effects", + "event_study_effects", + "treatment_effects", # TROP per-(unit, time) + "group_time_effects", # CS default aggregation + "period_effects", # MultiPeriod + ): + val = getattr(r, attr, None) + if val is None: + continue + try: + if len(val) > 0: + return None + except TypeError: + continue + return "No group/event-study effects available to compute heterogeneity." + if check == "estimator_native": + if name not in {"SyntheticDiDResults", "TROPResults"}: + return f"{name} does not expose native validation methods." + return None + return None + + def _execute(self) -> DiagnosticReportResults: + """Run the diagnostic battery and assemble the schema.""" + applicable, skipped = self._compute_applicable_checks() + + # Initialize all schema sections to either "ran"/"skipped"/"not_applicable". + sections: Dict[str, Dict[str, Any]] = {} + for check in _CHECK_NAMES: + if check in applicable: + sections[check] = {"status": "not_run", "reason": "pending implementation"} + elif check in skipped: + sections[check] = {"status": "skipped", "reason": skipped[check]} + else: + sections[check] = { + "status": "not_applicable", + "reason": f"{check} is not applicable to " f"{type(self._results).__name__}.", + } + + # Run the checks that are applicable. Each returns a schema-section dict + # that replaces the placeholder above. + if "parallel_trends" in applicable: + sections["parallel_trends"] = self._check_parallel_trends() + if "pretrends_power" in applicable: + sections["pretrends_power"] = self._check_pretrends_power() + if "sensitivity" in applicable: + sections["sensitivity"] = self._check_sensitivity() + if "bacon" in applicable: + sections["bacon"] = self._check_bacon() + if "design_effect" in applicable: + sections["design_effect"] = self._check_design_effect() + if "heterogeneity" in applicable: + sections["heterogeneity"] = self._check_heterogeneity() + if "epv" in applicable: + sections["epv"] = self._check_epv() + if "estimator_native" in applicable: + sections["estimator_native"] = self._check_estimator_native() + + # Estimator-native placeholder: SDiD/TROP diagnostics come in a later task. + if "estimator_native" not in applicable and "estimator_native" not in skipped: + sections["estimator_native"] = { + "status": "not_applicable", + "reason": f"{type(self._results).__name__} does not expose native " + "validation methods beyond what's captured above.", + } + + # Headline metric — best-effort across estimator types. + headline = self._extract_headline_metric() + + # Pull suggested next steps from the practitioner workflow. + next_steps = self._collect_next_steps(applicable) + + schema: Dict[str, Any] = { + "schema_version": DIAGNOSTIC_REPORT_SCHEMA_VERSION, + "estimator": type(self._results).__name__, + "headline_metric": headline, + "parallel_trends": sections["parallel_trends"], + "pretrends_power": sections["pretrends_power"], + "sensitivity": sections["sensitivity"], + "placebo": sections["placebo"], + "bacon": sections["bacon"], + "design_effect": sections["design_effect"], + "heterogeneity": sections["heterogeneity"], + "epv": sections["epv"], + "estimator_native_diagnostics": sections["estimator_native"], + "skipped": {k: v for k, v in skipped.items()}, + "warnings": [], + "overall_interpretation": "", + "next_steps": next_steps, + } + interpretation = _render_overall_interpretation(schema, self._context_labels()) + schema["overall_interpretation"] = interpretation + + return DiagnosticReportResults( + schema=schema, + interpretation=interpretation, + applicable_checks=tuple(sorted(applicable)), + skipped_checks=skipped, + warnings=(), + ) + + def _context_labels(self) -> Dict[str, str]: + """Return plain-English labels used in prose rendering.""" + return { + "outcome_label": self._outcome_label or "the outcome", + "treatment_label": self._treatment_label or "the treatment", + } + + def _collect_next_steps(self, applicable: set) -> List[Dict[str, Any]]: + """Pull and filter practitioner_next_steps, marking DR-covered steps complete.""" + try: + from diff_diff.practitioner import practitioner_next_steps + + completed = [] + if "parallel_trends" in applicable: + completed.append("parallel_trends") + if "sensitivity" in applicable: + completed.append("sensitivity") + if "heterogeneity" in applicable: + completed.append("heterogeneity") + ns = practitioner_next_steps( + self._results, + completed_steps=completed, + verbose=False, + ) + return [ + { + "label": s.get("label"), + "why": s.get("why"), + "code": s.get("code"), + "priority": s.get("priority"), + "baker_step": s.get("baker_step"), + } + for s in ns.get("next_steps", [])[:5] + ] + except Exception: # noqa: BLE001 + return [] + + # -- Per-check runners -------------------------------------------------- + + def _check_parallel_trends(self) -> Dict[str, Any]: + """Run the parallel-trends check. Dispatches on PT method for this type.""" + if "parallel_trends" in self._precomputed: + return self._format_precomputed_pt(self._precomputed["parallel_trends"]) + + method = _PT_METHOD.get(type(self._results).__name__) + if method == "two_x_two": + return self._pt_two_x_two() + if method == "event_study": + return self._pt_event_study() + if method == "hausman": + return self._pt_hausman() + if method == "synthetic_fit": + return self._pt_synthetic_fit() + if method == "factor": + return self._pt_factor() + return { + "status": "not_applicable", + "reason": f"No parallel-trends method registered for " + f"{type(self._results).__name__}.", + } + + def _pt_two_x_two(self) -> Dict[str, Any]: + """Simple two-period PT check via ``utils.check_parallel_trends``.""" + from diff_diff.utils import check_parallel_trends + + if self._data is None or self._outcome is None or self._time is None: + return { + "status": "skipped", + "reason": "Requires data=, outcome=, time=, and a treatment-group " + "column; not supplied.", + } + treatment_group = self._treatment + if treatment_group is None: + return { + "status": "skipped", + "reason": "Requires treatment= identifying the " + "treated-group indicator; not supplied.", + } + try: + raw = check_parallel_trends( + self._data, + outcome=self._outcome, + time=self._time, + treatment_group=treatment_group, + pre_periods=self._pre_periods, + ) + except Exception as exc: # noqa: BLE001 + return { + "status": "error", + "reason": f"check_parallel_trends raised {type(exc).__name__}: {exc}", + } + p_value = _to_python_float(raw.get("p_value")) + return { + "status": "ran", + "method": "slope_difference", + "joint_p_value": p_value, + "treated_trend": _to_python_float(raw.get("treated_trend")), + "control_trend": _to_python_float(raw.get("control_trend")), + "trend_difference": _to_python_float(raw.get("trend_difference")), + "t_statistic": _to_python_float(raw.get("t_statistic")), + "verdict": _pt_verdict(p_value), + } + + def _pt_event_study(self) -> Dict[str, Any]: + """Event-study joint Wald (or Bonferroni fallback) on pre-period coefficients. + + Works with either ``pre_period_effects`` (``MultiPeriodDiDResults`` style, + dict of ``PeriodEffect`` objects) or ``event_study_effects`` (CS / SA / + ImputationDiD style, dict of dicts with ``effect``/``se``/``p_value`` keys). + """ + r = self._results + pre_coefs = _collect_pre_period_coefs(r) + if not pre_coefs: + return { + "status": "skipped", + "reason": "No pre-period event-study coefficients available.", + } + interaction_indices = getattr(r, "interaction_indices", None) + vcov = getattr(r, "vcov", None) + + # pre_coefs is a sorted list of (key, effect, se, p_value) tuples. + per_period = [ + { + "period": _to_python_scalar(k), + "coef": _to_python_float(eff), + "se": _to_python_float(se), + "p_value": _to_python_float(p), + } + for (k, eff, se, p) in pre_coefs + ] + + joint_p: Optional[float] = None + test_statistic: Optional[float] = None + df = len(pre_coefs) + method = "bonferroni" + if vcov is not None and interaction_indices is not None and df > 0: + try: + keys_in_vcov = [k for (k, _, _, _) in pre_coefs if k in interaction_indices] + if len(keys_in_vcov) == df: + idx = [interaction_indices[k] for k in keys_in_vcov] + beta_map = {k: eff for (k, eff, _, _) in pre_coefs} + beta = np.array([beta_map[k] for k in keys_in_vcov], dtype=float) + v_sub = np.asarray(vcov)[np.ix_(idx, idx)] + stat = float(beta @ np.linalg.solve(v_sub, beta)) + from scipy.stats import chi2 + + joint_p = float(1.0 - chi2.cdf(stat, df=df)) + test_statistic = stat + method = "joint_wald" + except Exception: # noqa: BLE001 + joint_p = None + test_statistic = None + method = "bonferroni" + + if joint_p is None: + # Bonferroni: min per-period p-value scaled by count, capped at 1. + ps = [p["p_value"] for p in per_period if p["p_value"] is not None] + if ps: + joint_p = min(1.0, min(ps) * len(ps)) + + return { + "status": "ran", + "method": method, + "joint_p_value": joint_p, + "test_statistic": test_statistic, + "df": df, + "n_pre_periods": df, + "per_period": per_period, + "verdict": _pt_verdict(joint_p), + } + + def _check_pretrends_power(self) -> Dict[str, Any]: + """Compute pre-trends power (MDV) via ``compute_pretrends_power``. + + Feeds the ``mdv_share_of_att`` ratio used by ``BusinessReport`` to select + the power-aware phrasing tier for the ``no_detected_violation`` verdict. + """ + if "pretrends_power" in self._precomputed: + return self._format_precomputed_pretrends_power(self._precomputed["pretrends_power"]) + + from diff_diff.pretrends import compute_pretrends_power + + try: + pp = compute_pretrends_power( + self._results, + alpha=self._alpha, + target_power=0.80, + violation_type="linear", + ) + except Exception as exc: # noqa: BLE001 + return { + "status": "error", + "reason": f"compute_pretrends_power raised " f"{type(exc).__name__}: {exc}", + } + + # Build the schema section and compute the MDV/|ATT| ratio for BR. + headline_metric = self._extract_headline_metric() + att = headline_metric.get("value") if headline_metric else None + mdv = _to_python_float(getattr(pp, "mdv", None)) + ratio: Optional[float] = None + if ( + mdv is not None + and att is not None + and np.isfinite(att) + and abs(att) > 0 + and np.isfinite(mdv) + ): + ratio = mdv / abs(att) + + tier = _power_tier(ratio) + return { + "status": "ran", + "method": "compute_pretrends_power", + "violation_type": getattr(pp, "violation_type", "linear"), + "alpha": _to_python_float(getattr(pp, "alpha", self._alpha)), + "target_power": _to_python_float(getattr(pp, "target_power", 0.80)), + "mdv": mdv, + "mdv_share_of_att": ratio, + "power_at_M_1": _to_python_float(getattr(pp, "power", None)), + "n_pre_periods": int(getattr(pp, "n_pre_periods", 0) or 0), + "tier": tier, + } + + def _format_precomputed_pretrends_power(self, obj: Any) -> Dict[str, Any]: + """Adapt a pre-computed ``PreTrendsPowerResults`` to the schema shape.""" + mdv = _to_python_float(getattr(obj, "mdv", None)) + hm = self._extract_headline_metric() + att = hm.get("value") if hm else None + ratio: Optional[float] = None + if mdv is not None and att is not None and np.isfinite(att) and abs(att) > 0: + ratio = mdv / abs(att) + return { + "status": "ran", + "method": "precomputed", + "violation_type": getattr(obj, "violation_type", "linear"), + "alpha": _to_python_float(getattr(obj, "alpha", self._alpha)), + "target_power": _to_python_float(getattr(obj, "target_power", 0.80)), + "mdv": mdv, + "mdv_share_of_att": ratio, + "power_at_M_1": _to_python_float(getattr(obj, "power", None)), + "n_pre_periods": int(getattr(obj, "n_pre_periods", 0) or 0), + "tier": _power_tier(ratio), + "precomputed": True, + } + + def _check_sensitivity(self) -> Dict[str, Any]: + """Run HonestDiD over the M grid. Uses ``SensitivityResults.breakdown_M``. + + The standard path calls ``HonestDiD(method=..., M_grid=...).sensitivity_analysis()``. + SDiD and TROP route to estimator-native sensitivity in + ``estimator_native_diagnostics`` and emit a pointer here. + """ + if "sensitivity" in self._precomputed: + return self._format_precomputed_sensitivity(self._precomputed["sensitivity"]) + + name = type(self._results).__name__ + if name in {"SyntheticDiDResults", "TROPResults"}: + return { + "status": "skipped", + "reason": "Estimator uses native sensitivity (see " + "estimator_native_diagnostics).", + "method": "estimator_native", + } + + try: + from typing import cast + + from diff_diff.honest_did import HonestDiD + + # The sensitivity_method string is validated at runtime by + # HonestDiD; the Literal annotation is for static typing only. + honest = HonestDiD( + method=cast(Any, self._sensitivity_method), + alpha=self._alpha, + ) + sens = honest.sensitivity_analysis( + self._results, + M_grid=list(self._sensitivity_M_grid), + ) + except Exception as exc: # noqa: BLE001 + return { + "status": "error", + "method": self._sensitivity_method, + "reason": f"HonestDiD.sensitivity_analysis raised " f"{type(exc).__name__}: {exc}", + } + + return self._format_sensitivity_results(sens) + + def _format_sensitivity_results(self, sens: Any) -> Dict[str, Any]: + grid = [] + raw_M = getattr(sens, "M_values", None) + raw_cis = getattr(sens, "robust_cis", None) + raw_bounds = getattr(sens, "bounds", None) + M_values: List[Any] = list(raw_M) if raw_M is not None else [] + cis: List[Any] = list(raw_cis) if raw_cis is not None else [] + bounds: List[Any] = list(raw_bounds) if raw_bounds is not None else [] + for i, M in enumerate(M_values): + ci = cis[i] if i < len(cis) else (None, None) + bd = bounds[i] if i < len(bounds) else (None, None) + lo = _to_python_float(ci[0]) + hi = _to_python_float(ci[1]) + robust_to_zero = lo is not None and hi is not None and (lo > 0 or hi < 0) + grid.append( + { + "M": _to_python_float(M), + "ci_lower": lo, + "ci_upper": hi, + "bound_lower": _to_python_float(bd[0]), + "bound_upper": _to_python_float(bd[1]), + "robust_to_zero": robust_to_zero, + } + ) + bkd = _to_python_float(getattr(sens, "breakdown_M", None)) + if bkd is None: + conclusion = "robust_over_grid" + elif bkd >= 1.0: + conclusion = f"robust_to_M_{bkd:.2f}" + else: + conclusion = "fragile" + return { + "status": "ran", + "method": getattr(sens, "method", self._sensitivity_method), + "grid": grid, + "breakdown_M": bkd, + "original_estimate": _to_python_float(getattr(sens, "original_estimate", None)), + "original_se": _to_python_float(getattr(sens, "original_se", None)), + "conclusion": conclusion, + } + + def _format_precomputed_sensitivity(self, obj: Any) -> Dict[str, Any]: + """Accept either ``SensitivityResults`` (grid) or ``HonestDiDResults`` (single M).""" + if hasattr(obj, "M_values") and hasattr(obj, "breakdown_M"): + formatted = self._format_sensitivity_results(obj) + formatted["precomputed"] = True + return formatted + # Single-M HonestDiDResults: adapt with no breakdown_M. + ci_lb = _to_python_float(getattr(obj, "ci_lb", None)) + ci_ub = _to_python_float(getattr(obj, "ci_ub", None)) + return { + "status": "ran", + "method": getattr(obj, "method", self._sensitivity_method), + "grid": [ + { + "M": _to_python_float(getattr(obj, "M", None)), + "ci_lower": ci_lb, + "ci_upper": ci_ub, + "bound_lower": _to_python_float(getattr(obj, "lb", None)), + "bound_upper": _to_python_float(getattr(obj, "ub", None)), + "robust_to_zero": ( + ci_lb is not None and ci_ub is not None and (ci_lb > 0 or ci_ub < 0) + ), + } + ], + "breakdown_M": None, + "conclusion": "single_M_precomputed", + "precomputed": True, + } + + def _check_bacon(self) -> Dict[str, Any]: + """Surface Bacon decomposition: read-out when applicable, else skip. + + If ``results`` is itself a ``BaconDecompositionResults``, read fields. + If ``data`` + ``first_treat`` are supplied, call ``bacon_decompose``. + Otherwise, skip with a helpful reason. + """ + if "bacon" in self._precomputed: + return self._format_bacon(self._precomputed["bacon"]) + + r = self._results + name = type(r).__name__ + if name == "BaconDecompositionResults": + return self._format_bacon(r) + + data = self._data + outcome = self._outcome + unit = self._unit + time = self._time + first_treat = self._first_treat + if data is None or outcome is None or unit is None or time is None or first_treat is None: + return { + "status": "skipped", + "reason": "Bacon decomposition requires data + outcome + unit + time " + "+ first_treat on DiagnosticReport; not all supplied.", + } + + try: + from diff_diff.bacon import bacon_decompose + + bacon = bacon_decompose( + data, + outcome=outcome, + unit=unit, + time=time, + first_treat=first_treat, + ) + except Exception as exc: # noqa: BLE001 + return { + "status": "error", + "reason": f"bacon_decompose raised {type(exc).__name__}: {exc}", + } + return self._format_bacon(bacon) + + def _format_bacon(self, bacon: Any) -> Dict[str, Any]: + treated_vs_never = _to_python_float(getattr(bacon, "total_weight_treated_vs_never", None)) + earlier_vs_later = _to_python_float(getattr(bacon, "total_weight_earlier_vs_later", None)) + later_vs_earlier = _to_python_float(getattr(bacon, "total_weight_later_vs_earlier", None)) + twfe = _to_python_float(getattr(bacon, "twfe_estimate", None)) + forbidden = later_vs_earlier if later_vs_earlier is not None else 0.0 + if forbidden > 0.10: + verdict = "materially_contaminated" + elif forbidden > 0.01: + verdict = "minor_forbidden_weight" + else: + verdict = "clean" + return { + "status": "ran", + "twfe_estimate": twfe, + "weight_by_type": { + "treated_vs_never": treated_vs_never, + "earlier_vs_later": earlier_vs_later, + "later_vs_earlier": later_vs_earlier, + }, + "forbidden_weight": later_vs_earlier, + "verdict": verdict, + "n_timing_groups": _to_python_scalar(getattr(bacon, "n_timing_groups", None)), + } + + def _check_design_effect(self) -> Dict[str, Any]: + """Read survey design-effect from ``results.survey_metadata``.""" + sm = getattr(self._results, "survey_metadata", None) + if sm is None: + return { + "status": "skipped", + "reason": "No survey_metadata attached to results.", + } + deff = _to_python_float(getattr(sm, "design_effect", None)) + eff_n = _to_python_float(getattr(sm, "effective_n", None)) + is_trivial = deff is not None and 0.95 <= deff <= 1.05 + return { + "status": "ran", + "deff": deff, + "effective_n": eff_n, + "weight_type": getattr(sm, "weight_type", None), + "n_strata": _to_python_scalar(getattr(sm, "n_strata", None)), + "n_psu": _to_python_scalar(getattr(sm, "n_psu", None)), + "df_survey": _to_python_scalar(getattr(sm, "df_survey", None)), + "replicate_method": getattr(sm, "replicate_method", None), + "is_trivial": is_trivial, + } + + def _check_heterogeneity(self) -> Dict[str, Any]: + """Compute effect-stability metrics (CV, range, sign consistency).""" + effects = self._collect_effect_scalars() + if not effects: + return { + "status": "skipped", + "reason": "No group / event-study / period effects available.", + } + vals = np.array(effects, dtype=float) + finite = vals[np.isfinite(vals)] + if finite.size == 0: + return { + "status": "skipped", + "reason": "All effect values are non-finite.", + } + mean = float(np.mean(finite)) + sd = float(np.std(finite, ddof=1)) if finite.size > 1 else 0.0 + mn = float(np.min(finite)) + mx = float(np.max(finite)) + cv = sd / abs(mean) if abs(mean) > 0.1 * sd and abs(mean) > 0 else None + sign_consistent = bool(np.all(finite >= 0) or np.all(finite <= 0)) + return { + "status": "ran", + "source": self._heterogeneity_source(), + "n_effects": int(finite.size), + "min": mn, + "max": mx, + "mean": mean, + "sd": sd, + "range": mx - mn, + "cv": cv, + "sign_consistent": sign_consistent, + } + + def _check_epv(self) -> Dict[str, Any]: + """Read EPV diagnostics from ``results.epv_diagnostics``.""" + epv = getattr(self._results, "epv_diagnostics", None) + if epv is None: + return { + "status": "skipped", + "reason": "Estimator did not produce results.epv_diagnostics for " "this fit.", + } + threshold = 10 + low_cells = getattr(epv, "low_epv_cells", None) or [] + min_epv = _to_python_float(getattr(epv, "min_epv", None)) + return { + "status": "ran", + "threshold": threshold, + "n_cells_low": int(len(low_cells)), + "min_epv": min_epv, + "affected_cohorts": [_to_python_scalar(c) for c in low_cells], + } + + def _check_estimator_native(self) -> Dict[str, Any]: + """SDiD / TROP native validation surfaces. + + SDiD: ``pre_treatment_fit`` (weighted-PT analogue), weight + concentration (``get_weight_concentration``), ``in_time_placebo`` + (placebo-timing sweep), and ``sensitivity_to_zeta_omega`` + (regularization sensitivity). + + TROP: factor-model fit metrics (``effective_rank``, ``loocv_score``, + selected ``lambda_*``). + """ + r = self._results + name = type(r).__name__ + if name == "SyntheticDiDResults": + return self._sdid_native(r) + if name == "TROPResults": + return self._trop_native(r) + return { + "status": "not_applicable", + "reason": f"{name} does not expose native validation methods.", + } + + def _sdid_native(self, r: Any) -> Dict[str, Any]: + """Populate SDiD-native diagnostics section.""" + out: Dict[str, Any] = {"status": "ran", "estimator": "SyntheticDiD"} + out["pre_treatment_fit"] = _to_python_float(getattr(r, "pre_treatment_fit", None)) + # Weight concentration via the public method on SyntheticDiDResults. + try: + wc = r.get_weight_concentration(top_k=5) + out["weight_concentration"] = { + "effective_n": _to_python_float(wc.get("effective_n")), + "herfindahl": _to_python_float(wc.get("herfindahl")), + "top_k": _to_python_scalar(wc.get("top_k")), + "top_k_share": _to_python_float(wc.get("top_k_share")), + } + except Exception as exc: # noqa: BLE001 + out["weight_concentration"] = { + "status": "error", + "reason": f"get_weight_concentration raised " f"{type(exc).__name__}: {exc}", + } + # In-time placebo — runs only when the fit snapshot is available. + try: + placebo_df = r.in_time_placebo() + out["in_time_placebo"] = { + "n_placebos": int(len(placebo_df)), + "max_abs_effect": _to_python_float( + placebo_df["att"].abs().max() if len(placebo_df) > 0 else None + ), + "mean_abs_effect": _to_python_float( + placebo_df["att"].abs().mean() if len(placebo_df) > 0 else None + ), + } + except Exception as exc: # noqa: BLE001 + out["in_time_placebo"] = { + "status": "skipped", + "reason": f"in_time_placebo unavailable: " f"{type(exc).__name__}: {exc}", + } + # Zeta-omega sensitivity. + try: + zeta_df = r.sensitivity_to_zeta_omega() + atts = zeta_df["att"].astype(float).tolist() if len(zeta_df) > 0 else [] + out["zeta_sensitivity"] = { + "grid": [ + { + "multiplier": _to_python_float(row.get("multiplier")), + "att": _to_python_float(row.get("att")), + "pre_fit_rmse": _to_python_float(row.get("pre_fit_rmse")), + "effective_n": _to_python_float(row.get("effective_n")), + } + for row in zeta_df.to_dict(orient="records") + ], + "att_range": ([min(atts), max(atts)] if atts else None), + } + except Exception as exc: # noqa: BLE001 + out["zeta_sensitivity"] = { + "status": "skipped", + "reason": f"sensitivity_to_zeta_omega unavailable: " f"{type(exc).__name__}: {exc}", + } + return out + + def _trop_native(self, r: Any) -> Dict[str, Any]: + """Populate TROP-native factor-model diagnostics section.""" + return { + "status": "ran", + "estimator": "TROP", + "factor_model": { + "effective_rank": _to_python_float(getattr(r, "effective_rank", None)), + "loocv_score": _to_python_float(getattr(r, "loocv_score", None)), + "lambda_time": _to_python_float(getattr(r, "lambda_time", None)), + "lambda_unit": _to_python_float(getattr(r, "lambda_unit", None)), + "lambda_nn": _to_python_float(getattr(r, "lambda_nn", None)), + "n_pre_periods": _to_python_scalar(getattr(r, "n_pre_periods", None)), + "n_post_periods": _to_python_scalar(getattr(r, "n_post_periods", None)), + }, + } + + # -- Heterogeneity helpers -------------------------------------------- + + def _collect_effect_scalars(self) -> List[float]: + """Collect scalar effect values across group / event-study / TROP sources. + + Returns an empty list if no recognized effect container is present. + Never raises on unexpected shapes; unrecognized entries are skipped. + """ + r = self._results + # 1. group_effects: dict keyed by cohort -> dict with 'effect' or float + ge = getattr(r, "group_effects", None) + if ge is not None: + return self._scalars_from_mapping(ge) + # 2. event_study_effects: dict keyed by relative time -> dict with 'effect' + es = getattr(r, "event_study_effects", None) + if es is not None: + return self._scalars_from_mapping(es) + # 3. TROP: treatment_effects dict keyed by (unit, time) -> float + te = getattr(r, "treatment_effects", None) + if te is not None: + return self._scalars_from_mapping(te) + # 4. CS default: group_time_effects dict keyed by (g, t) -> dict + gte = getattr(r, "group_time_effects", None) + if gte is not None: + return self._scalars_from_mapping(gte) + # 5. MultiPeriod: period_effects dict keyed by period -> PeriodEffect + pe = getattr(r, "period_effects", None) + if pe is not None: + return self._scalars_from_mapping(pe) + return [] + + @staticmethod + def _scalars_from_mapping(mapping: Any) -> List[float]: + """Extract scalar effect values from various result-mapping shapes.""" + out: List[float] = [] + values: List[Any] + values_fn = getattr(mapping, "values", None) + if callable(values_fn): + try: + values = list(values_fn()) + except Exception: # noqa: BLE001 + return [] + else: + try: + values = list(mapping) # type: ignore[arg-type] + except Exception: # noqa: BLE001 + return [] + for val in values: + eff = _extract_scalar_effect(val) + if eff is not None: + out.append(eff) + return out + + def _heterogeneity_source(self) -> str: + """Name the attribute that produced the scalars (for the schema).""" + for attr in ( + "group_effects", + "event_study_effects", + "treatment_effects", + "group_time_effects", + "period_effects", + ): + if getattr(self._results, attr, None) is not None: + return attr + return "unknown" + + def _pt_hausman(self) -> Dict[str, Any]: + """EfficientDiD native PT check via ``EfficientDiD.hausman_pretest``. + + This is the correct PT check for EfficientDiD (PT-All vs PT-Post); the + generic event-study approach is inappropriate for this estimator per + ``practitioner._parallel_trends_step`` guidance. + """ + data = self._data + outcome = self._outcome + unit = self._unit + time = self._time + first_treat = self._first_treat + missing = [ + name + for name, val in ( + ("data", data), + ("outcome", outcome), + ("unit", unit), + ("time", time), + ("first_treat", first_treat), + ) + if val is None + ] + if ( + missing + or data is None + or outcome is None + or unit is None + or time is None + or first_treat is None + ): + return { + "status": "skipped", + "method": "hausman", + "reason": ( + "EfficientDiD.hausman_pretest requires data + outcome + unit + " + f"time + first_treat kwargs on DiagnosticReport; missing: " + f"{', '.join(missing)}." + ), + } + + try: + from diff_diff.efficient_did import EfficientDiD + + pt = EfficientDiD.hausman_pretest( + data, + outcome=outcome, + unit=unit, + time=time, + first_treat=first_treat, + alpha=self._alpha, + ) + except Exception as exc: # noqa: BLE001 + return { + "status": "error", + "method": "hausman", + "reason": f"hausman_pretest raised {type(exc).__name__}: {exc}", + } + + p_value = _to_python_float(getattr(pt, "p_value", None)) + return { + "status": "ran", + "method": "hausman", + "joint_p_value": p_value, + "test_statistic": _to_python_float(getattr(pt, "test_statistic", None)), + "df": _to_python_scalar(getattr(pt, "df", None)), + "verdict": _pt_verdict(p_value), + } + + def _pt_synthetic_fit(self) -> Dict[str, Any]: + """SDiD weighted pre-treatment-fit PT analogue. + + SDiD's design-enforced fit quality substitutes for a standard PT test: + the synthetic control is explicitly constructed to match the treated + group's pre-period trajectory, so small ``pre_treatment_fit`` RMSE + means the weighted-PT analogue is satisfied. + """ + r = self._results + fit = _to_python_float(getattr(r, "pre_treatment_fit", None)) + if fit is None: + return { + "status": "skipped", + "method": "synthetic_fit", + "reason": "SyntheticDiDResults.pre_treatment_fit is not populated " "on this fit.", + } + # Proxy verdict: unlike a classical PT p-value, this is a fit-quality + # metric. Classify conservatively — phrasing in BR will explain that + # this is SDiD's design-enforced analogue, not a PT hypothesis test. + return { + "status": "ran", + "method": "synthetic_fit", + "pre_treatment_fit_rmse": fit, + "verdict": "design_enforced_pt", + } + + def _pt_factor(self) -> Dict[str, Any]: + """TROP has no PT concept — its identification is factor-model-based.""" + return { + "status": "not_applicable", + "reason": "TROP uses factor-model identification; parallel trends is " + "not applicable. See estimator_native_diagnostics for the " + "factor-model fit metrics.", + "method": "factor", + } + + def _format_precomputed_pt(self, obj: Any) -> Dict[str, Any]: + """Adapt a pre-computed PT result (from utils.check_parallel_trends) to schema shape.""" + if not isinstance(obj, dict): + return { + "status": "error", + "reason": "precomputed['parallel_trends'] must be a dict returned by " + "check_parallel_trends or compatible shape.", + } + p_value = _to_python_float(obj.get("p_value")) + return { + "status": "ran", + "method": obj.get("method", "precomputed"), + "joint_p_value": p_value, + "verdict": _pt_verdict(p_value), + "precomputed": True, + } + + # -- Headline metric extraction ---------------------------------------- + + def _extract_headline_metric(self) -> Optional[Dict[str, Any]]: + """Best-effort extraction of the scalar headline metric from the result.""" + r = self._results + # Try the usual attribute names in priority order. + for name in ("overall_att", "avg_att", "att"): + val = getattr(r, name, None) + if val is None: + continue + se_name = { + "overall_att": "overall_se", + "avg_att": "avg_se", + "att": "se", + }[name] + p_name = { + "overall_att": "overall_p_value", + "avg_att": "avg_p_value", + "att": "p_value", + }[name] + ci_name = { + "overall_att": "overall_conf_int", + "avg_att": "avg_conf_int", + "att": "conf_int", + }[name] + return { + "name": name, + "value": _to_python_float(val), + "se": _to_python_float(getattr(r, se_name, None)), + "p_value": _to_python_float(getattr(r, p_name, None)), + "conf_int": _to_python_ci(getattr(r, ci_name, None)), + "alpha": _to_python_float(getattr(r, "alpha", self._alpha)), + } + return None + + +# --------------------------------------------------------------------------- +# Helpers (module-private) +# --------------------------------------------------------------------------- +def _extract_scalar_effect(val: Any) -> Optional[float]: + """Pull a scalar ``effect`` out of the many shapes results expose. + + Handles: ``PeriodEffect`` / ``GroupTimeEffect`` objects (``.effect`` attr), + dicts with an ``"effect"`` key, and bare scalars. + """ + if isinstance(val, dict): + eff = val.get("effect") + if eff is None: + return None + try: + return float(eff) + except (TypeError, ValueError): + return None + eff_attr = getattr(val, "effect", None) + if eff_attr is not None: + try: + return float(eff_attr) + except (TypeError, ValueError): + return None + try: + return float(val) + except (TypeError, ValueError): + return None + + +def _power_tier(ratio: Optional[float]) -> str: + """Map ``mdv / |att|`` to a phrasing tier used by ``BusinessReport``. + + Tiers per ``docs/methodology/REPORTING.md``: + * ``well_powered``: ratio < 0.25 + * ``moderately_powered``: 0.25 <= ratio < 1.0 + * ``underpowered``: ratio >= 1.0 + * ``unknown``: ratio is None or non-finite + """ + if ratio is None or not np.isfinite(ratio): + return "unknown" + if ratio < 0.25: + return "well_powered" + if ratio < 1.0: + return "moderately_powered" + return "underpowered" + + +def _collect_pre_period_coefs(results: Any) -> List[Tuple[Any, float, float, Optional[float]]]: + """Return a sorted list of ``(key, effect, se, p_value)`` for pre-period coefficients. + + Handles two shapes: + * ``pre_period_effects``: dict-of-``PeriodEffect`` on ``MultiPeriodDiDResults``. + * ``event_study_effects``: dict-of-dict (with ``effect`` / ``se`` / ``p_value`` keys) + on the staggered estimators (CS / SA / ImputationDiD / Stacked / EDiD / etc.). + Pre-period entries are those with negative relative-time keys. + + Returns an empty list when neither source provides pre-period entries. + """ + results_list: List[Tuple[Any, float, float, Optional[float]]] = [] + pre = getattr(results, "pre_period_effects", None) + if pre: + for k, pe in pre.items(): + eff = getattr(pe, "effect", None) + se = getattr(pe, "se", None) + p = getattr(pe, "p_value", None) + if eff is not None and se is not None: + results_list.append((k, float(eff), float(se), _to_python_float(p))) + else: + es = getattr(results, "event_study_effects", None) or {} + for k, entry in es.items(): + # Pre-period relative-time keys are negative (convention: e=-1, -2, ...). + try: + rel = int(k) + except (TypeError, ValueError): + continue + if rel >= 0: + continue + if not isinstance(entry, dict): + continue + eff = entry.get("effect") + se = entry.get("se") + p = entry.get("p_value") + if eff is None or se is None: + continue + results_list.append((k, float(eff), float(se), _to_python_float(p))) + results_list.sort(key=lambda t: t[0] if isinstance(t[0], (int, float)) else str(t[0])) + return results_list + + +def _pt_verdict(p: Optional[float]) -> str: + """Map a pre-trends joint p-value to the three-bin verdict enum. + + Verdicts per ``docs/methodology/REPORTING.md``: + - p >= 0.30 -> ``no_detected_violation`` (phrasing hedges on power + unless DR also reports that the test is well-powered via + ``compute_pretrends_power``). + - 0.05 <= p < 0.30 -> ``some_evidence_against``. + - p < 0.05 -> ``clear_violation``. + """ + if p is None or not np.isfinite(p): + return "inconclusive" + if p < 0.05: + return "clear_violation" + if p < 0.30: + return "some_evidence_against" + return "no_detected_violation" + + +def _to_python_float(value: Any) -> Optional[float]: + """Convert numpy scalars to built-in ``float``; preserve None; return None on failure.""" + if value is None: + return None + try: + f = float(value) + except (TypeError, ValueError): + return None + return f + + +def _to_python_scalar(value: Any) -> Any: + """Convert numpy scalars to built-in Python types where possible; pass through otherwise.""" + if isinstance(value, np.generic): + return value.item() + return value + + +def _to_python_ci(ci: Any) -> Optional[List[float]]: + """Convert a 2-tuple CI to ``[float, float]``; return None when malformed.""" + if ci is None: + return None + try: + lo, hi = ci + except (TypeError, ValueError): + return None + lo_f = _to_python_float(lo) + hi_f = _to_python_float(hi) + if lo_f is None or hi_f is None: + return None + return [lo_f, hi_f] + + +# --------------------------------------------------------------------------- +# Prose rendering helpers +# --------------------------------------------------------------------------- +def _check_headline(check: str, section: Dict[str, Any]) -> Optional[Any]: + """Return the most descriptive scalar for the per-check row in to_dataframe().""" + if section.get("status") != "ran": + return None + if check == "parallel_trends": + return section.get("joint_p_value") + if check == "pretrends_power": + return section.get("mdv_share_of_att") + if check == "sensitivity": + return section.get("breakdown_M") + if check == "bacon": + return section.get("forbidden_weight") + if check == "design_effect": + return section.get("deff") + if check == "heterogeneity": + return section.get("cv") + if check == "epv": + return section.get("min_epv") + if check == "estimator_native": + return section.get("pre_treatment_fit") + return None + + +def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str]) -> str: + """Synthesize a plain-English paragraph across DR checks. + + The paragraph names the headline effect, the dominant validity concern + (typically parallel trends or sensitivity), secondary caveats + (heterogeneity, design effect, Bacon), and one concrete next action. + Never produces traffic-light verdicts — severity is conveyed by natural + language per ``docs/methodology/REPORTING.md``. + """ + sentences: List[str] = [] + headline = schema.get("headline_metric") or {} + est = schema.get("estimator", "the estimator") + outcome = labels.get("outcome_label", "the outcome") + treatment = labels.get("treatment_label", "the treatment") + + # Sentence 1: headline + val = headline.get("value") if isinstance(headline, dict) else None + ci = headline.get("conf_int") if isinstance(headline, dict) else None + p = headline.get("p_value") if isinstance(headline, dict) else None + if val is not None: + direction = "increased" if val > 0 else "decreased" if val < 0 else "did not change" + ci_str = ( + f" (95% CI: {ci[0]:.3g} to {ci[1]:.3g})" + if isinstance(ci, (list, tuple)) and len(ci) == 2 and None not in ci + else "" + ) + p_str = f", p = {p:.3g}" if isinstance(p, (int, float)) else "" + sentences.append( + f"On {est}, {treatment} {direction} {outcome} by {val:.3g}{ci_str}{p_str}." + ) + + # Sentence 2: parallel trends + power + pt = schema.get("parallel_trends") or {} + pp = schema.get("pretrends_power") or {} + if pt.get("status") == "ran": + verdict = pt.get("verdict") + jp = pt.get("joint_p_value") + jp_str = f" (joint p = {jp:.3g})" if isinstance(jp, (int, float)) else "" + if verdict == "clear_violation": + sentences.append( + f"Pre-treatment event-study coefficients clearly reject parallel " + f"trends{jp_str}. The headline estimate should be treated as " + f"tentative pending sensitivity analysis." + ) + elif verdict == "some_evidence_against": + sentences.append( + f"Pre-treatment data show some evidence of diverging trends" + f"{jp_str}. Interpret the headline alongside the sensitivity " + f"analysis below." + ) + elif verdict == "no_detected_violation": + tier = pp.get("tier") if pp.get("status") == "ran" else "unknown" + if tier == "well_powered": + sentences.append( + f"Pre-treatment data are consistent with parallel trends" + f"{jp_str} and the test is well-powered (MDV is a small " + f"share of the estimated effect), so a material pre-trend " + f"would likely have been detected." + ) + elif tier == "moderately_powered": + sentences.append( + f"Pre-treatment data do not reject parallel trends" + f"{jp_str}; the test is moderately informative. See the " + f"sensitivity analysis below for bounded-violation " + f"guarantees." + ) + else: + sentences.append( + f"Pre-treatment data do not reject parallel trends" + f"{jp_str}, but the test has limited power — a non-rejection " + f"does not prove the assumption. See the HonestDiD " + f"sensitivity analysis below for a more reliable signal." + ) + elif verdict == "design_enforced_pt": + rmse = pt.get("pre_treatment_fit_rmse") + sentences.append( + f"The synthetic control matches the treated group's " + f"pre-period trajectory with RMSE = " + f"{rmse:.3g} (SDiD's design-enforced analogue of parallel " + f"trends)." + if isinstance(rmse, (int, float)) + else "SDiD's synthetic control is designed to satisfy the " + "weighted parallel-trends analogue." + ) + + # Sentence 3: sensitivity + sens = schema.get("sensitivity") or {} + if sens.get("status") == "ran": + bkd = sens.get("breakdown_M") + if bkd is None: + sentences.append( + "The effect remains significant across the entire HonestDiD " + "grid — robust to plausible parallel-trends violations." + ) + elif isinstance(bkd, (int, float)) and bkd >= 1.0: + sentences.append( + f"HonestDiD sensitivity: the result remains significant under " + f"parallel-trends violations up to {bkd:.2g}x the observed " + f"pre-period variation." + ) + else: + sentences.append( + f"HonestDiD sensitivity: the result is fragile — the " + f"confidence interval includes zero once violations reach " + f"{bkd:.2g}x the pre-period variation." + if isinstance(bkd, (int, float)) + else "" + ) + + # Sentence 4: one secondary caveat if present. + bacon = schema.get("bacon") or {} + if bacon.get("status") == "ran" and bacon.get("verdict") == "materially_contaminated": + fw = bacon.get("forbidden_weight") + if isinstance(fw, (int, float)): + sentences.append( + f"Goodman-Bacon decomposition flags {fw:.0%} of TWFE weight on " + f"'forbidden' later-vs-earlier comparisons — consider a " + f"heterogeneity-robust estimator (CS / SA / BJS / Gardner) if " + f"not already in use." + ) + deff = schema.get("design_effect") or {} + if deff.get("status") == "ran" and not deff.get("is_trivial"): + d = deff.get("deff") + eff_n = deff.get("effective_n") + if isinstance(d, (int, float)) and d >= 1.05: + eff_str = f", effective n = {eff_n:.0f}" if isinstance(eff_n, (int, float)) else "" + sentences.append( + f"Survey design effect is {d:.2g} (variance inflation relative " + f"to simple random sampling{eff_str})." + ) + + # Sentence 5: next step + next_steps = schema.get("next_steps") or [] + if next_steps: + top = next_steps[0] + if top.get("label"): + sentences.append(f"Next step: {top['label']}.") + + if not sentences: + return "" + return " ".join(s for s in sentences if s) + + +def _render_dr_full_report(results: "DiagnosticReportResults") -> str: + """Render a markdown report from a populated ``DiagnosticReportResults``.""" + schema = results.schema + lines: List[str] = [] + lines.append("# Diagnostic Report") + lines.append("") + lines.append(f"**Estimator**: `{schema.get('estimator')}`") + headline = schema.get("headline_metric") + if headline: + lines.append( + f"**Headline**: {headline.get('name')} = " + f"{headline.get('value')} " + f"(SE {headline.get('se')}, p = {headline.get('p_value')})" + ) + lines.append("") + lines.append("## Overall Interpretation") + lines.append("") + lines.append(schema.get("overall_interpretation", "") or "_No synthesis available._") + lines.append("") + + section_order = [ + ("Parallel trends", "parallel_trends"), + ("Pre-trends power", "pretrends_power"), + ("HonestDiD sensitivity", "sensitivity"), + ("Goodman-Bacon decomposition", "bacon"), + ("Effect-stability / heterogeneity", "heterogeneity"), + ("Survey design effect", "design_effect"), + ("Propensity-score EPV", "epv"), + ("Estimator-native diagnostics", "estimator_native_diagnostics"), + ("Placebo battery", "placebo"), + ] + for title, key in section_order: + section = schema.get(key) or {} + status = section.get("status", "not_run") + lines.append(f"## {title}") + lines.append(f"- status: `{status}`") + if status == "skipped" or status == "not_applicable": + reason = section.get("reason") + if reason: + lines.append(f"- reason: {reason}") + else: + for k, v in section.items(): + if k in ("status", "reason"): + continue + if isinstance(v, (dict, list)): + continue + lines.append(f"- {k}: `{v}`") + lines.append("") + + if schema.get("next_steps"): + lines.append("## Next Steps") + for s in schema["next_steps"]: + if s.get("label"): + lines.append(f"- {s['label']}") + if s.get("why"): + lines.append(f" - why: {s['why']}") + return "\n".join(lines) diff --git a/diff_diff/guides/llms-full.txt b/diff_diff/guides/llms-full.txt index c9b2851e..f6636f34 100644 --- a/diff_diff/guides/llms-full.txt +++ b/diff_diff/guides/llms-full.txt @@ -1741,3 +1741,115 @@ DIFF_DIFF_BACKEND=rust pytest # Force Rust (fail if unavailable) | Efficiency-optimal estimation | `EfficientDiD` | | Corrective weighting for stacked regressions | `StackedDiD` | | Robustness to parallel trends violations | `HonestDiD` | + +## BusinessReport + +Plain-English stakeholder narrative from any of the 16 fitted result types. +Renders `summary()` (short paragraph), `full_report()` (multi-section +markdown), and `to_dict()` (stable AI-legible schema — single source of +truth; prose renders from the dict). + +```python +from diff_diff import BusinessReport + +report = BusinessReport( + results, + outcome_label="Revenue per user", + outcome_unit="$", # "$" / "%" / "pp" / "log_points" / "count" recognized + outcome_direction="higher_is_better", + business_question="Did the campaign lift revenue?", + treatment_label="the campaign", + alpha=0.05, # single knob: drives both CI level and phrasing threshold + auto_diagnostics=True, # default; auto-constructs DiagnosticReport +) + +print(report.summary()) # 6-10 sentence paragraph +print(report.full_report()) # structured markdown +report.to_dict() # AI-legible schema; stable top-level keys +``` + +Constructor rejects `BaconDecompositionResults` with a helpful TypeError +(Bacon is a diagnostic, not an estimator; wrap the underlying estimator +and pass the Bacon object to `DiagnosticReport(precomputed={'bacon': ...})`). + +Schema top-level keys (all always present; missing content uses a +`{"status": "skipped", "reason": "..."}` shape rather than being absent): + +- `schema_version`, `estimator`, `context` +- `headline`, `assumption`, `pre_trends`, `sensitivity` +- `sample`, `heterogeneity`, `robustness`, `diagnostics` +- `next_steps`, `caveats`, `references` + +Status enum values: `ran | skipped | error | not_applicable | not_run | computed`. + +## DiagnosticReport + +Unified diagnostic runner orchestrating `check_parallel_trends`, +`compute_pretrends_power`, `HonestDiD.sensitivity`, `bacon_decompose`, +`compute_deff_diagnostics`, `results.epv_diagnostics`, plus +estimator-native surfaces for SyntheticDiD (`pre_treatment_fit`, +`get_weight_concentration`, `in_time_placebo`, `sensitivity_to_zeta_omega`) +and TROP (factor-model metrics). EfficientDiD PT uses the native +`hausman_pretest`. + +```python +from diff_diff import DiagnosticReport + +dr = DiagnosticReport( + results, + data=df, # optional; needed for 2x2 PT, Bacon-from-scratch + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + alpha=0.05, + # Opt-outs (all default True except placebo) + run_parallel_trends=True, + run_sensitivity=True, + run_placebo=False, # opt-in; not implemented in MVP + run_bacon=True, + run_design_effect=True, + run_heterogeneity=True, + run_epv=True, + run_pretrends_power=True, # drives power-aware PT phrasing + sensitivity_M_grid=(0.5, 1.0, 1.5, 2.0), + sensitivity_method="relative_magnitude", + # Escape hatch for users who ran a diagnostic with custom args: + precomputed={"sensitivity": my_honest_did_results}, +) + +dr.run_all() # triggers compute, caches +print(dr.summary()) # overall-interpretation paragraph +dr.to_dict() # AI-legible schema +dr.to_dataframe() # one row per check +dr.applicable_checks # tuple of checks that will run for this estimator +dr.skipped_checks # dict of {check: plain-English reason} +``` + +Schema top-level keys: `schema_version, estimator, headline_metric, +parallel_trends, pretrends_power, sensitivity, placebo, bacon, +design_effect, heterogeneity, epv, estimator_native_diagnostics, +skipped, warnings, overall_interpretation, next_steps`. + +### Verdicts and tiers + +Pre-trends verdict (three bins, documented in `docs/methodology/REPORTING.md`): + +- `joint_p >= 0.30` -> `no_detected_violation` +- `0.05 <= joint_p < 0.30` -> `some_evidence_against` +- `joint_p < 0.05` -> `clear_violation` + +Power tier (drives BR phrasing for the `no_detected_violation` verdict): + +- `mdv / |att| < 0.25` -> `well_powered` +- `0.25 <= mdv / |att| < 1.0` -> `moderately_powered` +- `mdv / |att| >= 1.0` -> `underpowered` +- power not runnable -> `unknown` (BR falls back to underpowered phrasing) + +### Methodology notes + +BR and DR perform no new statistical computation — every reported number +is read from the fitted result or computed by an existing diff-diff +utility. Both schemas are experimental in the current release; see +`docs/methodology/REPORTING.md` for phrasing rules, the no-traffic-light +decision, unit-translation policy, and schema stability policy. diff --git a/diff_diff/guides/llms-practitioner.txt b/diff_diff/guides/llms-practitioner.txt index 6680d800..2a6688d8 100644 --- a/diff_diff/guides/llms-practitioner.txt +++ b/diff_diff/guides/llms-practitioner.txt @@ -439,6 +439,42 @@ Your analysis report MUST include all of the following: - [ ] Comparison across at least 2-3 estimators - [ ] Estimates with and without covariates (REQUIRED) +### One-call reporting via BusinessReport + DiagnosticReport + +The `DiagnosticReport` class orchestrates Steps 3 (parallel trends), 6 +(sensitivity), and 7 (heterogeneity) in a single call and produces +plain-English output. Pair with `BusinessReport` for a +stakeholder-ready narrative: + +```python +from diff_diff import BusinessReport, DiagnosticReport + +# Optional: run diagnostics explicitly so you can inspect the structure. +dr = DiagnosticReport(cs_result, data=data, outcome='y', unit='id', + time='t', first_treat='g') +dr.run_all() +print(dr.summary()) # overall interpretation paragraph +dr.to_dict() # AI-legible structured schema + +# Or let BusinessReport auto-construct a DiagnosticReport and render the +# full stakeholder narrative in one call: +br = BusinessReport( + cs_result, + outcome_label='Revenue per user', + outcome_unit='$', + business_question='Did the campaign lift revenue?', + treatment_label='the campaign', +) +print(br.summary()) # short paragraph block +print(br.full_report()) # structured markdown +``` + +`DiagnosticReport` uses power-aware phrasing: when a pre-trends test +does not reject, the summary reflects whether the test is well-powered +(via `compute_pretrends_power`), rather than defaulting to "parallel +trends hold". See `docs/methodology/REPORTING.md` for the full verdict +and tier rules. + ### Runtime guidance ```python from diff_diff import practitioner_next_steps diff --git a/docs/api/business_report.rst b/docs/api/business_report.rst new file mode 100644 index 00000000..59a4306c --- /dev/null +++ b/docs/api/business_report.rst @@ -0,0 +1,52 @@ +BusinessReport +============== + +``BusinessReport`` wraps any fitted diff-diff result object and produces +stakeholder-ready output: + +- ``summary()`` — a short paragraph block suitable for an email or Slack. +- ``full_report()`` — a structured multi-section markdown report. +- ``to_dict()`` — a stable AI-legible structured schema (single source + of truth; prose renders from this dict). + +By default, BusinessReport constructs an internal ``DiagnosticReport`` +to surface pre-trends, sensitivity, and other validity checks as part +of the narrative. Pass ``auto_diagnostics=False`` to skip this, or +``diagnostics=`` to supply an explicit one. + +Methodology deviations (no traffic-light gates, pre-trends verdict +thresholds, power-aware phrasing, unit-translation policy, schema +stability) are documented in :doc:`../methodology/REPORTING`. + +Example +------- + +.. code-block:: python + + from diff_diff import CallawaySantAnna, BusinessReport + + cs = CallawaySantAnna().fit( + df, outcome="revenue", unit="store", time="period", + first_treat="first_treat", aggregate="event_study", + ) + report = BusinessReport( + cs, + outcome_label="Revenue per store", + outcome_unit="$", + business_question="Did the loyalty program lift revenue?", + treatment_label="the loyalty program", + ) + print(report.summary()) + +API +--- + +.. autoclass:: diff_diff.BusinessReport + :members: + :show-inheritance: + +.. autoclass:: diff_diff.BusinessContext + :members: + :show-inheritance: + +.. autodata:: diff_diff.BUSINESS_REPORT_SCHEMA_VERSION diff --git a/docs/api/diagnostic_report.rst b/docs/api/diagnostic_report.rst new file mode 100644 index 00000000..fc66d110 --- /dev/null +++ b/docs/api/diagnostic_report.rst @@ -0,0 +1,51 @@ +DiagnosticReport +================ + +``DiagnosticReport`` orchestrates the library's existing diagnostic +functions (parallel trends, pre-trends power, HonestDiD sensitivity, +Goodman-Bacon, design-effect, EPV, heterogeneity, and estimator-native +checks for SyntheticDiD and TROP) into a single report with a stable +AI-legible schema. + +Construction is free; ``run_all()`` triggers the compute and caches. +A second call to ``to_dict()`` or ``summary()`` reuses the cached +result. + +Methodology deviations (no traffic-light gates, opt-in placebo +battery, estimator-native diagnostic routing, power-aware phrasing +threshold) are documented in :doc:`../methodology/REPORTING`. + +Example +------- + +.. code-block:: python + + from diff_diff import CallawaySantAnna, DiagnosticReport + + cs = CallawaySantAnna().fit( + df, outcome="outcome", unit="unit", time="period", + first_treat="first_treat", aggregate="event_study", + ) + dr = DiagnosticReport( + cs, + data=df, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ) + print(dr.summary()) + dr.to_dataframe() # one row per check + +API +--- + +.. autoclass:: diff_diff.DiagnosticReport + :members: + :show-inheritance: + +.. autoclass:: diff_diff.DiagnosticReportResults + :members: + :show-inheritance: + +.. autodata:: diff_diff.DIAGNOSTIC_REPORT_SCHEMA_VERSION diff --git a/docs/api/index.rst b/docs/api/index.rst index 3d08dc98..da128317 100644 --- a/docs/api/index.rst +++ b/docs/api/index.rst @@ -253,6 +253,15 @@ Diagnostics & Inference power pretrends +Reporting +~~~~~~~~~ + +.. toctree:: + :maxdepth: 2 + + business_report + diagnostic_report + Results & Visualization ~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/doc-deps.yaml b/docs/doc-deps.yaml index aee0b9d0..dfb1181b 100644 --- a/docs/doc-deps.yaml +++ b/docs/doc-deps.yaml @@ -482,6 +482,36 @@ sources: - path: docs/tutorials/07_pretrends_power.ipynb type: tutorial + diff_diff/business_report.py: + drift_risk: medium + docs: + - path: docs/methodology/REPORTING.md + type: methodology + note: "Phrasing rules, pre-trends verdict thresholds, unit-translation policy, schema stability." + - path: docs/api/business_report.rst + type: api_reference + - path: README.md + section: "BusinessReport" + type: user_guide + - path: diff_diff/guides/llms-full.txt + section: "BusinessReport" + type: user_guide + + diff_diff/diagnostic_report.py: + drift_risk: medium + docs: + - path: docs/methodology/REPORTING.md + type: methodology + note: "Applicability matrix, opt-in placebo rationale, native-diagnostic routing, no-traffic-lights decision." + - path: docs/api/diagnostic_report.rst + type: api_reference + - path: README.md + section: "DiagnosticReport" + type: user_guide + - path: diff_diff/guides/llms-full.txt + section: "DiagnosticReport" + type: user_guide + diff_diff/power.py: drift_risk: low docs: diff --git a/docs/methodology/REGISTRY.md b/docs/methodology/REGISTRY.md index f1c575cf..99e1b181 100644 --- a/docs/methodology/REGISTRY.md +++ b/docs/methodology/REGISTRY.md @@ -2842,6 +2842,19 @@ The 8-step workflow in `diff_diff/guides/llms-practitioner.txt` is adapted from --- +# Reporting + +BusinessReport and DiagnosticReport are the practitioner-ready output +layer. Their methodology (phrasing rules, pre-trends verdict +thresholds, power-aware phrasing, unit-translation policy, schema +stability, no-traffic-light-gates decision, estimator-native diagnostic +routing) is recorded in a dedicated file to keep this registry +estimator-focused: + +- See [`REPORTING.md`](./REPORTING.md). + +--- + # Version History - **v1.3** (2026-03-26): Added Replicate Weight Variance, DEFF Diagnostics, diff --git a/docs/methodology/REPORTING.md b/docs/methodology/REPORTING.md new file mode 100644 index 00000000..f6cee0b5 --- /dev/null +++ b/docs/methodology/REPORTING.md @@ -0,0 +1,141 @@ +# Reporting + +This document records the methodology choices embedded in +`BusinessReport` and `DiagnosticReport` — the convenience layer that +produces plain-English stakeholder narratives from any diff-diff result. + +Methodology for estimators lives in `REGISTRY.md`. This file is the +single source for reporting-layer decisions; `REGISTRY.md` cross-links +here rather than duplicating content. + +## Module + +- `diff_diff/business_report.py` — `BusinessReport`, `BusinessContext`. +- `diff_diff/diagnostic_report.py` — `DiagnosticReport`, + `DiagnosticReportResults`. + +Both modules dispatch by `type(results).__name__` lookup to avoid +circular imports across the 16 result classes. They perform no new +statistical computation; every reported number is read from the fitted +result or computed by an existing diff-diff utility +(`compute_honest_did`, `HonestDiD.sensitivity`, `bacon_decompose`, +`check_parallel_trends`, `compute_deff_diagnostics`, +`compute_pretrends_power`). + +## Design deviations + +- **Note:** No hard pass/fail gates. `DiagnosticReport` does not produce + a traffic-light verdict. Severity is conveyed through natural-language + phrasing ("robust", "fragile", "material share"). This is an explicit + deviation from the strategy document's Gap 4 ("traffic-light + assessment (green/yellow/red)"); the choice is motivated by the + well-known risk of naive thresholds producing false confidence. A + `ConservativeThresholds` opt-in layer remains available as a future + addition if practitioner demand materialises. + +- **Note:** Placebo battery is opt-in (`run_placebo=False` by default). + `run_all_placebo_tests` on a typical panel (500 permutations times one + DiD fit per permutation) adds tens of seconds of latency, which would + be surprising as the default on a convenience wrapper. The schema + reserves the `"placebo"` key; it is always rendered with + `{"status": "skipped", "reason": "..."}` in MVP so agents parsing the + schema see a stable shape. + +- **Note:** `DiagnosticReport` does not call `check_parallel_trends` on + event-study or staggered result objects. `check_parallel_trends` in + `diff_diff/utils.py` assumes a single binary treatment with universal + pre-periods; for staggered and event-study designs, DR reads the + pre-period event-study coefficients directly and constructs a joint + Wald statistic (or Bonferroni fallback when `vcov` is missing). This + mirrors the guidance in `practitioner._parallel_trends_step(staggered=True)`. + +- **Note:** Estimator-native validation surfaces are surfaced rather + than duplicated. `SyntheticDiDResults` routes parallel-trends to + `pre_treatment_fit` (the RMSE of the synthetic-control fit on the + pre-period), and routes sensitivity to `in_time_placebo()` + + `sensitivity_to_zeta_omega()`. `TROPResults` surfaces factor-model + diagnostics (`effective_rank`, `loocv_score`, selected `lambda_*`) + under `estimator_native_diagnostics`. `EfficientDiDResults` PT runs + through `EfficientDiD.hausman_pretest` (the estimator's native + PT-All vs PT-Post check). + +- **Note:** Pre-trends verdict is a three-bin heuristic, not a field + convention. DR maps the joint p-value as follows: + + - `joint_p >= 0.30` → `no_detected_violation`. + - `0.05 <= joint_p < 0.30` → `some_evidence_against`. + - `joint_p < 0.05` → `clear_violation`. + + These thresholds are diff-diff heuristics. The 0.30 upper bound draws + on equivalence-testing intuition (Rambachan & Roth 2023 discuss the + limitations of pre-tests). The `no_detected_violation` label + deliberately avoids "parallel trends hold" language — the test did + not detect a violation, but pre-trends tests are commonly + underpowered. See the power-aware phrasing rule below. + +- **Note:** Power-aware phrasing for `no_detected_violation`. DR calls + `compute_pretrends_power(results, violation_type='linear', + alpha=alpha, target_power=0.80)` whenever the result has an + event-study surface with a `vcov`. BusinessReport then reads + `mdv_share_of_att = mdv / abs(att)` and selects a tier: + + - `< 0.25` → `well_powered` — "the test has 80% power to + detect a violation of magnitude M, which is only X% of the + estimated effect; if a material pre-trend existed, this test would + likely have caught it." + - `>= 0.25 and < 1.0` → `moderately_powered` — "the test + is informative but not definitive; see the sensitivity analysis + below for bounded-violation guarantees." + - `>= 1.0` → `underpowered` — "the test has limited + power — a non-rejection does not prove the assumption. See + the HonestDiD sensitivity analysis below for a more reliable + signal." + - Power analysis not runnable → fall back to `underpowered` + phrasing; the fallback reason is recorded in + `schema["pre_trends"]["power_status"]`. + + Rationale: always-hedging phrasing under-sells well-designed + studies; always-confident phrasing over-sells underpowered ones. + The library already ships `compute_pretrends_power()`, so using it + is the honest default rather than hedging every non-violation. + +- **Note:** Unit-translation policy. BusinessReport does not + arithmetically translate log-points to percents or level effects to + log-points. The estimate is rendered in the scale the estimator + produced; `outcome_unit="log_points"` emits an informational + caveat. The policy avoids guessing the underlying model (no + estimator in the library currently exports both log and level + coefficients), which would be unsafe in the presence of non-linear + link functions (Poisson QMLE, logit). + +- **Note:** Single-knob `alpha`. BusinessReport exposes only `alpha` + (defaults to `results.alpha`); there is no separate + `significance_threshold` parameter. `alpha` drives both the CI level + (`(1 - alpha) * 100`% interval) and the phrasing tier threshold + ("statistically significant at the (1 - alpha) * 100% level"). + +- **Note:** Schema stability policy for the AI-legible `to_dict()` + surface. New top-level keys count as additive (no version bump); new + values in any `status` enum count as breaking (agents doing + exhaustive pattern match will break on unknown enums); renames and + removals count as breaking. The `BUSINESS_REPORT_SCHEMA_VERSION` + and `DIAGNOSTIC_REPORT_SCHEMA_VERSION` constants bump independently. + The v3.2 CHANGELOG marks both schemas experimental so users do not + anchor tooling on them prematurely; a formal deprecation policy will + land within two subsequent PRs. + +## Reference implementation(s) + +The phrasing rules follow the guidance in: + +- Baker, A. C., Callaway, B., Cunningham, S., Goodman-Bacon, A., & + Sant'Anna, P. H. C. (2025). *Difference-in-Differences Designs: A + Practitioner's Guide.* (The 8-step workflow enforced through + `diff_diff/practitioner.py`.) +- Rambachan, A., & Roth, J. (2023). *A More Credible Approach to + Parallel Trends.* Review of Economic Studies. (HonestDiD sensitivity; + the pre-test power caveat directly shaped the three-tier power + phrasing.) +- Roth, J. (2022). *Pretest with Caution: Event-study Estimates after + Testing for Parallel Trends.* American Economic Review: Insights. + (Motivates the power-aware phrasing tiers.) diff --git a/tests/test_business_report.py b/tests/test_business_report.py new file mode 100644 index 00000000..67ba2819 --- /dev/null +++ b/tests/test_business_report.py @@ -0,0 +1,484 @@ +"""Tests for ``diff_diff.business_report.BusinessReport``. + +Covers the expanded test list from the approved plan: +- Schema contract across result types. +- JSON round-trip. +- BR-DR integration (auto, explicit, False). +- ``honest_did_results=`` passthrough (no re-computation). +- Unit-label behavior (pp vs $ differ; column-name fallback). +- Log-points unit policy (no arithmetic translation; informational caveat). +- Significance-chasing guard boundary. +- Pre-trends verdict thresholds (three bins routed through BR phrasing). +- Power-aware phrasing (three tiers + underpowered fallback). +- NaN ATT surfaces a caveat and does not crash. +- ``include_appendix`` toggle. +- ``BusinessReport(BaconDecompositionResults)`` raises TypeError. +- Survey metadata passthrough to schema + phrasing. +- Single-knob alpha drives both CI level and phrasing. +""" + +from __future__ import annotations + +import json +import warnings +from unittest.mock import patch + +import numpy as np +import pytest + +import diff_diff as dd +from diff_diff import ( + BusinessReport, + BusinessContext, + CallawaySantAnna, + DiagnosticReport, + DifferenceInDifferences, + MultiPeriodDiD, + SyntheticDiD, + bacon_decompose, + generate_did_data, + generate_factor_data, + generate_staggered_data, +) +from diff_diff.business_report import BUSINESS_REPORT_SCHEMA_VERSION + +warnings.filterwarnings("ignore") + +_BR_TOP_LEVEL_KEYS = { + "schema_version", + "estimator", + "context", + "headline", + "assumption", + "pre_trends", + "sensitivity", + "sample", + "heterogeneity", + "robustness", + "diagnostics", + "next_steps", + "caveats", + "references", +} + + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- +@pytest.fixture(scope="module") +def did_fit(): + df = generate_did_data(n_units=80, n_periods=4, treatment_effect=1.5, seed=7) + did = DifferenceInDifferences().fit(df, outcome="outcome", treatment="treated", time="post") + return did, df + + +@pytest.fixture(scope="module") +def event_study_fit(): + df = generate_did_data(n_units=80, n_periods=8, treatment_effect=1.5, seed=7) + es = MultiPeriodDiD().fit( + df, + outcome="outcome", + treatment="treated", + time="period", + unit="unit", + reference_period=3, + ) + return es, df + + +@pytest.fixture(scope="module") +def cs_fit(): + sdf = generate_staggered_data(n_units=100, n_periods=6, treatment_effect=1.5, seed=7) + cs = CallawaySantAnna().fit( + sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + aggregate="event_study", + ) + return cs, sdf + + +@pytest.fixture(scope="module") +def sdid_fit(): + fdf = generate_factor_data(n_units=25, n_pre=8, n_post=4, n_treated=4, seed=11) + sdid = SyntheticDiD().fit(fdf, outcome="outcome", unit="unit", time="period", treatment="treat") + return sdid, fdf + + +# --------------------------------------------------------------------------- +# Schema contract +# --------------------------------------------------------------------------- +class TestSchemaContract: + def test_top_level_keys(self, event_study_fit): + fit, _ = event_study_fit + br = BusinessReport(fit, auto_diagnostics=False) + assert set(br.to_dict().keys()) == _BR_TOP_LEVEL_KEYS + + def test_schema_version(self, event_study_fit): + fit, _ = event_study_fit + assert ( + BusinessReport(fit, auto_diagnostics=False).to_dict()["schema_version"] + == BUSINESS_REPORT_SCHEMA_VERSION + ) + + def test_json_round_trip(self, cs_fit): + fit, _ = cs_fit + br = BusinessReport( + fit, + outcome_label="sales", + outcome_unit="$", + treatment_label="the policy", + ) + dumped = json.dumps(br.to_dict()) + assert len(dumped) > 0 + assert json.loads(dumped)["schema_version"] == BUSINESS_REPORT_SCHEMA_VERSION + + def test_json_round_trip_sdid(self, sdid_fit): + fit, _ = sdid_fit + br = BusinessReport(fit, outcome_label="revenue", outcome_unit="$") + dumped = json.dumps(br.to_dict()) + assert len(dumped) > 0 + + +# --------------------------------------------------------------------------- +# BR ↔ DR integration +# --------------------------------------------------------------------------- +class TestDiagnosticsIntegration: + def test_auto_diagnostics_true_populates_diagnostics_block(self, event_study_fit): + fit, _ = event_study_fit + br = BusinessReport(fit, auto_diagnostics=True) + d = br.to_dict() + assert d["diagnostics"]["status"] == "ran" + assert "schema" in d["diagnostics"] + + def test_auto_diagnostics_false_skips(self, event_study_fit): + fit, _ = event_study_fit + br = BusinessReport(fit, auto_diagnostics=False) + d = br.to_dict() + assert d["diagnostics"]["status"] == "skipped" + assert "auto_diagnostics=False" in d["diagnostics"]["reason"] + + def test_explicit_diagnostics_results_takes_precedence(self, event_study_fit): + fit, _ = event_study_fit + dr = DiagnosticReport(fit) + dr_results = dr.run_all() + br = BusinessReport(fit, diagnostics=dr_results) + d = br.to_dict() + assert d["diagnostics"]["status"] == "ran" + # Same dict identity shows the supplied results were used verbatim. + assert d["diagnostics"]["schema"] is dr_results.schema + + def test_explicit_diagnostics_report_runs(self, event_study_fit): + fit, _ = event_study_fit + dr = DiagnosticReport(fit) + br = BusinessReport(fit, diagnostics=dr) + assert br.to_dict()["diagnostics"]["status"] == "ran" + + def test_diagnostics_wrong_type_raises(self, event_study_fit): + fit, _ = event_study_fit + with pytest.raises(TypeError): + BusinessReport(fit, diagnostics="not a DR") # type: ignore[arg-type] + + +# --------------------------------------------------------------------------- +# HonestDiD passthrough +# --------------------------------------------------------------------------- +class TestHonestDiDPassthrough: + def test_supplied_sensitivity_is_not_recomputed(self, event_study_fit): + fit, _ = event_study_fit + + class _FakeSens: + M_values = np.array([0.5, 1.0]) + bounds = [(0.1, 2.0), (-0.2, 2.5)] + robust_cis = [(0.05, 2.1), (-0.3, 2.6)] + breakdown_M = 1.5 + method = "relative_magnitude" + original_estimate = 1.0 + original_se = 0.2 + alpha = 0.05 + + fake = _FakeSens() + with patch("diff_diff.honest_did.HonestDiD.sensitivity_analysis") as mock: + br = BusinessReport(fit, honest_did_results=fake) + schema = br.to_dict() + mock.assert_not_called() + sens = schema["sensitivity"] + assert sens["status"] == "computed" + assert sens["breakdown_M"] == 1.5 + + +# --------------------------------------------------------------------------- +# Unit labels and policy +# --------------------------------------------------------------------------- +class TestUnitLabels: + def test_dollar_unit_formats_currency(self, cs_fit): + fit, _ = cs_fit + br = BusinessReport(fit, outcome_label="sales", outcome_unit="$", auto_diagnostics=False) + headline = br.headline() + assert "$" in headline + + def test_pp_unit_formats_percentage_points(self, cs_fit): + fit, _ = cs_fit + br = BusinessReport( + fit, outcome_label="awareness", outcome_unit="pp", auto_diagnostics=False + ) + headline = br.headline() + assert "pp" in headline + + def test_zero_config_falls_back_to_generic_label(self, cs_fit): + fit, _ = cs_fit + br = BusinessReport(fit, auto_diagnostics=False) + d = br.to_dict() + assert d["context"]["outcome_label"] == "the outcome" + assert d["context"]["treatment_label"] == "the treatment" + + def test_log_points_emits_unit_policy_caveat(self, cs_fit): + fit, _ = cs_fit + br = BusinessReport(fit, outcome_unit="log_points", auto_diagnostics=False) + caveats = br.caveats() + topics = {c.get("topic") for c in caveats} + assert "unit_policy" in topics + + +# --------------------------------------------------------------------------- +# Significance phrasing +# --------------------------------------------------------------------------- +class TestSignificancePhrasing: + def test_high_significance_produces_strong_language(self, cs_fit): + """CS on this seed has p ~ 1e-56 (very strong) -> 'strongly supported'.""" + fit, _ = cs_fit + br = BusinessReport(fit, outcome_label="sales", outcome_unit="$") + summary = br.summary() + assert "strongly supported" in summary + + def test_near_threshold_caveat(self, event_study_fit): + """Fabricate a p-value near 0.05 to exercise the significance-chasing guard.""" + fit, _ = event_study_fit + # Monkey-patch the result to land p_value in (0.04, 0.051). + original = fit.avg_p_value + try: + fit.avg_p_value = 0.045 + br = BusinessReport(fit, auto_diagnostics=False) + caveats = br.caveats() + topics = {c.get("topic") for c in caveats} + assert "near_significance" in topics + finally: + fit.avg_p_value = original + + def test_far_from_threshold_no_near_caveat(self, event_study_fit): + fit, _ = event_study_fit + original = fit.avg_p_value + try: + fit.avg_p_value = 0.010 + br = BusinessReport(fit, auto_diagnostics=False) + topics = {c.get("topic") for c in br.caveats()} + assert "near_significance" not in topics + finally: + fit.avg_p_value = original + + +# --------------------------------------------------------------------------- +# Pre-trends verdict + power tier phrasing +# --------------------------------------------------------------------------- +class TestPreTrendsVerdictPhrasing: + """Verdict and tier should flow through into schema AND phrasing.""" + + def test_verdict_and_tier_surface_in_schema(self, event_study_fit): + fit, _ = event_study_fit + br = BusinessReport(fit, auto_diagnostics=True) + pt = br.to_dict()["pre_trends"] + # This fixture has a clear violation and an underpowered test — both set. + assert pt["status"] == "computed" + assert pt["verdict"] in { + "no_detected_violation", + "some_evidence_against", + "clear_violation", + } + + def test_clear_violation_phrased_tentatively(self, event_study_fit): + fit, _ = event_study_fit + br = BusinessReport(fit, auto_diagnostics=True) + if br.to_dict()["pre_trends"].get("verdict") == "clear_violation": + summary = br.summary() + assert "tentative" in summary or "reject parallel trends" in summary + + def test_underpowered_phrasing_uses_hedge_language(self, cs_fit): + """CS fit on this seed typically produces 'no_detected_violation' + underpowered.""" + fit, sdf = cs_fit + # Force the CS fit through our BR pipeline. + br = BusinessReport( + fit, + outcome_label="sales", + outcome_unit="$", + diagnostics=DiagnosticReport( + fit, + data=sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ), + ) + pt = br.to_dict()["pre_trends"] + if pt.get("verdict") == "no_detected_violation": + summary = br.summary() + # One of the three tier-specific phrases should appear. + assert ( + "limited power" in summary + or "moderately informative" in summary + or "well-powered" in summary + or "likely have been detected" in summary + ) + + +# --------------------------------------------------------------------------- +# NaN ATT +# --------------------------------------------------------------------------- +class TestNaNATT: + def test_nan_att_produces_caveat_and_does_not_crash(self, event_study_fit): + fit, _ = event_study_fit + original = fit.avg_att + try: + fit.avg_att = float("nan") + br = BusinessReport(fit, auto_diagnostics=False) + summary = br.summary() + caveats = br.caveats() + assert isinstance(summary, str) + assert any(c.get("topic") == "estimation_failure" for c in caveats) + assert br.to_dict()["headline"]["sign"] == "undefined" + finally: + fit.avg_att = original + + +# --------------------------------------------------------------------------- +# include_appendix toggle +# --------------------------------------------------------------------------- +class TestAppendix: + def test_include_appendix_true_embeds_summary(self, event_study_fit): + fit, _ = event_study_fit + br = BusinessReport(fit, auto_diagnostics=False, include_appendix=True) + md = br.full_report() + assert "## Technical Appendix" in md + + def test_include_appendix_false_omits(self, event_study_fit): + fit, _ = event_study_fit + br = BusinessReport(fit, auto_diagnostics=False, include_appendix=False) + md = br.full_report() + assert "## Technical Appendix" not in md + + +# --------------------------------------------------------------------------- +# BaconDecompositionResults +# --------------------------------------------------------------------------- +class TestBaconTypeError: + def test_br_on_bacon_raises(self): + sdf = generate_staggered_data(n_units=30, n_periods=6, treatment_effect=1.5, seed=7) + bacon = bacon_decompose( + sdf, outcome="outcome", unit="unit", time="period", first_treat="first_treat" + ) + with pytest.raises(TypeError, match="BaconDecompositionResults is a diagnostic"): + BusinessReport(bacon) + + +# --------------------------------------------------------------------------- +# Survey metadata passthrough +# --------------------------------------------------------------------------- +class TestSurveyPassthrough: + def test_survey_absent_yields_null_survey_block(self, cs_fit): + fit, _ = cs_fit + br = BusinessReport(fit, auto_diagnostics=False) + d = br.to_dict() + assert d["sample"]["survey"] is None + + def test_survey_present_populates_block(self, event_study_fit): + """Synthetically attach a survey_metadata shim and verify BR surfaces it.""" + fit, _ = event_study_fit + + class _ShimMeta: + weight_type = "pweight" + effective_n = 120.0 + design_effect = 2.5 + sum_weights = 200.0 + n_strata = 8 + n_psu = 20 + df_survey = 18 + replicate_method = None + + original = fit.survey_metadata + try: + fit.survey_metadata = _ShimMeta() + br = BusinessReport(fit, auto_diagnostics=False) + survey = br.to_dict()["sample"]["survey"] + assert survey is not None + assert survey["weight_type"] == "pweight" + assert survey["design_effect"] == 2.5 + assert survey["is_trivial"] is False + + summary = br.summary() + # When DEFF >= 1.5 we inject a caveat or a summary sentence. + assert ( + "design effect" in summary.lower() + or "effective sample size" in summary.lower() + or any(c.get("topic") == "design_effect" for c in br.caveats()) + ) + finally: + fit.survey_metadata = original + + +# --------------------------------------------------------------------------- +# Single-knob alpha +# --------------------------------------------------------------------------- +class TestAlphaKnob: + def test_alpha_drives_ci_level(self, event_study_fit): + fit, _ = event_study_fit + br90 = BusinessReport(fit, alpha=0.10, auto_diagnostics=False) + br95 = BusinessReport(fit, alpha=0.05, auto_diagnostics=False) + assert br90.to_dict()["headline"]["ci_level"] == 90 + assert br95.to_dict()["headline"]["ci_level"] == 95 + + +# --------------------------------------------------------------------------- +# Summary + full_report work across estimators +# --------------------------------------------------------------------------- +class TestAcrossEstimators: + def test_summary_nonempty_for_all(self, did_fit, event_study_fit, cs_fit, sdid_fit): + for fit, _ in (did_fit, event_study_fit, cs_fit, sdid_fit): + br = BusinessReport(fit, auto_diagnostics=False) + s = br.summary() + assert isinstance(s, str) + assert len(s) > 0 + + +# --------------------------------------------------------------------------- +# Public API exposure +# --------------------------------------------------------------------------- +def test_public_api_exports(): + for name in ("BusinessReport", "BusinessContext", "BUSINESS_REPORT_SCHEMA_VERSION"): + assert hasattr(dd, name) + + +def test_repr_includes_estimator_and_effect(cs_fit): + fit, _ = cs_fit + r = repr(BusinessReport(fit, auto_diagnostics=False)) + assert "CallawaySantAnnaResults" in r + + +def test_str_equals_summary(cs_fit): + fit, _ = cs_fit + br = BusinessReport(fit, auto_diagnostics=False) + assert str(br) == br.summary() + + +def test_business_context_is_frozen_dataclass(): + ctx = BusinessContext( + outcome_label="x", + outcome_unit=None, + outcome_direction=None, + business_question=None, + treatment_label="y", + alpha=0.05, + ) + with pytest.raises((AttributeError, Exception)): + ctx.alpha = 0.10 # type: ignore[misc] diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py new file mode 100644 index 00000000..da00d3ac --- /dev/null +++ b/tests/test_diagnostic_report.py @@ -0,0 +1,462 @@ +"""Tests for ``diff_diff.diagnostic_report.DiagnosticReport``. + +Covers: +- Schema contract: every top-level key always present, stable enum values. +- Applicability matrix: per-estimator ``applicable_checks`` property. +- JSON round-trip. +- ``precomputed=`` passthrough (sensitivity). +- Pre-trends verdict thresholds (three bins). +- Power-aware tier thresholds (three bins + fallback). +- DEFF reads from ``survey_metadata`` when present. +- EfficientDiD ``hausman_pretest`` pathway. +- SDiD / TROP native diagnostics. +- Error-doesn't-break-report (diagnostic raises -> section records error). +""" + +from __future__ import annotations + +import json +import warnings +from unittest.mock import patch + +import numpy as np +import pytest + +import diff_diff as dd +from diff_diff import ( + CallawaySantAnna, + DiagnosticReport, + DiagnosticReportResults, + DifferenceInDifferences, + EfficientDiD, + MultiPeriodDiD, + SyntheticDiD, + generate_did_data, + generate_factor_data, + generate_staggered_data, +) +from diff_diff.diagnostic_report import ( + DIAGNOSTIC_REPORT_SCHEMA_VERSION, + _power_tier, + _pt_verdict, +) + +# --------------------------------------------------------------------------- +# Fixtures +# --------------------------------------------------------------------------- +_TOP_LEVEL_KEYS = { + "schema_version", + "estimator", + "headline_metric", + "parallel_trends", + "pretrends_power", + "sensitivity", + "placebo", + "bacon", + "design_effect", + "heterogeneity", + "epv", + "estimator_native_diagnostics", + "skipped", + "warnings", + "overall_interpretation", + "next_steps", +} + +_STATUS_ENUM = { + "ran", + "skipped", + "error", + "not_applicable", + "not_run", + "computed", +} + + +@pytest.fixture(scope="module") +def did_fit(): + warnings.filterwarnings("ignore") + df = generate_did_data(n_units=80, n_periods=4, treatment_effect=1.5, seed=7) + did = DifferenceInDifferences().fit(df, outcome="outcome", treatment="treated", time="post") + return did, df + + +@pytest.fixture(scope="module") +def multi_period_fit(): + warnings.filterwarnings("ignore") + df = generate_did_data(n_units=80, n_periods=8, treatment_effect=1.5, seed=7) + es = MultiPeriodDiD().fit( + df, + outcome="outcome", + treatment="treated", + time="period", + unit="unit", + reference_period=3, + ) + return es, df + + +@pytest.fixture(scope="module") +def cs_fit(): + warnings.filterwarnings("ignore") + sdf = generate_staggered_data(n_units=100, n_periods=6, treatment_effect=1.5, seed=7) + cs = CallawaySantAnna().fit( + sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + aggregate="event_study", + ) + return cs, sdf + + +@pytest.fixture(scope="module") +def edid_fit(): + warnings.filterwarnings("ignore") + sdf = generate_staggered_data(n_units=100, n_periods=6, treatment_effect=1.5, seed=7) + edid = EfficientDiD().fit( + sdf, outcome="outcome", unit="unit", time="period", first_treat="first_treat" + ) + return edid, sdf + + +@pytest.fixture(scope="module") +def sdid_fit(): + warnings.filterwarnings("ignore") + fdf = generate_factor_data(n_units=25, n_pre=8, n_post=4, n_treated=4, seed=11) + sdid = SyntheticDiD().fit(fdf, outcome="outcome", unit="unit", time="period", treatment="treat") + return sdid, fdf + + +# --------------------------------------------------------------------------- +# Schema contract +# --------------------------------------------------------------------------- +class TestSchemaContract: + """The AI-legible schema is the public promise. These tests lock it down.""" + + def test_every_top_level_key_present_did(self, did_fit): + fit, df = did_fit + dr = DiagnosticReport(fit, data=df, outcome="outcome", treatment="treated", time="post") + schema = dr.to_dict() + assert set(schema.keys()) == _TOP_LEVEL_KEYS + + def test_every_top_level_key_present_multiperiod(self, multi_period_fit): + fit, _ = multi_period_fit + schema = DiagnosticReport(fit).to_dict() + assert set(schema.keys()) == _TOP_LEVEL_KEYS + + def test_every_top_level_key_present_cs(self, cs_fit): + fit, sdf = cs_fit + schema = DiagnosticReport( + fit, + data=sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ).to_dict() + assert set(schema.keys()) == _TOP_LEVEL_KEYS + + def test_every_top_level_key_present_sdid(self, sdid_fit): + fit, _ = sdid_fit + schema = DiagnosticReport(fit).to_dict() + assert set(schema.keys()) == _TOP_LEVEL_KEYS + + def test_schema_version_constant(self, multi_period_fit): + fit, _ = multi_period_fit + schema = DiagnosticReport(fit).to_dict() + assert schema["schema_version"] == DIAGNOSTIC_REPORT_SCHEMA_VERSION + assert DIAGNOSTIC_REPORT_SCHEMA_VERSION == "1.0" + + def test_all_statuses_use_closed_enum(self, cs_fit): + fit, sdf = cs_fit + schema = DiagnosticReport( + fit, + data=sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ).to_dict() + for key in [ + "parallel_trends", + "pretrends_power", + "sensitivity", + "placebo", + "bacon", + "design_effect", + "heterogeneity", + "epv", + "estimator_native_diagnostics", + ]: + section = schema.get(key) + assert isinstance(section, dict), f"{key} missing" + assert ( + section.get("status") in _STATUS_ENUM + ), f"{key}.status = {section.get('status')!r} not in {_STATUS_ENUM}" + + def test_json_round_trip_multiperiod(self, multi_period_fit): + fit, _ = multi_period_fit + dr = DiagnosticReport(fit) + dumped = json.dumps(dr.to_dict()) + assert len(dumped) > 0 + round = json.loads(dumped) + assert round["schema_version"] == DIAGNOSTIC_REPORT_SCHEMA_VERSION + + def test_json_round_trip_sdid(self, sdid_fit): + fit, _ = sdid_fit + dumped = json.dumps(DiagnosticReport(fit).to_dict()) + assert len(dumped) > 0 + + +# --------------------------------------------------------------------------- +# Applicability matrix +# --------------------------------------------------------------------------- +class TestApplicabilityMatrix: + """Per-estimator applicability set filtered by instance state + options.""" + + def test_did_without_data_skips_pt(self, did_fit): + fit, _ = did_fit + dr = DiagnosticReport(fit) # no data + assert "parallel_trends" not in dr.applicable_checks + assert "parallel_trends" in dr.skipped_checks + reason = dr.skipped_checks["parallel_trends"] + assert "data" in reason.lower() + + def test_did_with_data_runs_pt(self, did_fit): + fit, df = did_fit + dr = DiagnosticReport(fit, data=df, outcome="outcome", treatment="treated", time="post") + assert "parallel_trends" in dr.applicable_checks + + def test_multiperiod_runs_pt_and_power_and_sensitivity(self, multi_period_fit): + fit, _ = multi_period_fit + dr = DiagnosticReport(fit) + applicable = set(dr.applicable_checks) + assert "parallel_trends" in applicable + assert "pretrends_power" in applicable + assert "sensitivity" in applicable + + def test_cs_runs_heterogeneity(self, cs_fit): + fit, sdf = cs_fit + dr = DiagnosticReport( + fit, + data=sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ) + applicable = set(dr.applicable_checks) + assert "heterogeneity" in applicable + assert "bacon" in applicable + assert "parallel_trends" in applicable + + def test_sdid_has_estimator_native(self, sdid_fit): + fit, _ = sdid_fit + dr = DiagnosticReport(fit) + assert "estimator_native" in dr.applicable_checks + + def test_run_opt_outs_move_checks_to_skipped(self, multi_period_fit): + fit, _ = multi_period_fit + dr = DiagnosticReport(fit, run_sensitivity=False) + assert "sensitivity" not in dr.applicable_checks + assert dr.skipped_checks["sensitivity"].startswith("run_sensitivity=False") + + def test_placebo_is_reserved_and_skipped(self, did_fit): + """Placebo is always in _CHECK_NAMES, always skipped in MVP.""" + fit, df = did_fit + dr = DiagnosticReport(fit, data=df, outcome="outcome", treatment="treated", time="post") + placebo_section = dr.to_dict()["placebo"] + assert placebo_section["status"] in {"skipped", "not_applicable"} + + +# --------------------------------------------------------------------------- +# Precomputed passthrough +# --------------------------------------------------------------------------- +class TestPrecomputed: + def test_precomputed_sensitivity_is_used_verbatim(self, multi_period_fit): + fit, _ = multi_period_fit + + # Construct a minimal SensitivityResults-shaped object the formatter recognizes. + class _FakeSens: + M_values = np.array([0.5, 1.0]) + bounds = [(0.1, 2.0), (-0.2, 2.5)] + robust_cis = [(0.05, 2.1), (-0.3, 2.6)] + breakdown_M = 0.75 + method = "relative_magnitude" + original_estimate = 1.0 + original_se = 0.2 + alpha = 0.05 + + fake = _FakeSens() + with patch("diff_diff.honest_did.HonestDiD.sensitivity_analysis") as mock: + dr = DiagnosticReport(fit, precomputed={"sensitivity": fake}) + dr.to_dict() + mock.assert_not_called() + schema = dr.to_dict() + assert schema["sensitivity"]["status"] == "ran" + assert schema["sensitivity"]["breakdown_M"] == 0.75 + + +# --------------------------------------------------------------------------- +# Verdict / tier helpers +# --------------------------------------------------------------------------- +class TestVerdictsAndTiers: + def test_pt_verdict_three_bins(self): + assert _pt_verdict(0.001) == "clear_violation" + assert _pt_verdict(0.049) == "clear_violation" + assert _pt_verdict(0.10) == "some_evidence_against" + assert _pt_verdict(0.29) == "some_evidence_against" + assert _pt_verdict(0.30) == "no_detected_violation" + assert _pt_verdict(0.99) == "no_detected_violation" + assert _pt_verdict(None) == "inconclusive" + assert _pt_verdict(float("nan")) == "inconclusive" + + def test_power_tier_three_bins_plus_unknown(self): + assert _power_tier(0.1) == "well_powered" + assert _power_tier(0.24) == "well_powered" + assert _power_tier(0.25) == "moderately_powered" + assert _power_tier(0.99) == "moderately_powered" + assert _power_tier(1.0) == "underpowered" + assert _power_tier(5.0) == "underpowered" + assert _power_tier(None) == "unknown" + assert _power_tier(float("nan")) == "unknown" + + +# --------------------------------------------------------------------------- +# EfficientDiD hausman pathway +# --------------------------------------------------------------------------- +class TestEfficientDiDHausman: + def test_hausman_pretest_runs_with_data_kwargs(self, edid_fit): + fit, sdf = edid_fit + dr = DiagnosticReport( + fit, + data=sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ) + pt = dr.to_dict()["parallel_trends"] + assert pt["status"] == "ran" + assert pt["method"] == "hausman" + + def test_hausman_skipped_without_data_kwargs(self, edid_fit): + fit, _ = edid_fit + dr = DiagnosticReport(fit) + pt = dr.to_dict()["parallel_trends"] + assert pt["status"] == "skipped" + assert pt["method"] == "hausman" + + +# --------------------------------------------------------------------------- +# SDiD native +# --------------------------------------------------------------------------- +class TestSDiDNative: + def test_sdid_pt_uses_synthetic_fit_method(self, sdid_fit): + fit, _ = sdid_fit + pt = DiagnosticReport(fit).to_dict()["parallel_trends"] + assert pt["method"] == "synthetic_fit" + assert pt["verdict"] == "design_enforced_pt" + assert isinstance(pt.get("pre_treatment_fit_rmse"), float) + + def test_sdid_native_section_populated(self, sdid_fit): + fit, _ = sdid_fit + native = DiagnosticReport(fit).to_dict()["estimator_native_diagnostics"] + assert native["status"] == "ran" + assert native["estimator"] == "SyntheticDiD" + assert "weight_concentration" in native + assert "in_time_placebo" in native + assert "zeta_sensitivity" in native + + def test_sdid_does_not_call_honest_did(self, sdid_fit): + """HonestDiD sensitivity should NOT run on SDiD (native path used instead).""" + fit, _ = sdid_fit + with patch("diff_diff.honest_did.HonestDiD.sensitivity_analysis") as mock: + DiagnosticReport(fit).to_dict() + mock.assert_not_called() + + +# --------------------------------------------------------------------------- +# Error handling +# --------------------------------------------------------------------------- +class TestErrorHandling: + def test_sensitivity_error_does_not_break_report(self, multi_period_fit): + """A failing diagnostic records its error in the section; the report still renders.""" + fit, _ = multi_period_fit + + def _raise(*args, **kwargs): + raise RuntimeError("synthetic test failure") + + with patch("diff_diff.honest_did.HonestDiD.sensitivity_analysis", side_effect=_raise): + dr = DiagnosticReport(fit) + schema = dr.to_dict() + sens = schema["sensitivity"] + assert sens["status"] == "error" + assert "synthetic test failure" in sens["reason"] + # Other sections still ran. + assert schema["parallel_trends"]["status"] == "ran" + + +# --------------------------------------------------------------------------- +# Overall prose +# --------------------------------------------------------------------------- +class TestOverallInterpretation: + def test_overall_interpretation_nonempty_for_fit(self, cs_fit): + fit, sdf = cs_fit + dr = DiagnosticReport( + fit, + data=sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ) + prose = dr.summary() + assert isinstance(prose, str) + assert len(prose) > 50 # a real paragraph + + def test_full_report_has_headers(self, cs_fit): + fit, sdf = cs_fit + dr = DiagnosticReport( + fit, + data=sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ) + md = dr.full_report() + assert "# Diagnostic Report" in md + assert "## Overall Interpretation" in md + assert "## Parallel trends" in md + assert "## HonestDiD sensitivity" in md + + +# --------------------------------------------------------------------------- +# Public result class +# --------------------------------------------------------------------------- +class TestDiagnosticReportResults: + def test_run_all_returns_dataclass(self, multi_period_fit): + fit, _ = multi_period_fit + dr = DiagnosticReport(fit) + results = dr.run_all() + assert isinstance(results, DiagnosticReportResults) + assert isinstance(results.applicable_checks, tuple) + assert isinstance(results.schema, dict) + + def test_run_all_is_idempotent(self, multi_period_fit): + fit, _ = multi_period_fit + dr = DiagnosticReport(fit) + a = dr.run_all() + b = dr.run_all() + assert a is b # cached + + +# --------------------------------------------------------------------------- +# Public API exposure +# --------------------------------------------------------------------------- +def test_public_api_exports(): + for name in ("DiagnosticReport", "DiagnosticReportResults", "DIAGNOSTIC_REPORT_SCHEMA_VERSION"): + assert hasattr(dd, name), f"diff_diff must export {name}" From 321218779661cfd8db67f7a8830809f1cf3f1c4d Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 15:16:40 -0400 Subject: [PATCH 02/48] Address review findings: Wald-path tests, outcome_direction, warning provenance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follows up on review findings on the prior commit: - **P1 Wald test coverage** — add targeted tests for ``_pt_event_study`` (``TestJointWaldAlignment``): * joint_wald runs when pre-period keys align with ``interaction_indices`` * computed chi-squared statistic matches a closed-form expectation * Bonferroni fallback when ``interaction_indices`` is missing * Bonferroni fallback when the key namespace is misaligned * Bonferroni fallback when ``vcov`` is missing Also document the alignment contract and fallback rule inline near the Wald codepath so the invariant is discoverable without reading tests. - **P2 outcome_direction** — implement direction-aware verbs in the headline sentence via ``_direction_verb``: * ``higher_is_better`` + positive effect -> "lifted" * ``higher_is_better`` + negative effect -> "reduced" * ``lower_is_better`` + positive effect -> "worsened" * ``lower_is_better`` + negative effect -> "improved" * ``None`` -> neutral "increased" / "decreased" Covered by ``TestOutcomeDirection`` with three scenarios. - **P2 warning provenance** — populate top-level ``schema["warnings"]`` from every section that ended in ``status="error"`` so agents do not have to scan each section dict to discover diagnostic failures. ``DiagnosticReportResults.warnings`` now mirrors the top-level list. Covered by ``TestWarningsPassthrough``. - **P2 string dispatch** — add an inline note above ``_APPLICABILITY`` explaining the ``type(results).__name__`` convention (mirrors ``practitioner._HANDLERS`` to avoid circular imports) and pointing at the applicability-matrix test as the regression guard. No behavioral changes outside the review items; existing tests remain unchanged. 121 tests pass, black / ruff / mypy clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 26 +++++++- diff_diff/diagnostic_report.py | 31 ++++++++- tests/test_business_report.py | 63 ++++++++++++++++++ tests/test_diagnostic_report.py | 113 ++++++++++++++++++++++++++++++++ 4 files changed, 230 insertions(+), 3 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index bdd072f8..973eed25 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -863,18 +863,42 @@ def _significance_phrase(p: Optional[float], alpha: float) -> str: return "the confidence interval includes zero; the data are consistent with no effect" +def _direction_verb(effect: float, outcome_direction: Optional[str]) -> str: + """Return a direction-aware verb for the headline sentence. + + When ``outcome_direction`` is unset we use neutral change verbs + (``increased`` / ``decreased``). When it is supplied, we additionally + flavor the verb with a value-laden connotation so the stakeholder can + read off whether the estimated effect points in the desired direction: + + - ``higher_is_better``: positive effect -> "lifted"; negative -> "reduced" + - ``lower_is_better``: positive effect -> "worsened"; negative -> "improved" + - None: positive -> "increased"; negative -> "decreased" + """ + if effect == 0: + return "did not change" + if outcome_direction == "higher_is_better": + return "lifted" if effect > 0 else "reduced" + if outcome_direction == "lower_is_better": + return "worsened" if effect > 0 else "improved" + return "increased" if effect > 0 else "decreased" + + def _render_headline_sentence(schema: Dict[str, Any]) -> str: """Render the headline sentence from the schema. Uses the absolute value in the magnitude slot when the verb already conveys direction ("decreased ... by $0.14" rather than "decreased ... by -$0.14"). CI bounds are rendered at their natural signed values. + When ``outcome_direction`` is supplied, the verb picks up a value-laden + connotation ("lifted" / "reduced" vs neutral "increased" / "decreased"). """ ctx = schema.get("context", {}) h = schema.get("headline", {}) effect = h.get("effect") outcome = ctx.get("outcome_label", "the outcome") treatment = ctx.get("treatment_label", "the treatment") + outcome_direction = ctx.get("outcome_direction") unit = h.get("unit") unit_kind = h.get("unit_kind", "unknown") @@ -884,7 +908,7 @@ def _render_headline_sentence(schema: Dict[str, Any]) -> str: f"effect on {outcome}. Inspect the data and model specification." ) - verb = "increased" if effect > 0 else "decreased" if effect < 0 else "did not change" + verb = _direction_verb(effect, outcome_direction) magnitude = _format_value(abs(effect), unit, unit_kind) lo = h.get("ci_lower") hi = h.get("ci_upper") diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 4ca24c79..0577ed50 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -61,6 +61,14 @@ # required attributes are present (e.g. ``survey_metadata`` for DEFF) and by # whether the user disabled a check via ``run_*=False``. # See ``docs/methodology/REPORTING.md`` for the full matrix and rationale. +# +# Implementation note: The keys are result-class names looked up via +# ``type(results).__name__``. This string-based dispatch mirrors the +# ``_HANDLERS`` pattern in ``diff_diff/practitioner.py`` and avoids circular +# imports across the 16 result modules. Renaming or aliasing any result class +# requires updating both this table and ``_PT_METHOD`` below; the +# applicability-matrix test parametrized over all result types serves as the +# regression guard. _APPLICABILITY: Dict[str, FrozenSet[str]] = { "DiDResults": frozenset({"parallel_trends", "design_effect"}), "MultiPeriodDiDResults": frozenset( @@ -544,6 +552,18 @@ def _execute(self) -> DiagnosticReportResults: # Pull suggested next steps from the practitioner workflow. next_steps = self._collect_next_steps(applicable) + # Populate schema-level warnings for every section that ended in "error", + # so users and agents do not have to scan each section dict to discover + # that a diagnostic failed. Preserves provenance per the "no silent + # failures" convention. + top_warnings: List[str] = [] + for check in _CHECK_NAMES: + section_key = "estimator_native" if check == "estimator_native" else check + section = sections.get(section_key, {}) + if section.get("status") == "error": + reason = section.get("reason") or "diagnostic raised an exception" + top_warnings.append(f"{check}: {reason}") + schema: Dict[str, Any] = { "schema_version": DIAGNOSTIC_REPORT_SCHEMA_VERSION, "estimator": type(self._results).__name__, @@ -558,7 +578,7 @@ def _execute(self) -> DiagnosticReportResults: "epv": sections["epv"], "estimator_native_diagnostics": sections["estimator_native"], "skipped": {k: v for k, v in skipped.items()}, - "warnings": [], + "warnings": top_warnings, "overall_interpretation": "", "next_steps": next_steps, } @@ -570,7 +590,7 @@ def _execute(self) -> DiagnosticReportResults: interpretation=interpretation, applicable_checks=tuple(sorted(applicable)), skipped_checks=skipped, - warnings=(), + warnings=tuple(top_warnings), ) def _context_labels(self) -> Dict[str, str]: @@ -708,6 +728,13 @@ def _pt_event_study(self) -> Dict[str, Any]: test_statistic: Optional[float] = None df = len(pre_coefs) method = "bonferroni" + # Joint-Wald pathway is taken only when EVERY pre-period key is present + # in ``interaction_indices`` (required len == df guard below). This + # protects against estimators whose event-study keys use a different + # namespace than the vcov indexing: if any key is missing, we fall back + # to Bonferroni rather than risk indexing into the wrong vcov rows. + # The schema's ``method`` field exposes which path ran so agents and + # tests can distinguish the two unambiguously. if vcov is not None and interaction_indices is not None and df > 0: try: keys_in_vcov = [k for (k, _, _, _) in pre_coefs if k in interaction_indices] diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 67ba2819..c78b57e3 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -245,6 +245,69 @@ def test_log_points_emits_unit_policy_caveat(self, cs_fit): # --------------------------------------------------------------------------- # Significance phrasing # --------------------------------------------------------------------------- +class TestOutcomeDirection: + """outcome_direction selects value-laden vs neutral verbs.""" + + def test_higher_is_better_positive_effect_uses_lifted(self, cs_fit): + fit, _ = cs_fit + br = BusinessReport( + fit, + outcome_label="sales", + outcome_unit="$", + outcome_direction="higher_is_better", + treatment_label="the policy", + auto_diagnostics=False, + ) + headline = br.headline() + assert "lifted" in headline + assert "increased" not in headline + + def test_lower_is_better_positive_effect_uses_worsened(self, cs_fit): + fit, _ = cs_fit # CS has a positive effect on this seed + br = BusinessReport( + fit, + outcome_label="churn", + outcome_unit="%", + outcome_direction="lower_is_better", + treatment_label="the change", + auto_diagnostics=False, + ) + headline = br.headline() + assert "worsened" in headline + + def test_direction_none_uses_neutral_verb(self, cs_fit): + fit, _ = cs_fit + br = BusinessReport( + fit, + outcome_label="sales", + outcome_unit="$", + auto_diagnostics=False, + ) + headline = br.headline() + assert "increased" in headline + assert "lifted" not in headline + + +class TestWarningsPassthrough: + """Broad exception handling still records provenance in schema.warnings.""" + + def test_diagnostic_error_surfaces_as_top_level_warning(self, event_study_fit): + fit, _ = event_study_fit + + def _raise(*args, **kwargs): + raise RuntimeError("synthetic test failure") + + with patch("diff_diff.honest_did.HonestDiD.sensitivity_analysis", side_effect=_raise): + br = BusinessReport(fit, auto_diagnostics=True) + schema = br.to_dict() + inner = schema["diagnostics"]["schema"] + # The error is recorded at the section level... + assert inner["sensitivity"]["status"] == "error" + # ...AND surfaced at the top level for quick scanning. + assert any("sensitivity:" in w for w in inner["warnings"]) + assert any("synthetic test failure" in w for w in inner["warnings"]) + + class TestSignificancePhrasing: def test_high_significance_produces_strong_language(self, cs_fit): """CS on this seed has p ~ 1e-56 (very strong) -> 'strongly supported'.""" diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index da00d3ac..9e0bbac0 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -302,6 +302,119 @@ class _FakeSens: # --------------------------------------------------------------------------- # Verdict / tier helpers # --------------------------------------------------------------------------- +class TestJointWaldAlignment: + """Cover the event-study PT joint-Wald vs Bonferroni fallback paths. + + These tests address the correctness-sensitive codepath in + ``_pt_event_study`` where pre-period coefficient keys must align with + ``interaction_indices`` before the joint Wald statistic can be indexed + into the right vcov rows/columns. When alignment fails, the code must + fall back to Bonferroni rather than compute a Wald statistic on the + wrong rows. + """ + + @staticmethod + def _stub_result(pre_effects, interaction_indices, vcov, **extra): + """Build a minimal MultiPeriodDiDResults-shaped stub for PT tests. + + ``pre_effects`` is an iterable of ``(period_key, effect, se, p_value)`` + tuples. Returns an object whose class name is ``MultiPeriodDiDResults`` + so DR's name-keyed dispatch routes it to the event-study PT path. + """ + from types import SimpleNamespace + + pre_map = { + k: SimpleNamespace(effect=eff, se=se, p_value=p) for (k, eff, se, p) in pre_effects + } + + class MultiPeriodDiDResults: # noqa: D401 — test stub that mimics the real class name + pass + + obj = MultiPeriodDiDResults() + obj.pre_period_effects = pre_map + obj.interaction_indices = interaction_indices + obj.vcov = np.asarray(vcov, dtype=float) if vcov is not None else None + obj.avg_att = 1.0 + obj.avg_se = 0.1 + obj.avg_p_value = 0.001 + obj.avg_conf_int = (0.8, 1.2) + obj.alpha = 0.05 + obj.n_obs = 100 + obj.n_treated = 50 + obj.n_control = 50 + obj.survey_metadata = None + for k, v in extra.items(): + setattr(obj, k, v) + return obj + + def test_joint_wald_runs_when_keys_align(self): + """With aligned pre_effects + interaction_indices + vcov, Wald runs + and the computed chi-squared statistic matches the closed form.""" + pre = [(-3, 0.0, 0.5, 0.99), (-2, 0.0, 0.5, 0.99), (-1, 0.0, 0.5, 0.99)] + interaction_indices = {-3: 0, -2: 1, -1: 2, 0: 3} # maps period -> vcov row + vcov = np.diag([0.25, 0.25, 0.25, 0.25]) # SE = 0.5 for each pre-period + stub = self._stub_result(pre, interaction_indices, vcov) + dr = DiagnosticReport(stub, run_sensitivity=False, run_bacon=False) + pt = dr.to_dict()["parallel_trends"] + assert pt["status"] == "ran" + assert ( + pt["method"] == "joint_wald" + ), f"Expected joint_wald with aligned keys; got {pt.get('method')}" + # beta=0 across all periods -> test_statistic = 0 -> p = 1.0 + assert pt["test_statistic"] == pytest.approx(0.0) + assert pt["joint_p_value"] == pytest.approx(1.0) + assert pt["df"] == 3 + + def test_joint_wald_computes_expected_statistic(self): + """Verify the Wald statistic matches a known closed-form value.""" + # beta = [1.0, -0.5, 0.2]; vcov diagonal with variances [0.25, 0.25, 0.16] + # -> test_statistic = 1.0^2/0.25 + 0.5^2/0.25 + 0.2^2/0.16 + # = 4.0 + 1.0 + 0.25 = 5.25 + pre = [(-3, 1.0, 0.5, 0.04), (-2, -0.5, 0.5, 0.30), (-1, 0.2, 0.4, 0.61)] + interaction_indices = {-3: 0, -2: 1, -1: 2} + vcov = np.diag([0.25, 0.25, 0.16]) + stub = self._stub_result(pre, interaction_indices, vcov) + dr = DiagnosticReport(stub, run_sensitivity=False, run_bacon=False) + pt = dr.to_dict()["parallel_trends"] + assert pt["method"] == "joint_wald" + assert pt["test_statistic"] == pytest.approx(5.25, rel=1e-6) + + def test_falls_back_to_bonferroni_without_interaction_indices(self): + pre = [(-2, 1.0, 0.5, 0.04), (-1, 0.2, 0.5, 0.69)] + stub = self._stub_result(pre, interaction_indices=None, vcov=np.diag([0.25, 0.25])) + dr = DiagnosticReport(stub, run_sensitivity=False, run_bacon=False) + pt = dr.to_dict()["parallel_trends"] + assert pt["status"] == "ran" + assert pt["method"] == "bonferroni", ( + "Missing interaction_indices must force Bonferroni fallback, " + "never attempt a Wald statistic on misaligned rows." + ) + # Bonferroni: min(per-period p) * n = 0.04 * 2 = 0.08 (< 1) + assert pt["joint_p_value"] == pytest.approx(0.08, rel=1e-6) + + def test_falls_back_to_bonferroni_when_keys_misaligned(self): + """pre_effects has keys [-2, -1] but interaction_indices uses [2019, 2020].""" + pre = [(-2, 1.0, 0.5, 0.04), (-1, 0.2, 0.5, 0.69)] + interaction_indices = {2019: 0, 2020: 1} # deliberately different namespace + vcov = np.diag([0.25, 0.25]) + stub = self._stub_result(pre, interaction_indices, vcov) + dr = DiagnosticReport(stub, run_sensitivity=False, run_bacon=False) + pt = dr.to_dict()["parallel_trends"] + assert pt["status"] == "ran" + assert pt["method"] == "bonferroni", ( + "Misaligned interaction_indices must force Bonferroni fallback — " + "the len(keys_in_vcov) == df guard should prevent the Wald path." + ) + + def test_falls_back_to_bonferroni_when_vcov_missing(self): + pre = [(-2, 1.0, 0.5, 0.04), (-1, 0.2, 0.5, 0.69)] + interaction_indices = {-2: 0, -1: 1} + stub = self._stub_result(pre, interaction_indices, vcov=None) + dr = DiagnosticReport(stub, run_sensitivity=False, run_bacon=False) + pt = dr.to_dict()["parallel_trends"] + assert pt["method"] == "bonferroni" + + class TestVerdictsAndTiers: def test_pt_verdict_three_bins(self): assert _pt_verdict(0.001) == "clear_violation" From 75026768f63459fb771fa738c33ad4b526bde853 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 15:31:47 -0400 Subject: [PATCH 03/48] Tighten README: frame BR/DR as experimental preview Replace the "Stakeholder-ready report from any fit" subsection framing with "Experimental preview: BusinessReport and DiagnosticReport" and reword the introductory paragraph to emphasize that wording, verdict thresholds, and schema shape will change. Drop the expected-output comment from the example (the prose will evolve) and invite feedback. This matches the foundation-not-shipped-feature posture: the schema and narrative prototype are worth validating in isolation, but the library still lacks several items a methodologically-rigorous practitioner (covariate comparison, event-study plot embedding, 2x2 placebo battery, real-dataset validation, target-parameter clarity, tutorial integration) would expect. Keeping external framing conservative until those gaps close. No functional changes; only README prose. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index b9daf68a..fa04d2a7 100644 --- a/README.md +++ b/README.md @@ -92,9 +92,9 @@ Measuring campaign lift? Evaluating a product launch? diff-diff handles the caus - **[Brand awareness survey tutorial](docs/tutorials/17_brand_awareness_survey.ipynb)** - Full example with complex survey design, brand funnel analysis, and staggered rollouts - **Have BRFSS/ACS/CPS individual records?** Use [`aggregate_survey()`](docs/api/prep.rst) to roll respondent-level microdata into a geographic-period panel with inverse-variance precision weights. The returned second-stage design uses analytic weights (`aweight`), so it works directly with `DifferenceInDifferences`, `TwoWayFixedEffects`, `MultiPeriodDiD`, `SunAbraham`, `ContinuousDiD`, and `EfficientDiD` (estimators marked **Full** in the [survey support matrix](docs/choosing_estimator.rst)) -### Stakeholder-ready report from any fit +### Experimental preview: `BusinessReport` and `DiagnosticReport` -Wrap any fitted result in `BusinessReport` for a plain-English stakeholder summary; pair with `DiagnosticReport` for a validity check: +diff-diff ships two preview classes, `BusinessReport` and `DiagnosticReport`, that produce plain-English output and a structured `to_dict()` schema from any fitted result. **Both are experimental in this release** — wording, verdict thresholds, and schema shape will change as the library learns from real practitioner usage. Do not anchor downstream tooling on the schema yet; the experimental flag is noted in the CHANGELOG. ```python from diff_diff import CallawaySantAnna, BusinessReport @@ -111,15 +111,9 @@ report = BusinessReport( treatment_label="the loyalty program", ) print(report.summary()) -# "The loyalty program increased Revenue per store by $1.78 (95% CI: $1.56 to $2.00). -# Statistically, the direction of the effect is strongly supported by the data. -# Pre-treatment data do not reject parallel trends, but the test has limited -# power — a non-rejection does not prove the assumption. See the sensitivity -# analysis below for a more reliable signal. -# Sample: 600 observations (70 treated, 30 control)." ``` -`BusinessReport` auto-constructs a `DiagnosticReport` by default so the summary mentions pre-trends, robustness, and design-effect findings in one call. `.to_dict()` returns the same content as a stable AI-legible schema (single source of truth; prose is rendered from the dict). See [docs/methodology/REPORTING.md](docs/methodology/REPORTING.md) for the phrasing rules, verdict thresholds, and schema stability policy. **Schema is experimental in this release.** +`BusinessReport` auto-constructs a `DiagnosticReport` so the summary mentions pre-trends, sensitivity, and design-effect findings in one call. Methodology (phrasing rules, verdict thresholds, schema stability) is documented in [docs/methodology/REPORTING.md](docs/methodology/REPORTING.md). Feedback on wording, applicability, and missing diagnostics is welcome — this is the part of the library most likely to evolve in the next few releases. Already know DiD? The [academic quickstart](docs/quickstart.rst) and [estimator guide](docs/choosing_estimator.rst) cover the full technical details. From 2577d6c7e6e88a716fc4e4979572f4677f301637 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 16:09:22 -0400 Subject: [PATCH 04/48] Address CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the five issues the CI reviewer flagged against the initial BusinessReport / DiagnosticReport PR: P0 — Single-M HonestDiDResults passthrough was being narrated as "robust across the full grid" because both renderers checked ``breakdown_M is None`` and fell through to the grid-wide phrasing. Preserves the ``conclusion="single_M_precomputed"`` state through both BR._render_summary and DR._render_overall_interpretation; the point check is now rendered as "at M=, the robust CI (excludes|includes) zero — run HonestDiD.sensitivity() across a grid for a breakdown value." Regression tests in ``TestSingleMSensitivityPrecomputed`` cover both DR.summary() and BR(honest_did_results=...).summary(). P0 — EPV diagnostics were silently reporting 0 low cells and min_epv=None on every fit because _check_epv() expected ``low_epv_cells`` / ``min_epv`` attributes but the library's ``epv_diagnostics`` convention is a dict keyed by cell identifier with per-cell ``{"is_low": bool, "epv": float}`` entries. Rewrites _check_epv() to handle the dict shape, counts low cells via ``v.get("is_low")``, derives min_epv from the ``epv`` values, and reads ``results.epv_threshold`` instead of hardcoding 10. Legacy object-shape fallback retained for custom subclasses. Regression tests in ``TestEPVDictBacked`` cover low-cell detection, no-low clean case, and the configurable threshold. P1 — CallawaySantAnnaResults sensitivity + pretrends_power were skipped entirely because the applicability gate required ``results.vcov``, but CS exposes ``event_study_vcov`` / ``event_study_vcov_index`` alongside a populated ``event_study_effects`` surface. ``HonestDiD.sensitivity_analysis`` and ``compute_pretrends_power`` already handle CS via those attributes, so the gate now accepts any of the three covariance sources. Also honors precomputed overrides regardless of gate. Regression tests in ``TestCSEventStudyVCovSupport`` confirm both checks are applicable on an aggregated CS fit. P1 — _pt_event_study() was forcing Bonferroni on CS even though event_study_vcov + event_study_vcov_index were available. Added a second covariance source branch that builds an index map from ``event_study_vcov_index`` and reports ``method="joint_wald_event_study"`` on the Wald path. Regression test in ``TestCSJointWaldViaEventStudyVCov`` verifies the closed-form chi-squared statistic on a CS stub with known coefficients and diagonal vcov. P1 — ContinuousDiDResults headline extraction was broken: BR and DR both assumed ``overall_se`` / ``overall_p_value`` / ``overall_conf_int`` but ContinuousDiDResults stores them as ``overall_att_se`` / ``overall_att_p_value`` / ``overall_att_conf_int``. Centralized the attribute-alias lookup in a new module-level ``_extract_scalar_headline`` helper that both report classes call; the helper accepts multiple SE / p / CI alias names per point-estimate row and tries them in order. Regression test in ``TestContinuousDiDHeadline`` confirms the helper resolves the ``overall_att_*`` aliases. P2 — Headline extraction was duplicated in BR and DR. Eliminated by the shared helper above. 131 targeted tests pass (BR + DR + guides + practitioner); black, ruff, and mypy clean on the new modules. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 78 +++++---- diff_diff/diagnostic_report.py | 269 ++++++++++++++++++++++++++------ tests/test_diagnostic_report.py | 229 +++++++++++++++++++++++++++ 3 files changed, 482 insertions(+), 94 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 973eed25..67174fab 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -313,51 +313,21 @@ def _build_schema(self) -> Dict[str, Any]: def _extract_headline(self, dr_schema: Optional[Dict[str, Any]]) -> Dict[str, Any]: """Extract the headline effect + CI + p-value from the result.""" r = self._results + # Delegate the attribute-alias lookup to the shared helper in the + # diagnostic_report module so BR and DR agree on which fields a + # result class exposes for its headline (including + # ``ContinuousDiDResults`` which uses ``overall_att_se`` / + # ``overall_att_p_value`` / ``overall_att_conf_int``). + from diff_diff.diagnostic_report import _extract_scalar_headline + + extracted = _extract_scalar_headline(r, fallback_alpha=self._context.alpha) att: Optional[float] = None se: Optional[float] = None p: Optional[float] = None ci: Optional[List[float]] = None alpha = self._context.alpha - - for name in ("overall_att", "avg_att", "att"): - val = getattr(r, name, None) - if val is None: - continue - att = _safe_float(val) - se = _safe_float( - getattr( - r, - { - "overall_att": "overall_se", - "avg_att": "avg_se", - "att": "se", - }[name], - None, - ) - ) - p = _safe_float( - getattr( - r, - { - "overall_att": "overall_p_value", - "avg_att": "avg_p_value", - "att": "p_value", - }[name], - None, - ) - ) - ci = _safe_ci( - getattr( - r, - { - "overall_att": "overall_conf_int", - "avg_att": "avg_conf_int", - "att": "conf_int", - }[name], - None, - ) - ) - break + if extracted is not None: + _name, att, se, p, ci, _alpha = extracted unit = self._context.outcome_unit unit_kind = _UNIT_KINDS.get(unit.lower() if unit else "", "unknown") @@ -992,11 +962,35 @@ def _render_summary(schema: Dict[str, Any]) -> str: "trends analogue)." ) - # Sensitivity. + # Sensitivity. A ``single_M_precomputed`` sensitivity block has + # ``breakdown_M=None`` by construction because only one M was evaluated; + # narrate it as a point check, NOT as grid-wide robustness. sens = schema.get("sensitivity", {}) or {} if sens.get("status") == "computed": bkd = sens.get("breakdown_M") - if bkd is None: + conclusion = sens.get("conclusion") + if conclusion == "single_M_precomputed": + grid_points = sens.get("grid") or [] + point = grid_points[0] if grid_points else {} + m_val = point.get("M") + robust = point.get("robust_to_zero") + if isinstance(m_val, (int, float)): + if robust: + sentences.append( + f"HonestDiD (single point checked): at M = {m_val:.2g}, " + f"the robust confidence interval excludes zero. This is " + f"a point check, not a breakdown analysis — run " + f"HonestDiD.sensitivity() across a grid of M values " + f"for a full robustness claim." + ) + else: + sentences.append( + f"HonestDiD (single point checked): at M = {m_val:.2g}, " + f"the robust confidence interval includes zero. Run " + f"HonestDiD.sensitivity() across a grid to find the " + f"breakdown value." + ) + elif bkd is None: sentences.append( "HonestDiD: the result remains significant across the " "full grid — robust to plausible parallel-trends violations." diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 0577ed50..3be3b766 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -449,8 +449,22 @@ def _instance_skip_reason(self, check: str) -> Optional[str]: # vcov is optional for the Bonferroni fallback. return None if check == "pretrends_power": - if getattr(r, "vcov", None) is None: - return "Pre-trends power requires results.vcov; not available." + # ``compute_pretrends_power`` handles CS / SA / ImputationDiD + # event-study results by reading ``event_study_effects`` + # directly, so we accept either a top-level ``vcov`` OR a + # populated event-study surface. Precomputed overrides also + # bypass this gate. + if "pretrends_power" in self._precomputed: + return None + has_vcov = getattr(r, "vcov", None) is not None + has_event_vcov = getattr(r, "event_study_vcov", None) is not None + has_event_es = getattr(r, "event_study_effects", None) is not None + if not (has_vcov or has_event_vcov or has_event_es): + return ( + "Pre-trends power needs either results.vcov or " + "event_study_effects (from aggregate='event_study' on " + "staggered estimators); neither available." + ) pre_coefs = _collect_pre_period_coefs(r) if len(pre_coefs) < 2: return "Pre-trends power needs >= 2 pre-treatment periods." @@ -459,9 +473,22 @@ def _instance_skip_reason(self, check: str) -> Optional[str]: # Native SDiD/TROP paths substitute for HonestDiD. if name in {"SyntheticDiDResults", "TROPResults"}: return None - # Standard HonestDiD path. - if getattr(r, "vcov", None) is None: - return "HonestDiD requires results.vcov for the pre-period coefficients." + # Precomputed sensitivity always unlocks this check. + if "sensitivity" in self._precomputed: + return None + # ``HonestDiD.sensitivity_analysis`` handles CS / SA / + # ImputationDiD internally via ``event_study_effects`` + + # ``event_study_vcov`` (or per-SE diagonal fallback), so we + # accept any of: top-level vcov, event_study_vcov, or a + # populated event_study_effects surface. + has_vcov = getattr(r, "vcov", None) is not None + has_event_vcov = getattr(r, "event_study_vcov", None) is not None + has_event_es = getattr(r, "event_study_effects", None) is not None + if not (has_vcov or has_event_vcov or has_event_es): + return ( + "HonestDiD needs either results.vcov, event_study_vcov, " + "or event_study_effects; none available." + ) pre_coefs = _collect_pre_period_coefs(r) if len(pre_coefs) < 1: return "HonestDiD requires at least one pre-period coefficient." @@ -729,26 +756,52 @@ def _pt_event_study(self) -> Dict[str, Any]: df = len(pre_coefs) method = "bonferroni" # Joint-Wald pathway is taken only when EVERY pre-period key is present - # in ``interaction_indices`` (required len == df guard below). This + # in the relevant index mapping (required len == df guard below). This # protects against estimators whose event-study keys use a different # namespace than the vcov indexing: if any key is missing, we fall back # to Bonferroni rather than risk indexing into the wrong vcov rows. # The schema's ``method`` field exposes which path ran so agents and # tests can distinguish the two unambiguously. - if vcov is not None and interaction_indices is not None and df > 0: + # + # Two covariance sources are supported: + # 1. ``interaction_indices`` + ``vcov`` — the MultiPeriodDiDResults + # convention, where ``vcov`` is the full regression covariance + # matrix and ``interaction_indices`` maps period labels to rows. + # 2. ``event_study_vcov_index`` + ``event_study_vcov`` — the + # CallawaySantAnnaResults convention, where the event-study + # covariance is stored separately from the full regression vcov. + vcov_for_wald: Optional[Any] = None + idx_map_for_wald: Optional[Any] = None + vcov_method_tag = "joint_wald" + if vcov is not None and interaction_indices is not None: + vcov_for_wald = vcov + idx_map_for_wald = interaction_indices + else: + es_vcov = getattr(r, "event_study_vcov", None) + es_vcov_index = getattr(r, "event_study_vcov_index", None) + if es_vcov is not None and es_vcov_index is not None: + vcov_for_wald = es_vcov + # ``event_study_vcov_index`` is an ordered list of relative-time + # keys; convert it into a dict mapping key -> position. + try: + idx_map_for_wald = {k: i for i, k in enumerate(es_vcov_index)} + vcov_method_tag = "joint_wald_event_study" + except TypeError: + idx_map_for_wald = None + if vcov_for_wald is not None and idx_map_for_wald is not None and df > 0: try: - keys_in_vcov = [k for (k, _, _, _) in pre_coefs if k in interaction_indices] + keys_in_vcov = [k for (k, _, _, _) in pre_coefs if k in idx_map_for_wald] if len(keys_in_vcov) == df: - idx = [interaction_indices[k] for k in keys_in_vcov] + idx = [idx_map_for_wald[k] for k in keys_in_vcov] beta_map = {k: eff for (k, eff, _, _) in pre_coefs} beta = np.array([beta_map[k] for k in keys_in_vcov], dtype=float) - v_sub = np.asarray(vcov)[np.ix_(idx, idx)] + v_sub = np.asarray(vcov_for_wald)[np.ix_(idx, idx)] stat = float(beta @ np.linalg.solve(v_sub, beta)) from scipy.stats import chi2 joint_p = float(1.0 - chi2.cdf(stat, df=df)) test_statistic = stat - method = "joint_wald" + method = vcov_method_tag except Exception: # noqa: BLE001 joint_p = None test_statistic = None @@ -1085,22 +1138,56 @@ def _check_heterogeneity(self) -> Dict[str, Any]: } def _check_epv(self) -> Dict[str, Any]: - """Read EPV diagnostics from ``results.epv_diagnostics``.""" - epv = getattr(self._results, "epv_diagnostics", None) + """Read EPV diagnostics from ``results.epv_diagnostics``. + + The diff-diff convention (see ``diff_diff/staggered.py`` around the + low-EPV summary warning) is that ``epv_diagnostics`` is a dict keyed + by cell identifier (e.g. ``(g, t)`` for staggered) whose values are + per-cell dicts with ``is_low`` (bool) and ``epv`` (float). The + threshold lives on ``results.epv_threshold`` (default 10) rather + than being hardcoded. + """ + r = self._results + epv = getattr(r, "epv_diagnostics", None) if epv is None: return { "status": "skipped", - "reason": "Estimator did not produce results.epv_diagnostics for " "this fit.", + "reason": "Estimator did not produce results.epv_diagnostics for this fit.", } - threshold = 10 - low_cells = getattr(epv, "low_epv_cells", None) or [] - min_epv = _to_python_float(getattr(epv, "min_epv", None)) + threshold = _to_python_float(getattr(r, "epv_threshold", 10)) or 10.0 + + if isinstance(epv, dict): + low_cells = [k for k, v in epv.items() if isinstance(v, dict) and v.get("is_low")] + epv_floats: List[float] = [] + for v in epv.values(): + if not isinstance(v, dict): + continue + raw = v.get("epv") + if raw is None: + continue + converted = _to_python_float(raw) + if converted is not None: + epv_floats.append(converted) + min_epv: Optional[float] = min(epv_floats) if epv_floats else None + return { + "status": "ran", + "threshold": threshold, + "n_cells_low": len(low_cells), + "n_cells_total": len(epv), + "min_epv": min_epv, + "affected_cohorts": [_to_python_scalar(c) for c in low_cells], + } + + # Legacy object-shaped fallback (not currently emitted by the library + # but kept so custom subclasses that mirror the old shape still work). + low_cells_attr = getattr(epv, "low_epv_cells", None) or [] return { "status": "ran", "threshold": threshold, - "n_cells_low": int(len(low_cells)), - "min_epv": min_epv, - "affected_cohorts": [_to_python_scalar(c) for c in low_cells], + "n_cells_low": int(len(low_cells_attr)), + "n_cells_total": _to_python_scalar(getattr(epv, "n_cells_total", None)), + "min_epv": _to_python_float(getattr(epv, "min_epv", None)), + "affected_cohorts": [_to_python_scalar(c) for c in low_cells_attr], } def _check_estimator_native(self) -> Dict[str, Any]: @@ -1391,41 +1478,96 @@ def _format_precomputed_pt(self, obj: Any) -> Dict[str, Any]: def _extract_headline_metric(self) -> Optional[Dict[str, Any]]: """Best-effort extraction of the scalar headline metric from the result.""" - r = self._results - # Try the usual attribute names in priority order. - for name in ("overall_att", "avg_att", "att"): - val = getattr(r, name, None) - if val is None: - continue - se_name = { - "overall_att": "overall_se", - "avg_att": "avg_se", - "att": "se", - }[name] - p_name = { - "overall_att": "overall_p_value", - "avg_att": "avg_p_value", - "att": "p_value", - }[name] - ci_name = { - "overall_att": "overall_conf_int", - "avg_att": "avg_conf_int", - "att": "conf_int", - }[name] - return { - "name": name, - "value": _to_python_float(val), - "se": _to_python_float(getattr(r, se_name, None)), - "p_value": _to_python_float(getattr(r, p_name, None)), - "conf_int": _to_python_ci(getattr(r, ci_name, None)), - "alpha": _to_python_float(getattr(r, "alpha", self._alpha)), - } - return None + extracted = _extract_scalar_headline(self._results, fallback_alpha=self._alpha) + if extracted is None: + return None + name, value, se, p, ci, alpha = extracted + return { + "name": name, + "value": value, + "se": se, + "p_value": p, + "conf_int": ci, + "alpha": alpha, + } # --------------------------------------------------------------------------- # Helpers (module-private) # --------------------------------------------------------------------------- +def _extract_scalar_headline( + results: Any, + fallback_alpha: float = 0.05, +) -> Optional[ + Tuple[ + str, + Optional[float], + Optional[float], + Optional[float], + Optional[List[float]], + Optional[float], + ] +]: + """Extract ``(name, value, se, p_value, conf_int, alpha)`` from a fitted result. + + Centralizes the scalar-headline mapping shared by both ``BusinessReport`` + and ``DiagnosticReport`` so schema drift (e.g. ``ContinuousDiDResults`` + using ``overall_att_se`` / ``overall_att_p_value`` / + ``overall_att_conf_int`` instead of the ``overall_att`` stem) is handled + in one place. + + Each row in the attribute-alias table below is tried in priority order. + The first point-estimate attribute that resolves to a non-None value + wins; the companion SE / p-value / CI attributes are then resolved from + the same row, taking the first alias that exists on the result object. + """ + # (name, [se aliases], [p-value aliases], [ci aliases]) + alias_table: List[Tuple[str, List[str], List[str], List[str]]] = [ + # Staggered / multi-period aggregations + ( + "overall_att", + ["overall_se", "overall_att_se"], + ["overall_p_value", "overall_att_p_value"], + ["overall_conf_int", "overall_att_conf_int"], + ), + # MultiPeriodDiDResults + ("avg_att", ["avg_se"], ["avg_p_value"], ["avg_conf_int"]), + # Simple DiDResults / SyntheticDiDResults / TROPResults / TripleDifferenceResults + ("att", ["se"], ["p_value"], ["conf_int"]), + ] + for name, se_aliases, p_aliases, ci_aliases in alias_table: + val = getattr(results, name, None) + if val is None: + continue + se = next( + ( + _to_python_float(getattr(results, a, None)) + for a in se_aliases + if getattr(results, a, None) is not None + ), + None, + ) + p = next( + ( + _to_python_float(getattr(results, a, None)) + for a in p_aliases + if getattr(results, a, None) is not None + ), + None, + ) + ci = next( + ( + _to_python_ci(getattr(results, a, None)) + for a in ci_aliases + if getattr(results, a, None) is not None + ), + None, + ) + alpha = _to_python_float(getattr(results, "alpha", fallback_alpha)) + return (name, _to_python_float(val), se, p, ci, alpha) + return None + + def _extract_scalar_effect(val: Any) -> Optional[float]: """Pull a scalar ``effect`` out of the many shapes results expose. @@ -1675,11 +1817,34 @@ def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str "weighted parallel-trends analogue." ) - # Sentence 3: sensitivity + # Sentence 3: sensitivity. The "robust across the grid" phrasing is reserved + # for genuine SensitivityResults grids; a precomputed single-M HonestDiDResults + # is narrated as a point check ("at M=") even though breakdown_M is None. sens = schema.get("sensitivity") or {} if sens.get("status") == "ran": bkd = sens.get("breakdown_M") - if bkd is None: + conclusion = sens.get("conclusion") + if conclusion == "single_M_precomputed": + grid = sens.get("grid") or [] + point = grid[0] if grid else {} + m_val = point.get("M") + robust = point.get("robust_to_zero") + if isinstance(m_val, (int, float)): + if robust: + sentences.append( + f"HonestDiD sensitivity (single point checked): " + f"at M = {m_val:.2g}, the robust CI excludes zero. " + f"This is a point check, not a grid — use " + f"HonestDiD.sensitivity() for a breakdown value." + ) + else: + sentences.append( + f"HonestDiD sensitivity (single point checked): " + f"at M = {m_val:.2g}, the robust CI includes zero. " + f"Run HonestDiD.sensitivity() across a grid to find " + f"the breakdown value." + ) + elif bkd is None: sentences.append( "The effect remains significant across the entire HonestDiD " "grid — robust to plausible parallel-trends violations." diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index 9e0bbac0..38e6c240 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -415,6 +415,235 @@ def test_falls_back_to_bonferroni_when_vcov_missing(self): assert pt["method"] == "bonferroni" +class TestSingleMSensitivityPrecomputed: + """Single-M HonestDiDResults must NOT be narrated as full-grid robustness. + + Regression for the P0 CI-review finding that ``conclusion='single_M_precomputed'`` + was being swallowed because both renderers checked ``breakdown_M is None`` and + fell through to the "robust across the full grid" phrasing. + """ + + def _fake_single_m(self, M=1.5, ci_lb=1.0, ci_ub=3.0): + from types import SimpleNamespace + + return SimpleNamespace( + M=M, + lb=ci_lb, + ub=ci_ub, + ci_lb=ci_lb, + ci_ub=ci_ub, + method="relative_magnitude", + alpha=0.05, + ) + + def test_dr_schema_preserves_single_m_marker(self, multi_period_fit): + fit, _ = multi_period_fit + dr = DiagnosticReport(fit, precomputed={"sensitivity": self._fake_single_m()}) + sens = dr.to_dict()["sensitivity"] + assert sens["status"] == "ran" + assert sens["conclusion"] == "single_M_precomputed" + assert sens["breakdown_M"] is None + assert len(sens["grid"]) == 1 + + def test_dr_summary_does_not_claim_full_grid_robustness(self, multi_period_fit): + fit, _ = multi_period_fit + dr = DiagnosticReport(fit, precomputed={"sensitivity": self._fake_single_m()}) + summary = dr.summary() + assert "across the entire HonestDiD grid" not in summary + assert "robust across the grid" not in summary + # It should narrate the single-M check honestly. + assert "single point checked" in summary + assert "not a breakdown" in summary or "not a grid" in summary + + def test_br_summary_does_not_claim_full_grid_robustness(self, multi_period_fit): + """BR via honest_did_results= passthrough must not oversell a point check.""" + from diff_diff import BusinessReport + + fit, _ = multi_period_fit + br = BusinessReport(fit, honest_did_results=self._fake_single_m()) + summary = br.summary() + assert "full grid" not in summary + assert "single point checked" in summary + + +class TestEPVDictBacked: + """EPV diagnostics on fits that use the dict-of-dicts convention. + + Regression for the P0 CI-review finding that ``_check_epv`` assumed + ``low_epv_cells`` / ``min_epv`` attributes but the library stores + ``epv_diagnostics`` as ``{(g, t): {"is_low": ..., "epv": ...}}``. + """ + + def _make_cs_stub(self, epv_diag, threshold=10.0): + class CallawaySantAnnaResults: + pass + + obj = CallawaySantAnnaResults() + obj.overall_att = 1.0 + obj.overall_se = 0.1 + obj.overall_p_value = 0.001 + obj.overall_conf_int = (0.8, 1.2) + obj.alpha = 0.05 + obj.n_obs = 200 + obj.n_treated = 40 + obj.n_control = 160 + obj.survey_metadata = None + obj.event_study_effects = None + obj.epv_diagnostics = epv_diag + obj.epv_threshold = threshold + return obj + + def test_low_epv_cells_counted_from_is_low_flag(self): + epv = { + (2020, 1): {"is_low": True, "epv": 4.5}, + (2020, 2): {"is_low": False, "epv": 18.0}, + (2021, 1): {"is_low": True, "epv": 2.0}, + (2021, 2): {"is_low": False, "epv": 22.0}, + } + stub = self._make_cs_stub(epv, threshold=10.0) + dr = DiagnosticReport(stub, run_sensitivity=False, run_bacon=False) + section = dr.to_dict()["epv"] + assert section["status"] == "ran" + assert section["n_cells_low"] == 2 + assert section["n_cells_total"] == 4 + assert section["min_epv"] == pytest.approx(2.0) + assert section["threshold"] == pytest.approx(10.0) + + def test_no_low_cells_reports_clean(self): + epv = {(2020, 1): {"is_low": False, "epv": 15.0}} + stub = self._make_cs_stub(epv, threshold=10.0) + dr = DiagnosticReport(stub, run_sensitivity=False, run_bacon=False) + section = dr.to_dict()["epv"] + assert section["n_cells_low"] == 0 + assert section["min_epv"] == pytest.approx(15.0) + + def test_threshold_read_from_results_not_hardcoded(self): + """Pass a non-default epv_threshold and confirm DR echoes it.""" + epv = {(2020, 1): {"is_low": True, "epv": 7.0}} + stub = self._make_cs_stub(epv, threshold=8.5) + dr = DiagnosticReport(stub, run_sensitivity=False, run_bacon=False) + assert dr.to_dict()["epv"]["threshold"] == pytest.approx(8.5) + + +class TestCSEventStudyVCovSupport: + """CS sensitivity + pretrends_power must not be skipped for absence of results.vcov. + + Regression for the P1 CI-review finding that the applicability gate required + ``results.vcov`` but CS exposes ``event_study_vcov`` / ``event_study_vcov_index``. + """ + + def test_cs_sensitivity_runs_on_aggregated_fit(self, cs_fit): + fit, sdf = cs_fit + dr = DiagnosticReport( + fit, + data=sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ) + assert ( + "sensitivity" in dr.applicable_checks + ), "CS fit with event_study aggregation must not skip sensitivity" + sens = dr.to_dict()["sensitivity"] + # It may run successfully or emit an error depending on data shape, + # but it must NOT be skipped for "results.vcov not available". + assert sens["status"] in {"ran", "error"}, sens + + def test_cs_pretrends_power_runs_on_aggregated_fit(self, cs_fit): + fit, sdf = cs_fit + dr = DiagnosticReport( + fit, + data=sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ) + assert ( + "pretrends_power" in dr.applicable_checks + ), "CS fit with event_study aggregation must not skip pretrends_power" + + +class TestCSJointWaldViaEventStudyVCov: + """CS PT should use joint_wald via event_study_vcov when interaction_indices is absent. + + Regression for the P1 CI-review finding that CS always fell back to Bonferroni + even though ``event_study_vcov`` + ``event_study_vcov_index`` were available. + """ + + def _make_cs_stub_with_es_vcov(self): + class CallawaySantAnnaResults: + pass + + obj = CallawaySantAnnaResults() + obj.overall_att = 1.0 + obj.overall_se = 0.1 + obj.overall_p_value = 0.001 + obj.overall_conf_int = (0.8, 1.2) + obj.alpha = 0.05 + obj.n_obs = 200 + obj.n_treated = 40 + obj.n_control = 160 + obj.survey_metadata = None + # Pre-period event-study entries with known coefficients + vcov. + obj.event_study_effects = { + -3: {"effect": 0.5, "se": 0.5, "p_value": 0.32}, + -2: {"effect": -0.5, "se": 0.5, "p_value": 0.32}, + -1: {"effect": 0.2, "se": 0.4, "p_value": 0.62}, + 0: {"effect": 2.0, "se": 0.3, "p_value": 0.0001}, + 1: {"effect": 2.5, "se": 0.3, "p_value": 0.0001}, + } + obj.event_study_vcov = np.diag([0.25, 0.25, 0.16, 0.09, 0.09]) + obj.event_study_vcov_index = [-3, -2, -1, 0, 1] + obj.vcov = None # CS convention + obj.interaction_indices = None + return obj + + def test_cs_pt_uses_event_study_vcov_wald(self): + stub = self._make_cs_stub_with_es_vcov() + dr = DiagnosticReport(stub, run_sensitivity=False, run_bacon=False) + pt = dr.to_dict()["parallel_trends"] + assert pt["status"] == "ran" + assert ( + pt["method"] == "joint_wald_event_study" + ), f"Expected event-study-backed Wald; got method={pt.get('method')!r}" + # Closed-form: 0.5^2/0.25 + (-0.5)^2/0.25 + 0.2^2/0.16 = 1 + 1 + 0.25 = 2.25 + assert pt["test_statistic"] == pytest.approx(2.25, rel=1e-6) + assert pt["df"] == 3 + + +class TestContinuousDiDHeadline: + """ContinuousDiDResults exposes overall_att_se/p_value/conf_int, not overall_se/… + + Regression for the P1 CI-review finding that both report classes missed + ContinuousDiDResults inference fields. + """ + + def test_extract_scalar_headline_resolves_continuous_did_aliases(self): + from diff_diff.diagnostic_report import _extract_scalar_headline + + class ContinuousDiDResults: + pass + + obj = ContinuousDiDResults() + obj.overall_att = 2.5 + obj.overall_att_se = 0.4 + obj.overall_att_p_value = 0.00001 + obj.overall_att_conf_int = (1.7, 3.3) + obj.alpha = 0.05 + + result = _extract_scalar_headline(obj) + assert result is not None + name, value, se, p, ci, alpha = result + assert name == "overall_att" + assert value == pytest.approx(2.5) + assert se == pytest.approx(0.4) + assert p == pytest.approx(0.00001) + assert ci == [pytest.approx(1.7), pytest.approx(3.3)] + assert alpha == pytest.approx(0.05) + + class TestVerdictsAndTiers: def test_pt_verdict_three_bins(self): assert _pt_verdict(0.001) == "clear_violation" From 989f71ab04b41bd8c94796904afd23e6265f66e5 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 16:31:43 -0400 Subject: [PATCH 05/48] Address second round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P0 fixes: 1. **CI-level mislabeling across BR and DR.** BR's ``_extract_headline`` was reading the stored CI from the fitted result and relabeling it with the caller's ``alpha``, so ``BusinessReport(results, alpha=0.10)`` would print a stored 95% interval under a "90% CI" label. DR's ``_render_overall_interpretation`` hardcoded "95% CI" in prose, inverting the same bug when the caller used a non-default alpha. BR now recomputes the interval via ``safe_inference`` when the caller's alpha differs from the fit's; DR prose reads the headline's alpha to derive the CI level string. Regression in ``TestAlphaKnob.test_ci_bounds_recomputed_when_alpha_differs_from_result``. 2. **``full_report()`` single-M HonestDiD rendering.** The summary path was fixed earlier, but the structured-markdown path still emitted "Breakdown M: robust across full grid (no breakdown)" for a single-M passthrough (which has ``breakdown_M=None`` by construction, not because it's grid-wide robust). Added the ``conclusion == "single_M_precomputed"`` branch in ``_render_full_report``. Regression in ``TestFullReportSingleM.test_full_report_does_not_claim_full_grid_for_single_m``. 3. **Reference-marker / NaN pre-period filtering.** ``_collect_pre_period_coefs`` was accepting any negative-time event-study row with non-None effect and SE, which pulled in the universal-base reference marker (``effect=0, se=NaN, n_groups=0``) emitted by CS / SA / ImputationDiD / Stacked event-study output as a real pre-period coefficient. ``_pt_event_study`` Bonferroni also treated ``NaN`` p-values as valid by checking ``is not None`` rather than ``np.isfinite``. The combination could produce a false-clean ``no_detected_violation`` verdict on fits whose only "evidence" was synthetic. Now drop rows with ``n_groups == 0`` and any row whose effect, SE, or p-value is non-finite before both applicability and PT computation; if no valid entries remain, the check returns ``skipped`` rather than a clean p-value. Regressions in ``TestReferenceMarkerAndNaNFiltering``. P1 fixes: 4. **Power-tier covariance source annotation.** ``compute_pretrends_power`` currently drops to ``np.diag(ses**2)`` for CS / SA / ImputationDiD / Stacked / etc. even when the full ``event_study_vcov`` is attached on the result. The diagonal-only MDV can be optimistic because it ignores correlations across pre-periods; promoting that to ``well_powered`` would overstate the evidence. The ``pretrends_power`` schema section now records ``covariance_source`` (one of ``full_pre_period_vcov`` / ``diag_fallback_available_full_vcov_unused`` / ``diag_fallback``), BR downgrades ``well_powered`` → ``moderately_powered`` when we know the diagonal approximation was the only input, and ``docs/methodology/REPORTING.md`` documents this as a known conservative deviation pending the right long-term fix in ``pretrends.py``. 5. **``precomputed=`` contract validation.** The docstring advertised passthrough for ``placebo``, ``design_effect``, ``heterogeneity``, and ``epv`` but only four checks actually respected it (``parallel_trends``, ``sensitivity``, ``pretrends_power``, ``bacon``). Narrowed the docstring to match reality and added a ``ValueError`` that rejects unsupported ``precomputed=`` keys at construction. Regressions in ``TestPrecomputedValidation``. The remaining sections (``design_effect``, ``heterogeneity``, ``epv``) are read-outs from the fitted result with no expensive call to bypass; there is no scenario where a user-supplied override helps. 139 targeted tests pass; black, ruff, and mypy clean on the new modules. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 71 ++++++++++++-- diff_diff/diagnostic_report.py | 127 ++++++++++++++++++++++--- docs/methodology/REPORTING.md | 15 +++ tests/test_business_report.py | 45 +++++++++ tests/test_diagnostic_report.py | 159 ++++++++++++++++++++++++++++++++ 5 files changed, 399 insertions(+), 18 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 67174fab..91db2fb3 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -326,8 +326,33 @@ def _extract_headline(self, dr_schema: Optional[Dict[str, Any]]) -> Dict[str, An p: Optional[float] = None ci: Optional[List[float]] = None alpha = self._context.alpha + result_alpha: Optional[float] = None if extracted is not None: - _name, att, se, p, ci, _alpha = extracted + _name, att, se, p, ci, result_alpha = extracted + + # If the caller asked for a different alpha than the result was fit + # at, recompute the CI from (att, se) using ``safe_inference`` so the + # labeled CI level matches the interval actually shown. Without this + # the stored interval (e.g. 95%) would be relabeled to the caller's + # level (e.g. 90%) — the documented single-knob contract requires + # them to agree. SE is a scale parameter independent of alpha, so + # recomputation is safe; the result's original t-stat / p-value do + # not change either. + if ( + result_alpha is not None + and not np.isclose(alpha, result_alpha) + and att is not None + and se is not None + and np.isfinite(att) + and np.isfinite(se) + ): + from diff_diff.utils import safe_inference + + _t, _p, recomputed_ci = safe_inference(att, se, alpha=alpha) + if recomputed_ci is not None and all( + x is not None and np.isfinite(x) for x in recomputed_ci + ): + ci = [float(recomputed_ci[0]), float(recomputed_ci[1])] unit = self._context.outcome_unit unit_kind = _UNIT_KINDS.get(unit.lower() if unit else "", "unknown") @@ -463,6 +488,10 @@ def _lift_pre_trends(dr: Optional[Dict[str, Any]]) -> Dict[str, Any]: "power_tier": pp.get("tier"), "mdv": pp.get("mdv"), "mdv_share_of_att": pp.get("mdv_share_of_att"), + # Carry the covariance-source annotation through so BR can hedge the + # power-tier phrasing when compute_pretrends_power silently used a + # diagonal fallback despite event_study_vcov being available. + "power_covariance_source": pp.get("covariance_source"), } @@ -919,6 +948,15 @@ def _render_summary(schema: Dict[str, Any]) -> str: jp = pt.get("joint_p_value") verdict = pt.get("verdict") tier = pt.get("power_tier") + # ``compute_pretrends_power`` currently falls back to ``np.diag(ses**2)`` + # for CS / SA / ImputationDiD / Stacked / etc., even when the full + # ``event_study_vcov`` is available. Downgrade any "well_powered" tier + # to "moderately_powered" when we know the diagonal approximation was + # the only input — a diagonal-only MDV can be optimistic because it + # ignores correlations across pre-periods. + cov_source = pt.get("power_covariance_source") + if tier == "well_powered" and cov_source == "diag_fallback_available_full_vcov_unused": + tier = "moderately_powered" if verdict == "clear_violation": sentences.append( f"Pre-treatment data clearly reject parallel trends (joint " @@ -1103,18 +1141,37 @@ def _render_full_report(schema: Dict[str, Any]) -> str: lines.append(f"- Pre-trends not computed: {pt.get('reason', 'unavailable')}") lines.append("") - # Sensitivity + # Sensitivity. A single-M HonestDiDResults passthrough has + # breakdown_M=None by construction because only one M was evaluated; + # the "robust across full grid" phrasing is reserved for genuine + # grid-over-M SensitivityResults. lines.append("## Sensitivity (HonestDiD)") lines.append("") if sens.get("status") == "computed": bkd = sens.get("breakdown_M") concl = sens.get("conclusion") lines.append(f"- Method: `{sens.get('method')}`") - lines.append( - f"- Breakdown M: {bkd:.3g}" - if isinstance(bkd, (int, float)) - else "- Breakdown M: robust across full grid (no breakdown)" - ) + if concl == "single_M_precomputed": + grid_points = sens.get("grid") or [] + point = grid_points[0] if grid_points else {} + m_val = point.get("M") + robust = point.get("robust_to_zero") + if isinstance(m_val, (int, float)): + lines.append(f"- Single point checked: M = {m_val:.3g}") + lines.append( + f"- Robust CI at M = {m_val:.3g}: " + f"{'excludes zero' if robust else 'includes zero'}" + ) + lines.append( + "- Run `HonestDiD.sensitivity()` across a grid of M " + "values to find the breakdown value." + ) + else: + lines.append("- Single-M passthrough (breakdown not available)") + elif isinstance(bkd, (int, float)): + lines.append(f"- Breakdown M: {bkd:.3g}") + else: + lines.append("- Breakdown M: robust across full grid (no breakdown)") lines.append(f"- Conclusion: `{concl}`") else: lines.append(f"- Sensitivity not computed: {sens.get('reason', 'unavailable')}") diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 3be3b766..3e41b9ea 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -257,11 +257,22 @@ class DiagnosticReport: alpha : float, default 0.05 Significance level used across checks. precomputed : dict, optional - Map of check name to a pre-computed result object. Accepted keys: - ``"parallel_trends"``, ``"sensitivity"``, ``"placebo"``, ``"bacon"``, - ``"design_effect"``, ``"heterogeneity"``, ``"epv"``, - ``"pretrends_power"``. Supplied values are used verbatim and the - corresponding underlying function is not called. + Map of check name to a pre-computed result object. Accepted keys + (this is the full implemented list; unsupported keys raise + ``ValueError``): + + - ``"parallel_trends"`` — a dict returned by + ``utils.check_parallel_trends`` (adapted into the schema shape). + - ``"sensitivity"`` — a ``SensitivityResults`` (grid) or + ``HonestDiDResults`` (single-M) object; used verbatim and no + ``HonestDiD.sensitivity_analysis`` call is made. + - ``"pretrends_power"`` — a ``PreTrendsPowerResults`` object. + - ``"bacon"`` — a ``BaconDecompositionResults`` object. + + Other sections (``design_effect``, ``heterogeneity``, ``epv``) are + read directly from the fitted result object and do not currently + accept precomputed values — there is no expensive call to bypass. + ``placebo`` is reserved in the schema but opt-in / deferred in MVP. outcome_label, treatment_label : str, optional Plain-English labels used in prose rendering. """ @@ -317,6 +328,18 @@ def __init__( self._sensitivity_method = sensitivity_method self._alpha = float(alpha) self._precomputed = dict(precomputed or {}) + # Validate precomputed keys against the actually-implemented passthrough + # set so advertised contracts do not silently diverge from behavior. + _supported_precomputed = {"parallel_trends", "sensitivity", "pretrends_power", "bacon"} + _unsupported = set(self._precomputed) - _supported_precomputed + if _unsupported: + raise ValueError( + "precomputed= contains keys that are not implemented: " + f"{sorted(_unsupported)}. Supported keys: " + f"{sorted(_supported_precomputed)}. ``design_effect``, " + "``heterogeneity``, and ``epv`` are read directly from the " + "fitted result and do not accept precomputed overrides." + ) self._outcome_label = outcome_label self._treatment_label = treatment_label self._cached: Optional[DiagnosticReportResults] = None @@ -809,7 +832,16 @@ def _pt_event_study(self) -> Dict[str, Any]: if joint_p is None: # Bonferroni: min per-period p-value scaled by count, capped at 1. - ps = [p["p_value"] for p in per_period if p["p_value"] is not None] + # NaN p-values are excluded — a non-finite p-value means the + # per-period test was undefined (zero SE, reference marker that + # slipped through, etc.) and must not be treated as clean + # evidence. If no valid p-values remain, joint_p stays None and + # the verdict will be ``inconclusive``. + ps = [ + p["p_value"] + for p in per_period + if isinstance(p["p_value"], (int, float)) and np.isfinite(p["p_value"]) + ] if ps: joint_p = min(1.0, min(ps) * len(ps)) @@ -862,6 +894,38 @@ def _check_pretrends_power(self) -> Dict[str, Any]: ): ratio = mdv / abs(att) + # Annotate whether ``compute_pretrends_power`` had access to the full + # pre-period covariance (CS / SA / ImputationDiD currently fall back to + # ``np.diag(ses**2)`` inside ``pretrends.py``, even when + # ``event_study_vcov`` is available). BR uses this field to downgrade + # power-tier prose when only the diagonal approximation was used. + r = self._results + has_full_es_vcov = ( + getattr(r, "event_study_vcov", None) is not None + and getattr(r, "event_study_vcov_index", None) is not None + ) + is_event_study_type = type(r).__name__ in { + "CallawaySantAnnaResults", + "SunAbrahamResults", + "ImputationDiDResults", + "StackedDiDResults", + "StaggeredTripleDiffResults", + "WooldridgeDiDResults", + "ChaisemartinDHaultfoeuilleResults", + "EfficientDiDResults", + "TwoStageDiDResults", + } + if is_event_study_type and has_full_es_vcov: + # ``compute_pretrends_power`` does not currently consume + # ``event_study_vcov`` for these result types (see the reviewer's + # note on pretrends.py). Flag the diagonal fallback explicitly so + # the prose layer can hedge. + cov_source = "diag_fallback_available_full_vcov_unused" + elif is_event_study_type: + cov_source = "diag_fallback" + else: + cov_source = "full_pre_period_vcov" + tier = _power_tier(ratio) return { "status": "ran", @@ -874,6 +938,7 @@ def _check_pretrends_power(self) -> Dict[str, Any]: "power_at_M_1": _to_python_float(getattr(pp, "power", None)), "n_pre_periods": int(getattr(pp, "n_pre_periods", 0) or 0), "tier": tier, + "covariance_source": cov_source, } def _format_precomputed_pretrends_power(self, obj: Any) -> Dict[str, Any]: @@ -1621,7 +1686,18 @@ def _collect_pre_period_coefs(results: Any) -> List[Tuple[Any, float, float, Opt on the staggered estimators (CS / SA / ImputationDiD / Stacked / EDiD / etc.). Pre-period entries are those with negative relative-time keys. - Returns an empty list when neither source provides pre-period entries. + Filtering rules (critical for methodology-safe PT tests): + + * Entries marked as reference markers (``n_groups == 0`` on the CS / SA / + ImputationDiD / Stacked event-study shape) are excluded. These are + synthetic ``effect=0, se=NaN`` rows injected for universal-base + normalization; treating them as real pre-period evidence would inflate + the Bonferroni denominator and produce bogus zero-deviation entries. + * Entries whose ``effect`` or ``se`` is non-finite (NaN / inf) are + excluded. A NaN SE means inference is undefined — feeding it into + Bonferroni or Wald would produce a false-clean PT verdict. + + Returns an empty list when neither source provides valid pre-period entries. """ results_list: List[Tuple[Any, float, float, Optional[float]]] = [] pre = getattr(results, "pre_period_effects", None) @@ -1630,8 +1706,16 @@ def _collect_pre_period_coefs(results: Any) -> List[Tuple[Any, float, float, Opt eff = getattr(pe, "effect", None) se = getattr(pe, "se", None) p = getattr(pe, "p_value", None) - if eff is not None and se is not None: - results_list.append((k, float(eff), float(se), _to_python_float(p))) + if eff is None or se is None: + continue + try: + eff_f = float(eff) + se_f = float(se) + except (TypeError, ValueError): + continue + if not (np.isfinite(eff_f) and np.isfinite(se_f)): + continue + results_list.append((k, eff_f, se_f, _to_python_float(p))) else: es = getattr(results, "event_study_effects", None) or {} for k, entry in es.items(): @@ -1644,12 +1728,26 @@ def _collect_pre_period_coefs(results: Any) -> List[Tuple[Any, float, float, Opt continue if not isinstance(entry, dict): continue + # Drop universal-base reference markers. See + # ``staggered_aggregation.py`` around the reference-period + # injection: ``n_groups == 0`` flags the synthetic marker row + # with NaN SE and p-value. + n_groups = entry.get("n_groups") + if n_groups is not None and n_groups == 0: + continue eff = entry.get("effect") se = entry.get("se") p = entry.get("p_value") if eff is None or se is None: continue - results_list.append((k, float(eff), float(se), _to_python_float(p))) + try: + eff_f = float(eff) + se_f = float(se) + except (TypeError, ValueError): + continue + if not (np.isfinite(eff_f) and np.isfinite(se_f)): + continue + results_list.append((k, eff_f, se_f, _to_python_float(p))) results_list.sort(key=lambda t: t[0] if isinstance(t[0], (int, float)) else str(t[0])) return results_list @@ -1753,8 +1851,15 @@ def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str p = headline.get("p_value") if isinstance(headline, dict) else None if val is not None: direction = "increased" if val > 0 else "decreased" if val < 0 else "did not change" + # Use the headline's own alpha rather than hardcoding 95 so prose + # stays consistent with the rendered interval when alpha != 0.05. + headline_alpha = headline.get("alpha") if isinstance(headline, dict) else None + if isinstance(headline_alpha, (int, float)) and 0 < headline_alpha < 1: + ci_level = int(round((1.0 - headline_alpha) * 100)) + else: + ci_level = 95 ci_str = ( - f" (95% CI: {ci[0]:.3g} to {ci[1]:.3g})" + f" ({ci_level}% CI: {ci[0]:.3g} to {ci[1]:.3g})" if isinstance(ci, (list, tuple)) and len(ci) == 2 and None not in ci else "" ) diff --git a/docs/methodology/REPORTING.md b/docs/methodology/REPORTING.md index f6cee0b5..d095969c 100644 --- a/docs/methodology/REPORTING.md +++ b/docs/methodology/REPORTING.md @@ -99,6 +99,21 @@ result or computed by an existing diff-diff utility The library already ships `compute_pretrends_power()`, so using it is the honest default rather than hedging every non-violation. +- **Note:** Diagonal-covariance fallback for staggered-estimator power. + `compute_pretrends_power()` currently drops to `np.diag(ses**2)` for + CS / SA / ImputationDiD / Stacked / etc. even when the full + `event_study_vcov` is attached on the result. The + `DiagnosticReport.pretrends_power` block records + `covariance_source: "diag_fallback_available_full_vcov_unused"` in + that case, and `BusinessReport` downgrades a `well_powered` tier to + `moderately_powered` before rendering prose. This is a known + conservative deviation from the documented "use the full pre-period + covariance" position — it prevents the diagonal approximation from + producing an overly optimistic "well-powered" claim when correlated + pre-period errors could tighten the MDV. The right long-term fix is + to teach `compute_pretrends_power()` to consume `event_study_vcov` + and `event_study_vcov_index`; until that lands this downgrade stays. + - **Note:** Unit-translation policy. BusinessReport does not arithmetically translate log-points to percents or level effects to log-points. The estimate is rendered in the scale the estimator diff --git a/tests/test_business_report.py b/tests/test_business_report.py index c78b57e3..ff8d9f0a 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -501,6 +501,51 @@ def test_alpha_drives_ci_level(self, event_study_fit): assert br90.to_dict()["headline"]["ci_level"] == 90 assert br95.to_dict()["headline"]["ci_level"] == 95 + def test_ci_bounds_recomputed_when_alpha_differs_from_result(self, event_study_fit): + """Regression for the P0 CI-label bug: when alpha != results.alpha, + the displayed interval must be recomputed from (att, se) rather than + the stored interval being relabeled to the caller's alpha.""" + import math + + fit, _ = event_study_fit + br95 = BusinessReport(fit, alpha=0.05, auto_diagnostics=False) + br90 = BusinessReport(fit, alpha=0.10, auto_diagnostics=False) + h95 = br95.to_dict()["headline"] + h90 = br90.to_dict()["headline"] + if h95["effect"] is not None and math.isfinite(h95["effect"]): + # 90% bounds must be strictly inside 95% bounds. + assert h90["ci_lower"] > h95["ci_lower"] + 1e-9 + assert h90["ci_upper"] < h95["ci_upper"] - 1e-9 + assert h95["ci_level"] == 95 + assert h90["ci_level"] == 90 + + +class TestFullReportSingleM: + """Regression: ``full_report()`` must not claim full-grid robustness for a + single-M HonestDiDResults passthrough. The summary path was fixed earlier; + the structured-markdown path had the same bug and now mirrors it.""" + + @staticmethod + def _fake_single_m(M=1.5, ci_lb=1.0, ci_ub=3.0): + from types import SimpleNamespace + + return SimpleNamespace( + M=M, + lb=ci_lb, + ub=ci_ub, + ci_lb=ci_lb, + ci_ub=ci_ub, + method="relative_magnitude", + alpha=0.05, + ) + + def test_full_report_does_not_claim_full_grid_for_single_m(self, event_study_fit): + fit, _ = event_study_fit + br = BusinessReport(fit, honest_did_results=self._fake_single_m()) + md = br.full_report() + assert "robust across full grid" not in md + assert "Single point checked" in md or "single point" in md.lower() + # --------------------------------------------------------------------------- # Summary + full_report work across estimators diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index 38e6c240..48c97431 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -415,6 +415,165 @@ def test_falls_back_to_bonferroni_when_vcov_missing(self): assert pt["method"] == "bonferroni" +class TestReferenceMarkerAndNaNFiltering: + """Regression for the P0 finding that reference markers + NaN pre-periods + were being swept into Bonferroni / Wald PT as real evidence. + + Universal-base CS / SA / ImputationDiD / Stacked event-study output + injects a synthetic reference-period row (``effect=0``, ``se=NaN``, + ``p_value=NaN``, ``n_groups=0``). Treating that row as valid + pre-period evidence would inflate the Bonferroni denominator and + collapse all-NaN fallbacks to a false-clean verdict. + """ + + @staticmethod + def _cs_stub_with_reference_marker(): + import numpy as np + + class CallawaySantAnnaResults: + pass + + obj = CallawaySantAnnaResults() + obj.overall_att = 1.0 + obj.overall_se = 0.1 + obj.overall_p_value = 0.001 + obj.overall_conf_int = (0.8, 1.2) + obj.alpha = 0.05 + obj.n_obs = 200 + obj.n_treated = 40 + obj.n_control = 160 + obj.survey_metadata = None + # Two real pre-period rows + one universal-base reference marker (n_groups=0). + obj.event_study_effects = { + -3: {"effect": 0.1, "se": 0.3, "p_value": 0.74, "n_groups": 5}, + -2: {"effect": -0.2, "se": 0.3, "p_value": 0.51, "n_groups": 5}, + -1: { + "effect": 0.0, + "se": np.nan, + "p_value": np.nan, + "conf_int": (np.nan, np.nan), + "n_groups": 0, + }, + 0: {"effect": 1.5, "se": 0.2, "p_value": 0.0001, "n_groups": 5}, + } + obj.vcov = None + obj.interaction_indices = None + obj.event_study_vcov = None + obj.event_study_vcov_index = None + return obj + + def test_reference_marker_excluded_from_pt_collection(self): + from diff_diff.diagnostic_report import _collect_pre_period_coefs + + obj = self._cs_stub_with_reference_marker() + coefs = _collect_pre_period_coefs(obj) + keys = [k for (k, _, _, _) in coefs] + assert -1 not in keys, ( + "Universal-base reference marker (n_groups=0) must not appear " + "as a valid pre-period coefficient" + ) + assert -3 in keys and -2 in keys + # Every returned SE must be finite. + for _k, _eff, se, _p in coefs: + assert np.isfinite(se), f"Non-finite SE leaked through: {se}" + + def test_all_nan_pre_periods_do_not_produce_clean_verdict(self): + """If *every* pre-period row is a reference marker / NaN, the PT + check must return inconclusive / skipped — never a clean p_value=1.0. + """ + import numpy as np + + class CallawaySantAnnaResults: + pass + + obj = CallawaySantAnnaResults() + obj.overall_att = 1.0 + obj.overall_se = 0.1 + obj.overall_p_value = 0.001 + obj.overall_conf_int = (0.8, 1.2) + obj.alpha = 0.05 + obj.n_obs = 200 + obj.n_treated = 40 + obj.n_control = 160 + obj.survey_metadata = None + obj.event_study_effects = { + -1: { + "effect": 0.0, + "se": np.nan, + "p_value": np.nan, + "n_groups": 0, + }, + 0: {"effect": 1.5, "se": 0.2, "p_value": 0.0001, "n_groups": 5}, + } + obj.vcov = None + obj.interaction_indices = None + obj.event_study_vcov = None + obj.event_study_vcov_index = None + dr = DiagnosticReport(obj, run_sensitivity=False, run_bacon=False) + pt = dr.to_dict()["parallel_trends"] + # All pre-period rows were reference markers → no valid data → skipped. + assert pt["status"] == "skipped" + # Verdict must not falsely say "no detected violation" when the only + # "data" was a reference marker. + assert pt.get("verdict") != "no_detected_violation" + + def test_bonferroni_excludes_nan_p_values(self): + """If a pre-period row has a finite effect/SE but NaN p-value (edge + case on some exotic fits), Bonferroni must skip it, not feed it in.""" + import numpy as np + + class MultiPeriodDiDResults: + pass + + from types import SimpleNamespace + + obj = MultiPeriodDiDResults() + obj.pre_period_effects = { + -2: SimpleNamespace(effect=1.0, se=0.5, p_value=0.04), + -1: SimpleNamespace(effect=0.5, se=0.5, p_value=np.nan), + } + obj.vcov = None + obj.interaction_indices = None + obj.event_study_vcov = None + obj.event_study_vcov_index = None + obj.avg_att = 1.0 + obj.avg_se = 0.1 + obj.avg_p_value = 0.001 + obj.avg_conf_int = (0.8, 1.2) + obj.alpha = 0.05 + obj.n_obs = 100 + obj.n_treated = 50 + obj.n_control = 50 + obj.survey_metadata = None + + dr = DiagnosticReport(obj, run_sensitivity=False, run_bacon=False) + pt = dr.to_dict()["parallel_trends"] + # With only one valid p-value (0.04), Bonferroni should be min(1.0, 0.04*1) = 0.04. + # If the NaN were naively included the test would either error or coerce to 1.0. + assert pt["method"] == "bonferroni" + assert pt["joint_p_value"] == pytest.approx(0.04, abs=1e-9) + + +class TestPrecomputedValidation: + """Regression for the P1 finding that ``precomputed=`` silently accepted + keys that were never implemented. Unsupported keys now raise.""" + + def test_unsupported_precomputed_key_raises(self, multi_period_fit): + fit, _ = multi_period_fit + with pytest.raises(ValueError, match="not implemented"): + DiagnosticReport(fit, precomputed={"design_effect": object()}) + + def test_supported_precomputed_keys_accepted(self, multi_period_fit): + fit, _ = multi_period_fit + # The four implemented keys should not raise at construction. + DiagnosticReport(fit, precomputed={"parallel_trends": {"p_value": 0.5}}) + + def test_mixed_supported_and_unsupported_raises(self, multi_period_fit): + fit, _ = multi_period_fit + with pytest.raises(ValueError, match="epv"): + DiagnosticReport(fit, precomputed={"sensitivity": None, "epv": object()}) + + class TestSingleMSensitivityPrecomputed: """Single-M HonestDiDResults must NOT be narrated as full-grid robustness. From 959f84e80d60c8b9f8a785d0177aa99aa83179aa Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 16:47:04 -0400 Subject: [PATCH 06/48] Address third round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P0 fix: * **Alpha override was inference-contract-blind.** Previously, whenever the caller's ``alpha`` differed from the result's, BR recomputed the displayed CI via ``safe_inference(att, se, alpha=alpha)`` with no ``df`` and no bootstrap handling — silently discarding the ``bootstrap_distribution`` / finite-df inference contracts used by TROP, ContinuousDiD, dCDH-bootstrap, survey fits, SDiD jackknife, etc. BR now detects bootstrap-backed (``inference_method='bootstrap'`` or non-None ``bootstrap_distribution`` or ``variance_method in {bootstrap, jackknife, placebo}``) and finite-df (``df_survey > 0``) inference paths and preserves the fitted CI at its native level in those cases, recording an informational caveat noting that the caller's alpha still drives phrasing but the native interval is shown. Regressions in ``TestAlphaOverrideBootstrapAndFiniteDF`` cover both the bootstrap and finite-df survey paths. P1 fixes: * **``pretrends_power`` over-broad applicability.** The matrix had marked the check applicable for ImputationDiD, TwoStage, Stacked, EfficientDiD, StaggeredTripleDiff, Wooldridge, and dCDH, but ``compute_pretrends_power`` only has adapters for MultiPeriod, CS, and SA; the other families were landing in ``error``. Narrowed the applicability matrix to match the real helper support. * **``sensitivity`` over-broad applicability.** HonestDiD only adapts MultiPeriod, CS, and dCDH (via ``placebo_event_study``). The matrix had also included SA / Imputation / Stacked / EfficientDiD / StaggeredTripleDiff / Wooldridge. Narrowed to the supported set. The dCDH-specific instance gate now checks ``placebo_event_study`` rather than the generic ``event_study_effects`` so HonestDiD's dCDH branch is reached instead of the generic event-study collector. * **``n_obs == 0`` reference-marker filter.** Stacked / TwoStage / Imputation emit synthetic reference-period markers using ``n_obs=0`` rather than CS / SA's ``n_groups=0`` flag. ``_collect_pre_period_coefs`` now drops rows with either sentinel so the Bonferroni denominator and joint-Wald index are not inflated by non-informative rows. P2 fix: * **``placebo`` schema inconsistency.** ``REPORTING.md`` said ``placebo`` is always rendered as ``{"status": "skipped"}`` in MVP, but no result type had ``placebo`` in its applicability frozenset, so implementation fell through to ``"not_applicable"``. Now every DiagnosticReport.to_dict() returns ``placebo`` with ``status="skipped"`` regardless of estimator, matching the stated contract. Regression tests for each finding added in ``TestNarrowedApplicabilityAndPlaceboSchema`` and ``TestAlphaOverrideBootstrapAndFiniteDF``. 146 targeted tests pass; black, ruff, mypy clean on the new modules. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 77 ++++++++++++++++++++++++----- diff_diff/diagnostic_report.py | 78 ++++++++++++++++++------------ tests/test_business_report.py | 86 +++++++++++++++++++++++++++++++++ tests/test_diagnostic_report.py | 79 ++++++++++++++++++++++++++++++ 4 files changed, 275 insertions(+), 45 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 91db2fb3..f1954143 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -331,13 +331,24 @@ def _extract_headline(self, dr_schema: Optional[Dict[str, Any]]) -> Dict[str, An _name, att, se, p, ci, result_alpha = extracted # If the caller asked for a different alpha than the result was fit - # at, recompute the CI from (att, se) using ``safe_inference`` so the - # labeled CI level matches the interval actually shown. Without this - # the stored interval (e.g. 95%) would be relabeled to the caller's - # level (e.g. 90%) — the documented single-knob contract requires - # them to agree. SE is a scale parameter independent of alpha, so - # recomputation is safe; the result's original t-stat / p-value do - # not change either. + # at, the displayed CI needs to match the label. Naive recomputation + # via ``safe_inference(att, se, alpha=alpha)`` would use a normal + # distribution with no df, which silently discards finite-df / + # bootstrap / percentile inference contracts used by TROP, + # ContinuousDiD, dCDH-bootstrap, survey fits, etc. Rules: + # 1. If the result has an analytic inference contract we can + # reproduce (no bootstrap distribution, no finite df we don't + # know about), recompute via ``safe_inference`` — this covers + # the common case of normal-approximation CIs. + # 2. Otherwise (bootstrap / percentile / finite-df / survey d.f.), + # preserve the fitted CI and its native level so the displayed + # interval keeps matching the stored p-value and inference + # contract. The ``ci_level`` field will reflect the result's + # own alpha, and a caveat is appended below noting that the + # caller's alpha drives phrasing but the native interval is + # shown. + alpha_was_honored = True + alpha_override_caveat: Optional[str] = None if ( result_alpha is not None and not np.isclose(alpha, result_alpha) @@ -346,13 +357,40 @@ def _extract_headline(self, dr_schema: Optional[Dict[str, Any]]) -> Dict[str, An and np.isfinite(att) and np.isfinite(se) ): - from diff_diff.utils import safe_inference + inference_method = getattr(r, "inference_method", "analytical") + has_bootstrap_dist = getattr(r, "bootstrap_distribution", None) is not None + df_survey = getattr( + r, "df_survey", getattr(getattr(r, "survey_metadata", None), "df_survey", None) + ) + variance_method = getattr(r, "variance_method", None) + + bootstrap_like = ( + inference_method == "bootstrap" + or has_bootstrap_dist + or variance_method in {"bootstrap", "jackknife", "placebo"} + ) + finite_df = isinstance(df_survey, (int, float)) and df_survey > 0 + + if bootstrap_like or finite_df: + # Preserve the fitted CI at its native level. + alpha_was_honored = False + alpha = float(result_alpha) + alpha_override_caveat = ( + f"Requested alpha was not honored for the confidence " + f"interval because this fit uses " + f"{'bootstrap' if bootstrap_like else 'finite-df'} " + f"inference; the displayed CI remains at the fit's " + f"native level ({int(round((1.0 - result_alpha) * 100))}%). " + f"The significance phrasing still uses the requested alpha." + ) + else: + from diff_diff.utils import safe_inference - _t, _p, recomputed_ci = safe_inference(att, se, alpha=alpha) - if recomputed_ci is not None and all( - x is not None and np.isfinite(x) for x in recomputed_ci - ): - ci = [float(recomputed_ci[0]), float(recomputed_ci[1])] + _t, _p, recomputed_ci = safe_inference(att, se, alpha=alpha) + if recomputed_ci is not None and all( + x is not None and np.isfinite(x) for x in recomputed_ci + ): + ci = [float(recomputed_ci[0]), float(recomputed_ci[1])] unit = self._context.outcome_unit unit_kind = _UNIT_KINDS.get(unit.lower() if unit else "", "unknown") @@ -382,6 +420,8 @@ def _extract_headline(self, dr_schema: Optional[Dict[str, Any]]) -> Dict[str, An "se": se, "ci_lower": ci[0] if ci else None, "ci_upper": ci[1] if ci else None, + "alpha_was_honored": alpha_was_honored, + "alpha_override_caveat": alpha_override_caveat, "ci_level": ci_level, "p_value": p, "is_significant": is_significant, @@ -623,6 +663,17 @@ def _build_caveats( } ) + # Alpha override could not be honored (bootstrap / finite-df inference). + alpha_override_msg = headline.get("alpha_override_caveat") + if isinstance(alpha_override_msg, str) and alpha_override_msg: + caveats.append( + { + "severity": "info", + "topic": "alpha_override_preserved", + "message": alpha_override_msg, + } + ) + # Near-threshold p-value. if headline.get("near_significance_threshold"): caveats.append( diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 3e41b9ea..1215f9df 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -69,6 +69,18 @@ # requires updating both this table and ``_PT_METHOD`` below; the # applicability-matrix test parametrized over all result types serves as the # regression guard. +# ``pretrends_power`` is restricted to the result families for which +# ``compute_pretrends_power`` has an explicit adapter — see +# ``diff_diff/pretrends.py`` around the result-type dispatch. Expanding +# beyond this set (Imputation / Stacked / TwoStage / EfficientDiD / +# StaggeredTripleDiff / Wooldridge / dCDH) would cause the helper to +# raise ``TypeError("Unsupported results type ...")`` and mark the check +# as ``error``, so the narrower set is the right contract. +# +# ``sensitivity`` is restricted to families with a ``HonestDiD`` +# adapter: MultiPeriod, CS, dCDH (via ``placebo_event_study``). SDiD +# and TROP use their own native paths (``estimator_native``) instead +# of HonestDiD. _APPLICABILITY: Dict[str, FrozenSet[str]] = { "DiDResults": frozenset({"parallel_trends", "design_effect"}), "MultiPeriodDiDResults": frozenset( @@ -89,7 +101,6 @@ { "parallel_trends", "pretrends_power", - "sensitivity", "bacon", "design_effect", "heterogeneity", @@ -98,8 +109,6 @@ "ImputationDiDResults": frozenset( { "parallel_trends", - "pretrends_power", - "sensitivity", "bacon", "design_effect", "heterogeneity", @@ -108,7 +117,6 @@ "TwoStageDiDResults": frozenset( { "parallel_trends", - "pretrends_power", "bacon", "design_effect", "heterogeneity", @@ -117,8 +125,6 @@ "StackedDiDResults": frozenset( { "parallel_trends", - "pretrends_power", - "sensitivity", "bacon", "design_effect", "heterogeneity", @@ -139,8 +145,6 @@ "EfficientDiDResults": frozenset( { "parallel_trends", - "pretrends_power", - "sensitivity", "bacon", "design_effect", "heterogeneity", @@ -149,14 +153,10 @@ ), "ContinuousDiDResults": frozenset({"design_effect", "heterogeneity"}), "TripleDifferenceResults": frozenset({"design_effect", "epv"}), - "StaggeredTripleDiffResults": frozenset( - {"parallel_trends", "pretrends_power", "sensitivity", "design_effect"} - ), + "StaggeredTripleDiffResults": frozenset({"parallel_trends", "design_effect"}), "WooldridgeDiDResults": frozenset( { "parallel_trends", - "pretrends_power", - "sensitivity", "bacon", "design_effect", "heterogeneity", @@ -165,7 +165,6 @@ "ChaisemartinDHaultfoeuilleResults": frozenset( { "parallel_trends", - "pretrends_power", "sensitivity", "bacon", "design_effect", @@ -432,13 +431,15 @@ def _compute_applicable_checks(self) -> Tuple[set, Dict[str, str]]: continue applicable.add(check) - # Placebo is always skipped in MVP (opt-in path deferred) - if "placebo" in type_level and "placebo" not in applicable: - skipped.setdefault( - "placebo", - "Placebo battery runs on opt-in only; not yet implemented in MVP. " - "Reserved in the schema for forward compatibility.", - ) + # Placebo is reserved for every result type in MVP so the schema + # shape is stable: ``schema["placebo"]["status"] == "skipped"`` + # always holds regardless of estimator. The opt-in execution path + # is deferred to a follow-up; ``REPORTING.md`` documents this. + skipped.setdefault( + "placebo", + "Placebo battery runs on opt-in only; not yet implemented in MVP. " + "Reserved in the schema for forward compatibility.", + ) return applicable, skipped @@ -499,11 +500,22 @@ def _instance_skip_reason(self, check: str) -> Optional[str]: # Precomputed sensitivity always unlocks this check. if "sensitivity" in self._precomputed: return None - # ``HonestDiD.sensitivity_analysis`` handles CS / SA / - # ImputationDiD internally via ``event_study_effects`` + - # ``event_study_vcov`` (or per-SE diagonal fallback), so we - # accept any of: top-level vcov, event_study_vcov, or a - # populated event_study_effects surface. + # dCDH uses ``placebo_event_study`` as its pre-period surface, + # which HonestDiD consumes via a dedicated branch. Accept the + # fit when that attribute is populated. + if name == "ChaisemartinDHaultfoeuilleResults": + pes = getattr(r, "placebo_event_study", None) + if pes is None: + return ( + "HonestDiD on dCDH requires results.placebo_event_study " + "(re-fit with a placebo-producing configuration)." + ) + return None + # MultiPeriod / CS path: ``HonestDiD.sensitivity_analysis`` + # consumes ``event_study_effects`` plus either ``vcov`` + + # ``interaction_indices`` (MultiPeriod) or ``event_study_vcov`` + # + ``event_study_vcov_index`` (CS), with a per-SE diagonal + # fallback otherwise. has_vcov = getattr(r, "vcov", None) is not None has_event_vcov = getattr(r, "event_study_vcov", None) is not None has_event_es = getattr(r, "event_study_effects", None) is not None @@ -1728,12 +1740,14 @@ def _collect_pre_period_coefs(results: Any) -> List[Tuple[Any, float, float, Opt continue if not isinstance(entry, dict): continue - # Drop universal-base reference markers. See - # ``staggered_aggregation.py`` around the reference-period - # injection: ``n_groups == 0`` flags the synthetic marker row - # with NaN SE and p-value. - n_groups = entry.get("n_groups") - if n_groups is not None and n_groups == 0: + # Drop universal-base reference markers. Different estimator + # aggregations use different flags for the synthetic marker row + # (all of which carry NaN SE and p-value): + # * CS / SA: ``n_groups == 0`` + # * Stacked / TwoStage / Imputation: ``n_obs == 0`` + # Treat either as a disqualifier so the Bonferroni denominator + # and joint-Wald index are not inflated by non-informative rows. + if entry.get("n_groups") == 0 or entry.get("n_obs") == 0: continue eff = entry.get("effect") se = entry.get("se") diff --git a/tests/test_business_report.py b/tests/test_business_report.py index ff8d9f0a..622938aa 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -520,6 +520,92 @@ def test_ci_bounds_recomputed_when_alpha_differs_from_result(self, event_study_f assert h90["ci_level"] == 90 +class TestAlphaOverrideBootstrapAndFiniteDF: + """Regression for the P0 finding that ``safe_inference(att, se, alpha)`` + silently discards bootstrap / finite-df inference contracts on results + that use them (TROP, ContinuousDiD, dCDH-bootstrap, survey fits). + + Rule: when the caller's alpha differs from the fit's alpha AND the + result's inference contract is bootstrap-backed or uses finite df, + BR preserves the fitted CI at the fit's native level rather than + recomputing with a normal approximation. The override is recorded as + an informational caveat. + """ + + class _BootstrapResultStub: + """Minimal stub shaped like a bootstrap-inferred result.""" + + def __init__(self): + self.att = 1.0 + self.se = 0.5 + self.p_value = 0.04 + # Original 95% CI from the bootstrap distribution. + self.conf_int = (0.05, 1.95) + self.alpha = 0.05 + self.n_obs = 100 + self.n_treated = 40 + self.n_control = 60 + self.inference_method = "bootstrap" + self.survey_metadata = None + # Presence of a bootstrap distribution triggers the preserve path. + import numpy as np + + self.bootstrap_distribution = np.random.default_rng(0).normal(1.0, 0.5, 200) + + def test_bootstrap_fit_preserves_fitted_ci_on_alpha_mismatch(self): + stub = self._BootstrapResultStub() + br = BusinessReport(stub, alpha=0.10, auto_diagnostics=False) + h = br.to_dict()["headline"] + # Native fit was at 95%; requested 90% should NOT be reflected in the label. + assert h["ci_level"] == 95, ( + "Bootstrap fit must preserve fitted CI level (95) when caller " + f"requests a different alpha; got {h['ci_level']}" + ) + # Bounds should match the stored bootstrap interval, not a normal-z + # recomputation at 90%. + assert h["ci_lower"] == pytest.approx(0.05) + assert h["ci_upper"] == pytest.approx(1.95) + # A caveat records the override. + caveat_topics = {c.get("topic") for c in br.caveats()} + assert "alpha_override_preserved" in caveat_topics + + class _FiniteDfSurveyStub: + def __init__(self): + from types import SimpleNamespace + + self.att = 2.0 + self.se = 0.4 + self.p_value = 0.001 + self.conf_int = (1.22, 2.78) # 95% via survey t-quantile + self.alpha = 0.05 + self.n_obs = 120 + self.n_treated = 50 + self.n_control = 70 + self.inference_method = "analytical" + # Finite survey d.f. triggers the preserve path — normal approx + # would widen / narrow incorrectly. + self.survey_metadata = SimpleNamespace( + weight_type="pweight", + effective_n=110.0, + design_effect=1.2, + sum_weights=120.0, + n_strata=4, + n_psu=12, + df_survey=8, + replicate_method=None, + ) + + def test_finite_df_fit_preserves_fitted_ci_on_alpha_mismatch(self): + stub = self._FiniteDfSurveyStub() + br = BusinessReport(stub, alpha=0.10, auto_diagnostics=False) + h = br.to_dict()["headline"] + assert h["ci_level"] == 95 + assert h["ci_lower"] == pytest.approx(1.22) + assert h["ci_upper"] == pytest.approx(2.78) + caveat_topics = {c.get("topic") for c in br.caveats()} + assert "alpha_override_preserved" in caveat_topics + + class TestFullReportSingleM: """Regression: ``full_report()`` must not claim full-grid robustness for a single-M HonestDiDResults passthrough. The summary path was fixed earlier; diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index 48c97431..99089e01 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -415,6 +415,85 @@ def test_falls_back_to_bonferroni_when_vcov_missing(self): assert pt["method"] == "bonferroni" +class TestNarrowedApplicabilityAndPlaceboSchema: + """Regressions for the round-3 CI-review findings. + + * ``pretrends_power`` and ``sensitivity`` are now restricted to the + result families that their backing helpers actually support, so + default reports no longer land in ``error`` for SA / Imputation / + Stacked / EfficientDiD / StaggeredTripleDiff / Wooldridge. + * ``placebo`` is always ``status="skipped"`` in MVP regardless of + estimator, matching the ``REPORTING.md`` contract. + """ + + def test_placebo_is_always_skipped_not_not_applicable(self, did_fit): + fit, df = did_fit + dr = DiagnosticReport(fit, data=df, outcome="outcome", treatment="treated", time="post") + placebo = dr.to_dict()["placebo"] + assert placebo["status"] == "skipped", ( + f"placebo must always be status='skipped' per REPORTING.md; " + f"got {placebo['status']!r}" + ) + + def test_placebo_skipped_for_multiperiod_fit(self, multi_period_fit): + fit, _ = multi_period_fit + placebo = DiagnosticReport(fit).to_dict()["placebo"] + assert placebo["status"] == "skipped" + + def test_placebo_skipped_for_sdid_fit(self, sdid_fit): + fit, _ = sdid_fit + placebo = DiagnosticReport(fit).to_dict()["placebo"] + assert placebo["status"] == "skipped" + + def test_sun_abraham_sensitivity_not_applicable(self): + """SA is not in HonestDiD's adapter list; DR must not try to run it.""" + import warnings + + import pandas as pd + + from diff_diff import SunAbraham, generate_staggered_data + + warnings.filterwarnings("ignore") + sdf = generate_staggered_data(n_units=100, n_periods=6, treatment_effect=1.5, seed=7) + fit = SunAbraham().fit( + sdf, outcome="outcome", unit="unit", time="period", first_treat="first_treat" + ) + dr = DiagnosticReport(fit) + applicable = set(dr.applicable_checks) + sensitivity = dr.to_dict()["sensitivity"] + assert "sensitivity" not in applicable, ( + "SunAbrahamResults has no HonestDiD adapter; sensitivity must not " + "be marked applicable" + ) + assert sensitivity["status"] == "not_applicable" + + def test_n_obs_zero_reference_marker_filtered(self): + """Stacked / TwoStage / Imputation reference markers use n_obs=0 + (not n_groups=0). ``_collect_pre_period_coefs`` must filter both.""" + import numpy as np + + from diff_diff.diagnostic_report import _collect_pre_period_coefs + + class StackedDiDResults: + pass + + obj = StackedDiDResults() + obj.event_study_effects = { + -2: {"effect": 0.1, "se": 0.3, "p_value": 0.74, "n_obs": 50}, + -1: { + "effect": 0.0, + "se": np.nan, + "p_value": np.nan, + "n_obs": 0, # synthetic reference marker + }, + 0: {"effect": 1.5, "se": 0.2, "p_value": 0.0001, "n_obs": 50}, + } + coefs = _collect_pre_period_coefs(obj) + keys = [k for (k, _, _, _) in coefs] + assert -1 not in keys, "n_obs==0 row must be filtered out" + assert -2 in keys + + class TestReferenceMarkerAndNaNFiltering: """Regression for the P0 finding that reference markers + NaN pre-periods were being swept into Bonferroni / Wald PT as real evidence. From 345f65cc492306a995fc89423aec85ae5c4eb340 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 16:58:25 -0400 Subject: [PATCH 07/48] Address fourth round of CI review findings on PR #318 P0 fix: * **``inference_method == 'wild_bootstrap'`` was not detected as bootstrap-like.** My prior bootstrap check caught ``'bootstrap'`` and ``variance_method in {bootstrap, jackknife, placebo}`` plus an attached ``bootstrap_distribution``, but ``DifferenceInDifferences( inference='wild_bootstrap')`` returns ``inference_method='wild_bootstrap'`` and a percentile-bootstrap CI without necessarily attaching the raw distribution. The override path silently replaced that CI with a normal-approximation one. Fixed by matching both ``'bootstrap'`` and ``'wild_bootstrap'``; the preserved-CI caveat now calls out "wild cluster bootstrap" specifically when that path triggered. Regression: ``TestWildBootstrapAlphaOverride``. P1 fix: * **``_describe_assumption()`` emitted generic DiD PT text for ContinuousDiD / TripleDifference / StaggeredTripleDiff**, all of which have identifying logic different from ordinary group-time PT per ``docs/methodology/REGISTRY.md``. Replaced the generic fallback with source-backed branches: - ``ContinuousDiDResults``: two-level parallel trends (PT vs Strong PT) per Callaway, Goodman-Bacon & Sant'Anna (2024), with explicit mention of ATT(d|d), ATT(d), ACRT identification sets. - ``TripleDifferenceResults`` / ``StaggeredTripleDiffResults``: triple-difference cancellation across the 2x2x2 cells per Ortiz-Villavicencio & Sant'Anna (2025); notes that identification is weaker than ordinary DiD PT and depends on additive separability across the three dimensions. The ``parallel_trends_variant`` schema field gains two new values: ``"dose_pt_or_strong_pt"`` and ``"triple_difference_cancellation"``. Direct regressions in ``TestAssumptionBlockSourceFaithful`` assert registry-backed language (attribution phrases + method names) is present and generic group-time PT text is absent. 150 targeted tests pass; black, ruff, mypy clean on the new modules. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 59 +++++++++++++++-- tests/test_business_report.py | 121 ++++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+), 4 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index f1954143..656a119c 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -364,8 +364,11 @@ def _extract_headline(self, dr_schema: Optional[Dict[str, Any]]) -> Dict[str, An ) variance_method = getattr(r, "variance_method", None) + # Any non-analytic inference surface that stores a sampling / + # resampling distribution (wild cluster bootstrap, percentile + # bootstrap, jackknife, placebo) should preserve its native CI. bootstrap_like = ( - inference_method == "bootstrap" + inference_method in {"bootstrap", "wild_bootstrap"} or has_bootstrap_dist or variance_method in {"bootstrap", "jackknife", "placebo"} ) @@ -375,10 +378,15 @@ def _extract_headline(self, dr_schema: Optional[Dict[str, Any]]) -> Dict[str, An # Preserve the fitted CI at its native level. alpha_was_honored = False alpha = float(result_alpha) + if inference_method == "wild_bootstrap": + inference_label = "wild cluster bootstrap" + elif bootstrap_like: + inference_label = "bootstrap" + else: + inference_label = "finite-df" alpha_override_caveat = ( f"Requested alpha was not honored for the confidence " - f"interval because this fit uses " - f"{'bootstrap' if bootstrap_like else 'finite-df'} " + f"interval because this fit uses {inference_label} " f"inference; the displayed CI remains at the fit's " f"native level ({int(round((1.0 - result_alpha) * 100))}%). " f"The significance phrasing still uses the requested alpha." @@ -611,6 +619,50 @@ def _describe_assumption(estimator_name: str) -> Dict[str, Any]: "captured through latent factor loadings." ), } + if estimator_name == "ContinuousDiDResults": + # Callaway, Goodman-Bacon & Sant'Anna (2024), two-level PT: + # REGISTRY.md §ContinuousDiD > Identification. + return { + "parallel_trends_variant": "dose_pt_or_strong_pt", + "no_anticipation": True, + "description": ( + "ContinuousDiD identifies dose-specific treatment effects " + "under two possible parallel-trends conditions (Callaway, " + "Goodman-Bacon & Sant'Anna 2024). Parallel Trends (PT) " + "assumes untreated potential outcome paths are equal across " + "all dose groups and the untreated group (conditional on " + "dose), identifying ATT(d|d) and the binarized ATT^loc but " + "NOT ATT(d), ACRT, or cross-dose comparisons. Strong " + "Parallel Trends (SPT) additionally rules out selection " + "into dose on the basis of treatment effects and is " + "required to identify the dose-response curve ATT(d), " + "marginal effect ACRT(d), and cross-dose contrasts." + ), + } + if estimator_name in {"TripleDifferenceResults", "StaggeredTripleDiffResults"}: + # Ortiz-Villavicencio & Sant'Anna (2025) — identification is the + # triple-difference cancellation across the 2x2x2 cells, not + # ordinary DiD parallel trends; see REGISTRY.md §TripleDifference + # and §StaggeredTripleDifference. + return { + "parallel_trends_variant": "triple_difference_cancellation", + "no_anticipation": True, + "description": ( + "Triple-difference identification relies on the DDD " + "decomposition (Ortiz-Villavicencio & Sant'Anna 2025): " + "the ATT is recovered from `DDD = DiD_A + DiD_B - DiD_C` " + "across the Group x Period x Eligibility (or Treatment) " + "cells, which differences out group-specific and " + "period-specific unobservables without requiring separate " + "parallel trends to hold between each cell pair. The " + "identifying restriction is therefore weaker than ordinary " + "DiD parallel trends but assumes that the residual " + "unobservable component is additively separable across the " + "three dimensions; practical overlap and common-support " + "conditions still apply on the propensity score when " + "covariates are used." + ), + } if estimator_name in { "CallawaySantAnnaResults", "SunAbrahamResults", @@ -620,7 +672,6 @@ def _describe_assumption(estimator_name: str) -> Dict[str, Any]: "EfficientDiDResults", "WooldridgeDiDResults", "ChaisemartinDHaultfoeuilleResults", - "StaggeredTripleDiffResults", }: return { "parallel_trends_variant": "conditional_or_group_time", diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 622938aa..bdd63e83 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -606,6 +606,127 @@ def test_finite_df_fit_preserves_fitted_ci_on_alpha_mismatch(self): assert "alpha_override_preserved" in caveat_topics +class TestWildBootstrapAlphaOverride: + """Regression for the round-4 P0 finding that ``inference='wild_bootstrap'`` + results were falling through to a normal-approximation recomputation.""" + + def test_wild_bootstrap_preserves_fitted_ci(self): + class _WildBootstrapStub: + def __init__(self): + self.att = 1.0 + self.se = 0.5 + self.p_value = 0.04 + # 95% CI produced by the wild cluster bootstrap surface. + self.conf_int = (0.10, 1.90) + self.alpha = 0.05 + self.n_obs = 100 + self.n_treated = 40 + self.n_control = 60 + self.inference_method = "wild_bootstrap" + self.survey_metadata = None + # Wild-boot fits don't necessarily carry a raw distribution; + # the inference_method string alone must be enough. + self.bootstrap_distribution = None + + stub = _WildBootstrapStub() + br = BusinessReport(stub, alpha=0.10, auto_diagnostics=False) + h = br.to_dict()["headline"] + assert h["ci_level"] == 95, ( + "Wild cluster bootstrap must preserve fitted CI level on alpha " + f"mismatch; got {h['ci_level']}" + ) + assert h["ci_lower"] == pytest.approx(0.10) + assert h["ci_upper"] == pytest.approx(1.90) + caveats = br.caveats() + assert any(c.get("topic") == "alpha_override_preserved" for c in caveats) + # Caveat message should call out wild cluster bootstrap specifically. + preserved_msg = next( + c["message"] for c in caveats if c.get("topic") == "alpha_override_preserved" + ) + assert "wild cluster bootstrap" in preserved_msg + + +class TestAssumptionBlockSourceFaithful: + """Regression for the round-4 P1 finding that ``_describe_assumption`` + was producing generic DiD PT text for ContinuousDiD, TripleDifference, + and StaggeredTripleDifference — all of which have different identifying + logic per the Methodology Registry.""" + + def _stub(self, class_name): + cls = type(class_name, (), {}) + obj = cls() + obj.att = 1.0 + obj.se = 0.1 + obj.p_value = 0.001 + obj.conf_int = (0.8, 1.2) + obj.alpha = 0.05 + obj.n_obs = 100 + obj.n_treated = 40 + obj.n_control = 60 + obj.survey_metadata = None + obj.event_study_effects = None + obj.inference_method = "analytical" + return obj + + def test_continuous_did_assumption_uses_two_level_pt(self): + br = BusinessReport(self._stub("ContinuousDiDResults"), auto_diagnostics=False) + assumption = br.to_dict()["assumption"] + assert assumption["parallel_trends_variant"] == "dose_pt_or_strong_pt" + desc = assumption["description"] + # Registry-backed language: PT vs Strong PT + ACRT mention. + assert "Strong Parallel Trends" in desc or "SPT" in desc + assert "ATT(d" in desc or "ACRT" in desc + assert "Callaway" in desc # attribution to CGBS 2024 + + def test_triple_difference_assumption_uses_ddd_decomposition(self): + class TripleDifferenceResults: + pass + + obj = TripleDifferenceResults() + obj.att = 1.0 + obj.se = 0.1 + obj.p_value = 0.001 + obj.conf_int = (0.8, 1.2) + obj.alpha = 0.05 + obj.n_obs = 100 + obj.n_treated = 40 + obj.n_control = 60 + obj.survey_metadata = None + obj.inference_method = "analytical" + + br = BusinessReport(obj, auto_diagnostics=False) + assumption = br.to_dict()["assumption"] + assert assumption["parallel_trends_variant"] == "triple_difference_cancellation" + desc = assumption["description"] + assert "DDD" in desc + assert "Ortiz-Villavicencio" in desc or "2025" in desc + + def test_staggered_triple_diff_assumption_uses_ddd_not_generic_pt(self): + class StaggeredTripleDiffResults: + pass + + obj = StaggeredTripleDiffResults() + obj.overall_att = 1.0 + obj.overall_se = 0.1 + obj.overall_p_value = 0.001 + obj.overall_conf_int = (0.8, 1.2) + obj.alpha = 0.05 + obj.n_obs = 100 + obj.n_treated = 40 + obj.n_control = 60 + obj.survey_metadata = None + obj.event_study_effects = None + obj.inference_method = "analytical" + + br = BusinessReport(obj, auto_diagnostics=False) + assumption = br.to_dict()["assumption"] + assert assumption["parallel_trends_variant"] == "triple_difference_cancellation" + desc = assumption["description"] + assert "triple-difference" in desc.lower() or "DDD" in desc + # Must NOT be the generic group-time PT text. + assert "group-time ATT" not in desc + + class TestFullReportSingleM: """Regression: ``full_report()`` must not claim full-grid robustness for a single-M HonestDiDResults passthrough. The summary path was fixed earlier; From 311a7bec980510beb3dd315ff2c1881a396c8079 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 17:40:18 -0400 Subject: [PATCH 08/48] Address fifth round of CI review findings on PR #318 - P0: ``_extract_headline`` now detects ``bootstrap_results is not None`` and ``n_bootstrap > 0`` in addition to ``inference_method`` / ``bootstrap_distribution`` / ``variance_method`` / ``df_survey``. Many staggered / continuous / dCDH result classes copy bootstrap- derived se/p/conf_int into their top-level fields without advertising ``inference_method``; alpha override must preserve their fitted CI rather than silently swapping in a normal-approximation interval. - P1: ``DiagnosticReport._check_sensitivity`` wraps the HonestDiD call in ``warnings.catch_warnings(record=True)`` and propagates captured messages onto the returned section dict. ``run_all`` aggregates per-section warnings into the top-level ``warnings`` list so both DR and BR surface them. CallawaySantAnna fits with ``base_period='varying'`` are preemptively skipped at the applicability gate with a methodology-critical reason, since HonestDiD explicitly warns those bounds are not valid for interpretation. BR renders the skip as a warning-severity caveat under a new ``sensitivity_skipped`` topic. - P1: ``_describe_assumption`` now gives ``ChaisemartinDHaultfoeuilleResults`` a source-backed description of transition-based identification (joiners / leavers / stable-control transitions, DID_M / DID_l building blocks, non-binary dose matching, reversible treatment) rather than generic group-time ATT PT text. - P2: README example now uses ``CallawaySantAnna(base_period='universal')`` so the advertised one-call sensitivity path actually runs. Both ``cs_fit`` fixtures updated likewise. - Regressions: ``TestBootstrapResultsAndNBootstrapDetection`` (four cases incl. dCDH-shaped stub and the analytic zero-bootstrap guard), ``TestDCDHAssumptionTransitionBased`` (source-faithful language assertions), ``TestCSVaryingBaseSensitivitySkipped`` (DR schema reason + BR caveat surfacing). 150 -> 115 targeted tests passing; black / ruff / mypy clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 2 +- diff_diff/business_report.py | 87 ++++++++++++++- diff_diff/diagnostic_report.py | 75 +++++++++++-- tests/test_business_report.py | 184 +++++++++++++++++++++++++++++++- tests/test_diagnostic_report.py | 9 +- 5 files changed, 340 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index fa04d2a7..5515c78b 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,7 @@ diff-diff ships two preview classes, `BusinessReport` and `DiagnosticReport`, th ```python from diff_diff import CallawaySantAnna, BusinessReport -cs = CallawaySantAnna().fit( +cs = CallawaySantAnna(base_period="universal").fit( df, outcome="revenue", unit="store", time="month", first_treat="first_treat", aggregate="event_study", ) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 656a119c..e460cfaa 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -364,12 +364,32 @@ def _extract_headline(self, dr_schema: Optional[Dict[str, Any]]) -> Dict[str, An ) variance_method = getattr(r, "variance_method", None) + # Many staggered / continuous / dCDH result classes copy + # bootstrap-derived se/p/conf_int directly into their top-level + # fields and do not advertise ``inference_method`` or + # ``bootstrap_distribution``. Instead they expose either a + # populated ``bootstrap_results`` sub-object (CS, SA, Imputation, + # TwoStage, EfficientDiD, StaggeredTripleDiff, dCDH) or an + # ``n_bootstrap`` field set > 0 (ContinuousDiD, plus the above + # when applicable). Treat both as bootstrap markers so an + # ``alpha`` override does not silently swap a percentile / + # multiplier-bootstrap CI for a normal-approximation one. + has_bootstrap_results = getattr(r, "bootstrap_results", None) is not None + raw_n_bootstrap = getattr(r, "n_bootstrap", 0) + has_n_bootstrap = ( + isinstance(raw_n_bootstrap, (int, float)) + and np.isfinite(raw_n_bootstrap) + and raw_n_bootstrap > 0 + ) + # Any non-analytic inference surface that stores a sampling / # resampling distribution (wild cluster bootstrap, percentile # bootstrap, jackknife, placebo) should preserve its native CI. bootstrap_like = ( inference_method in {"bootstrap", "wild_bootstrap"} or has_bootstrap_dist + or has_bootstrap_results + or has_n_bootstrap or variance_method in {"bootstrap", "jackknife", "placebo"} ) finite_df = isinstance(df_survey, (int, float)) and df_survey > 0 @@ -663,6 +683,36 @@ def _describe_assumption(estimator_name: str) -> Dict[str, Any]: "covariates are used." ), } + if estimator_name == "ChaisemartinDHaultfoeuilleResults": + # de Chaisemartin & D'Haultfoeuille (2020, 2024) — identification is + # transition-based across (joiner, leaver, stable-control) cells + # around each switching period, not a group-time ATT parallel- + # trends restriction. Writing up dCDH as "parallel trends across + # treatment cohorts" was flagged as a source-faithfulness bug in + # PR #318 review; REGISTRY.md §ChaisemartinDHaultfoeuille is + # explicit about the transition-set construction. + return { + "parallel_trends_variant": "transition_based", + "no_anticipation": True, + "description": ( + "Identification is transition-based (de Chaisemartin & " + "D'Haultfoeuille 2020; dynamic companion 2024). At each " + "switching period, the estimator contrasts joiners " + "(D:0->1), leavers (D:1->0), and stable-treated / " + "stable-untreated control cells that share the same " + "treatment state across adjacent periods, yielding the " + "contemporaneous ``DID_M`` and per-horizon ``DID_l`` / " + "``DID_{g,l}`` building blocks. The identifying " + "restriction is parallel trends within each transition's " + "stable-control cell (not a single group-time ATT PT " + "condition across all cohorts) plus no anticipation; " + "with non-binary treatment the stable-control match is " + "additionally on exact baseline dose ``D_{g,1}``. " + "Reversible treatment is natively supported, unlike the " + "absorbing-treatment designs that rely on a fixed " + "treatment-onset cohort." + ), + } if estimator_name in { "CallawaySantAnnaResults", "SunAbrahamResults", @@ -671,7 +721,6 @@ def _describe_assumption(estimator_name: str) -> Dict[str, Any]: "StackedDiDResults", "EfficientDiDResults", "WooldridgeDiDResults", - "ChaisemartinDHaultfoeuilleResults", }: return { "parallel_trends_variant": "conditional_or_group_time", @@ -825,6 +874,42 @@ def _build_caveats( } ) + # Sensitivity was skipped for methodology reasons (e.g., CS fit with + # ``base_period='varying'`` — HonestDiD bounds are not interpretable + # there). Surface the reason as a warning-severity caveat so readers + # do not assume the headline is robust across the R-R grid. + if sens.get("status") == "skipped": + reason = sens.get("reason") + if isinstance(reason, str) and reason: + caveats.append( + { + "severity": "warning", + "topic": "sensitivity_skipped", + "message": ("HonestDiD sensitivity was not run on this fit. " + reason), + } + ) + + # Non-fatal warnings captured from delegated diagnostics + # (e.g., HonestDiD's bootstrap diag-covariance fallback, dropped + # non-consecutive horizons on dCDH). DR already records these in + # ``schema["warnings"]``; mirror the methodology-critical ones + # into BR's caveat list so summary/full-report prose can surface + # them without readers having to inspect the DR schema. + for msg in dr_schema.get("warnings", []) or []: + if not isinstance(msg, str) or not msg: + continue + # Skip alpha-override and design-effect messages already + # covered by dedicated caveats above. + lower = msg.lower() + if "sensitivity:" in lower or "pretrends_power:" in lower: + caveats.append( + { + "severity": "info", + "topic": "diagnostic_warning", + "message": msg, + } + ) + # Unit mismatch caveat (log_points + unit override). unit_kind = headline.get("unit_kind") if unit_kind == "log_points": diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 1215f9df..e44bd6ab 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -500,6 +500,25 @@ def _instance_skip_reason(self, check: str) -> Optional[str]: # Precomputed sensitivity always unlocks this check. if "sensitivity" in self._precomputed: return None + # CallawaySantAnna with ``base_period='varying'`` (the default) + # produces consecutive-comparison pre-period coefficients; + # HonestDiD explicitly warns those bounds are not valid for + # interpreted sensitivity. Skip at the applicability gate so + # BR/DR do not narrate the grid as robustness. Users opting + # in can pass ``precomputed={'sensitivity': ...}`` or re-fit + # with ``base_period='universal'``. + if name == "CallawaySantAnnaResults": + base_period = getattr(r, "base_period", "universal") + if base_period != "universal": + return ( + "HonestDiD on CallawaySantAnna requires " + "``base_period='universal'`` for valid interpretation " + "(Rambachan-Roth bounds are not comparable across the " + "consecutive pre-period comparisons produced by " + f"``base_period={base_period!r}``). Re-fit with " + "``CallawaySantAnna(base_period='universal')`` or pass " + "``precomputed={'sensitivity': ...}`` to opt in." + ) # dCDH uses ``placebo_event_study`` as its pre-period surface, # which HonestDiD consumes via a dedicated branch. Accept the # fit when that attribute is populated. @@ -625,6 +644,21 @@ def _execute(self) -> DiagnosticReportResults: if section.get("status") == "error": reason = section.get("reason") or "diagnostic raised an exception" top_warnings.append(f"{check}: {reason}") + # Surface non-fatal warnings captured by delegated diagnostics + # (e.g., HonestDiD's "base_period='varying' is not valid for + # interpretation" on CallawaySantAnna, or the diag-covariance + # fallback on bootstrap-fitted CS). These rode up on each + # section's ``warnings`` field and must not be swallowed. + section_warnings = section.get("warnings") + if isinstance(section_warnings, (list, tuple)): + for msg in section_warnings: + if msg is None: + continue + top_warnings.append(f"{check}: {msg}") + # Some sections (e.g., sensitivity skipped for varying-base CS) + # also surface methodology-critical context via ``reason`` even + # though ``status != "error"``. We do not duplicate those here + # — the section's own status/reason is the authoritative record. schema: Dict[str, Any] = { "schema_version": DIAGNOSTIC_REPORT_SCHEMA_VERSION, @@ -994,21 +1028,36 @@ def _check_sensitivity(self) -> Dict[str, Any]: "method": "estimator_native", } + # Varying-base CS gate: handled at ``_instance_skip_reason``, so + # this code path is not reached for a varying-base CS fit unless + # the user passed ``precomputed={'sensitivity': ...}`` (handled + # above). Kept here as a comment anchor; see _instance_skip_reason. + + import warnings as _warnings + try: from typing import cast from diff_diff.honest_did import HonestDiD - # The sensitivity_method string is validated at runtime by - # HonestDiD; the Literal annotation is for static typing only. - honest = HonestDiD( - method=cast(Any, self._sensitivity_method), - alpha=self._alpha, - ) - sens = honest.sensitivity_analysis( - self._results, - M_grid=list(self._sensitivity_M_grid), - ) + # Capture any non-fatal UserWarnings HonestDiD emits (bootstrap + # diag-covariance fallback on CS, library-extension note on + # dCDH, dropped non-consecutive horizons, etc.) so BR/DR do not + # silently narrate sensitivity as clean when the helper + # flagged caveats. The try/except below still handles fatal + # errors; captured warnings ride on the returned dict. + with _warnings.catch_warnings(record=True) as caught: + _warnings.simplefilter("always") + # The sensitivity_method string is validated at runtime by + # HonestDiD; the Literal annotation is for static typing only. + honest = HonestDiD( + method=cast(Any, self._sensitivity_method), + alpha=self._alpha, + ) + sens = honest.sensitivity_analysis( + self._results, + M_grid=list(self._sensitivity_M_grid), + ) except Exception as exc: # noqa: BLE001 return { "status": "error", @@ -1016,7 +1065,11 @@ def _check_sensitivity(self) -> Dict[str, Any]: "reason": f"HonestDiD.sensitivity_analysis raised " f"{type(exc).__name__}: {exc}", } - return self._format_sensitivity_results(sens) + captured = [str(w.message) for w in caught if issubclass(w.category, Warning)] + formatted = self._format_sensitivity_results(sens) + if captured: + formatted["warnings"] = captured + return formatted def _format_sensitivity_results(self, sens: Any) -> Dict[str, Any]: grid = [] diff --git a/tests/test_business_report.py b/tests/test_business_report.py index bdd63e83..75a2328c 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -89,7 +89,10 @@ def event_study_fit(): @pytest.fixture(scope="module") def cs_fit(): sdf = generate_staggered_data(n_units=100, n_periods=6, treatment_effect=1.5, seed=7) - cs = CallawaySantAnna().fit( + # base_period='universal' so DR's sensitivity check can run without + # hitting the round-5 methodology-critical skip (Rambachan-Roth bounds + # are not interpretable on consecutive-comparison pre-periods). + cs = CallawaySantAnna(base_period="universal").fit( sdf, outcome="outcome", unit="unit", @@ -797,3 +800,182 @@ def test_business_context_is_frozen_dataclass(): ) with pytest.raises((AttributeError, Exception)): ctx.alpha = 0.10 # type: ignore[misc] + + +class TestBootstrapResultsAndNBootstrapDetection: + """Regression for the round-5 P0 finding that ``_extract_headline`` + only preserved native CI surfaces when a result advertised + ``inference_method`` / ``bootstrap_distribution`` / ``variance_method`` + / ``df_survey``. + + Several staggered / continuous / dCDH result classes copy bootstrap- + derived se/p/conf_int into their top-level fields at fit time and + expose the bootstrap only via a ``bootstrap_results`` sub-object or + an ``n_bootstrap > 0`` attribute. An ``alpha`` override on such a + fit would silently swap a percentile/multiplier bootstrap CI for a + normal-approximation one. BR must now detect either marker and + preserve the fitted CI at its native level. + """ + + def _base_stub(self): + stub = type("Stub", (), {})() + stub.att = 1.0 + stub.se = 0.5 + stub.p_value = 0.04 + stub.conf_int = (0.05, 1.95) + stub.alpha = 0.05 + stub.n_obs = 100 + stub.n_treated = 40 + stub.n_control = 60 + stub.survey_metadata = None + # Crucially NOT exposing inference_method / bootstrap_distribution + # / variance_method / df_survey: exactly the surface the reviewer + # flagged as silently falling through. + return stub + + def test_bootstrap_results_object_alone_preserves_fit_ci(self): + stub = self._base_stub() + stub.bootstrap_results = type("BootSub", (), {"n_bootstrap": 199})() + br = BusinessReport(stub, alpha=0.10, auto_diagnostics=False) + h = br.to_dict()["headline"] + assert h["ci_level"] == 95, ( + "Result carrying bootstrap_results must preserve fitted CI " + "level on alpha mismatch; got " + str(h["ci_level"]) + ) + assert h["ci_lower"] == pytest.approx(0.05) + assert h["ci_upper"] == pytest.approx(1.95) + topics = {c.get("topic") for c in br.caveats()} + assert "alpha_override_preserved" in topics + + def test_n_bootstrap_positive_alone_preserves_fit_ci(self): + """ContinuousDiDResults-style: ``n_bootstrap`` field, no bootstrap_results.""" + stub = self._base_stub() + stub.n_bootstrap = 499 + br = BusinessReport(stub, alpha=0.10, auto_diagnostics=False) + h = br.to_dict()["headline"] + assert h["ci_level"] == 95 + assert h["ci_lower"] == pytest.approx(0.05) + assert h["ci_upper"] == pytest.approx(1.95) + topics = {c.get("topic") for c in br.caveats()} + assert "alpha_override_preserved" in topics + + def test_n_bootstrap_zero_does_not_trigger_preserve_path(self): + """Analytic fits with ``n_bootstrap = 0`` must still honor alpha.""" + stub = self._base_stub() + stub.n_bootstrap = 0 + br = BusinessReport(stub, alpha=0.10, auto_diagnostics=False) + h = br.to_dict()["headline"] + # Analytic — alpha honored, CI recomputed to 90%. + assert h["ci_level"] == 90 + + def test_dcdh_shaped_bootstrap_stub_preserves_fit_ci(self): + """dCDH copies bootstrap se/p/conf_int into top-level fields without + ``inference_method``. The reviewer called this out specifically.""" + + class ChaisemartinDHaultfoeuilleResults: # name-keyed dispatch + pass + + stub = ChaisemartinDHaultfoeuilleResults() + stub.att = 1.5 + stub.se = 0.4 + stub.p_value = 0.02 + stub.conf_int = (0.72, 2.28) + stub.alpha = 0.05 + stub.n_obs = 200 + stub.n_treated = 80 + stub.n_control = 120 + stub.survey_metadata = None + stub.event_study_effects = None + stub.placebo_event_study = None + # dCDH carries bootstrap via a sub-object; top-level fields are + # the bootstrap-derived values, not analytic. + stub.bootstrap_results = type("DCDHBoot", (), {"n_bootstrap": 499})() + + br = BusinessReport(stub, alpha=0.10, auto_diagnostics=False) + h = br.to_dict()["headline"] + assert h["ci_level"] == 95 + assert h["ci_lower"] == pytest.approx(0.72) + assert h["ci_upper"] == pytest.approx(2.28) + + +class TestDCDHAssumptionTransitionBased: + """Regression for the round-5 P1 finding that + ``ChaisemartinDHaultfoeuilleResults`` was narrated with generic group- + time PT text instead of source-backed transition-based identification. + """ + + def test_dcdh_uses_transition_based_language(self): + class ChaisemartinDHaultfoeuilleResults: + pass + + obj = ChaisemartinDHaultfoeuilleResults() + obj.att = 1.0 + obj.se = 0.1 + obj.p_value = 0.001 + obj.conf_int = (0.8, 1.2) + obj.alpha = 0.05 + obj.n_obs = 100 + obj.n_treated = 40 + obj.n_control = 60 + obj.survey_metadata = None + obj.event_study_effects = None + obj.placebo_event_study = None + obj.inference_method = "analytical" + + br = BusinessReport(obj, auto_diagnostics=False) + assumption = br.to_dict()["assumption"] + assert assumption["parallel_trends_variant"] == "transition_based" + desc = assumption["description"] + # Source-faithful: joiners/leavers/stable-control, dCDH paper attribution. + assert "joiner" in desc.lower() + assert "leaver" in desc.lower() + assert "Chaisemartin" in desc or "D'Haultfoeuille" in desc + # Must NOT open with the generic group-time PT framing. The text + # may reference it inside a contrast clause ("not a single + # group-time ATT PT"), which is fine and intended. + assert not desc.startswith("Identification relies on parallel trends") + + +class TestCSVaryingBaseSensitivitySkipped: + """Regression for the round-5 P1 finding that DR would narrate HonestDiD + bounds as robust sensitivity for a CallawaySantAnna fit with + ``base_period='varying'`` (the CS default). The HonestDiD helper + explicitly warns that those bounds are not valid for interpretation; + DR must preemptively skip and surface the reason.""" + + def test_cs_varying_base_skips_sensitivity_with_reason(self): + class CallawaySantAnnaResults: + pass + + stub = CallawaySantAnnaResults() + stub.overall_att = 1.0 + stub.overall_se = 0.3 + stub.overall_p_value = 0.01 + stub.overall_conf_int = (0.4, 1.6) + stub.alpha = 0.05 + stub.n_obs = 100 + stub.n_treated = 40 + stub.n_control = 60 + stub.survey_metadata = None + stub.event_study_effects = None + stub.event_study_vcov = None + stub.event_study_vcov_index = None + stub.vcov = None + stub.interaction_indices = None + stub.base_period = "varying" + stub.inference_method = "analytical" + + from diff_diff import DiagnosticReport + + dr = DiagnosticReport(stub).run_all() + sens = dr.schema["sensitivity"] + assert sens["status"] == "skipped" + reason = sens["reason"] + assert "base_period" in reason and "universal" in reason + # And BR must surface this as a warning-severity caveat. + br = BusinessReport(stub, diagnostics=dr) + caveats = br.caveats() + topics = {c.get("topic") for c in caveats} + assert "sensitivity_skipped" in topics, ( + "BR must surface varying-base sensitivity skip as a caveat; " f"got topics {topics}" + ) diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index 99089e01..e0b19507 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -100,7 +100,12 @@ def multi_period_fit(): def cs_fit(): warnings.filterwarnings("ignore") sdf = generate_staggered_data(n_units=100, n_periods=6, treatment_effect=1.5, seed=7) - cs = CallawaySantAnna().fit( + # Use base_period='universal' so HonestDiD sensitivity can run on this + # fixture. CS's default is 'varying', which DR now skips with a + # methodology-critical reason (Rambachan-Roth bounds are not valid for + # interpretation on consecutive-comparison pre-periods). See the + # round-5 CI review on PR #318. + cs = CallawaySantAnna(base_period="universal").fit( sdf, outcome="outcome", unit="unit", @@ -449,8 +454,6 @@ def test_sun_abraham_sensitivity_not_applicable(self): """SA is not in HonestDiD's adapter list; DR must not try to run it.""" import warnings - import pandas as pd - from diff_diff import SunAbraham, generate_staggered_data warnings.filterwarnings("ignore") From df2636bde8210935f1946f33eab78233773a8b2c Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 17:54:06 -0400 Subject: [PATCH 09/48] Address sixth round of CI review findings on PR #318 - P1 heterogeneity: ``_collect_effect_scalars`` no longer mixes pre- and post-treatment coefficients into the CV / range / sign- consistency summary. ``MultiPeriodDiDResults`` now routes through ``post_period_effects``; staggered event-study fits filter to ``rel_time >= 0`` AND exclude reference markers (``n_groups == 0`` / ``n_obs == 0``) AND exclude non-finite rows; CS ``group_time_effects`` filters to ``t >= g`` post-treatment cells. ``_heterogeneity_source`` now names the post-only surface (e.g., ``post_period_effects`` / ``event_study_effects_post`` / ``group_time_effects_post``) so downstream tooling can verify the estimand being summarized. - P1 dCDH parallel trends: ``_collect_pre_period_coefs`` now reads ``placebo_event_study`` as the pre-period surface for ``ChaisemartinDHaultfoeuilleResults``. dCDH is advertised as PT-applicable in ``_APPLICABILITY`` but the extractor previously only looked at ``pre_period_effects`` / negative-horizon ``event_study_effects``, silently skipping the PT check on valid placebo fits. - P2: API RST examples (``docs/api/business_report.rst``, ``docs/api/diagnostic_report.rst``) updated to construct ``CallawaySantAnna(base_period="universal")`` so the advertised auto-diagnostics path runs sensitivity instead of being skipped. ``docs/methodology/REPORTING.md`` pretrends-power routing text now matches the implemented applicability matrix ({MultiPeriod, CS, SA}) rather than claiming general "event-study with vcov" applicability. - Regressions: ``TestDCDHParallelTrendsViaPlaceboEventStudy`` (two cases: runs when ``placebo_event_study`` populated, skips when missing) and ``TestHeterogeneityPostTreatmentOnly`` (extractor returns post-only scalars for MultiPeriod; event-study filter drops pre-period and reference-marker rows). 115 -> 109 targeted tests passing; black / ruff / mypy clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/diagnostic_report.py | 163 +++++++++++++++++++++++++++----- docs/api/business_report.rst | 2 +- docs/api/diagnostic_report.rst | 2 +- docs/methodology/REPORTING.md | 14 ++- tests/test_business_report.py | 2 +- tests/test_diagnostic_report.py | 162 +++++++++++++++++++++++++++++++ 6 files changed, 315 insertions(+), 30 deletions(-) diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index e44bd6ab..52e7698f 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -1419,32 +1419,106 @@ def _trop_native(self, r: Any) -> Dict[str, Any]: # -- Heterogeneity helpers -------------------------------------------- def _collect_effect_scalars(self) -> List[float]: - """Collect scalar effect values across group / event-study / TROP sources. + """Collect scalar **post-treatment** effect values across group / event- + study / TROP sources. - Returns an empty list if no recognized effect container is present. - Never raises on unexpected shapes; unrecognized entries are skipped. + Pre-period coefficients (placebos and normalization constraints) + and synthetic reference-marker rows are explicitly excluded — + mixing them into the heterogeneity dispersion / sign-consistency + summary silently redefines the estimand, which the round-6 CI + review flagged on PR #318. + + Returns an empty list if no recognized effect container yields + any post-treatment entries. """ r = self._results - # 1. group_effects: dict keyed by cohort -> dict with 'effect' or float + # 1. group_effects: per-cohort post-treatment ATT(g) by construction. ge = getattr(r, "group_effects", None) if ge is not None: return self._scalars_from_mapping(ge) - # 2. event_study_effects: dict keyed by relative time -> dict with 'effect' + # 2. MultiPeriodDiDResults: use the ``post_period_effects`` property + # (post-treatment only) instead of ``period_effects`` (which mixes + # pre- and post-treatment coefficients). + ppe = getattr(r, "post_period_effects", None) + if ppe is not None: + return self._scalars_from_mapping(ppe) + # 3. event_study_effects: dict keyed by relative time -> dict with + # 'effect'. Filter to **post-treatment** horizons (rel_time >= 0), + # exclude reference markers (``n_groups == 0`` on CS/SA; + # ``n_obs == 0`` on Stacked/TwoStage/Imputation/EfficientDiD), and + # exclude entries with non-finite effect. es = getattr(r, "event_study_effects", None) if es is not None: - return self._scalars_from_mapping(es) - # 3. TROP: treatment_effects dict keyed by (unit, time) -> float + post_only: List[float] = [] + try: + items = list(es.items()) + except Exception: # noqa: BLE001 + items = [] + for key, entry in items: + try: + rel = int(key) + except (TypeError, ValueError): + # Non-integer keys — unknown shape; skip conservatively + # rather than mixing into the dispersion summary. + continue + if rel < 0: + continue + if isinstance(entry, dict): + if entry.get("n_groups") == 0 or entry.get("n_obs") == 0: + continue + eff = _extract_scalar_effect(entry) + if eff is None or not np.isfinite(eff): + continue + post_only.append(eff) + return post_only + # 4. TROP: treatment_effects dict keyed by (unit, time) -> float. + # TROP produces counterfactual deltas only at observed points for + # treated units (the factor-model construction), so these are + # post-treatment by design. te = getattr(r, "treatment_effects", None) if te is not None: return self._scalars_from_mapping(te) - # 4. CS default: group_time_effects dict keyed by (g, t) -> dict + # 5. CS default aggregation: group_time_effects dict keyed by + # (g, t) -> dict. Filter to t >= g (post-treatment cells); the + # pre-treatment cells (t < g) are identification-deviation + # placebos, not effect heterogeneity. gte = getattr(r, "group_time_effects", None) if gte is not None: - return self._scalars_from_mapping(gte) - # 5. MultiPeriod: period_effects dict keyed by period -> PeriodEffect - pe = getattr(r, "period_effects", None) - if pe is not None: - return self._scalars_from_mapping(pe) + post_cells: List[float] = [] + try: + items = list(gte.items()) + except Exception: # noqa: BLE001 + items = [] + for key, entry in items: + g_t = None + if isinstance(key, tuple) and len(key) == 2: + g_t = key + else: + g_val = ( + getattr(entry, "group", None) + if not isinstance(entry, dict) + else entry.get("group") + ) + t_val = ( + getattr(entry, "time", None) + if not isinstance(entry, dict) + else entry.get("time") + ) + if g_val is not None and t_val is not None: + g_t = (g_val, t_val) + if g_t is not None: + try: + g_num = float(g_t[0]) + t_num = float(g_t[1]) + if t_num < g_num: + continue + except (TypeError, ValueError): + pass + eff = _extract_scalar_effect(entry) + if eff is None or not np.isfinite(eff): + continue + post_cells.append(eff) + return post_cells return [] @staticmethod @@ -1470,16 +1544,25 @@ def _scalars_from_mapping(mapping: Any) -> List[float]: return out def _heterogeneity_source(self) -> str: - """Name the attribute that produced the scalars (for the schema).""" - for attr in ( - "group_effects", - "event_study_effects", - "treatment_effects", - "group_time_effects", - "period_effects", - ): - if getattr(self._results, attr, None) is not None: - return attr + """Name the attribute that produced the scalars (for the schema). + + Mirrors the dispatch order in ``_collect_effect_scalars`` and + reports the actual post-treatment surface consumed (e.g., + ``post_period_effects`` rather than ``period_effects`` on + ``MultiPeriodDiDResults``, and ``event_study_effects_post`` to + make it clear pre-period / reference-marker rows were filtered). + """ + r = self._results + if getattr(r, "group_effects", None) is not None: + return "group_effects" + if getattr(r, "post_period_effects", None) is not None: + return "post_period_effects" + if getattr(r, "event_study_effects", None) is not None: + return "event_study_effects_post" + if getattr(r, "treatment_effects", None) is not None: + return "treatment_effects" + if getattr(r, "group_time_effects", None) is not None: + return "group_time_effects_post" return "unknown" def _pt_hausman(self) -> Dict[str, Any]: @@ -1745,11 +1828,17 @@ def _power_tier(ratio: Optional[float]) -> str: def _collect_pre_period_coefs(results: Any) -> List[Tuple[Any, float, float, Optional[float]]]: """Return a sorted list of ``(key, effect, se, p_value)`` for pre-period coefficients. - Handles two shapes: + Handles three shapes: * ``pre_period_effects``: dict-of-``PeriodEffect`` on ``MultiPeriodDiDResults``. * ``event_study_effects``: dict-of-dict (with ``effect`` / ``se`` / ``p_value`` keys) on the staggered estimators (CS / SA / ImputationDiD / Stacked / EDiD / etc.). Pre-period entries are those with negative relative-time keys. + * ``placebo_event_study``: dict-of-dict on + ``ChaisemartinDHaultfoeuilleResults`` — dCDH's dynamic placebos + ``DID^{pl}_l`` are the estimator's pre-period analogue (the + Rambachan-Roth machinery in ``honest_did.py`` consumes them via a + dedicated branch, and this diagnostic must match). Keys are + negative horizons; entries share the event-study dict shape. Filtering rules (critical for methodology-safe PT tests): @@ -1762,10 +1851,16 @@ def _collect_pre_period_coefs(results: Any) -> List[Tuple[Any, float, float, Opt excluded. A NaN SE means inference is undefined — feeding it into Bonferroni or Wald would produce a false-clean PT verdict. - Returns an empty list when neither source provides valid pre-period entries. + Returns an empty list when none of the three sources provides valid + pre-period entries. """ results_list: List[Tuple[Any, float, float, Optional[float]]] = [] pre = getattr(results, "pre_period_effects", None) + # dCDH exposes pre-period placebos via ``placebo_event_study``; the + # round-6 CI review flagged that routing dCDH through the generic + # ``event_study_effects`` path produced empty pre-coef lists and + # silently skipped the PT check. + dcdh_placebo = getattr(results, "placebo_event_study", None) if pre: for k, pe in pre.items(): eff = getattr(pe, "effect", None) @@ -1781,6 +1876,24 @@ def _collect_pre_period_coefs(results: Any) -> List[Tuple[Any, float, float, Opt if not (np.isfinite(eff_f) and np.isfinite(se_f)): continue results_list.append((k, eff_f, se_f, _to_python_float(p))) + elif dcdh_placebo: + # dCDH placebo horizons are the pre-period surface. + for k, entry in dcdh_placebo.items(): + if not isinstance(entry, dict): + continue + eff = entry.get("effect") + se = entry.get("se") + p = entry.get("p_value") + if eff is None or se is None: + continue + try: + eff_f = float(eff) + se_f = float(se) + except (TypeError, ValueError): + continue + if not (np.isfinite(eff_f) and np.isfinite(se_f)): + continue + results_list.append((k, eff_f, se_f, _to_python_float(p))) else: es = getattr(results, "event_study_effects", None) or {} for k, entry in es.items(): diff --git a/docs/api/business_report.rst b/docs/api/business_report.rst index 59a4306c..aa12f5f4 100644 --- a/docs/api/business_report.rst +++ b/docs/api/business_report.rst @@ -25,7 +25,7 @@ Example from diff_diff import CallawaySantAnna, BusinessReport - cs = CallawaySantAnna().fit( + cs = CallawaySantAnna(base_period="universal").fit( df, outcome="revenue", unit="store", time="period", first_treat="first_treat", aggregate="event_study", ) diff --git a/docs/api/diagnostic_report.rst b/docs/api/diagnostic_report.rst index fc66d110..c795c0c1 100644 --- a/docs/api/diagnostic_report.rst +++ b/docs/api/diagnostic_report.rst @@ -22,7 +22,7 @@ Example from diff_diff import CallawaySantAnna, DiagnosticReport - cs = CallawaySantAnna().fit( + cs = CallawaySantAnna(base_period="universal").fit( df, outcome="outcome", unit="unit", time="period", first_treat="first_treat", aggregate="event_study", ) diff --git a/docs/methodology/REPORTING.md b/docs/methodology/REPORTING.md index d095969c..d9294970 100644 --- a/docs/methodology/REPORTING.md +++ b/docs/methodology/REPORTING.md @@ -75,8 +75,18 @@ result or computed by an existing diff-diff utility - **Note:** Power-aware phrasing for `no_detected_violation`. DR calls `compute_pretrends_power(results, violation_type='linear', - alpha=alpha, target_power=0.80)` whenever the result has an - event-study surface with a `vcov`. BusinessReport then reads + alpha=alpha, target_power=0.80)` for the estimator families that + ship a `compute_pretrends_power` adapter: `MultiPeriodDiDResults`, + `CallawaySantAnnaResults`, and `SunAbrahamResults` (see + `_APPLICABILITY["pretrends_power"]` in + `diff_diff/diagnostic_report.py`). Other staggered families with + event-study output (`ImputationDiDResults`, `TwoStageDiDResults`, + `StackedDiDResults`, `EfficientDiDResults`, + `StaggeredTripleDiffResults`, `WooldridgeDiDResults`, + `ChaisemartinDHaultfoeuilleResults`) do not yet have a power + adapter and therefore render the `no_detected_violation` tier as + `underpowered` with the fallback reason recorded in + `schema["pre_trends"]["power_status"]`. BusinessReport then reads `mdv_share_of_att = mdv / abs(att)` and selects a tier: - `< 0.25` → `well_powered` — "the test has 80% power to diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 75a2328c..9add99ef 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -28,8 +28,8 @@ import diff_diff as dd from diff_diff import ( - BusinessReport, BusinessContext, + BusinessReport, CallawaySantAnna, DiagnosticReport, DifferenceInDifferences, diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index e0b19507..a22812ac 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -1037,6 +1037,168 @@ def test_run_all_is_idempotent(self, multi_period_fit): assert a is b # cached +class TestDCDHParallelTrendsViaPlaceboEventStudy: + """Regression for the round-6 P1 finding that dCDH was advertised as + PT-applicable but ``_collect_pre_period_coefs`` never read + ``placebo_event_study``, so the PT check was silently skipped even + on fits with valid placebo horizons. + """ + + def _stub(self, with_placebo: bool): + class ChaisemartinDHaultfoeuilleResults: + pass + + stub = ChaisemartinDHaultfoeuilleResults() + stub.att = 1.0 + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 100 + stub.n_treated = 40 + stub.n_control = 60 + stub.survey_metadata = None + stub.event_study_effects = None + if with_placebo: + stub.placebo_event_study = { + -3: { + "effect": 0.05, + "se": 0.1, + "p_value": 0.62, + "conf_int": (-0.15, 0.25), + "n_obs": 40, + }, + -2: { + "effect": -0.08, + "se": 0.09, + "p_value": 0.38, + "conf_int": (-0.26, 0.10), + "n_obs": 45, + }, + -1: { + "effect": 0.04, + "se": 0.10, + "p_value": 0.69, + "conf_int": (-0.16, 0.24), + "n_obs": 50, + }, + } + else: + stub.placebo_event_study = None + return stub + + def test_pt_check_reads_placebo_event_study(self): + stub = self._stub(with_placebo=True) + dr = DiagnosticReport(stub).run_all() + pt = dr.schema["parallel_trends"] + assert ( + pt["status"] == "ran" + ), f"dCDH PT check must run on a fit with placebo_event_study; got {pt}" + # Per-period rows should come from the placebo keys (negative horizons). + per_period = pt.get("per_period") or pt.get("periods") or [] + assert per_period, "PT output must include per-period rows" + periods = [row.get("period") for row in per_period] + assert all( + isinstance(p, int) and p < 0 for p in periods + ), f"dCDH PT must use negative placebo horizons; got {periods}" + + def test_pt_check_skips_when_no_placebo_event_study(self): + stub = self._stub(with_placebo=False) + dr = DiagnosticReport(stub).run_all() + pt = dr.schema["parallel_trends"] + assert ( + pt["status"] == "skipped" + ), f"dCDH PT must skip when placebo_event_study is missing; got {pt}" + + +class TestHeterogeneityPostTreatmentOnly: + """Regression for the round-6 P1 finding that ``_check_heterogeneity`` + was mixing pre- and post-treatment coefficients into the CV / range / + sign-consistency summary. + """ + + def test_collector_prefers_post_period_effects_over_period_effects(self): + """On a MultiPeriod-shaped stub, ``_collect_effect_scalars`` must read + ``post_period_effects`` (post-treatment only), not ``period_effects`` + (which mixes pre- and post-treatment coefficients). If the pre-period + value leaked in, sign_consistency would flip and the range would span + a much larger interval.""" + from diff_diff.diagnostic_report import DiagnosticReport + + class MultiPeriodDiDResults: + pass + + stub = MultiPeriodDiDResults() + pe_pre = type("PeriodEffect", (), {"effect": -1.0, "se": 0.2})() + pe_post_1 = type("PeriodEffect", (), {"effect": 1.0, "se": 0.2})() + pe_post_2 = type("PeriodEffect", (), {"effect": 3.0, "se": 0.2})() + stub.period_effects = {-1: pe_pre, 0: pe_post_1, 1: pe_post_2} + stub.post_period_effects = {0: pe_post_1, 1: pe_post_2} + stub.pre_period_effects = {-1: pe_pre} + stub.avg_att = 2.0 + stub.avg_se = 0.1 + stub.avg_p_value = 0.001 + stub.avg_conf_int = (1.8, 2.2) + stub.alpha = 0.05 + stub.n_obs = 100 + stub.n_treated = 40 + stub.n_control = 60 + stub.survey_metadata = None + + # Bypass the applicability-matrix gate by constructing the report + # object and calling the extractor directly: the fix is in the + # extractor, and MultiPeriod's applicability matrix may or may + # not include heterogeneity at any given release. + dr = DiagnosticReport(stub) + effects = sorted(dr._collect_effect_scalars()) + assert effects == [1.0, 3.0], ( + f"Extractor must return only post-treatment effects " + f"(no pre-period -1.0); got {effects}" + ) + assert dr._heterogeneity_source() == "post_period_effects" + + def test_event_study_filters_pre_period_and_reference_markers(self): + class CallawaySantAnnaResults: + pass + + stub = CallawaySantAnnaResults() + # Event study: pre horizons (rel<0), reference marker (n_groups=0), + # non-finite row, and two valid post rows. + stub.event_study_effects = { + -2: {"effect": -3.0, "se": 0.2, "n_groups": 15}, + -1: {"effect": 0.0, "se": float("nan"), "n_groups": 0}, # reference marker + 0: {"effect": 1.0, "se": 0.2, "n_groups": 15}, + 1: {"effect": 2.0, "se": 0.2, "n_groups": 12}, + 2: {"effect": float("nan"), "se": 0.2, "n_groups": 5}, # non-finite + } + stub.overall_att = 1.5 + stub.overall_se = 0.1 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (1.3, 1.7) + stub.alpha = 0.05 + stub.n_obs = 100 + stub.n_treated = 40 + stub.n_control = 60 + stub.survey_metadata = None + stub.base_period = "universal" + + dr = DiagnosticReport( + stub, + run_parallel_trends=False, + run_sensitivity=False, + run_bacon=False, + ).run_all() + het = dr.schema["heterogeneity"] + assert het["status"] == "ran" + assert het["source"] == "event_study_effects_post" + # Only rel>=0, finite, non-reference rows: {1.0, 2.0}. + assert het["n_effects"] == 2 + assert het["min"] == pytest.approx(1.0) + assert het["max"] == pytest.approx(2.0) + assert het["sign_consistent"] is True + + # --------------------------------------------------------------------------- # Public API exposure # --------------------------------------------------------------------------- From d8fa66d094ec8a193f67fc3907b84f282842a28d Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 18:19:03 -0400 Subject: [PATCH 10/48] Address seventh round of CI review findings on PR #318 - P0 alpha override: ``BusinessReport._extract_headline`` no longer recomputes a normal-z CI when the caller's ``alpha`` differs from the fit's native alpha. Recomputing via ``safe_inference(att, se, alpha)`` silently swapped t-based inference (``DifferenceInDifferences`` via ``LinearRegression.get_inference()``, ``MultiPeriodDiD`` via ``safe_inference(..., df=df)``, TROP's ``df_trop``) for a normal-z CI, and could invent a finite CI on undefined-df (replicate- survey ``df_survey=0``) fits whose native inference was NaN. The fitted CI is now preserved at its native level on any alpha mismatch; a ``display_alpha`` / ``phrasing_alpha`` split keeps the CI level at the fit's native level while significance phrasing (``is_significant``, ``near_significance_threshold``) uses the caller's requested alpha. The inference label in the override caveat now distinguishes bootstrap, wild bootstrap, jackknife / placebo, finite-df survey, undefined-df replicate, and ordinary analytical (native degrees of freedom). - P2 schema mislabel: ``DiagnosticReport`` pretrends-power section renames ``power_at_M_1`` to ``power_at_violation_magnitude`` and adds an explicit ``violation_magnitude`` field. The underlying ``PreTrendsPowerResults.power`` is power at ``violation_magnitude`` (which defaults to the MDV when the caller passes ``M=None``), not power at ``M=1.0`` as the prior label implied. - Test updates: the round-2 ``test_ci_bounds_recomputed_when_alpha _differs_from_result`` assumed recomputation was the correct behavior; renamed to ``test_alpha_mismatch_preserves_fitted_ci_at_native_level`` and inverted the bounds expectations. ``test_alpha_drives_ci_level`` narrowed to the equal-alpha case. ``test_n_bootstrap_zero_does_not_trigger_preserve_path`` replaced by ``test_n_bootstrap_zero_still_preserves_on_alpha_mismatch``. New ``TestAnalyticalFiniteDfAlphaOverride`` pins the P0 fix on real ``DifferenceInDifferences`` / ``MultiPeriodDiD`` fits and on a ``df_survey=0`` stub (NaN CI must propagate). 112 targeted tests passing; black / ruff / mypy clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 154 +++++++++++++++------------------ diff_diff/diagnostic_report.py | 11 ++- tests/test_business_report.py | 139 +++++++++++++++++++++++++---- 3 files changed, 202 insertions(+), 102 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index e460cfaa..65bb2914 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -330,23 +330,30 @@ def _extract_headline(self, dr_schema: Optional[Dict[str, Any]]) -> Dict[str, An if extracted is not None: _name, att, se, p, ci, result_alpha = extracted - # If the caller asked for a different alpha than the result was fit - # at, the displayed CI needs to match the label. Naive recomputation - # via ``safe_inference(att, se, alpha=alpha)`` would use a normal - # distribution with no df, which silently discards finite-df / - # bootstrap / percentile inference contracts used by TROP, - # ContinuousDiD, dCDH-bootstrap, survey fits, etc. Rules: - # 1. If the result has an analytic inference contract we can - # reproduce (no bootstrap distribution, no finite df we don't - # know about), recompute via ``safe_inference`` — this covers - # the common case of normal-approximation CIs. - # 2. Otherwise (bootstrap / percentile / finite-df / survey d.f.), - # preserve the fitted CI and its native level so the displayed - # interval keeps matching the stored p-value and inference - # contract. The ``ci_level`` field will reflect the result's - # own alpha, and a caveat is appended below noting that the - # caller's alpha drives phrasing but the native interval is - # shown. + # If the caller asked for a different alpha than the result was + # fit at, we cannot generally recompute a faithful CI from + # ``(att, se)`` alone: that would require reproducing the fit's + # exact inference contract (t-quantile with the fit's ``df``, + # bootstrap percentile, wild cluster bootstrap, survey replicate + # quantile, rank-deficient d.f.-undefined, etc.), none of which + # are exposed as a uniform descriptor across the 16 result + # classes. Recomputing via ``safe_inference(att, se, alpha)`` + # silently substitutes a normal-z CI even for analytical t-based + # fits (``DifferenceInDifferences`` via + # ``LinearRegression.get_inference()`` with a finite ``df``, + # ``MultiPeriodDiD`` via ``safe_inference(..., df=df)``, TROP's + # ``df_trop``) and can invent a finite interval where the native + # fit deliberately returned NaN (replicate-survey fits with + # ``df=0``). Both were flagged as P0 by the round-7 CI review. + # + # Rule: always preserve the fitted CI on an alpha mismatch. + # ``display_alpha`` drives ``ci_level`` (so the displayed CI + # label matches the preserved bounds) while + # ``self._context.alpha`` — the caller's requested alpha — drives + # the significance phrasing (``is_significant`` / + # ``near_threshold``). A caveat records the override. + display_alpha = alpha + phrasing_alpha = alpha alpha_was_honored = True alpha_override_caveat: Optional[str] = None if ( @@ -354,71 +361,48 @@ def _extract_headline(self, dr_schema: Optional[Dict[str, Any]]) -> Dict[str, An and not np.isclose(alpha, result_alpha) and att is not None and se is not None - and np.isfinite(att) - and np.isfinite(se) ): inference_method = getattr(r, "inference_method", "analytical") - has_bootstrap_dist = getattr(r, "bootstrap_distribution", None) is not None - df_survey = getattr( - r, "df_survey", getattr(getattr(r, "survey_metadata", None), "df_survey", None) - ) - variance_method = getattr(r, "variance_method", None) - - # Many staggered / continuous / dCDH result classes copy - # bootstrap-derived se/p/conf_int directly into their top-level - # fields and do not advertise ``inference_method`` or - # ``bootstrap_distribution``. Instead they expose either a - # populated ``bootstrap_results`` sub-object (CS, SA, Imputation, - # TwoStage, EfficientDiD, StaggeredTripleDiff, dCDH) or an - # ``n_bootstrap`` field set > 0 (ContinuousDiD, plus the above - # when applicable). Treat both as bootstrap markers so an - # ``alpha`` override does not silently swap a percentile / - # multiplier-bootstrap CI for a normal-approximation one. - has_bootstrap_results = getattr(r, "bootstrap_results", None) is not None - raw_n_bootstrap = getattr(r, "n_bootstrap", 0) - has_n_bootstrap = ( - isinstance(raw_n_bootstrap, (int, float)) - and np.isfinite(raw_n_bootstrap) - and raw_n_bootstrap > 0 - ) - - # Any non-analytic inference surface that stores a sampling / - # resampling distribution (wild cluster bootstrap, percentile - # bootstrap, jackknife, placebo) should preserve its native CI. - bootstrap_like = ( - inference_method in {"bootstrap", "wild_bootstrap"} - or has_bootstrap_dist - or has_bootstrap_results - or has_n_bootstrap - or variance_method in {"bootstrap", "jackknife", "placebo"} - ) - finite_df = isinstance(df_survey, (int, float)) and df_survey > 0 - - if bootstrap_like or finite_df: - # Preserve the fitted CI at its native level. - alpha_was_honored = False - alpha = float(result_alpha) - if inference_method == "wild_bootstrap": - inference_label = "wild cluster bootstrap" - elif bootstrap_like: - inference_label = "bootstrap" - else: - inference_label = "finite-df" - alpha_override_caveat = ( - f"Requested alpha was not honored for the confidence " - f"interval because this fit uses {inference_label} " - f"inference; the displayed CI remains at the fit's " - f"native level ({int(round((1.0 - result_alpha) * 100))}%). " - f"The significance phrasing still uses the requested alpha." - ) + if inference_method == "wild_bootstrap": + inference_label = "wild cluster bootstrap" + elif ( + inference_method == "bootstrap" or getattr(r, "bootstrap_results", None) is not None + ): + inference_label = "bootstrap" + elif getattr(r, "bootstrap_distribution", None) is not None: + inference_label = "bootstrap" + elif getattr(r, "variance_method", None) in {"bootstrap", "jackknife", "placebo"}: + variance_method = getattr(r, "variance_method", None) + inference_label = f"{variance_method} variance" else: - from diff_diff.utils import safe_inference - - _t, _p, recomputed_ci = safe_inference(att, se, alpha=alpha) - if recomputed_ci is not None and all( - x is not None and np.isfinite(x) for x in recomputed_ci - ): - ci = [float(recomputed_ci[0]), float(recomputed_ci[1])] + df_survey = getattr( + r, + "df_survey", + getattr(getattr(r, "survey_metadata", None), "df_survey", None), + ) + if isinstance(df_survey, (int, float)) and df_survey > 0: + inference_label = "finite-df survey" + elif isinstance(df_survey, (int, float)) and df_survey == 0: + # Rank-deficient replicate design: the fit deliberately + # left inference undefined. Preserve (NaN bounds remain NaN). + inference_label = "undefined-df (replicate-weight)" + else: + # Ordinary analytical fit with a finite but unexposed + # ``df`` (``DifferenceInDifferences`` / ``MultiPeriodDiD`` + # / most staggered estimators / TROP). We cannot + # reproduce the t-quantile without the fit's ``df``. + inference_label = "analytical (native degrees of freedom)" + + display_alpha = float(result_alpha) + alpha_was_honored = False + alpha_override_caveat = ( + f"Requested alpha ({phrasing_alpha:.2f}) was not honored " + f"for the confidence interval because this fit uses " + f"{inference_label} inference; the displayed CI remains " + f"at the fit's native level " + f"({int(round((1.0 - result_alpha) * 100))}%). The " + f"significance phrasing still uses the requested alpha." + ) unit = self._context.outcome_unit unit_kind = _UNIT_KINDS.get(unit.lower() if unit else "", "unknown") @@ -433,9 +417,15 @@ def _extract_headline(self, dr_schema: Optional[Dict[str, Any]]) -> Dict[str, An ) if att is None or not np.isfinite(att): sign = "undefined" - ci_level = int(round((1.0 - alpha) * 100)) - is_significant = p is not None and np.isfinite(p) and p < alpha if p is not None else False - near_threshold = p is not None and np.isfinite(p) and (alpha - 0.01) < p < (alpha + 0.001) + ci_level = int(round((1.0 - display_alpha) * 100)) + is_significant = ( + p is not None and np.isfinite(p) and p < phrasing_alpha if p is not None else False + ) + near_threshold = ( + p is not None + and np.isfinite(p) + and (phrasing_alpha - 0.01) < p < (phrasing_alpha + 0.001) + ) # Use DR-computed breakdown_M if available for quick reference. breakdown_M: Optional[float] = None if dr_schema: diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 52e7698f..f20058fc 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -981,7 +981,13 @@ def _check_pretrends_power(self) -> Dict[str, Any]: "target_power": _to_python_float(getattr(pp, "target_power", 0.80)), "mdv": mdv, "mdv_share_of_att": ratio, - "power_at_M_1": _to_python_float(getattr(pp, "power", None)), + # ``PreTrendsPowerResults.power`` is the power to detect a + # violation of ``violation_magnitude`` (which defaults to the + # MDV when the caller passes ``M=None``). The round-7 CI + # review flagged the prior ``power_at_M_1`` label as a schema + # contract bug — the stored value was not power at ``M=1.0``. + "violation_magnitude": _to_python_float(getattr(pp, "violation_magnitude", None)), + "power_at_violation_magnitude": _to_python_float(getattr(pp, "power", None)), "n_pre_periods": int(getattr(pp, "n_pre_periods", 0) or 0), "tier": tier, "covariance_source": cov_source, @@ -1003,7 +1009,8 @@ def _format_precomputed_pretrends_power(self, obj: Any) -> Dict[str, Any]: "target_power": _to_python_float(getattr(obj, "target_power", 0.80)), "mdv": mdv, "mdv_share_of_att": ratio, - "power_at_M_1": _to_python_float(getattr(obj, "power", None)), + "violation_magnitude": _to_python_float(getattr(obj, "violation_magnitude", None)), + "power_at_violation_magnitude": _to_python_float(getattr(obj, "power", None)), "n_pre_periods": int(getattr(obj, "n_pre_periods", 0) or 0), "tier": _power_tier(ratio), "precomputed": True, diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 9add99ef..3dee904b 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -497,17 +497,21 @@ class _ShimMeta: # Single-knob alpha # --------------------------------------------------------------------------- class TestAlphaKnob: - def test_alpha_drives_ci_level(self, event_study_fit): + def test_alpha_equal_to_result_alpha_drives_ci_level(self, event_study_fit): + """When caller's alpha matches the fit's native alpha, ``ci_level`` + reflects that alpha (e.g., alpha=0.05 -> 95% CI).""" fit, _ = event_study_fit - br90 = BusinessReport(fit, alpha=0.10, auto_diagnostics=False) - br95 = BusinessReport(fit, alpha=0.05, auto_diagnostics=False) - assert br90.to_dict()["headline"]["ci_level"] == 90 - assert br95.to_dict()["headline"]["ci_level"] == 95 - - def test_ci_bounds_recomputed_when_alpha_differs_from_result(self, event_study_fit): - """Regression for the P0 CI-label bug: when alpha != results.alpha, - the displayed interval must be recomputed from (att, se) rather than - the stored interval being relabeled to the caller's alpha.""" + br = BusinessReport(fit, alpha=0.05, auto_diagnostics=False) + assert br.to_dict()["headline"]["ci_level"] == 95 + + def test_alpha_mismatch_preserves_fitted_ci_at_native_level(self, event_study_fit): + """Round-7 regression: a caller alpha that differs from the fit's + native alpha must NOT recompute a z-based CI (the fit used t-based + inference with a finite ``df`` that BR cannot reproduce from + ``(att, se)`` alone). The displayed CI stays at the fit's native + level, while significance phrasing uses the caller's alpha. A + caveat records the override. + """ import math fit, _ = event_study_fit @@ -516,11 +520,19 @@ def test_ci_bounds_recomputed_when_alpha_differs_from_result(self, event_study_f h95 = br95.to_dict()["headline"] h90 = br90.to_dict()["headline"] if h95["effect"] is not None and math.isfinite(h95["effect"]): - # 90% bounds must be strictly inside 95% bounds. - assert h90["ci_lower"] > h95["ci_lower"] + 1e-9 - assert h90["ci_upper"] < h95["ci_upper"] - 1e-9 + # Bounds must match between the two: the alpha=0.10 call + # preserves the fit's 95% CI rather than recomputing a 90% z-CI. + assert h90["ci_lower"] == pytest.approx(h95["ci_lower"]) + assert h90["ci_upper"] == pytest.approx(h95["ci_upper"]) + # ``ci_level`` stays at the fit's native level in both cases. assert h95["ci_level"] == 95 - assert h90["ci_level"] == 90 + assert h90["ci_level"] == 95 + # Override is surfaced as an info-level caveat. + topics = {c.get("topic") for c in br90.caveats()} + assert "alpha_override_preserved" in topics, ( + "Alpha mismatch must surface a caveat documenting the preserved " + "native CI level; topics seen: " + str(topics) + ) class TestAlphaOverrideBootstrapAndFiniteDF: @@ -859,14 +871,22 @@ def test_n_bootstrap_positive_alone_preserves_fit_ci(self): topics = {c.get("topic") for c in br.caveats()} assert "alpha_override_preserved" in topics - def test_n_bootstrap_zero_does_not_trigger_preserve_path(self): - """Analytic fits with ``n_bootstrap = 0`` must still honor alpha.""" + def test_n_bootstrap_zero_still_preserves_on_alpha_mismatch(self): + """Analytic fits (``n_bootstrap = 0``) also preserve the fitted CI + on alpha mismatch — BR cannot reproduce a ``DiDResults`` / + ``MultiPeriodDiDResults`` / TROP t-quantile CI without the fit's + finite ``df``, which is not exposed uniformly. Round-7 regression. + """ stub = self._base_stub() stub.n_bootstrap = 0 br = BusinessReport(stub, alpha=0.10, auto_diagnostics=False) h = br.to_dict()["headline"] - # Analytic — alpha honored, CI recomputed to 90%. - assert h["ci_level"] == 90 + # Analytic fit's native 95% CI is preserved at 95% on 90% override. + assert h["ci_level"] == 95 + assert h["ci_lower"] == pytest.approx(0.05) + assert h["ci_upper"] == pytest.approx(1.95) + topics = {c.get("topic") for c in br.caveats()} + assert "alpha_override_preserved" in topics def test_dcdh_shaped_bootstrap_stub_preserves_fit_ci(self): """dCDH copies bootstrap se/p/conf_int into top-level fields without @@ -898,6 +918,89 @@ class ChaisemartinDHaultfoeuilleResults: # name-keyed dispatch assert h["ci_upper"] == pytest.approx(2.28) +class TestAnalyticalFiniteDfAlphaOverride: + """Round-7 regressions for the P0 finding that + ``_extract_headline`` was recomputing a normal-z CI on alpha + mismatch for analytical fits whose native inference used a finite + ``df`` (``DifferenceInDifferences`` / ``MultiPeriodDiD`` / TROP) + that BR cannot reproduce from ``(att, se)`` alone. The fix is to + always preserve the fitted CI on alpha mismatch. + """ + + def test_analytical_did_result_preserves_native_ci(self): + from diff_diff import DifferenceInDifferences, generate_did_data + + df = generate_did_data(n_units=80, n_periods=4, treatment_effect=1.5, seed=7) + fit = DifferenceInDifferences().fit(df, outcome="outcome", treatment="treated", time="post") + native_lo, native_hi = fit.conf_int + + br = BusinessReport(fit, alpha=0.10, auto_diagnostics=False) + h = br.to_dict()["headline"] + # Native 95% CI preserved — no z-based recomputation. + assert h["ci_level"] == 95 + assert h["ci_lower"] == pytest.approx(native_lo) + assert h["ci_upper"] == pytest.approx(native_hi) + topics = {c.get("topic") for c in br.caveats()} + assert "alpha_override_preserved" in topics + + def test_multiperiod_preserves_native_ci_on_alpha_override(self): + from diff_diff import MultiPeriodDiD, generate_did_data + + df = generate_did_data(n_units=80, n_periods=8, treatment_effect=1.5, seed=7) + fit = MultiPeriodDiD().fit( + df, + outcome="outcome", + treatment="treated", + time="period", + unit="unit", + reference_period=3, + ) + native_lo, native_hi = fit.avg_conf_int + + br = BusinessReport(fit, alpha=0.10, auto_diagnostics=False) + h = br.to_dict()["headline"] + assert h["ci_level"] == 95 + assert h["ci_lower"] == pytest.approx(native_lo) + assert h["ci_upper"] == pytest.approx(native_hi) + + def test_undefined_df_survey_stub_does_not_invent_finite_ci(self): + """When the fit's native inference returned NaN (rank-deficient + replicate design: ``df_survey = 0``), BR must not recompute a + finite interval — the NaN signal must propagate through.""" + from types import SimpleNamespace + + class _UndefinedDfStub: + pass + + stub = _UndefinedDfStub() + stub.att = 1.0 + stub.se = float("nan") + stub.p_value = float("nan") + stub.conf_int = (float("nan"), float("nan")) + stub.alpha = 0.05 + stub.n_obs = 100 + stub.n_treated = 40 + stub.n_control = 60 + stub.inference_method = "analytical" + stub.survey_metadata = SimpleNamespace( + weight_type="replicate", + replicate_method="JK1", + effective_n=80.0, + design_effect=1.25, + sum_weights=100.0, + n_strata=None, + n_psu=None, + df_survey=0, + ) + + br = BusinessReport(stub, alpha=0.10, auto_diagnostics=False) + h = br.to_dict()["headline"] + # NaN bounds must propagate — BR must not invent a finite CI. + lo, hi = h["ci_lower"], h["ci_upper"] + assert lo is None or not np.isfinite(lo), f"ci_lower should be NaN/None, got {lo}" + assert hi is None or not np.isfinite(hi), f"ci_upper should be NaN/None, got {hi}" + + class TestDCDHAssumptionTransitionBased: """Regression for the round-5 P1 finding that ``ChaisemartinDHaultfoeuilleResults`` was narrated with generic group- From 7b5c0ad2d1cbcac47d331372054077ead41e5c33 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 18:48:02 -0400 Subject: [PATCH 11/48] Address eighth round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 8 flagged one legitimate P1 (method-aware PT prose + EfficientDiD assumption). The round-8 review's P0 alpha override, P2 power_at_M_1 rename, and P2 analytical-fit regression findings are regressions on its own round-7 assessment — all three were addressed in commit d8fa66d and remain in place at HEAD: ``safe_inference(att, se, alpha)`` is gone from ``_extract_headline``, ``power_at_M_1`` is renamed to ``power_at_violation_magnitude`` with ``violation_magnitude`` exposed, and ``test_n_bootstrap_zero_does_not_trigger_preserve_path`` was replaced by ``test_n_bootstrap_zero_still_preserves_on_alpha_mismatch`` plus the new ``TestAnalyticalFiniteDfAlphaOverride`` suite with real ``DiDResults`` / ``MultiPeriodDiD`` fits and an undefined-d.f. replicate-survey stub. The static reviewer appears to have described the pre-round-7 state of those paths; grep confirms the fixes are present at this SHA. Legitimate round-8 P1 fix: - ``BusinessReport._describe_assumption`` now has an ``EfficientDiDResults``-specific branch that reads ``results.pt_assumption`` (``"all"`` vs ``"post"``) and, when present, ``results.control_group``. PT-All surfaces Lemma 2.1 / over-identification + Hausman pretest language; PT-Post surfaces Corollary 3.2 / just-identified single-baseline DiD language. EfficientDiD is pulled out of the generic group-time ATT frozenset per REGISTRY.md §EfficientDiD lines 736-738 and 907. - BR summary and DR ``_render_overall_interpretation`` PT sentences now branch on ``parallel_trends.method``. New ``_pt_method_subject`` / ``_pt_subject_phrase`` helpers return source-faithful subjects ("The pre-period slope-difference test" for ``slope_difference``, "The Hausman PT-All vs PT-Post pretest" for ``hausman``, "Pre-treatment event-study coefficients" for Wald / Bonferroni paths, "The synthetic-control pre-treatment fit" for SDiD, "The factor-model pre-treatment fit" for TROP). A matching ``_pt_method_stat_label`` emits ``joint p`` vs ``p`` so single- statistic tests (slope_difference, hausman) are not mis-labeled with a joint-Wald style statistic label. - Regressions: ``TestEfficientDiDAssumptionPtAllPtPost`` (three cases: PT-All, PT-Post, control_group passthrough) and ``TestMethodAwarePTProse`` (BR slope-difference wording on a crafted schema; DR hausman wording on a real ``EfficientDiD`` fit). New ``edid_fit`` fixture added to ``tests/test_business_report.py``. 117 targeted tests passing; black / ruff / mypy clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 130 +++++++++++++++++++++++----- diff_diff/diagnostic_report.py | 81 ++++++++++++++---- tests/test_business_report.py | 151 +++++++++++++++++++++++++++++++++ 3 files changed, 324 insertions(+), 38 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 65bb2914..bdbdf769 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -264,7 +264,7 @@ def _build_schema(self) -> Dict[str, Any]: pre_trends = _lift_pre_trends(dr_schema) sensitivity = _lift_sensitivity(dr_schema) robustness = _lift_robustness(dr_schema) - assumption = _describe_assumption(estimator_name) + assumption = _describe_assumption(estimator_name, self._results) next_steps = (dr_schema or {}).get("next_steps", []) caveats = _build_caveats(self._results, headline, sample, dr_schema) references = _references_for(estimator_name) @@ -605,7 +605,7 @@ def _lift_robustness(dr: Optional[Dict[str, Any]]) -> Dict[str, Any]: } -def _describe_assumption(estimator_name: str) -> Dict[str, Any]: +def _describe_assumption(estimator_name: str, results: Any = None) -> Dict[str, Any]: """Return the identifying-assumption block for an estimator.""" if estimator_name in { "SyntheticDiDResults", @@ -703,13 +703,58 @@ def _describe_assumption(estimator_name: str) -> Dict[str, Any]: "treatment-onset cohort." ), } + if estimator_name == "EfficientDiDResults": + # Chen, Sant'Anna & Xie (2025) — identification is parameterized + # by ``pt_assumption`` ("all" vs "post"). PT-All is the stronger + # regime (PT across all groups/periods, over-identified — paper + # Lemma 2.1), PT-Post the weaker (PT only in post-treatment, + # just-identified reduction to single-baseline DiD per Corollary + # 3.2). Also read ``control_group`` when present (not_yet_treated + # vs last_cohort) to be source-faithful to REGISTRY.md §EfficientDiD + # lines 736-738 and 907. + pt_assumption = getattr(results, "pt_assumption", "all") + control_group = getattr(results, "control_group", None) + if pt_assumption == "post": + variant = "pt_post" + description = ( + "Identification under PT-Post (Chen, Sant'Anna & Xie " + "2025): parallel trends holds only in post-treatment " + "periods, the comparison group is never-treated, and " + "the baseline is period g-1 only. This is the weaker " + "of the two regimes — just-identified and reducing to " + "standard single-baseline DiD (Corollary 3.2). Also " + "assumes no anticipation (Assumption NA), overlap " + "(Assumption O), and absorbing / irreversible treatment." + ) + else: + variant = "pt_all" + description = ( + "Identification under PT-All (Chen, Sant'Anna & Xie " + "2025): parallel trends holds for all groups and all " + "periods, allowing any not-yet-treated cohort and any " + "pre-treatment period as baseline. The estimator is " + "over-identified (Lemma 2.1), and the paper's optimal " + "combination weights are applied. Also assumes no " + "anticipation (Assumption NA), overlap (Assumption O), " + "and absorbing / irreversible treatment. The Hausman " + "PT-All vs PT-Post pretest (operating on the post-" + "treatment event-study vector ES(e), Theorem A.1) " + "checks whether the stronger PT-All regime is tenable." + ) + block: Dict[str, Any] = { + "parallel_trends_variant": variant, + "no_anticipation": True, + "description": description, + } + if isinstance(control_group, str): + block["control_group"] = control_group + return block if estimator_name in { "CallawaySantAnnaResults", "SunAbrahamResults", "ImputationDiDResults", "TwoStageDiDResults", "StackedDiDResults", - "EfficientDiDResults", "WooldridgeDiDResults", }: return { @@ -918,6 +963,46 @@ def _build_caveats( return caveats +def _pt_method_subject(method: Optional[str]) -> str: + """Return a source-faithful sentence subject for the PT verdict prose. + + The ``parallel_trends.method`` field distinguishes between the + 2x2 slope-difference check, the pre-period event-study Wald / + Bonferroni variants, EfficientDiD's Hausman PT-All vs PT-Post + pretest, SDiD's weighted pre-treatment fit, and TROP's factor- + model identification. Generic "pre-treatment event-study" wording + is wrong for the first and third cases. See round-8 CI review on + PR #318 and REGISTRY.md §EfficientDiD (Hausman pretest). + """ + if method == "slope_difference": + return "The pre-period slope-difference test" + if method == "hausman": + return "The Hausman PT-All vs PT-Post pretest" + if method in {"joint_wald", "joint_wald_event_study", "joint_wald_no_vcov", "bonferroni"}: + return "Pre-treatment event-study coefficients" + if method == "synthetic_fit": + return "The synthetic-control pre-treatment fit" + if method == "factor": + return "The factor-model pre-treatment fit" + return "Pre-treatment data" + + +def _pt_method_stat_label(method: Optional[str]) -> Optional[str]: + """Return the joint-statistic label appropriate to the PT method. + + Returns ``"joint p"`` for Wald / Bonferroni paths, ``"p"`` for the + 2x2 slope-difference and Hausman paths (which are single-statistic + tests), and ``None`` for design-enforced paths that have no p-value. + """ + if method in {"joint_wald", "joint_wald_event_study", "joint_wald_no_vcov", "bonferroni"}: + return "joint p" + if method in {"slope_difference", "hausman"}: + return "p" + if method in {"synthetic_fit", "factor"}: + return None + return "joint p" + + def _references_for(estimator_name: str) -> List[Dict[str, str]]: """Map the estimator to the appropriate citation references.""" base = [ @@ -1125,6 +1210,7 @@ def _render_summary(schema: Dict[str, Any]) -> str: jp = pt.get("joint_p_value") verdict = pt.get("verdict") tier = pt.get("power_tier") + method = pt.get("method") # ``compute_pretrends_power`` currently falls back to ``np.diag(ses**2)`` # for CS / SA / ImputationDiD / Stacked / etc., even when the full # ``event_study_vcov`` is available. Downgrade any "well_powered" tier @@ -1134,41 +1220,41 @@ def _render_summary(schema: Dict[str, Any]) -> str: cov_source = pt.get("power_covariance_source") if tier == "well_powered" and cov_source == "diag_fallback_available_full_vcov_unused": tier = "moderately_powered" + subject = _pt_method_subject(method) + stat_label = _pt_method_stat_label(method) + jp_phrase = ( + f" ({stat_label} = {jp:.3g})" if isinstance(jp, (int, float)) and stat_label else "" + ) if verdict == "clear_violation": sentences.append( - f"Pre-treatment data clearly reject parallel trends (joint " - f"p = {jp:.3g}); the headline should be treated as tentative " - f"pending the sensitivity analysis below." - if isinstance(jp, (int, float)) - else "Pre-treatment data clearly reject parallel trends; the " - "headline should be treated as tentative." + f"{subject} clearly reject parallel trends{jp_phrase}; the " + "headline should be treated as tentative pending the " + "sensitivity analysis below." ) elif verdict == "some_evidence_against": sentences.append( - f"Pre-treatment data show some evidence of diverging trends " - f"(joint p = {jp:.3g}); interpret the headline alongside the " - f"sensitivity analysis below." - if isinstance(jp, (int, float)) - else "Pre-treatment data show some evidence of diverging trends." + f"{subject} show some evidence against parallel trends" + f"{jp_phrase}; interpret the headline alongside the " + "sensitivity analysis below." ) elif verdict == "no_detected_violation": if tier == "well_powered": sentences.append( - "Pre-treatment data are consistent with parallel trends, " - "and the test is well-powered (the minimum-detectable " + f"{subject} are consistent with parallel trends, and " + "the test is well-powered (the minimum-detectable " "violation is small relative to the estimated effect)." ) elif tier == "moderately_powered": sentences.append( - "Pre-treatment data do not reject parallel trends; the " - "test is moderately informative. See the sensitivity " - "analysis below for bounded-violation guarantees." + f"{subject} do not reject parallel trends; the test is " + "moderately informative. See the sensitivity analysis " + "below for bounded-violation guarantees." ) else: sentences.append( - "Pre-treatment data do not reject parallel trends, but " - "the test has limited power — a non-rejection does not " - "prove the assumption. See the sensitivity analysis below." + f"{subject} do not reject parallel trends, but the test " + "has limited power — a non-rejection does not prove the " + "assumption. See the sensitivity analysis below." ) elif verdict == "design_enforced_pt": sentences.append( diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index f20058fc..59c44cfa 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -2017,6 +2017,45 @@ def _check_headline(check: str, section: Dict[str, Any]) -> Optional[Any]: return None +def _pt_subject_phrase(method: Optional[str]) -> str: + """Return a source-faithful subject for DR's PT verdict sentence. + + Round-8 CI review: the generic "pre-treatment event-study + coefficients" wording mis-describes the 2x2 slope-difference check + (``method="slope_difference"``) and EfficientDiD's Hausman PT-All + vs PT-Post pretest (``method="hausman"``). See REGISTRY.md + §EfficientDiD line 907 for the Hausman test's operating vector. + """ + if method == "slope_difference": + return "The pre-period slope-difference test" + if method == "hausman": + return "The Hausman PT-All vs PT-Post pretest" + if method in {"joint_wald", "joint_wald_event_study", "joint_wald_no_vcov", "bonferroni"}: + return "Pre-treatment event-study coefficients" + if method == "synthetic_fit": + return "The synthetic-control pre-treatment fit" + if method == "factor": + return "The factor-model pre-treatment fit" + return "Pre-treatment data" + + +def _pt_stat_label(method: Optional[str]) -> Optional[str]: + """Label for the joint-statistic p-value in the PT prose. + + Wald / Bonferroni paths take a joint p-value (``joint p``); the 2x2 + slope-difference and Hausman paths are single-statistic tests + (``p``). Design-enforced paths return ``None`` so the sentence + omits a statistic. + """ + if method in {"joint_wald", "joint_wald_event_study", "joint_wald_no_vcov", "bonferroni"}: + return "joint p" + if method in {"slope_difference", "hausman"}: + return "p" + if method in {"synthetic_fit", "factor"}: + return None + return "joint p" + + def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str]) -> str: """Synthesize a plain-English paragraph across DR checks. @@ -2055,47 +2094,57 @@ def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str f"On {est}, {treatment} {direction} {outcome} by {val:.3g}{ci_str}{p_str}." ) - # Sentence 2: parallel trends + power + # Sentence 2: parallel trends + power (method-aware prose per the + # round-8 CI review on PR #318; PT method can be slope_difference + # (2x2), joint_wald / bonferroni (event study), hausman (EfficientDiD + # PT-All vs PT-Post), synthetic_fit (SDiD), or factor (TROP), and the + # generic "event-study coefficients" wording is wrong for the + # 2x2 and Hausman paths). pt = schema.get("parallel_trends") or {} pp = schema.get("pretrends_power") or {} if pt.get("status") == "ran": verdict = pt.get("verdict") jp = pt.get("joint_p_value") - jp_str = f" (joint p = {jp:.3g})" if isinstance(jp, (int, float)) else "" + method = pt.get("method") + subject = _pt_subject_phrase(method) + stat_label = _pt_stat_label(method) + jp_str = ( + f" ({stat_label} = {jp:.3g})" if isinstance(jp, (int, float)) and stat_label else "" + ) if verdict == "clear_violation": sentences.append( - f"Pre-treatment event-study coefficients clearly reject parallel " - f"trends{jp_str}. The headline estimate should be treated as " - f"tentative pending sensitivity analysis." + f"{subject} clearly reject parallel trends{jp_str}. The " + "headline estimate should be treated as tentative pending " + "sensitivity analysis." ) elif verdict == "some_evidence_against": sentences.append( - f"Pre-treatment data show some evidence of diverging trends" + f"{subject} show some evidence against parallel trends" f"{jp_str}. Interpret the headline alongside the sensitivity " - f"analysis below." + "analysis below." ) elif verdict == "no_detected_violation": tier = pp.get("tier") if pp.get("status") == "ran" else "unknown" if tier == "well_powered": sentences.append( - f"Pre-treatment data are consistent with parallel trends" + f"{subject} are consistent with parallel trends" f"{jp_str} and the test is well-powered (MDV is a small " - f"share of the estimated effect), so a material pre-trend " - f"would likely have been detected." + "share of the estimated effect), so a material pre-trend " + "would likely have been detected." ) elif tier == "moderately_powered": sentences.append( - f"Pre-treatment data do not reject parallel trends" + f"{subject} do not reject parallel trends" f"{jp_str}; the test is moderately informative. See the " - f"sensitivity analysis below for bounded-violation " - f"guarantees." + "sensitivity analysis below for bounded-violation " + "guarantees." ) else: sentences.append( - f"Pre-treatment data do not reject parallel trends" + f"{subject} do not reject parallel trends" f"{jp_str}, but the test has limited power — a non-rejection " - f"does not prove the assumption. See the HonestDiD " - f"sensitivity analysis below for a more reliable signal." + "does not prove the assumption. See the HonestDiD " + "sensitivity analysis below for a more reliable signal." ) elif verdict == "design_enforced_pt": rmse = pt.get("pre_treatment_fit_rmse") diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 3dee904b..2024afeb 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -110,6 +110,17 @@ def sdid_fit(): return sdid, fdf +@pytest.fixture(scope="module") +def edid_fit(): + from diff_diff import EfficientDiD + + sdf = generate_staggered_data(n_units=100, n_periods=6, treatment_effect=1.5, seed=7) + edid = EfficientDiD().fit( + sdf, outcome="outcome", unit="unit", time="period", first_treat="first_treat" + ) + return edid, sdf + + # --------------------------------------------------------------------------- # Schema contract # --------------------------------------------------------------------------- @@ -742,6 +753,146 @@ class StaggeredTripleDiffResults: assert "group-time ATT" not in desc +class TestEfficientDiDAssumptionPtAllPtPost: + """Round-8 regression: EfficientDiD has two distinct PT regimes + (PT-All and PT-Post, per Chen-Sant'Anna-Xie 2025 Corollary 3.2 and + Lemma 2.1). The old generic group-time PT text was source-unfaithful; + the assumption block must now read ``results.pt_assumption`` and + branch on it. + """ + + def _stub(self, pt_assumption: str, control_group: str = "not_yet_treated"): + class EfficientDiDResults: + pass + + stub = EfficientDiDResults() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 100 + stub.n_treated = 40 + stub.n_control = 60 + stub.survey_metadata = None + stub.event_study_effects = None + stub.inference_method = "analytical" + stub.pt_assumption = pt_assumption + stub.control_group = control_group + return stub + + def test_pt_all_uses_pt_all_language(self): + br = BusinessReport(self._stub("all"), auto_diagnostics=False) + a = br.to_dict()["assumption"] + assert a["parallel_trends_variant"] == "pt_all" + assert "PT-All" in a["description"] + assert "Hausman" in a["description"] + # Must NOT be the old generic group-time PT text. + assert "group-time ATT" not in a["description"] + + def test_pt_post_uses_pt_post_language(self): + br = BusinessReport(self._stub("post"), auto_diagnostics=False) + a = br.to_dict()["assumption"] + assert a["parallel_trends_variant"] == "pt_post" + assert "PT-Post" in a["description"] + assert "Corollary 3.2" in a["description"] or "single-baseline" in a["description"] + + def test_control_group_is_reflected_in_block(self): + br = BusinessReport(self._stub("all", "last_cohort"), auto_diagnostics=False) + a = br.to_dict()["assumption"] + assert a.get("control_group") == "last_cohort" + + +class TestMethodAwarePTProse: + """Round-8 regression: BR and DR summary prose must branch on the + ``parallel_trends.method`` field. Generic "pre-treatment event-study + coefficients" wording is wrong for the 2x2 ``slope_difference`` path + and for EfficientDiD's ``hausman`` PT-All vs PT-Post pretest. + """ + + def test_br_summary_uses_slope_difference_wording_for_simple_did(self): + """Use a stub DR schema with a known slope_difference verdict so + the test is deterministic across pre-period counts. The real + 2x2 fit can produce NaN verdicts when there is only one + pre-period, so we don't rely on a real DR here.""" + + class DiDResults: + pass + + stub = DiDResults() + stub.att = 1.0 + stub.se = 0.2 + stub.p_value = 0.001 + stub.conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 100 + stub.n_treated = 40 + stub.n_control = 60 + stub.survey_metadata = None + stub.inference_method = "analytical" + + # Hand-crafted DR schema with ``method = "slope_difference"``. + from diff_diff.diagnostic_report import DiagnosticReportResults + + fake_schema = { + "schema_version": "1.0", + "estimator": "DiDResults", + "headline_metric": {"name": "att", "value": 1.0}, + "parallel_trends": { + "status": "ran", + "method": "slope_difference", + "joint_p_value": 0.40, + "verdict": "no_detected_violation", + }, + "pretrends_power": {"status": "not_applicable"}, + "sensitivity": {"status": "not_applicable"}, + "placebo": {"status": "skipped", "reason": "opt-in"}, + "bacon": {"status": "not_applicable"}, + "design_effect": {"status": "not_applicable"}, + "heterogeneity": {"status": "not_applicable"}, + "epv": {"status": "not_applicable"}, + "estimator_native_diagnostics": {"status": "not_applicable"}, + "skipped": {}, + "warnings": [], + "overall_interpretation": "", + "next_steps": [], + } + fake_dr_results = DiagnosticReportResults( + schema=fake_schema, + interpretation="", + applicable_checks=("parallel_trends",), + skipped_checks={}, + warnings=(), + ) + br = BusinessReport(stub, diagnostics=fake_dr_results) + summary = br.summary() + pt_method = br.to_dict()["pre_trends"].get("method") + assert pt_method == "slope_difference" + # Must NOT use the generic event-study wording. + assert "event-study coefficients" not in summary + # Must use the slope-difference subject phrase. + assert "slope-difference" in summary + + def test_dr_summary_uses_hausman_wording_for_efficient_did(self, edid_fit): + from diff_diff import DiagnosticReport + + fit, sdf = edid_fit + dr = DiagnosticReport( + fit, + data=sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ) + summary = dr.summary() + pt = dr.to_dict()["parallel_trends"] + # EfficientDiD's PT check routes through hausman_pretest. + assert pt.get("method") == "hausman" + # The generic event-study wording must not appear for this path. + assert "event-study coefficients" not in summary + + class TestFullReportSingleM: """Regression: ``full_report()`` must not claim full-grid robustness for a single-M HonestDiDResults passthrough. The summary path was fixed earlier; From ed6f2205a807ca230adb1393e9a15ff51b3375f3 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 18:58:03 -0400 Subject: [PATCH 12/48] Address ninth round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 9 flagged one legitimate new finding (Hausman propagation). The other four findings are regressions on the reviewer's own round-7 and round-8 assessments — grep at HEAD confirms all four were fixed in earlier commits and remain in place: - P0 alpha override: ``safe_inference(att, se, alpha)`` was removed from ``BusinessReport._extract_headline`` in round 7 (commit d8fa66d). The only remaining references are in the explanatory comment that documents why we preserve rather than recompute. ``grep safe_inference diff_diff/business_report.py`` returns only the comment lines. - P1 EfficientDiD assumption + PT prose: addressed in round 8 (commit 7b5c0ad). ``_describe_assumption`` has an EfficientDiD- specific branch reading ``pt_assumption`` (PT-All vs PT-Post per Corollary 3.2 / Lemma 2.1). BR and DR summary prose branch on ``parallel_trends.method`` via the new ``_pt_method_subject`` / ``_pt_subject_phrase`` helpers, so the 2x2 slope-difference and Hausman paths get source-correct subjects. - P2 ``power_at_M_1`` rename: addressed in round 7. Grep confirms the field is ``power_at_violation_magnitude`` with ``violation_magnitude`` exposed; only the explanatory comment references the old name. - P2 test regressions: ``test_ci_bounds_recomputed_when_alpha_differs _from_result`` and ``test_n_bootstrap_zero_does_not_trigger_preserve _path`` were removed in round 7; the new ``TestAnalyticalFiniteDfAlphaOverride`` suite locks in the preserve-always behavior on real ``DiDResults`` / ``MultiPeriodDiD`` fits plus an undefined-d.f. replicate-survey stub. Legitimate round-9 P1 fix: - ``DiagnosticReport._pt_hausman`` now reads ``control_group`` and ``anticipation`` from the fitted result and forwards them to ``EfficientDiD.hausman_pretest``. The prior code passed only ``data/outcome/unit/time/first_treat/alpha``, so a fit that used ``control_group='last_cohort'`` or a non-zero ``anticipation`` was silently diagnosed under the default design rather than its own. New ``TestHausmanPretestPropagatesFitDesign`` regression uses ``unittest.mock.patch`` to verify both fields propagate. 118 targeted tests passing; black / ruff / mypy clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/diagnostic_report.py | 15 +++++++++ tests/test_business_report.py | 58 ++++++++++++++++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 59c44cfa..4ed251a7 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -1613,6 +1613,20 @@ def _pt_hausman(self) -> Dict[str, Any]: ), } + # Propagate the fit's design settings into the pretest. If the + # original fit used non-default ``control_group`` (e.g. + # ``"last_cohort"``) or a non-zero ``anticipation``, rerunning + # with defaults would diagnose a different design than the + # estimate being summarized (round-9 CI review on PR #318). + r = self._results + hausman_kwargs: Dict[str, Any] = {} + fit_control_group = getattr(r, "control_group", None) + if isinstance(fit_control_group, str): + hausman_kwargs["control_group"] = fit_control_group + fit_anticipation = getattr(r, "anticipation", None) + if isinstance(fit_anticipation, (int, float)) and np.isfinite(fit_anticipation): + hausman_kwargs["anticipation"] = int(fit_anticipation) + try: from diff_diff.efficient_did import EfficientDiD @@ -1623,6 +1637,7 @@ def _pt_hausman(self) -> Dict[str, Any]: time=time, first_treat=first_treat, alpha=self._alpha, + **hausman_kwargs, ) except Exception as exc: # noqa: BLE001 return { diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 2024afeb..d49f8a2a 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -893,6 +893,64 @@ def test_dr_summary_uses_hausman_wording_for_efficient_did(self, edid_fit): assert "event-study coefficients" not in summary +class TestHausmanPretestPropagatesFitDesign: + """Round-9 regression: ``_pt_hausman`` must propagate the fitted + result's ``control_group`` and ``anticipation`` into + ``EfficientDiD.hausman_pretest`` so the pretest diagnoses the same + design as the estimate being summarized. Rerunning with defaults + would silently change the identification regime. + """ + + def _real_edid_fit(self): + from diff_diff import EfficientDiD + + sdf = generate_staggered_data(n_units=100, n_periods=6, treatment_effect=1.5, seed=7) + edid = EfficientDiD().fit( + sdf, outcome="outcome", unit="unit", time="period", first_treat="first_treat" + ) + # Force non-default design knobs on the result so the regression + # exercises propagation even when the constructor used defaults. + edid.control_group = "last_cohort" + edid.anticipation = 1 + return edid, sdf + + def test_hausman_pretest_receives_control_group_and_anticipation(self): + from diff_diff import DiagnosticReport + + fit, sdf = self._real_edid_fit() + captured: dict = {} + + def _fake_hausman(*args, **kwargs): + captured.update(kwargs) + + class _Result: + statistic = 0.0 + p_value = 0.5 + df = 1 + + return _Result() + + with patch( + "diff_diff.efficient_did.EfficientDiD.hausman_pretest", + side_effect=_fake_hausman, + ): + DiagnosticReport( + fit, + data=sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ).run_all() + + assert ( + captured.get("control_group") == "last_cohort" + ), f"control_group must propagate from the fit; got {captured}" + assert ( + captured.get("anticipation") == 1 + ), f"anticipation must propagate from the fit; got {captured}" + + class TestFullReportSingleM: """Regression: ``full_report()`` must not claim full-grid robustness for a single-M HonestDiDResults passthrough. The summary path was fixed earlier; From 25cb8867c9ed2e161d8e615afb476821c1cafbff Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 19:34:21 -0400 Subject: [PATCH 13/48] Address tenth round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - P1 Hausman fit-faithful skip: ``DiagnosticReport._pt_hausman`` and the applicability gate now skip with an explicit reason when the original EfficientDiD fit cannot be faithfully replayed from ``(data, outcome, unit, time, first_treat)`` alone. Specifically, fits with ``estimation_path='dr'`` (covariate / doubly-robust path; ``covariates`` is not stored on the result) and survey-weighted fits (the full ``SurveyDesign`` is not persisted) are gated out. Same-design fits (``nocov`` + no survey) still propagate ``control_group`` and ``anticipation``. - P1 EfficientDiD assumption prose branches on ``control_group``. Default ``never_treated`` keeps the never-treated comparison phrasing; ``last_cohort`` describes the pseudo-never-treated latest-cohort design with periods at/after that cohort's treatment start dropped (REGISTRY.md §EfficientDiD line 908). Both PT-All and PT-Post branches were updated. - P2 applicability gate: ``_instance_skip_reason`` now has a Hausman branch that skips ``parallel_trends`` when raw panel kwargs are missing, or when the fit used a DR / survey path. Matches reality — ``applicable_checks`` and ``skipped_checks`` now line up with what ``_pt_hausman`` would actually do. - P2 next-steps derivation: ``_collect_next_steps`` now takes the sections dict and marks a step complete only when the section's ``status == "ran"`` rather than from the raw applicability set. Fixes the overstated-completion bug for checks that were applicable but skipped at runtime (e.g., varying-base CS sensitivity, DR/survey EfficientDiD Hausman). - P3 Hausman H statistic surfaced: the schema now reads ``HausmanPretestResult.statistic`` (the actual attribute) with a ``test_statistic`` fallback for any precomputed-passthrough objects that use the alternate name. - Regressions: ``TestHausmanFitFaithfulSkip`` covers the DR and survey-weighted skip paths; ``TestHausmanTestStatisticPopulated`` asserts the H statistic lands in the schema on successful runs; ``TestEfficientDiDAssumptionPtAllPtPost`` gains ``never_treated`` vs ``last_cohort`` branches for both PT-Post and PT-All and fixes the old invalid ``control_group='not_yet_treated'`` stub default. 124 targeted tests passing; black / ruff / mypy clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 61 ++++++++++++---- diff_diff/diagnostic_report.py | 116 ++++++++++++++++++++++++++---- tests/test_business_report.py | 122 +++++++++++++++++++++++++++++++- tests/test_diagnostic_report.py | 9 ++- 4 files changed, 279 insertions(+), 29 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 34106ae1..ac63fd4e 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -703,32 +703,63 @@ def _describe_assumption(estimator_name: str, results: Any = None) -> Dict[str, # lines 736-738 and 907. pt_assumption = getattr(results, "pt_assumption", "all") control_group = getattr(results, "control_group", None) + # The estimator only accepts ``control_group`` values of + # ``"never_treated"`` (the default) or ``"last_cohort"``. When + # ``last_cohort`` is used, the latest treatment cohort is + # reclassified as a pseudo-never-treated comparison and time + # periods at/after its onset are dropped; describing such a fit + # with generic never-treated language would misstate the + # identifying setup (see REGISTRY.md §EfficientDiD line 908). + is_last_cohort = control_group == "last_cohort" if pt_assumption == "post": variant = "pt_post" + if is_last_cohort: + control_clause = ( + "the comparison group is the latest treated cohort " + "reclassified as pseudo-never-treated (periods " + "at/after that cohort's treatment start are " + "dropped)" + ) + else: + control_clause = "the comparison group is never-treated" description = ( "Identification under PT-Post (Chen, Sant'Anna & Xie " "2025): parallel trends holds only in post-treatment " - "periods, the comparison group is never-treated, and " - "the baseline is period g-1 only. This is the weaker " - "of the two regimes — just-identified and reducing to " - "standard single-baseline DiD (Corollary 3.2). Also " - "assumes no anticipation (Assumption NA), overlap " - "(Assumption O), and absorbing / irreversible treatment." + "periods, " + control_clause + ", and the baseline is period g-1 only. This is the " + "weaker of the two regimes — just-identified and " + "reducing to standard single-baseline DiD (Corollary " + "3.2). Also assumes no anticipation (Assumption NA), " + "overlap (Assumption O), and absorbing / irreversible " + "treatment." ) else: variant = "pt_all" + if is_last_cohort: + baseline_clause = ( + "using the latest treated cohort as a pseudo-never-" + "treated comparison (periods at/after that cohort's " + "treatment start are dropped); any earlier cohort " + "and any pre-treatment period can serve as baseline" + ) + else: + baseline_clause = ( + "using never-treated units as comparison; any " + "not-yet-treated cohort and any pre-treatment period " + "can serve as baseline" + ) description = ( "Identification under PT-All (Chen, Sant'Anna & Xie " "2025): parallel trends holds for all groups and all " - "periods, allowing any not-yet-treated cohort and any " - "pre-treatment period as baseline. The estimator is " - "over-identified (Lemma 2.1), and the paper's optimal " - "combination weights are applied. Also assumes no " - "anticipation (Assumption NA), overlap (Assumption O), " - "and absorbing / irreversible treatment. The Hausman " - "PT-All vs PT-Post pretest (operating on the post-" - "treatment event-study vector ES(e), Theorem A.1) " - "checks whether the stronger PT-All regime is tenable." + "periods, " + + baseline_clause + + ". The estimator is over-identified (Lemma 2.1), and " + "the paper's optimal combination weights are applied. " + "Also assumes no anticipation (Assumption NA), overlap " + "(Assumption O), and absorbing / irreversible " + "treatment. The Hausman PT-All vs PT-Post pretest " + "(operating on the post-treatment event-study vector " + "ES(e), Theorem A.1) checks whether the stronger " + "PT-All regime is tenable." ) block: Dict[str, Any] = { "parallel_trends_variant": variant, diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 37dc0185..e3cd4b63 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -471,6 +471,45 @@ def _instance_skip_reason(self, check: str) -> Optional[str]: "aggregate='event_study' to populate event-study output." ) # vcov is optional for the Bonferroni fallback. + if method == "hausman": + # EfficientDiD's Hausman pretest requires the raw panel + # to refit under PT-All and PT-Post. Gate at applicability + # rather than letting ``_pt_hausman`` skip at runtime, so + # ``applicable_checks`` and ``completed_steps`` reflect + # reality. + hausman_missing = [ + arg + for arg, val in ( + ("data", self._data), + ("outcome", self._outcome), + ("unit", self._unit), + ("time", self._time), + ("first_treat", self._first_treat), + ) + if val is None + ] + if hausman_missing: + return ( + "EfficientDiD.hausman_pretest needs raw panel data; " + "pass data + outcome + unit + time + first_treat to " + "DiagnosticReport. Missing: " + ", ".join(hausman_missing) + "." + ) + # Fit-faithful guard: DR / survey fits cannot be replayed + # under defaults, so skip with an explicit reason rather + # than rerunning a different design. + if getattr(r, "estimation_path", "nocov") != "nocov": + return ( + "Original EfficientDiD fit used the doubly-robust " + "covariate path; ``covariates`` is not stored on " + "the result, so the Hausman pretest cannot be " + "faithfully replayed." + ) + if getattr(r, "survey_metadata", None) is not None: + return ( + "Original EfficientDiD fit used a survey design; " + "replaying the Hausman pretest would require the " + "full ``SurveyDesign`` object." + ) return None if check == "pretrends_power": # ``compute_pretrends_power`` handles CS / SA / ImputationDiD @@ -631,7 +670,7 @@ def _execute(self) -> DiagnosticReportResults: headline = self._extract_headline_metric() # Pull suggested next steps from the practitioner workflow. - next_steps = self._collect_next_steps(applicable) + next_steps = self._collect_next_steps(sections) # Populate schema-level warnings for every section that ended in "error", # so users and agents do not have to scan each section dict to discover @@ -696,17 +735,28 @@ def _context_labels(self) -> Dict[str, str]: "treatment_label": self._treatment_label or "the treatment", } - def _collect_next_steps(self, applicable: set) -> List[Dict[str, Any]]: - """Pull and filter practitioner_next_steps, marking DR-covered steps complete.""" + def _collect_next_steps(self, sections: Dict[str, Dict[str, Any]]) -> List[Dict[str, Any]]: + """Pull and filter practitioner_next_steps, marking DR-covered steps complete. + + A step is marked complete only when its DR section actually ran + (``status == "ran"``). The previous implementation marked steps + complete based on membership in the applicability set, which + overstated completion for checks that were applicable but skipped + at runtime (e.g., Hausman on a DR / survey fit; sensitivity on + varying-base CS). + """ try: from diff_diff.practitioner import practitioner_next_steps + def _ran(key: str) -> bool: + return sections.get(key, {}).get("status") == "ran" + completed = [] - if "parallel_trends" in applicable: + if _ran("parallel_trends"): completed.append("parallel_trends") - if "sensitivity" in applicable: + if _ran("sensitivity"): completed.append("sensitivity") - if "heterogeneity" in applicable: + if _ran("heterogeneity"): completed.append("heterogeneity") ns = practitioner_next_steps( self._results, @@ -1612,12 +1662,50 @@ def _pt_hausman(self) -> Dict[str, Any]: ), } - # Propagate the fit's design settings into the pretest. If the - # original fit used non-default ``control_group`` (e.g. - # ``"last_cohort"``) or a non-zero ``anticipation``, rerunning - # with defaults would diagnose a different design than the - # estimate being summarized (round-9 CI review on PR #318). + # Fit-faithful guard. ``EfficientDiDResults`` exposes + # ``control_group``, ``anticipation``, and ``estimation_path`` + # (``"nocov"`` or ``"dr"``) plus ``survey_metadata``, but not the + # ``covariates`` list, ``cluster`` column, or nuisance kwargs + # needed to replay a DR / clustered / survey-weighted fit. If + # the original fit used any of those paths, rerunning the + # pretest under defaults would diagnose a different design than + # the estimate being summarized. Skip with an explicit reason + # instead of silently fibbing. r = self._results + estimation_path = getattr(r, "estimation_path", "nocov") + has_survey = getattr(r, "survey_metadata", None) is not None + if estimation_path != "nocov" or has_survey: + reasons: List[str] = [] + if estimation_path == "dr": + reasons.append( + "the original fit used the doubly-robust path with " + "covariates (``covariates`` list is not stored on " + "``EfficientDiDResults``)" + ) + if has_survey: + reasons.append( + "the original fit used a survey design (replay would " + "require the full ``SurveyDesign`` object)" + ) + return { + "status": "skipped", + "method": "hausman", + "reason": ( + "Cannot faithfully replay the Hausman pretest: " + + "; ".join(reasons) + + ". Rerunning the pretest under defaults would " + "diagnose a different design than the estimate. " + "Rerun ``EfficientDiD.hausman_pretest(...)`` " + "manually with the original fit's kwargs or pass " + "``precomputed={'sensitivity': ...}`` if you have " + "a pretest result." + ), + } + + # Propagate settings we can read off the result. Same-design + # replay: only ``control_group`` and ``anticipation`` need to + # match; covariates / cluster / nuisance kwargs are irrelevant + # on the ``nocov`` path we just gated to. hausman_kwargs: Dict[str, Any] = {} fit_control_group = getattr(r, "control_group", None) if isinstance(fit_control_group, str): @@ -1646,11 +1734,15 @@ def _pt_hausman(self) -> Dict[str, Any]: } p_value = _to_python_float(getattr(pt, "p_value", None)) + # ``HausmanPretestResult`` exposes ``statistic`` (not + # ``test_statistic``); keep a fallback in case a precomputed + # passthrough object uses the alternate name. + test_stat = _to_python_float(getattr(pt, "statistic", getattr(pt, "test_statistic", None))) return { "status": "ran", "method": "hausman", "joint_p_value": p_value, - "test_statistic": _to_python_float(getattr(pt, "test_statistic", None)), + "test_statistic": test_stat, "df": _to_python_scalar(getattr(pt, "df", None)), "verdict": _pt_verdict(p_value), } diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 81247a47..5717d8bd 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -759,7 +759,13 @@ class TestEfficientDiDAssumptionPtAllPtPost: branch on it. """ - def _stub(self, pt_assumption: str, control_group: str = "not_yet_treated"): + def _stub(self, pt_assumption: str, control_group: str = "never_treated"): + """Build an EfficientDiD-shaped stub. ``control_group`` defaults to + ``"never_treated"`` (the estimator's actual default); the only other + accepted value is ``"last_cohort"`` (pseudo-never-treated). The + earlier ``"not_yet_treated"`` default was invalid for this estimator + and was flagged in round-10 CI review.""" + class EfficientDiDResults: pass @@ -795,6 +801,30 @@ def test_pt_post_uses_pt_post_language(self): assert "PT-Post" in a["description"] assert "Corollary 3.2" in a["description"] or "single-baseline" in a["description"] + def test_pt_post_never_treated_names_never_treated(self): + """Default control_group: description must say never-treated.""" + br = BusinessReport(self._stub("post", "never_treated"), auto_diagnostics=False) + desc = br.to_dict()["assumption"]["description"] + assert "never-treated" in desc + assert "latest treated cohort" not in desc + + def test_pt_post_last_cohort_branch_describes_pseudo_control(self): + """Round-10 regression: ``control_group='last_cohort'`` must not be + narrated with generic never-treated language. The description must + describe the pseudo-never-treated latest-cohort design (REGISTRY.md + §EfficientDiD line 908).""" + br = BusinessReport(self._stub("post", "last_cohort"), auto_diagnostics=False) + desc = br.to_dict()["assumption"]["description"] + assert "latest treated cohort" in desc + assert "pseudo-never-treated" in desc + assert "dropped" in desc + + def test_pt_all_last_cohort_branch_describes_pseudo_control(self): + br = BusinessReport(self._stub("all", "last_cohort"), auto_diagnostics=False) + desc = br.to_dict()["assumption"]["description"] + assert "latest treated cohort" in desc + assert "pseudo-never-treated" in desc + def test_control_group_is_reflected_in_block(self): br = BusinessReport(self._stub("all", "last_cohort"), auto_diagnostics=False) a = br.to_dict()["assumption"] @@ -949,6 +979,96 @@ class _Result: ), f"anticipation must propagate from the fit; got {captured}" +class TestHausmanFitFaithfulSkip: + """Round-10 regression: DR / survey-weighted EfficientDiD fits cannot + replay the Hausman pretest from ``(data, outcome, unit, time, + first_treat)`` alone because the result does not expose ``covariates``, + ``cluster``, nuisance kwargs, or the full survey design. DR must skip + with an explicit reason rather than rerunning defaults. + """ + + def _make_fit(self, *, estimation_path="nocov", survey_metadata=None): + from diff_diff import EfficientDiD + + sdf = generate_staggered_data(n_units=100, n_periods=6, treatment_effect=1.5, seed=7) + edid = EfficientDiD().fit( + sdf, outcome="outcome", unit="unit", time="period", first_treat="first_treat" + ) + edid.estimation_path = estimation_path + edid.survey_metadata = survey_metadata + return edid, sdf + + def test_dr_covariate_path_skipped_with_reason(self): + from diff_diff import DiagnosticReport + + fit, sdf = self._make_fit(estimation_path="dr") + dr = DiagnosticReport( + fit, + data=sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ) + assert "parallel_trends" not in dr.applicable_checks + reason = dr.skipped_checks.get("parallel_trends", "") + assert "doubly-robust" in reason + + def test_survey_weighted_fit_skipped_with_reason(self): + from types import SimpleNamespace + + from diff_diff import DiagnosticReport + + fake_survey = SimpleNamespace( + weight_type="pweight", + effective_n=80.0, + design_effect=1.25, + sum_weights=100.0, + n_strata=None, + n_psu=None, + df_survey=40, + ) + fit, sdf = self._make_fit(survey_metadata=fake_survey) + dr = DiagnosticReport( + fit, + data=sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ) + assert "parallel_trends" not in dr.applicable_checks + reason = dr.skipped_checks.get("parallel_trends", "") + assert "survey design" in reason + + +class TestHausmanTestStatisticPopulated: + """Round-10 P3 regression: ``HausmanPretestResult`` exposes + ``statistic`` (not ``test_statistic``); the DR schema was previously + reading the wrong attribute and losing the H statistic.""" + + def test_test_statistic_field_is_populated_on_success(self, edid_fit): + from diff_diff import DiagnosticReport + + fit, sdf = edid_fit + dr = DiagnosticReport( + fit, + data=sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ) + pt = dr.to_dict()["parallel_trends"] + if pt["status"] == "ran": + # Method-specific: only Hausman exposes a test_statistic. + assert pt["method"] == "hausman" + ts = pt["test_statistic"] + assert ( + ts is not None and isinstance(ts, float) and np.isfinite(ts) + ), f"Hausman H statistic must be populated on success; got {ts}" + + class TestFullReportSingleM: """Regression: ``full_report()`` must not claim full-grid robustness for a single-M HonestDiDResults passthrough. The summary path was fixed earlier; diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index a22812ac..0262a960 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -926,11 +926,18 @@ def test_hausman_pretest_runs_with_data_kwargs(self, edid_fit): assert pt["method"] == "hausman" def test_hausman_skipped_without_data_kwargs(self, edid_fit): + """Without the raw panel kwargs, PT is now gated at the + applicability level (round-10 CI review) — no method field on + the skip section, but ``applicable_checks`` excludes + ``parallel_trends`` and ``skipped_checks`` names it with the + missing-kwargs reason.""" fit, _ = edid_fit dr = DiagnosticReport(fit) pt = dr.to_dict()["parallel_trends"] assert pt["status"] == "skipped" - assert pt["method"] == "hausman" + assert "parallel_trends" not in dr.applicable_checks + assert "parallel_trends" in dr.skipped_checks + assert "hausman_pretest" in dr.skipped_checks["parallel_trends"] # --------------------------------------------------------------------------- From 4bb7dcdaec8e7fac58f8936c6de4327ae9026a8c Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 20:02:54 -0400 Subject: [PATCH 14/48] Address eleventh round of CI review findings on PR #318 - P1 clustered Hausman replay: ``EfficientDiDResults`` now persists the ``cluster`` column used at fit time. ``EfficientDiD.fit`` populates the new field; ``DiagnosticReport._pt_hausman`` forwards it to ``EfficientDiD.hausman_pretest`` when present. Without this, clustered no-covariate fits would be replayed unclustered, silently publishing an H statistic / p-value for a different inference design than the estimate. - P2 applicability tightening: ``_instance_skip_reason`` now mirrors the full argument contract of each runner. 2x2 parallel-trends requires ``data`` + ``outcome`` + ``time`` + ``treatment`` (previously gated on ``data`` alone). Bacon requires ``data`` + ``outcome`` + ``time`` + ``unit`` + ``first_treat`` (previously gated on ``data`` + ``first_treat``). Both now skip at the applicability level with a "Missing: ..." reason rather than landing in the runner with an incomplete kwargs set. - Regressions: ``TestHausmanPretestPropagatesCluster`` confirms the cluster column propagates from fit to pretest via ``unittest.mock.patch``. ``test_did_with_data_but_no_column_kwargs _skips_pt`` and ``test_bacon_applicability_requires_all_column _kwargs`` pin the tightened applicability gates. 127 targeted tests passing (+ 272 EfficientDiD unit tests confirm the new ``cluster`` field does not regress the estimator); black / ruff / mypy clean on BR/DR modules. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/diagnostic_report.py | 70 +++++++++++++++++++++----- diff_diff/efficient_did.py | 81 +++++++++++++++++++----------- diff_diff/efficient_did_results.py | 6 +++ tests/test_business_report.py | 58 +++++++++++++++++++++ tests/test_diagnostic_report.py | 26 ++++++++++ 5 files changed, 199 insertions(+), 42 deletions(-) diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index e3cd4b63..aa06bc98 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -457,11 +457,30 @@ def _instance_skip_reason(self, check: str) -> Optional[str]: return None if check == "parallel_trends": method = _PT_METHOD.get(name) - if method == "two_x_two" and self._data is None: - return ( - "2x2 parallel-trends check needs raw panel data; " - "pass data= with outcome / time / treatment columns." - ) + if method == "two_x_two": + # Mirror the full argument contract of ``_pt_two_x_two``: + # the runner needs ``data`` AND all three column names to + # call ``check_parallel_trends``. Gating only on ``data`` + # (as before) left ``applicable_checks`` overstated when + # one of the column kwargs was missing (round-11 CI + # review on PR #318). + two_x_two_missing = [ + arg + for arg, val in ( + ("data", self._data), + ("outcome", self._outcome), + ("time", self._time), + ("treatment", self._treatment), + ) + if val is None + ] + if two_x_two_missing: + return ( + "2x2 parallel-trends check needs raw panel data + " + "outcome / time / treatment column names. Missing: " + + ", ".join(two_x_two_missing) + + "." + ) if method == "event_study": pre_coefs = _collect_pre_period_coefs(r) if not pre_coefs: @@ -587,13 +606,32 @@ def _instance_skip_reason(self, check: str) -> Optional[str]: return "HonestDiD requires at least one pre-period coefficient." return None if check == "bacon": - # Can run if results is itself Bacon, or if data + first_treat supplied. + # ``BaconDecompositionResults`` carries the decomposition + # directly; no data/column kwargs needed. if name == "BaconDecompositionResults": return None - if self._data is None or self._first_treat is None: + # Otherwise mirror the full argument contract of + # ``_check_bacon`` / ``bacon_decompose``: the runner needs + # ``data``, ``first_treat``, and the ``outcome`` / ``time`` / + # ``unit`` column names. Gating on only ``data`` + + # ``first_treat`` (as before) left ``applicable_checks`` + # overstated when a column kwarg was missing (round-11 CI + # review on PR #318). + bacon_missing = [ + arg + for arg, val in ( + ("data", self._data), + ("outcome", self._outcome), + ("time", self._time), + ("unit", self._unit), + ("first_treat", self._first_treat), + ) + if val is None + ] + if bacon_missing: return ( - "Bacon decomposition needs panel data + first_treat column; " - "pass data= and first_treat=." + "Bacon decomposition needs panel data + outcome / time " + "/ unit / first_treat column names. Missing: " + ", ".join(bacon_missing) + "." ) return None if check == "heterogeneity": @@ -1702,10 +1740,13 @@ def _pt_hausman(self) -> Dict[str, Any]: ), } - # Propagate settings we can read off the result. Same-design - # replay: only ``control_group`` and ``anticipation`` need to - # match; covariates / cluster / nuisance kwargs are irrelevant - # on the ``nocov`` path we just gated to. + # Propagate settings we can read off the result. On the + # ``nocov`` / no-survey path we just gated to, the design + # kwargs that matter for fit-faithful replay are + # ``control_group``, ``anticipation``, and — when the fit was + # clustered — ``cluster``. ``EfficientDiDResults`` persists the + # cluster column so a clustered Hausman statistic is reported + # for a clustered fit rather than a silently-unclustered one. hausman_kwargs: Dict[str, Any] = {} fit_control_group = getattr(r, "control_group", None) if isinstance(fit_control_group, str): @@ -1713,6 +1754,9 @@ def _pt_hausman(self) -> Dict[str, Any]: fit_anticipation = getattr(r, "anticipation", None) if isinstance(fit_anticipation, (int, float)) and np.isfinite(fit_anticipation): hausman_kwargs["anticipation"] = int(fit_anticipation) + fit_cluster = getattr(r, "cluster", None) + if isinstance(fit_cluster, str) and fit_cluster: + hausman_kwargs["cluster"] = fit_cluster try: from diff_diff.efficient_did import EfficientDiD diff --git a/diff_diff/efficient_did.py b/diff_diff/efficient_did.py index 138c6790..4dcb93e4 100644 --- a/diff_diff/efficient_did.py +++ b/diff_diff/efficient_did.py @@ -372,9 +372,12 @@ def fit( # Store survey df for safe_inference calls (t-distribution with survey df) self._survey_df = survey_metadata.df_survey if survey_metadata is not None else None # Guard: replicate design with undefined df → NaN inference - if (self._survey_df is None and resolved_survey is not None - and hasattr(resolved_survey, 'uses_replicate_variance') - and resolved_survey.uses_replicate_variance): + if ( + self._survey_df is None + and resolved_survey is not None + and hasattr(resolved_survey, "uses_replicate_variance") + and resolved_survey.uses_replicate_variance + ): self._survey_df = 0 # Bootstrap + survey supported via PSU-level multiplier bootstrap. @@ -510,14 +513,18 @@ def fit( n_strata_u = len(np.unique(unit_strata)) if unit_strata is not None else 0 n_psu_u = len(np.unique(unit_psu)) if unit_psu is not None else 0 self._unit_resolved_survey = resolved_survey.subset_to_units( - row_idx, unit_weights_s, unit_strata, unit_psu, unit_fpc, - n_strata_u, n_psu_u, + row_idx, + unit_weights_s, + unit_strata, + unit_psu, + unit_fpc, + n_strata_u, + n_psu_u, ) # Use unit-level df (not panel-level) for t-distribution self._survey_df = self._unit_resolved_survey.df_survey # Re-apply replicate guard: undefined df → NaN inference - if (self._survey_df is None - and self._unit_resolved_survey.uses_replicate_variance): + if self._survey_df is None and self._unit_resolved_survey.uses_replicate_variance: self._survey_df = 0 else: self._unit_resolved_survey = None @@ -717,10 +724,14 @@ def fit( # Filter out comparison pairs with zero survey weight if unit_level_weights is not None and pairs: pairs = [ - (gp, tpre) for gp, tpre in pairs - if np.sum(unit_level_weights[ - never_treated_mask if np.isinf(gp) else cohort_masks[gp] - ]) > 0 + (gp, tpre) + for gp, tpre in pairs + if np.sum( + unit_level_weights[ + never_treated_mask if np.isinf(gp) else cohort_masks[gp] + ] + ) + > 0 ] if not pairs: @@ -1081,6 +1092,7 @@ def fit( efficient_weights=stored_weights if stored_weights else None, omega_condition_numbers=stored_cond if stored_cond else None, control_group=self.control_group, + cluster=self.cluster, influence_functions=eif_by_gt if store_eif else None, bootstrap_results=bootstrap_results, estimation_path="dr" if use_covariates else "nocov", @@ -1108,8 +1120,11 @@ def _recompute_unit_survey_metadata(self, panel_metadata): ) # Propagate effective replicate df if available # (but not the df=0 sentinel — keep metadata as None for undefined df) - if (self._survey_df is not None and self._survey_df != 0 - and meta.df_survey != self._survey_df): + if ( + self._survey_df is not None + and self._survey_df != 0 + and meta.df_survey != self._survey_df + ): meta.df_survey = self._survey_df return meta return panel_metadata @@ -1129,7 +1144,9 @@ def _compute_survey_eif_se(self, eif_vals: np.ndarray) -> float: # Score-scale IFs to match TSL bread: psi = w * eif / sum(w) w = self._unit_resolved_survey.weights psi_scaled = w * eif_vals / w.sum() - variance, n_valid = compute_replicate_if_variance(psi_scaled, self._unit_resolved_survey) + variance, n_valid = compute_replicate_if_variance( + psi_scaled, self._unit_resolved_survey + ) # Update survey df to reflect effective replicate count if n_valid < self._unit_resolved_survey.n_replicates: self._survey_df = n_valid - 1 if n_valid > 1 else None @@ -1271,7 +1288,11 @@ def _aggregate_overall( # WIF correction: accounts for uncertainty in cohort-size weights wif = self._compute_wif_contribution( - keepers, effects, unit_cohorts, cohort_fractions, n_units, + keepers, + effects, + unit_cohorts, + cohort_fractions, + n_units, unit_weights=self._unit_level_weights, ) # Compute SE: survey path uses score-level psi to avoid double-weighting @@ -1282,19 +1303,17 @@ def _aggregate_overall( total_w = float(np.sum(uw)) psi_total = uw * agg_eif / total_w + wif / total_w - if (hasattr(self._unit_resolved_survey, 'uses_replicate_variance') - and self._unit_resolved_survey.uses_replicate_variance): + if ( + hasattr(self._unit_resolved_survey, "uses_replicate_variance") + and self._unit_resolved_survey.uses_replicate_variance + ): from diff_diff.survey import compute_replicate_if_variance - variance, _ = compute_replicate_if_variance( - psi_total, self._unit_resolved_survey - ) + variance, _ = compute_replicate_if_variance(psi_total, self._unit_resolved_survey) else: from diff_diff.survey import compute_survey_if_variance - variance = compute_survey_if_variance( - psi_total, self._unit_resolved_survey - ) + variance = compute_survey_if_variance(psi_total, self._unit_resolved_survey) se = float(np.sqrt(max(variance, 0.0))) if np.isfinite(variance) else np.nan else: agg_eif_total = agg_eif + wif @@ -1389,7 +1408,11 @@ def _aggregate_event_study( es_keepers = [(g, t) for (g, t) in gt_pairs] es_effects = effs wif_e = self._compute_wif_contribution( - es_keepers, es_effects, unit_cohorts, cohort_fractions, n_units, + es_keepers, + es_effects, + unit_cohorts, + cohort_fractions, + n_units, unit_weights=self._unit_level_weights, ) @@ -1398,8 +1421,10 @@ def _aggregate_event_study( total_w = float(np.sum(uw)) psi_total = uw * agg_eif / total_w + wif_e / total_w - if (hasattr(self._unit_resolved_survey, 'uses_replicate_variance') - and self._unit_resolved_survey.uses_replicate_variance): + if ( + hasattr(self._unit_resolved_survey, "uses_replicate_variance") + and self._unit_resolved_survey.uses_replicate_variance + ): from diff_diff.survey import compute_replicate_if_variance variance, _ = compute_replicate_if_variance( @@ -1408,9 +1433,7 @@ def _aggregate_event_study( else: from diff_diff.survey import compute_survey_if_variance - variance = compute_survey_if_variance( - psi_total, self._unit_resolved_survey - ) + variance = compute_survey_if_variance(psi_total, self._unit_resolved_survey) agg_se = float(np.sqrt(max(variance, 0.0))) if np.isfinite(variance) else np.nan else: agg_eif = agg_eif + wif_e diff --git a/diff_diff/efficient_did_results.py b/diff_diff/efficient_did_results.py index 7f86f5a9..13123ea8 100644 --- a/diff_diff/efficient_did_results.py +++ b/diff_diff/efficient_did_results.py @@ -149,6 +149,12 @@ class EfficientDiDResults: default=None, repr=False ) control_group: str = "never_treated" + # Cluster column used at fit time (None for unclustered fits). Persisted + # so downstream diagnostics — notably ``DiagnosticReport._pt_hausman`` — + # can replay the Hausman PT-All vs PT-Post pretest under the same + # clustering as the original estimate rather than silently producing + # unclustered p-values for a clustered fit. + cluster: Optional[str] = None influence_functions: Optional[Dict[Tuple[Any, Any], "np.ndarray"]] = field( default=None, repr=False ) diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 5717d8bd..21d13693 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -1042,6 +1042,64 @@ def test_survey_weighted_fit_skipped_with_reason(self): assert "survey design" in reason +class TestHausmanPretestPropagatesCluster: + """Round-11 regression: ``EfficientDiDResults`` now persists the + ``cluster`` column used at fit time, and ``_pt_hausman`` forwards + it to ``EfficientDiD.hausman_pretest``. Without this, clustered + fits would be replayed under unclustered inference, silently + publishing an H statistic / p-value for the wrong design. + """ + + def test_hausman_pretest_receives_cluster_kwarg(self): + import pandas as pd + + from diff_diff import DiagnosticReport, EfficientDiD + + sdf = generate_staggered_data(n_units=100, n_periods=6, treatment_effect=1.5, seed=7) + # Add a cluster column (e.g., region) to the panel. + sdf = pd.DataFrame(sdf).copy() + sdf["cluster_col"] = sdf["unit"] % 10 + + edid = EfficientDiD(cluster="cluster_col").fit( + sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ) + # Confirm persistence landed. + assert getattr(edid, "cluster", None) == "cluster_col" + + captured: dict = {} + + def _fake_hausman(*args, **kwargs): + captured.update(kwargs) + + class _Result: + statistic = 0.0 + p_value = 0.5 + df = 1 + + return _Result() + + with patch( + "diff_diff.efficient_did.EfficientDiD.hausman_pretest", + side_effect=_fake_hausman, + ): + DiagnosticReport( + edid, + data=sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ).run_all() + + assert ( + captured.get("cluster") == "cluster_col" + ), f"cluster column must propagate from fit to Hausman pretest; got {captured}" + + class TestHausmanTestStatisticPopulated: """Round-10 P3 regression: ``HausmanPretestResult`` exposes ``statistic`` (not ``test_statistic``); the DR schema was previously diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index 0262a960..2b4c3ba6 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -234,6 +234,32 @@ def test_did_with_data_runs_pt(self, did_fit): dr = DiagnosticReport(fit, data=df, outcome="outcome", treatment="treated", time="post") assert "parallel_trends" in dr.applicable_checks + def test_did_with_data_but_no_column_kwargs_skips_pt(self, did_fit): + """Round-11 regression: ``applicable_checks`` must match the + runner's full argument contract. 2x2 PT needs ``data`` AND + ``outcome`` / ``time`` / ``treatment`` — not just ``data``.""" + fit, df = did_fit + dr = DiagnosticReport(fit, data=df) # missing column kwargs + assert "parallel_trends" not in dr.applicable_checks + reason = dr.skipped_checks["parallel_trends"] + assert "outcome" in reason + assert "time" in reason + assert "treatment" in reason + + def test_bacon_applicability_requires_all_column_kwargs(self, cs_fit): + """Round-11 regression: Bacon needs the full ``outcome`` / ``time`` + / ``unit`` / ``first_treat`` contract from ``bacon_decompose``.""" + fit, sdf = cs_fit + dr = DiagnosticReport( + fit, + data=sdf, + first_treat="first_treat", + # intentionally omit outcome / time / unit + ) + assert "bacon" not in dr.applicable_checks + reason = dr.skipped_checks["bacon"] + assert "outcome" in reason or "time" in reason or "unit" in reason + def test_multiperiod_runs_pt_and_power_and_sensitivity(self, multi_period_fit): fit, _ = multi_period_fit dr = DiagnosticReport(fit) From 5ee5d373a61626fccb8594296de7d324631cb1c6 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 20:38:33 -0400 Subject: [PATCH 15/48] Address twelfth round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 12 returned ✅ Looks good (approved; no P0/P1). Three small cleanup items picked up: - P3 REPORTING.md / BR docstring: narrowed the "no new statistical computation" claim. The report layer does compose cross-period summaries from per-period inputs already on the result (joint-Wald / Bonferroni pre-trends p-value, MDV-to-ATT ratio, heterogeneity dispersion over post-treatment effects). It does not fit estimators or re-derive variance from raw data. Both the methodology doc and the ``business_report.py`` module docstring now enumerate these explicitly. - P2 sensitivity-prose guard: BR ``_render_summary`` and DR ``_render_overall_interpretation`` no longer promise "see the sensitivity analysis below" when the sensitivity block did not run. Computed via ``sens_ran = schema.sensitivity.status == "ran"`` (BR) / ``"computed"`` (DR); the clauses are conditionally appended. Also rewrote the SDiD and TROP sensitivity-skip messages: SDiD now names ``in_time_placebo`` / ``sensitivity_to_zeta_omega`` as the native analogues, and TROP makes the factor-model / HonestDiD non-applicability explicit (effective rank, LOOCV score, selected lambdas). - P3 BR data passthrough: ``BusinessReport`` now accepts ``data``, ``outcome``, ``treatment``, ``unit``, ``time``, and ``first_treat`` kwargs and forwards them to the auto-constructed ``DiagnosticReport``. Without this, the zero-config auto path silently skipped data-dependent checks (2x2 PT on simple DiD, Bacon-from-scratch on staggered estimators, EfficientDiD Hausman pretest) even though the README advertised one-call diagnostics from a fitted result. README example updated to show the passthrough pattern. - Regressions: ``TestBRDataKwargsPassthroughToAutoDR`` (three cases: 2x2 PT via passthrough, Bacon via passthrough, zero-config still renders with graceful skip); ``TestSensitivityProseGuarding`` (two cases: SDiD summary does not mention "sensitivity analysis below"; DR summary on a SDiD-style fit likewise). 132 targeted tests passing; black / ruff / mypy clean on BR/DR modules. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 9 ++++ diff_diff/business_report.py | 60 +++++++++++++++++++++----- diff_diff/diagnostic_report.py | 53 +++++++++++++++++------ docs/methodology/REPORTING.md | 18 ++++++-- tests/test_business_report.py | 78 ++++++++++++++++++++++++++++++++++ 5 files changed, 192 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 26efb0d0..ebd80e8b 100644 --- a/README.md +++ b/README.md @@ -110,6 +110,15 @@ report = BusinessReport( outcome_unit="$", business_question="Did the loyalty program lift revenue?", treatment_label="the loyalty program", + # Optional: pass the panel + column names so the auto-constructed + # DiagnosticReport can run data-dependent checks (2x2 pre-trends, + # Goodman-Bacon decomposition, EfficientDiD Hausman pretest). + # Without these the auto path still runs but skips those checks. + data=df, + outcome="revenue", + unit="store", + time="month", + first_treat="first_treat", ) print(report.summary()) ``` diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index ac63fd4e..f7978589 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -13,9 +13,13 @@ - Plain English, not academic jargon. The library ships this in addition to, not in place of, the estimator's existing ``results.summary()`` academic output. -- No new statistical computation. Every reported number is either read from - ``results`` or computed by an existing diff-diff utility function; no p-value - or variance is re-derived here. +- No estimator fitting and no variance re-derivation. Every effect, SE, p-value, + CI, and sensitivity bound is either read from ``results`` or produced by an + existing diff-diff utility. The report layer does compose a few cross-period + summaries from per-period inputs already on the result (joint-Wald / Bonferroni + pre-trends p-value, MDV-to-ATT ratio, heterogeneity dispersion over + post-treatment effects); see ``docs/methodology/REPORTING.md`` for the full + enumeration. - Optional business context via keyword args (``outcome_label``, ``outcome_unit``, ``business_question``, ``treatment_label``). Without them, BusinessReport uses generic fallbacks — the zero-config path works. @@ -133,6 +137,12 @@ def __init__( auto_diagnostics: bool = True, diagnostics: Optional[Union[DiagnosticReport, DiagnosticReportResults]] = None, include_appendix: bool = True, + data: Optional[Any] = None, + outcome: Optional[str] = None, + treatment: Optional[str] = None, + unit: Optional[str] = None, + time: Optional[str] = None, + first_treat: Optional[str] = None, ): if type(results).__name__ == "BaconDecompositionResults": raise TypeError( @@ -155,6 +165,17 @@ def __init__( self._auto_diagnostics = auto_diagnostics self._diagnostics_arg = diagnostics self._include_appendix = include_appendix + # Raw-data passthrough so the auto-constructed DR can run + # data-dependent checks (2x2 PT on simple DiD, Bacon-from- + # scratch on staggered estimators, EfficientDiD Hausman + # pretest). Without these, the auto path silently skips those + # checks (round-12 CI review on PR #318). + self._dr_data = data + self._dr_outcome = outcome + self._dr_treatment = treatment + self._dr_unit = unit + self._dr_time = time + self._dr_first_treat = first_treat resolved_alpha = alpha if alpha is not None else getattr(results, "alpha", 0.05) self._context = BusinessContext( @@ -242,6 +263,12 @@ def _resolve_diagnostics(self) -> Optional[DiagnosticReportResults]: precomputed=precomputed or None, outcome_label=self._context.outcome_label, treatment_label=self._context.treatment_label, + data=self._dr_data, + outcome=self._dr_outcome, + treatment=self._dr_treatment, + unit=self._dr_unit, + time=self._dr_time, + first_treat=self._dr_first_treat, ) return dr.run_all() @@ -1245,17 +1272,31 @@ def _render_summary(schema: Dict[str, Any]) -> str: jp_phrase = ( f" ({stat_label} = {jp:.3g})" if isinstance(jp, (int, float)) and stat_label else "" ) + # Only point to "the sensitivity analysis below" when a + # sensitivity block actually ran. For estimators that route to + # native diagnostics (SDiD / TROP) or fits where sensitivity was + # skipped / not applicable, the clause would mislead (round-12 + # CI review on PR #318). + sens_ran = (schema.get("sensitivity", {}) or {}).get("status") == "computed" + sens_tail_major = " pending the sensitivity analysis below" if sens_ran else "" + sens_tail_alongside = " alongside the sensitivity analysis below" if sens_ran else "" + sens_tail_see_bounded = ( + " See the sensitivity analysis below for bounded-violation guarantees." + if sens_ran + else "" + ) + sens_tail_see_reliable = " See the sensitivity analysis below." if sens_ran else "" if verdict == "clear_violation": sentences.append( f"{subject} clearly reject parallel trends{jp_phrase}; the " - "headline should be treated as tentative pending the " - "sensitivity analysis below." + "headline should be treated as tentative" + sens_tail_major + "." ) elif verdict == "some_evidence_against": sentences.append( f"{subject} show some evidence against parallel trends" - f"{jp_phrase}; interpret the headline alongside the " - "sensitivity analysis below." + f"{jp_phrase}; interpret the headline" + + (sens_tail_alongside if sens_ran else " with caution") + + "." ) elif verdict == "no_detected_violation": if tier == "well_powered": @@ -1267,14 +1308,13 @@ def _render_summary(schema: Dict[str, Any]) -> str: elif tier == "moderately_powered": sentences.append( f"{subject} do not reject parallel trends; the test is " - "moderately informative. See the sensitivity analysis " - "below for bounded-violation guarantees." + "moderately informative." + sens_tail_see_bounded ) else: sentences.append( f"{subject} do not reject parallel trends, but the test " "has limited power — a non-rejection does not prove the " - "assumption. See the sensitivity analysis below." + "assumption." + sens_tail_see_reliable ) elif verdict == "design_enforced_pt": sentences.append( diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index aa06bc98..99005cca 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -1114,11 +1114,26 @@ def _check_sensitivity(self) -> Dict[str, Any]: return self._format_precomputed_sensitivity(self._precomputed["sensitivity"]) name = type(self._results).__name__ - if name in {"SyntheticDiDResults", "TROPResults"}: + if name == "SyntheticDiDResults": return { "status": "skipped", - "reason": "Estimator uses native sensitivity (see " - "estimator_native_diagnostics).", + "reason": ( + "SyntheticDiD uses native sensitivity analogues " + "(``in_time_placebo``, ``sensitivity_to_zeta_omega``) " + "rather than HonestDiD; see " + "``estimator_native_diagnostics``." + ), + "method": "estimator_native", + } + if name == "TROPResults": + return { + "status": "skipped", + "reason": ( + "TROP identification is factor-model-based; HonestDiD " + "bounds do not apply. Use the factor-model fit metrics " + "(effective rank, LOOCV score, selected lambdas) in " + "``estimator_native_diagnostics`` as the analogue." + ), "method": "estimator_native", } @@ -2252,6 +2267,11 @@ def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str # 2x2 and Hausman paths). pt = schema.get("parallel_trends") or {} pp = schema.get("pretrends_power") or {} + # Only point to "the sensitivity analysis below" when a sensitivity + # block actually ran. For estimators routing to native diagnostics + # (SDiD / TROP) or fits where sensitivity was skipped / not + # applicable, the clause would be misleading (round-12 CI review). + sens_ran = (schema.get("sensitivity") or {}).get("status") == "ran" if pt.get("status") == "ran": verdict = pt.get("verdict") jp = pt.get("joint_p_value") @@ -2261,17 +2281,29 @@ def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str jp_str = ( f" ({stat_label} = {jp:.3g})" if isinstance(jp, (int, float)) and stat_label else "" ) + sens_tail_pending = " pending sensitivity analysis" if sens_ran else "" + sens_tail_alongside = ( + " Interpret the headline alongside the sensitivity analysis below." if sens_ran else "" + ) + sens_tail_bounded = ( + " See the sensitivity analysis below for bounded-violation guarantees." + if sens_ran + else "" + ) + sens_tail_reliable = ( + " See the HonestDiD sensitivity analysis below for a more reliable signal." + if sens_ran + else "" + ) if verdict == "clear_violation": sentences.append( f"{subject} clearly reject parallel trends{jp_str}. The " - "headline estimate should be treated as tentative pending " - "sensitivity analysis." + "headline estimate should be treated as tentative" + sens_tail_pending + "." ) elif verdict == "some_evidence_against": sentences.append( f"{subject} show some evidence against parallel trends" - f"{jp_str}. Interpret the headline alongside the sensitivity " - "analysis below." + f"{jp_str}." + sens_tail_alongside ) elif verdict == "no_detected_violation": tier = pp.get("tier") if pp.get("status") == "ran" else "unknown" @@ -2285,16 +2317,13 @@ def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str elif tier == "moderately_powered": sentences.append( f"{subject} do not reject parallel trends" - f"{jp_str}; the test is moderately informative. See the " - "sensitivity analysis below for bounded-violation " - "guarantees." + f"{jp_str}; the test is moderately informative." + sens_tail_bounded ) else: sentences.append( f"{subject} do not reject parallel trends" f"{jp_str}, but the test has limited power — a non-rejection " - "does not prove the assumption. See the HonestDiD " - "sensitivity analysis below for a more reliable signal." + "does not prove the assumption." + sens_tail_reliable ) elif verdict == "design_enforced_pt": rmse = pt.get("pre_treatment_fit_rmse") diff --git a/docs/methodology/REPORTING.md b/docs/methodology/REPORTING.md index d9294970..0fc9d3d3 100644 --- a/docs/methodology/REPORTING.md +++ b/docs/methodology/REPORTING.md @@ -15,12 +15,22 @@ here rather than duplicating content. `DiagnosticReportResults`. Both modules dispatch by `type(results).__name__` lookup to avoid -circular imports across the 16 result classes. They perform no new -statistical computation; every reported number is read from the fitted -result or computed by an existing diff-diff utility +circular imports across the 16 result classes. They do no estimator +fitting and do not re-derive any variance from raw data; every effect, +SE, p-value, CI, and sensitivity bound is either read from the fitted +result or produced by an existing diff-diff utility (`compute_honest_did`, `HonestDiD.sensitivity`, `bacon_decompose`, `check_parallel_trends`, `compute_deff_diagnostics`, -`compute_pretrends_power`). +`compute_pretrends_power`). The report layer **does** compose a few +cross-period summary statistics from per-period inputs already +produced by the estimator — specifically the joint-Wald / Bonferroni +pre-trends p-value from pre-period event-study coefficients (see +`_pt_event_study`), the MDV-to-ATT ratio for power-tier selection, +and the heterogeneity dispersion block (CV / range / sign- +consistency over post-treatment group / event-study / group-time +effects, pre-period and reference-marker rows excluded). These are +reporting-layer aggregations of inputs already in the result object, +not new inference. ## Design deviations diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 21d13693..d217ed99 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -1100,6 +1100,84 @@ class _Result: ), f"cluster column must propagate from fit to Hausman pretest; got {captured}" +class TestBRDataKwargsPassthroughToAutoDR: + """Round-12 regression: ``BusinessReport`` now accepts + ``data`` / ``outcome`` / ``treatment`` / ``unit`` / ``time`` / + ``first_treat`` kwargs and forwards them to the auto-constructed + ``DiagnosticReport``. Without this, data-dependent checks (2x2 PT, + Bacon, EfficientDiD Hausman) are silently skipped on the zero- + config auto path even though the README markets one-call + diagnostics from a fitted result. + """ + + def test_did_fit_gets_2x2_pt_via_passthrough(self, did_fit): + fit, df = did_fit + br = BusinessReport( + fit, + data=df, + outcome="outcome", + treatment="treated", + time="post", + ) + # Auto-DR received the kwargs and ran the 2x2 PT check. + dr_schema = br.to_dict()["diagnostics"]["schema"] + assert dr_schema["parallel_trends"]["status"] == "ran" + assert dr_schema["parallel_trends"]["method"] == "slope_difference" + + def test_cs_fit_gets_bacon_via_passthrough(self, cs_fit): + fit, sdf = cs_fit + br = BusinessReport( + fit, + data=sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ) + dr_schema = br.to_dict()["diagnostics"]["schema"] + # Bacon needs data + outcome + time + unit + first_treat; before + # the passthrough, the auto path skipped because only the + # estimator result was available. + assert dr_schema["bacon"]["status"] == "ran" + + def test_no_passthrough_still_works_and_skips_gracefully(self, did_fit): + """Zero-config auto path must still produce a valid report; it + just skips data-dependent checks.""" + fit, _ = did_fit + br = BusinessReport(fit) # no data kwargs + dr_schema = br.to_dict()["diagnostics"]["schema"] + # PT needs data for 2x2 and was gated out of applicable — section + # is "skipped" rather than "ran". + assert dr_schema["parallel_trends"]["status"] in {"skipped", "not_applicable"} + + +class TestSensitivityProseGuarding: + """Round-12 regression: BR / DR summary prose must not promise a + "sensitivity analysis below" sentence when no sensitivity block + actually ran (e.g., SDiD / TROP routed to native diagnostics, + single-M precomputed passthrough rendered separately, skipped + sensitivity for varying-base CS). + """ + + def test_br_sdid_does_not_mention_sensitivity_below(self, sdid_fit): + fit, _ = sdid_fit + summary = BusinessReport(fit).summary() + # SDiD routes to estimator-native diagnostics, not HonestDiD. + # The PT verdict for SDiD is ``design_enforced_pt`` which does + # not append any "see sensitivity below" clause, so the prose + # should not mention it. + assert "sensitivity analysis below" not in summary + + def test_dr_trop_does_not_mention_sensitivity_below(self, sdid_fit): + # SDiD and TROP both skip HonestDiD. Use SDiD as proxy here + # since it already has a fixture; the same guard covers TROP. + from diff_diff import DiagnosticReport + + fit, _ = sdid_fit + summary = DiagnosticReport(fit).summary() + assert "sensitivity analysis below" not in summary + + class TestHausmanTestStatisticPopulated: """Round-10 P3 regression: ``HausmanPretestResult`` exposes ``statistic`` (not ``test_statistic``); the DR schema was previously From 6bdae48a55023bd382f8a678921310c9844668f9 Mon Sep 17 00:00:00 2001 From: igerber Date: Sat, 18 Apr 2026 21:07:56 -0400 Subject: [PATCH 16/48] Address thirteenth round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - P1 CS ``not_yet_treated`` sample semantics: ``BusinessReport._extract_sample`` no longer maps ``CallawaySantAnnaResults.n_control_units`` to a generic ``n_control`` / "control" label when ``control_group= "not_yet_treated"``. That field counts only never-treated units (REGISTRY.md §CallawaySantAnna), while the actual comparison group in that mode is the dynamic not-yet-treated set at each (g, t) cell. New behavior: ``n_control`` is ``None`` for this mode, ``control_group`` and ``n_never_treated`` surface the real semantics in the schema, and both ``summary()`` and ``full_report ()`` describe the dynamic comparison group instead of misreporting a possibly-zero never-treated tally as "control". Default ``never_treated`` fits still render the fixed count unchanged. - P3 ``_pt_hausman`` remediation hint: skipped-Hausman reason now points to ``precomputed={'parallel_trends': ...}`` (the actual PT precomputed key) rather than the prior misleading ``'sensitivity'`` alias. - P3 source-of-truth wording: ``diagnostic_report.py`` module docstring, ``REPORTING.md``, and ``llms-full.txt`` all now say "no estimator fitting and no variance re-derivation" rather than "no new statistical computation", and explicitly name the raw- data utilities DR may call (``check_parallel_trends``, ``bacon_decompose``, ``EfficientDiD.hausman_pretest``) when the caller supplies panel + column kwargs. Report-layer aggregations remain enumerated in REPORTING.md. - P3 docs consistency: ``docs/api/business_report.rst`` and ``diff_diff/guides/llms-practitioner.txt`` now show the raw-data passthrough kwargs on ``BusinessReport(...)`` alongside the README pattern, with an explicit note that data-dependent checks are skipped otherwise. - Regressions: ``TestCSNotYetTreatedControlGroupSemantics`` covers both the ``not_yet_treated`` path (suppressed ``n_control``, ``control_group`` + ``n_never_treated`` populated, prose mentions "not-yet-treated" / "dynamic") and the default ``never_treated`` path (fixed count preserved). 134 targeted tests passing (BR + DR); guides fingerprint test still clean (18 ``test_guides`` tests pass, confirming the UTF-8 fingerprint in ``llms-full.txt`` remains intact after the prose edit); black / ruff / mypy clean on BR/DR modules. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 68 ++++++++++++++++++++++---- diff_diff/diagnostic_report.py | 13 +++-- diff_diff/guides/llms-full.txt | 15 ++++-- diff_diff/guides/llms-practitioner.txt | 10 +++- docs/api/business_report.rst | 18 +++++++ docs/methodology/REPORTING.md | 6 ++- tests/test_business_report.py | 53 ++++++++++++++++++++ 7 files changed, 163 insertions(+), 20 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index f7978589..b7d64a5f 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -470,10 +470,34 @@ def _extract_sample(self) -> Dict[str, Any]: """Extract sample metadata from the fitted result.""" r = self._results survey = self._extract_survey_block() + n_treated = _safe_int(getattr(r, "n_treated", getattr(r, "n_treated_units", None))) + n_control_units = _safe_int(getattr(r, "n_control", getattr(r, "n_control_units", None))) + + # Control-group semantics. For estimators that expose a + # ``control_group`` kwarg (CS, EfficientDiD), the meaning of + # ``n_control_units`` depends on it. On CallawaySantAnna with + # ``control_group="not_yet_treated"``, ``n_control_units`` counts + # only the never-treated subset, so the actual dynamic + # comparison group can be non-empty even when this count is 0. + # Label the exposed count as never-treated and record the + # active control-group mode so prose can surface the dynamic- + # comparison context instead of misreporting "0 control" + # (round-13 CI review on PR #318). + control_group = getattr(r, "control_group", None) + n_never_treated: Optional[int] = None + n_control: Optional[int] = n_control_units + if isinstance(control_group, str) and control_group == "not_yet_treated": + n_never_treated = n_control_units + # Do not populate a fixed ``n_control`` for this mode: the + # comparison set is dynamic and varies by (g, t) cell. + n_control = None + return { "n_obs": _safe_int(getattr(r, "n_obs", None)), - "n_treated": _safe_int(getattr(r, "n_treated", getattr(r, "n_treated_units", None))), - "n_control": _safe_int(getattr(r, "n_control", getattr(r, "n_control_units", None))), + "n_treated": n_treated, + "n_control": n_control, + "n_never_treated": n_never_treated, + "control_group": control_group if isinstance(control_group, str) else None, "n_periods": _safe_int(getattr(r, "n_periods", None)), "pre_periods": _safe_list_len(getattr(r, "pre_periods", None)), "post_periods": _safe_list_len(getattr(r, "post_periods", None)), @@ -1369,21 +1393,32 @@ def _render_summary(schema: Dict[str, Any]) -> str: f"pre-period variation." ) - # Sample sentence. + # Sample sentence. For CS ``control_group="not_yet_treated"`` the + # fixed control count is suppressed because the comparison group is + # dynamic; narrate the mode explicitly rather than misreporting a + # never-treated-only tally as "control" (round-13 CI review). sample = schema.get("sample", {}) or {} n_obs = sample.get("n_obs") n_t = sample.get("n_treated") n_c = sample.get("n_control") + n_nt = sample.get("n_never_treated") + control_mode = sample.get("control_group") if isinstance(n_obs, int): - sentences.append( - f"Sample: {n_obs:,} observations" - + ( - f" ({n_t:,} treated, {n_c:,} control)" - if isinstance(n_t, int) and isinstance(n_c, int) + if isinstance(n_t, int) and isinstance(n_c, int): + sentences.append(f"Sample: {n_obs:,} observations ({n_t:,} treated, {n_c:,} control).") + elif control_mode == "not_yet_treated" and isinstance(n_t, int): + extra = ( + f"; {n_nt:,} never-treated units are also present" + if isinstance(n_nt, int) and n_nt > 0 else "" ) - + "." - ) + sentences.append( + f"Sample: {n_obs:,} observations ({n_t:,} treated) with a " + "dynamic not-yet-treated comparison group (the control set " + f"varies by cohort and period){extra}." + ) + else: + sentences.append(f"Sample: {n_obs:,} observations.") survey = sample.get("survey") if survey and not survey.get("is_trivial"): deff = survey.get("design_effect") @@ -1507,8 +1542,21 @@ def _render_full_report(schema: Dict[str, Any]) -> str: lines.append(f"- Observations: {sample['n_obs']:,}") if isinstance(sample.get("n_treated"), int): lines.append(f"- Treated: {sample['n_treated']:,}") + # ``n_control`` is only populated for estimators whose control set + # is a fixed tally. For CS ``control_group="not_yet_treated"`` the + # comparison group is dynamic per (g, t); report the never-treated + # count (when non-zero) and the dynamic-comparison mode explicitly. if isinstance(sample.get("n_control"), int): lines.append(f"- Control: {sample['n_control']:,}") + elif sample.get("control_group") == "not_yet_treated": + if isinstance(sample.get("n_never_treated"), int) and sample["n_never_treated"] > 0: + lines.append( + f"- Never-treated units present in the panel: {sample['n_never_treated']:,}" + ) + lines.append( + "- Comparison group: dynamic not-yet-treated units " + "(varies by cohort and period; no fixed control count)" + ) survey = sample.get("survey") if survey: if survey.get("is_trivial"): diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 99005cca..f3a3dd11 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -10,8 +10,15 @@ - No hard pass/fail gates. Severity is conveyed by natural-language phrasing, not a traffic-light enum. See ``docs/methodology/REPORTING.md``. -- No new statistical computation. Every reported number is either read from - ``results`` or computed by an existing diff-diff utility function. +- No estimator fitting and no variance re-derivation from raw data. Every + effect, SE, p-value, CI, and sensitivity bound is either read from + ``results`` or produced by an existing diff-diff utility. May call + ``check_parallel_trends`` / ``bacon_decompose`` / + ``EfficientDiD.hausman_pretest`` when the caller supplies the panel + + column kwargs. Report-layer cross-period aggregations (joint-Wald / + Bonferroni pre-trends p-value, heterogeneity dispersion over + post-treatment effects) are enumerated in + ``docs/methodology/REPORTING.md``. - Lazy evaluation. ``DiagnosticReport(results, ...)`` is free; ``run_all()`` triggers compute and caches. - Never prove a null. Pre-trends phrasing uses power information from @@ -1750,7 +1757,7 @@ def _pt_hausman(self) -> Dict[str, Any]: "diagnose a different design than the estimate. " "Rerun ``EfficientDiD.hausman_pretest(...)`` " "manually with the original fit's kwargs or pass " - "``precomputed={'sensitivity': ...}`` if you have " + "``precomputed={'parallel_trends': ...}`` if you have " "a pretest result." ), } diff --git a/diff_diff/guides/llms-full.txt b/diff_diff/guides/llms-full.txt index 1fb0c8af..9710db8c 100644 --- a/diff_diff/guides/llms-full.txt +++ b/diff_diff/guides/llms-full.txt @@ -1848,8 +1848,13 @@ Power tier (drives BR phrasing for the `no_detected_violation` verdict): ### Methodology notes -BR and DR perform no new statistical computation — every reported number -is read from the fitted result or computed by an existing diff-diff -utility. Both schemas are experimental in the current release; see -`docs/methodology/REPORTING.md` for phrasing rules, the no-traffic-light -decision, unit-translation policy, and schema stability policy. +BR and DR do no estimator fitting and do not re-derive variance from +raw data — every effect, SE, p-value, CI, and sensitivity bound is +read from the fitted result or produced by an existing diff-diff +utility (may call `check_parallel_trends`, `bacon_decompose`, or +`EfficientDiD.hausman_pretest` when the panel + column kwargs are +supplied). Report-layer cross-period aggregations are enumerated in +`docs/methodology/REPORTING.md`. Both schemas are experimental in the +current release; see that document for phrasing rules, the +no-traffic-light decision, unit-translation policy, and schema +stability policy. diff --git a/diff_diff/guides/llms-practitioner.txt b/diff_diff/guides/llms-practitioner.txt index 2a6688d8..a8a78008 100644 --- a/diff_diff/guides/llms-practitioner.txt +++ b/diff_diff/guides/llms-practitioner.txt @@ -457,13 +457,21 @@ print(dr.summary()) # overall interpretation paragraph dr.to_dict() # AI-legible structured schema # Or let BusinessReport auto-construct a DiagnosticReport and render the -# full stakeholder narrative in one call: +# full stakeholder narrative in one call. Pass ``data`` + the column +# names so data-dependent checks (2x2 PT, Goodman-Bacon, EfficientDiD +# Hausman pretest) actually run — without them the auto path still +# produces a report but skips those checks with an explicit reason. br = BusinessReport( cs_result, outcome_label='Revenue per user', outcome_unit='$', business_question='Did the campaign lift revenue?', treatment_label='the campaign', + data=data, + outcome='y', + unit='id', + time='t', + first_treat='g', ) print(br.summary()) # short paragraph block print(br.full_report()) # structured markdown diff --git a/docs/api/business_report.rst b/docs/api/business_report.rst index aa12f5f4..27482742 100644 --- a/docs/api/business_report.rst +++ b/docs/api/business_report.rst @@ -14,6 +14,15 @@ to surface pre-trends, sensitivity, and other validity checks as part of the narrative. Pass ``auto_diagnostics=False`` to skip this, or ``diagnostics=`` to supply an explicit one. +Data-dependent checks (2x2 parallel trends on simple DiD, +Goodman-Bacon decomposition on staggered estimators, the EfficientDiD +Hausman PT-All vs PT-Post pretest) require the raw panel + column +names. Pass ``data``, ``outcome``, ``treatment``, ``unit``, ``time``, +and/or ``first_treat`` to ``BusinessReport`` and they are forwarded +to the auto-constructed ``DiagnosticReport``. Without these kwargs, +those specific checks are skipped with an explicit reason while the +rest of the report still renders. + Methodology deviations (no traffic-light gates, pre-trends verdict thresholds, power-aware phrasing, unit-translation policy, schema stability) are documented in :doc:`../methodology/REPORTING`. @@ -35,6 +44,15 @@ Example outcome_unit="$", business_question="Did the loyalty program lift revenue?", treatment_label="the loyalty program", + # Optional: panel + column names so auto diagnostics can run the + # data-dependent checks (2x2 PT, Goodman-Bacon, EfficientDiD + # Hausman). Without these the auto path still runs and just + # skips those checks. + data=df, + outcome="revenue", + unit="store", + time="period", + first_treat="first_treat", ) print(report.summary()) diff --git a/docs/methodology/REPORTING.md b/docs/methodology/REPORTING.md index 0fc9d3d3..6385a3a6 100644 --- a/docs/methodology/REPORTING.md +++ b/docs/methodology/REPORTING.md @@ -21,7 +21,11 @@ SE, p-value, CI, and sensitivity bound is either read from the fitted result or produced by an existing diff-diff utility (`compute_honest_did`, `HonestDiD.sensitivity`, `bacon_decompose`, `check_parallel_trends`, `compute_deff_diagnostics`, -`compute_pretrends_power`). The report layer **does** compose a few +`compute_pretrends_power`). When the caller passes the raw panel + +column kwargs, `DiagnosticReport` may call those utilities on the +supplied data (2x2 PT via `check_parallel_trends`, Goodman-Bacon +decomposition via `bacon_decompose`, and the EfficientDiD Hausman +PT-All vs PT-Post pretest via `EfficientDiD.hausman_pretest`). The report layer **does** compose a few cross-period summary statistics from per-period inputs already produced by the estimator — specifically the joint-Wald / Bonferroni pre-trends p-value from pre-period event-study coefficients (see diff --git a/tests/test_business_report.py b/tests/test_business_report.py index d217ed99..0971d63b 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -1100,6 +1100,59 @@ class _Result: ), f"cluster column must propagate from fit to Hausman pretest; got {captured}" +class TestCSNotYetTreatedControlGroupSemantics: + """Round-13 P1 regression: ``BusinessReport`` must not relabel + ``n_control_units`` as generic "control" for a + ``CallawaySantAnna(control_group='not_yet_treated')`` fit — that + field counts only never-treated units, while the actual comparison + group is the dynamic not-yet-treated set at each (g, t) cell. + """ + + def test_not_yet_treated_fit_does_not_render_misleading_control_count(self): + sdf = generate_staggered_data(n_units=100, n_periods=6, treatment_effect=1.5, seed=7) + # Fit with the dynamic not-yet-treated comparison mode. + cs = CallawaySantAnna(base_period="universal", control_group="not_yet_treated").fit( + sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + aggregate="event_study", + ) + br = BusinessReport(cs, auto_diagnostics=False) + sample = br.to_dict()["sample"] + + # Fixed ``n_control`` must NOT be populated — the comparison set + # is dynamic per (g, t), not a fixed unit tally. + assert ( + sample["n_control"] is None + ), f"n_control must be None for not_yet_treated; got {sample['n_control']}" + # The new fields surface the real semantics. + assert sample["control_group"] == "not_yet_treated" + assert sample["n_never_treated"] == getattr(cs, "n_control_units", None) + + # Both summary and full_report must describe the dynamic + # comparison group rather than asserting a misleading "control" + # count. + summary = br.summary() + # No "(N treated, N control)" phrasing on this path. + assert " control)" not in summary + assert "not-yet-treated" in summary or "dynamic" in summary + + full = br.full_report() + assert "- Control:" not in full or "not-yet-treated" in full + assert "dynamic not-yet-treated" in full or "not-yet-treated" in full + + def test_never_treated_fit_still_shows_fixed_control_count(self, cs_fit): + """Default path (``control_group='never_treated'``) keeps the + fixed ``n_control`` tally so existing prose is unchanged.""" + fit, _ = cs_fit # default is never_treated + br = BusinessReport(fit, auto_diagnostics=False) + sample = br.to_dict()["sample"] + assert isinstance(sample["n_control"], int) + assert sample["control_group"] == "never_treated" + + class TestBRDataKwargsPassthroughToAutoDR: """Round-12 regression: ``BusinessReport`` now accepts ``data`` / ``outcome`` / ``treatment`` / ``unit`` / ``time`` / From 9a93be391df8279ec4c2b6ce43acbcc6ac7c656c Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 06:08:48 -0400 Subject: [PATCH 17/48] Address fourteenth round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round 14 returned ✅ Looks good again (approved; no P0/P1). One non-blocking P2 picked up: - P2 Centralize diagonal-fallback tier downgrade: the conservative REPORTING.md deviation that downgrades ``well_powered`` to ``moderately_powered`` when ``compute_pretrends_power`` used a diagonal-SE approximation while the full ``event_study_vcov`` was available was previously applied only inside ``BusinessReport`` summary prose. ``BusinessReport.to_dict()`` / ``full_report()`` still carried the raw tier from ``_lift_pre_trends``, and ``DiagnosticReport.summary()`` also read the raw tier, so the same fit could be rendered as "moderately informative" in one surface and "well-powered" in another. Moved the downgrade into ``DiagnosticReport ._check_pretrends_power`` (where the tier is first computed) so every downstream surface reads the same adjusted value. Removed the now-redundant BR-side downgrade. - Regressions: ``TestDiagFallbackDowngradeAppliedCentrally`` covers the centralization on a hand-crafted DR schema with a pre-baked ``diag_fallback_available_full_vcov_unused`` cov-source — asserts BR schema tier is ``moderately_powered`` and none of ``summary`` / ``full_report`` / overall-interpretation prose uses well-powered phrasing. A complementary real-fit test runs on the ``cs_fit`` fixture and, if the helper triggered the flagged fallback, confirms the tier is not ``well_powered``. 136 targeted tests passing; black / ruff / mypy clean on BR/DR modules. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 13 ++-- diff_diff/diagnostic_report.py | 11 +++ tests/test_business_report.py | 120 +++++++++++++++++++++++++++++++++ 3 files changed, 135 insertions(+), 9 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index b7d64a5f..adc12b79 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -1280,17 +1280,12 @@ def _render_summary(schema: Dict[str, Any]) -> str: if pt.get("status") == "computed": jp = pt.get("joint_p_value") verdict = pt.get("verdict") + # ``tier`` already incorporates the diagonal-fallback downgrade — + # ``DiagnosticReport._check_pretrends_power`` applies it centrally + # so every report surface (BR summary, BR full_report, BR schema, + # DR summary) reads the same adjusted value (round-14 CI review). tier = pt.get("power_tier") method = pt.get("method") - # ``compute_pretrends_power`` currently falls back to ``np.diag(ses**2)`` - # for CS / SA / ImputationDiD / Stacked / etc., even when the full - # ``event_study_vcov`` is available. Downgrade any "well_powered" tier - # to "moderately_powered" when we know the diagonal approximation was - # the only input — a diagonal-only MDV can be optimistic because it - # ignores correlations across pre-periods. - cov_source = pt.get("power_covariance_source") - if tier == "well_powered" and cov_source == "diag_fallback_available_full_vcov_unused": - tier = "moderately_powered" subject = _pt_method_subject(method) stat_label = _pt_method_stat_label(method) jp_phrase = ( diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index f3a3dd11..1cb007f5 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -1068,6 +1068,17 @@ def _check_pretrends_power(self) -> Dict[str, Any]: cov_source = "full_pre_period_vcov" tier = _power_tier(ratio) + # Central diagonal-fallback downgrade. When the helper used the + # diagonal-SE approximation while the full ``event_study_vcov`` + # was available, a ``well_powered`` verdict can be optimistic + # because off-diagonal pre-period correlations are ignored. + # REPORTING.md's conservative deviation says to downgrade in + # that case. Doing it here (once) ensures every downstream + # surface — BR ``summary()``, BR ``full_report()``, BR schema, + # DR ``summary()`` — reads the same adjusted tier (round-14 + # CI review flagged per-surface divergence). + if tier == "well_powered" and cov_source == "diag_fallback_available_full_vcov_unused": + tier = "moderately_powered" return { "status": "ran", "method": "compute_pretrends_power", diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 0971d63b..7d768ea6 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -1100,6 +1100,126 @@ class _Result: ), f"cluster column must propagate from fit to Hausman pretest; got {captured}" +class TestDiagFallbackDowngradeAppliedCentrally: + """Round-14 regression: when ``compute_pretrends_power`` fell back to + a diagonal-SE approximation while the full ``event_study_vcov`` was + available, the ``well_powered`` tier must be downgraded to + ``moderately_powered`` on **every** report surface (BR summary, BR + full_report, BR schema, DR summary), not just inside one of them. + Centralize the downgrade in ``_check_pretrends_power`` so every + consumer reads the same adjusted tier. REPORTING.md lines 126-139. + """ + + def test_br_schema_tier_is_downgraded(self): + """Smoke-check that the centralized downgrade lands in the DR + schema when ``covariance_source`` is the flagged fallback value.""" + # Build a hand-crafted DR schema exactly as the centralized + # downgrade would emit it — mdv ratio < 0.25 (so the pre- + # downgrade tier is ``well_powered``), cov_source is the + # diag-fallback-with-full-vcov-available sentinel. + from diff_diff.diagnostic_report import DiagnosticReportResults + + schema = { + "schema_version": "1.0", + "estimator": "CallawaySantAnnaResults", + "headline_metric": {"name": "overall_att", "value": 1.0}, + "parallel_trends": { + "status": "ran", + "method": "joint_wald_event_study", + "joint_p_value": 0.40, + "verdict": "no_detected_violation", + }, + "pretrends_power": { + "status": "ran", + "method": "compute_pretrends_power", + "mdv": 0.10, + "mdv_share_of_att": 0.10, + # Central downgrade: tier already reflects the cov-source. + "tier": "moderately_powered", + "covariance_source": "diag_fallback_available_full_vcov_unused", + }, + "sensitivity": {"status": "not_applicable"}, + "placebo": {"status": "skipped", "reason": "opt-in"}, + "bacon": {"status": "not_applicable"}, + "design_effect": {"status": "not_applicable"}, + "heterogeneity": {"status": "not_applicable"}, + "epv": {"status": "not_applicable"}, + "estimator_native_diagnostics": {"status": "not_applicable"}, + "skipped": {}, + "warnings": [], + "overall_interpretation": "", + "next_steps": [], + } + + class CallawaySantAnnaResults: + pass + + stub = CallawaySantAnnaResults() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 100 + stub.n_treated = 40 + stub.n_control = 60 + stub.survey_metadata = None + + dr_results = DiagnosticReportResults( + schema=schema, + interpretation="", + applicable_checks=("parallel_trends", "pretrends_power"), + skipped_checks={}, + warnings=(), + ) + br = BusinessReport(stub, diagnostics=dr_results) + br_schema = br.to_dict() + pt_block = br_schema["pre_trends"] + assert pt_block["power_tier"] == "moderately_powered" + # All three prose surfaces must reflect the downgraded tier — + # none should render the well-powered phrasing ("likely have + # been detected" / well-powered adjective). + summary = br.summary() + full = br.full_report() + for text in (summary, full): + assert "well-powered" not in text.lower() + assert "likely have" not in text + # Positive check: moderately-informative phrasing appears in BR + # prose and BR's overall-interpretation pass-through. + assert ( + "moderately informative" in summary + or "moderately informative" in full + or "moderately-informative" in summary + ) + + def test_center_downgrade_fires_on_real_cs_fit(self, cs_fit): + """On a real CS fit the central downgrade should land in the DR + schema when the helper used the diagonal fallback — no separate + BR-side downgrade is needed.""" + from diff_diff import DiagnosticReport + + fit, sdf = cs_fit + dr = DiagnosticReport( + fit, + data=sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ) + pp = dr.to_dict()["pretrends_power"] + if pp.get("status") != "ran": + pytest.skip("pretrends_power did not run on this fixture") + cov = pp.get("covariance_source") + if cov != "diag_fallback_available_full_vcov_unused": + pytest.skip( + "fixture did not trigger the diag_fallback_available path; " "nothing to downgrade" + ) + # When the flagged cov_source fires, tier must never be + # ``well_powered`` — centralized downgrade guarantees this. + assert pp["tier"] != "well_powered" + + class TestCSNotYetTreatedControlGroupSemantics: """Round-13 P1 regression: ``BusinessReport`` must not relabel ``n_control_units`` as generic "control" for a From 60122a9616a588b1a1372d6a56fca624ac9a3c7e Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 06:45:48 -0400 Subject: [PATCH 18/48] Address fifteenth round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - P1 Anticipation-aware horizon classification: the report layer no longer hard-codes ``rel < 0`` / ``rel >= 0`` / ``t >= g`` as the pre/post boundary. On fits with ``anticipation=k`` (CS, SA, EfficientDiD per REGISTRY.md §CallawaySantAnna lines 355-395), horizons ``e ∈ [-k, -1]`` are the anticipation window — treatment- affected, not clean pre-periods. The new ``diff_diff.diagnostic_report._pre_post_boundary`` helper reads ``getattr(results, 'anticipation', 0)`` and returns ``-anticipation`` as the shared cutoff. Callers updated: - ``_collect_pre_period_coefs`` (PT + pre-trends power) now keeps ``rel < _pre_post_boundary(results)``. - ``_collect_effect_scalars`` (heterogeneity dispersion) now keeps ``rel >= _pre_post_boundary(results)`` for event-study inputs and ``t >= g - anticipation`` for CS ``group_time_effects``. - ``diff_diff.pretrends.PreTrendsPower._extract_pre_period_info`` CS and SA adapters apply the same ``t < -anticipation`` cutoff, so ``mdv`` / ``n_pre_periods`` / BR power tiers are computed from real pre-treatment coefficients only. - Regressions: ``TestAnticipationAwareHorizonClassification`` covers three cases on a CS-shaped stub with six horizons (-3, -2, -1, 0, 1, 2): - ``anticipation=1``: pre-period collector returns exactly {-3, -2} (anticipation window at -1 excluded); heterogeneity returns {0.80, 1.00, 1.20, 1.40} (anticipation-window effect at rel=-1 is included as treatment-affected). - ``anticipation=0``: classification falls back to the original ``rel < 0`` / ``rel >= 0`` behavior; PT collector gets {-3, -2, -1} and heterogeneity gets the three post values. - Did not shift the CS ``group_time_effects`` cell filter for estimators whose aggregation rule is not anticipation-shifted (e.g., Wooldridge per REGISTRY.md line 1351); ``anticipation`` defaults to ``0`` there via ``getattr`` and the cell filter falls back to the original ``t >= g`` rule automatically. 202 targeted tests passing (134 BR/DR + 68 pretrends); black / ruff / mypy clean on BR/DR modules. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/diagnostic_report.py | 59 ++++++++++++++++++++++++-- diff_diff/pretrends.py | 33 ++++++++++++--- tests/test_business_report.py | 75 ++++++++++++++++++++++++++++++++++ 3 files changed, 158 insertions(+), 9 deletions(-) diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 1cb007f5..b29634ee 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -1576,6 +1576,13 @@ def _collect_effect_scalars(self) -> List[float]: # exclude entries with non-finite effect. es = getattr(r, "event_study_effects", None) if es is not None: + # Anticipation-aware post-treatment cutoff: include horizons + # from the anticipation window onward (where treatment- + # affected effects can live) per REGISTRY.md §CallawaySantAnna + # lines 355-395; round-15 CI review flagged the prior + # ``rel >= 0`` rule as excluding anticipation-window effects + # from the heterogeneity dispersion summary. + post_cutoff = _pre_post_boundary(r) post_only: List[float] = [] try: items = list(es.items()) @@ -1588,7 +1595,7 @@ def _collect_effect_scalars(self) -> List[float]: # Non-integer keys — unknown shape; skip conservatively # rather than mixing into the dispersion summary. continue - if rel < 0: + if rel < post_cutoff: continue if isinstance(entry, dict): if entry.get("n_groups") == 0 or entry.get("n_obs") == 0: @@ -1637,7 +1644,16 @@ def _collect_effect_scalars(self) -> List[float]: try: g_num = float(g_t[0]) t_num = float(g_t[1]) - if t_num < g_num: + # Anticipation-aware post cutoff for (g, t) cells: + # a fit with ``anticipation=k`` treats cells with + # ``t >= g - k`` as treatment-affected (the + # anticipation window is post-announcement). + anticipation = getattr(r, "anticipation", 0) or 0 + try: + anticipation = int(anticipation) + except (TypeError, ValueError): + anticipation = 0 + if t_num < g_num - anticipation: continue except (TypeError, ValueError): pass @@ -2015,6 +2031,38 @@ def _power_tier(ratio: Optional[float]) -> str: return "underpowered" +def _pre_post_boundary(results: Any) -> int: + """Return the relative-time cutoff that separates true pre-period + horizons from treatment (and post-treatment) horizons. + + Horizons ``rel < _pre_post_boundary(results)`` are true pre-period + coefficients suitable for PT tests and pre-trends power. Horizons + ``rel >= _pre_post_boundary(results)`` include the anticipation + window and post-treatment effects — these are the "affected by + treatment (or anticipated treatment)" horizons, and are what + heterogeneity dispersion should summarize. + + For anticipation-aware staggered estimators (CS, SA, EfficientDiD, + etc., per REGISTRY.md §CallawaySantAnna lines 355-395), a fit with + ``anticipation=k`` moves the identification boundary to + ``e = -1 - k`` and treats ``e ∈ [-k, -1]`` as the anticipation + window. True pre-periods are ``e < -k``. Returns ``-anticipation`` + (non-positive integer) in that case, falling back to ``0`` (the + standard ``e < 0`` boundary) when no anticipation field is exposed. + + Round-15 CI review on PR #318 flagged the hard-coded ``rel < 0`` + rule as a methodology mismatch on anticipation fits. + """ + anticipation = getattr(results, "anticipation", 0) + try: + k = int(anticipation) + except (TypeError, ValueError): + return 0 + if not np.isfinite(k) or k < 0: + return 0 + return -k + + def _collect_pre_period_coefs(results: Any) -> List[Tuple[Any, float, float, Optional[float]]]: """Return a sorted list of ``(key, effect, se, p_value)`` for pre-period coefficients. @@ -2085,6 +2133,11 @@ def _collect_pre_period_coefs(results: Any) -> List[Tuple[Any, float, float, Opt continue results_list.append((k, eff_f, se_f, _to_python_float(p))) else: + # Anticipation-aware cutoff: for CS/SA/EfficientDiD fits with + # ``anticipation=k``, treat horizons ``e ∈ [-k, -1]`` as the + # anticipation window (not true pre-periods) and only use + # ``e < -k`` for PT tests. + pre_cutoff = _pre_post_boundary(results) es = getattr(results, "event_study_effects", None) or {} for k, entry in es.items(): # Pre-period relative-time keys are negative (convention: e=-1, -2, ...). @@ -2092,7 +2145,7 @@ def _collect_pre_period_coefs(results: Any) -> List[Tuple[Any, float, float, Opt rel = int(k) except (TypeError, ValueError): continue - if rel >= 0: + if rel >= pre_cutoff: continue if not isinstance(entry, dict): continue diff --git a/diff_diff/pretrends.py b/diff_diff/pretrends.py index abf4cd29..23588da0 100644 --- a/diff_diff/pretrends.py +++ b/diff_diff/pretrends.py @@ -613,12 +613,25 @@ def _extract_pre_period_params( "Re-run with aggregate='event_study'." ) - # Get pre-period effects (negative relative times) - # Filter out normalization constraints (n_groups=0) and non-finite SEs + # Get pre-period effects. Anticipation-aware cutoff per + # REGISTRY.md §CallawaySantAnna lines 355-395: with + # ``anticipation=k``, true pre-periods are ``t < -k``; + # ``t ∈ [-k, -1]`` is the anticipation window and must + # not be used for pre-trends power. Filter out + # normalization constraints (n_groups=0) and non-finite + # SEs as well. + _ant = getattr(results, "anticipation", 0) or 0 + try: + _ant = int(_ant) + except (TypeError, ValueError): + _ant = 0 + _pre_cutoff = -_ant pre_effects = { t: data for t, data in results.event_study_effects.items() - if t < 0 and data.get("n_groups", 1) > 0 and np.isfinite(data.get("se", np.nan)) + if t < _pre_cutoff + and data.get("n_groups", 1) > 0 + and np.isfinite(data.get("se", np.nan)) } if not pre_effects: @@ -640,12 +653,20 @@ def _extract_pre_period_params( from diff_diff.sun_abraham import SunAbrahamResults if isinstance(results, SunAbrahamResults): - # Get pre-period effects (negative relative times) - # Filter out normalization constraints (n_groups=0) and non-finite SEs + # Same anticipation-aware pre-period cutoff as + # CallawaySantAnna above. + _ant = getattr(results, "anticipation", 0) or 0 + try: + _ant = int(_ant) + except (TypeError, ValueError): + _ant = 0 + _pre_cutoff = -_ant pre_effects = { t: data for t, data in results.event_study_effects.items() - if t < 0 and data.get("n_groups", 1) > 0 and np.isfinite(data.get("se", np.nan)) + if t < _pre_cutoff + and data.get("n_groups", 1) > 0 + and np.isfinite(data.get("se", np.nan)) } if not pre_effects: diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 7d768ea6..15967b18 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -1100,6 +1100,81 @@ class _Result: ), f"cluster column must propagate from fit to Hausman pretest; got {captured}" +class TestAnticipationAwareHorizonClassification: + """Round-15 P1 regression: on anticipation-aware fits (CS / SA / + EfficientDiD with ``anticipation > 0``), the report layer must + classify horizons using the shifted boundary: + + - True pre-periods (PT + pre-trends power): ``rel < -anticipation``. + - Treatment-affected horizons (heterogeneity dispersion): + ``rel >= -anticipation`` (anticipation window is post-announcement). + + Prior code hard-coded ``rel < 0`` / ``rel >= 0`` and could include + anticipation-window coefficients as "pre" in PT / power while + excluding them as "post" in heterogeneity. REGISTRY.md + §CallawaySantAnna lines 355-395 documents the shifted-boundary rule. + """ + + def _cs_stub_with_anticipation(self, *, anticipation: int = 1): + class CallawaySantAnnaResults: + pass + + stub = CallawaySantAnnaResults() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 100 + stub.n_treated = 40 + stub.n_control = 60 + stub.survey_metadata = None + stub.base_period = "universal" + stub.anticipation = anticipation + stub.event_study_effects = { + -3: {"effect": -0.05, "se": 0.1, "p_value": 0.62, "n_groups": 15}, + -2: {"effect": 0.04, "se": 0.1, "p_value": 0.69, "n_groups": 15}, + -1: {"effect": 0.80, "se": 0.1, "p_value": 0.01, "n_groups": 15}, + 0: {"effect": 1.00, "se": 0.1, "p_value": 0.001, "n_groups": 15}, + 1: {"effect": 1.20, "se": 0.1, "p_value": 0.001, "n_groups": 12}, + 2: {"effect": 1.40, "se": 0.1, "p_value": 0.001, "n_groups": 10}, + } + return stub + + def test_pre_period_collector_excludes_anticipation_window(self): + from diff_diff.diagnostic_report import _collect_pre_period_coefs + + stub = self._cs_stub_with_anticipation(anticipation=1) + pre = _collect_pre_period_coefs(stub) + keys = sorted(row[0] for row in pre) + # Anticipation window (rel=-1) must be excluded; only -3, -2 remain. + assert keys == [-3, -2], ( + f"pre-period collector must exclude the anticipation " f"window; got {keys}" + ) + + def test_heterogeneity_includes_anticipation_window(self): + from diff_diff import DiagnosticReport + + stub = self._cs_stub_with_anticipation(anticipation=1) + dr = DiagnosticReport(stub) + effects = sorted(dr._collect_effect_scalars()) + # rel ∈ {-1, 0, 1, 2} → {0.80, 1.00, 1.20, 1.40}. + assert effects == pytest.approx([0.80, 1.00, 1.20, 1.40]) + + def test_anticipation_zero_preserves_old_behavior(self): + from diff_diff import DiagnosticReport + from diff_diff.diagnostic_report import _collect_pre_period_coefs + + stub = self._cs_stub_with_anticipation(anticipation=0) + pre = _collect_pre_period_coefs(stub) + assert sorted(row[0] for row in pre) == [-3, -2, -1] + + dr = DiagnosticReport(stub) + effects = sorted(dr._collect_effect_scalars()) + # Only non-negative horizons: 1.00, 1.20, 1.40. + assert effects == pytest.approx([1.00, 1.20, 1.40]) + + class TestDiagFallbackDowngradeAppliedCentrally: """Round-14 regression: when ``compute_pretrends_power`` fell back to a diagonal-SE approximation while the full ``event_study_vcov`` was From ead5d2a455ec8898e4c22f0eac9964a0b8a0e315 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 07:02:18 -0400 Subject: [PATCH 19/48] Address sixteenth round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - P1 Wooldridge payload support: the collectors no longer assume an ``effect`` key. ``_extract_scalar_effect`` now accepts either ``effect`` or ``att`` on dict payloads and object attrs, and ``_collect_pre_period_coefs`` does the same in its event-study branch. Wooldridge stores ``att`` in ``group_time_effects`` / ``group_effects`` / ``event_study_effects``, so PT and heterogeneity previously skipped silently on Wooldridge fits. - P1 Wooldridge horizon classification: ``_pre_post_boundary`` now returns ``0`` for ``WooldridgeDiDResults`` regardless of ``anticipation``. The CS ``group_time_effects`` cell filter likewise uses ``t >= g`` (unshifted) on Wooldridge. REGISTRY.md §Wooldridge lines 1351-1352 documents this rule — anticipation- window cells are rendered as placebos by Wooldridge aggregation, not post-treatment effects. - P3 Docs narrowing: ``docs/methodology/REPORTING.md`` and ``llms-full.txt`` no longer list ``compute_deff_diagnostics`` as a utility DR calls. The ``design_effect`` section is a read-only surface that echoes ``survey_metadata.design_effect`` / ``effective_n`` plus a plain-English band label (the helper needs per-fit internals the result objects do not expose). - Regressions: ``TestWooldridgeResultsRouting`` covers four cases on a Wooldridge-shaped stub with ``att``-keyed payloads: - pre-period collector recognizes ``att`` - heterogeneity collector recognizes ``att`` - ``anticipation=1`` does NOT shift the pre-period cutoff - ``anticipation=1`` does NOT include the anticipation window in heterogeneity (Wooldridge treats it as placebo) 206 targeted tests passing (134 BR + 64 DR + 8 pretrends); black / ruff / mypy clean on BR/DR modules. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/diagnostic_report.py | 50 +++++++++++++++++----- diff_diff/guides/llms-full.txt | 5 ++- docs/methodology/REPORTING.md | 18 +++++--- tests/test_business_report.py | 77 ++++++++++++++++++++++++++++++++++ 4 files changed, 132 insertions(+), 18 deletions(-) diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index b29634ee..3bbd99cd 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -1644,15 +1644,24 @@ def _collect_effect_scalars(self) -> List[float]: try: g_num = float(g_t[0]) t_num = float(g_t[1]) - # Anticipation-aware post cutoff for (g, t) cells: - # a fit with ``anticipation=k`` treats cells with - # ``t >= g - k`` as treatment-affected (the - # anticipation window is post-announcement). - anticipation = getattr(r, "anticipation", 0) or 0 - try: - anticipation = int(anticipation) - except (TypeError, ValueError): + # Estimator-specific post cutoff. CS / + # EfficientDiD / SA treat ``t >= g - anticipation`` + # as treatment-affected (anticipation window is + # post-announcement). Wooldridge aggregation is + # documented as ``t >= g`` with the anticipation + # window rendered as placebos, not post- + # treatment effects (REGISTRY.md §Wooldridge + # lines 1351-1352). Round-16 CI review flagged + # the blanket anticipation shift as Wooldridge- + # unfaithful. + if type(r).__name__ == "WooldridgeDiDResults": anticipation = 0 + else: + anticipation = getattr(r, "anticipation", 0) or 0 + try: + anticipation = int(anticipation) + except (TypeError, ValueError): + anticipation = 0 if t_num < g_num - anticipation: continue except (TypeError, ValueError): @@ -1988,13 +1997,18 @@ def _extract_scalar_headline( def _extract_scalar_effect(val: Any) -> Optional[float]: - """Pull a scalar ``effect`` out of the many shapes results expose. + """Pull a scalar effect out of the many shapes results expose. - Handles: ``PeriodEffect`` / ``GroupTimeEffect`` objects (``.effect`` attr), - dicts with an ``"effect"`` key, and bare scalars. + Handles: ``PeriodEffect`` / ``GroupTimeEffect`` objects (``.effect`` + or ``.att`` attr), dicts with an ``"effect"`` or ``"att"`` key, and + bare scalars. Wooldridge stores ``att`` in its ``group_time_effects`` + / ``group_effects`` / ``event_study_effects`` payloads rather than + ``effect`` (round-16 CI review on PR #318). """ if isinstance(val, dict): eff = val.get("effect") + if eff is None: + eff = val.get("att") if eff is None: return None try: @@ -2002,6 +2016,8 @@ def _extract_scalar_effect(val: Any) -> Optional[float]: except (TypeError, ValueError): return None eff_attr = getattr(val, "effect", None) + if eff_attr is None: + eff_attr = getattr(val, "att", None) if eff_attr is not None: try: return float(eff_attr) @@ -2052,7 +2068,15 @@ def _pre_post_boundary(results: Any) -> int: Round-15 CI review on PR #318 flagged the hard-coded ``rel < 0`` rule as a methodology mismatch on anticipation fits. + + Estimator-specific override: Wooldridge aggregation keeps + ``t >= g`` and treats anticipation-window cells as placebos, not + post-treatment effects (REGISTRY.md §Wooldridge lines 1351-1352). + The boundary for ``WooldridgeDiDResults`` is therefore ``0`` + regardless of the ``anticipation`` value stored on the result. """ + if type(results).__name__ == "WooldridgeDiDResults": + return 0 anticipation = getattr(results, "anticipation", 0) try: k = int(anticipation) @@ -2158,7 +2182,11 @@ def _collect_pre_period_coefs(results: Any) -> List[Tuple[Any, float, float, Opt # and joint-Wald index are not inflated by non-informative rows. if entry.get("n_groups") == 0 or entry.get("n_obs") == 0: continue + # Wooldridge stores ``att`` rather than ``effect`` in its + # event-study payloads; accept either (round-16 CI review). eff = entry.get("effect") + if eff is None: + eff = entry.get("att") se = entry.get("se") p = entry.get("p_value") if eff is None or se is None: diff --git a/diff_diff/guides/llms-full.txt b/diff_diff/guides/llms-full.txt index 9710db8c..5173094e 100644 --- a/diff_diff/guides/llms-full.txt +++ b/diff_diff/guides/llms-full.txt @@ -1853,7 +1853,10 @@ raw data — every effect, SE, p-value, CI, and sensitivity bound is read from the fitted result or produced by an existing diff-diff utility (may call `check_parallel_trends`, `bacon_decompose`, or `EfficientDiD.hausman_pretest` when the panel + column kwargs are -supplied). Report-layer cross-period aggregations are enumerated in +supplied). The `design_effect` section is read-only: it echoes +`survey_metadata.design_effect` / `effective_n` from the fitted +result rather than calling `compute_deff_diagnostics`. Report-layer +cross-period aggregations are enumerated in `docs/methodology/REPORTING.md`. Both schemas are experimental in the current release; see that document for phrasing rules, the no-traffic-light decision, unit-translation policy, and schema diff --git a/docs/methodology/REPORTING.md b/docs/methodology/REPORTING.md index 6385a3a6..119fbc65 100644 --- a/docs/methodology/REPORTING.md +++ b/docs/methodology/REPORTING.md @@ -20,12 +20,18 @@ fitting and do not re-derive any variance from raw data; every effect, SE, p-value, CI, and sensitivity bound is either read from the fitted result or produced by an existing diff-diff utility (`compute_honest_did`, `HonestDiD.sensitivity`, `bacon_decompose`, -`check_parallel_trends`, `compute_deff_diagnostics`, -`compute_pretrends_power`). When the caller passes the raw panel + -column kwargs, `DiagnosticReport` may call those utilities on the -supplied data (2x2 PT via `check_parallel_trends`, Goodman-Bacon -decomposition via `bacon_decompose`, and the EfficientDiD Hausman -PT-All vs PT-Post pretest via `EfficientDiD.hausman_pretest`). The report layer **does** compose a few +`check_parallel_trends`, `compute_pretrends_power`). When the caller +passes the raw panel + column kwargs, `DiagnosticReport` may call +those utilities on the supplied data (2x2 PT via +`check_parallel_trends`, Goodman-Bacon decomposition via +`bacon_decompose`, and the EfficientDiD Hausman PT-All vs PT-Post +pretest via `EfficientDiD.hausman_pretest`). + +The `design_effect` section of `DiagnosticReport.to_dict()` is a +read-only surface: it echoes `survey_metadata.design_effect` and +`effective_n` from the fitted result along with a plain-English band +label. It does not call `compute_deff_diagnostics` (that helper +needs per-fit internals the result objects do not expose). The report layer **does** compose a few cross-period summary statistics from per-period inputs already produced by the estimator — specifically the joint-Wald / Bonferroni pre-trends p-value from pre-period event-study coefficients (see diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 15967b18..1168bcd2 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -1100,6 +1100,83 @@ class _Result: ), f"cluster column must propagate from fit to Hausman pretest; got {captured}" +class TestWooldridgeResultsRouting: + """Round-16 P1 regression: the collectors must accept + ``WooldridgeDiDResults`` payloads, which use ``att`` (not + ``effect``). Without this, PT and heterogeneity silently skip on + Wooldridge fits. Also, Wooldridge aggregation keeps ``t >= g`` and + ignores the ``anticipation`` shift used by CS / SA / EfficientDiD + (REGISTRY.md §Wooldridge lines 1351-1352). + """ + + def _wooldridge_stub(self, *, anticipation: int = 0): + class WooldridgeDiDResults: + pass + + stub = WooldridgeDiDResults() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 100 + stub.n_treated = 40 + stub.n_control = 60 + stub.survey_metadata = None + stub.anticipation = anticipation + # Event study: Wooldridge payloads use ``att`` not ``effect``. + stub.event_study_effects = { + -2: {"att": -0.05, "se": 0.1, "p_value": 0.62}, + -1: {"att": 0.04, "se": 0.1, "p_value": 0.69}, + 0: {"att": 1.00, "se": 0.1, "p_value": 0.001}, + 1: {"att": 1.20, "se": 0.1, "p_value": 0.001}, + 2: {"att": 1.40, "se": 0.1, "p_value": 0.001}, + } + return stub + + def test_pre_period_collector_reads_att_payload(self): + from diff_diff.diagnostic_report import _collect_pre_period_coefs + + stub = self._wooldridge_stub() + pre = _collect_pre_period_coefs(stub) + keys = sorted(row[0] for row in pre) + assert keys == [ + -2, + -1, + ], f"pre-period collector must read Wooldridge ``att`` payloads; got {keys}" + effects = {row[0]: row[1] for row in pre} + assert effects[-2] == pytest.approx(-0.05) + assert effects[-1] == pytest.approx(0.04) + + def test_heterogeneity_reads_att_payload(self): + from diff_diff import DiagnosticReport + + stub = self._wooldridge_stub() + dr = DiagnosticReport(stub) + effects = sorted(dr._collect_effect_scalars()) + # Event-study post-only: rel >= 0 → {1.00, 1.20, 1.40}. + assert effects == pytest.approx([1.00, 1.20, 1.40]) + + def test_wooldridge_ignores_anticipation_shift_on_pre_periods(self): + from diff_diff.diagnostic_report import _collect_pre_period_coefs + + stub = self._wooldridge_stub(anticipation=1) + pre = _collect_pre_period_coefs(stub) + keys = sorted(row[0] for row in pre) + # Wooldridge keeps rel < 0 regardless of anticipation. + assert keys == [-2, -1] + + def test_wooldridge_ignores_anticipation_shift_on_heterogeneity(self): + from diff_diff import DiagnosticReport + + stub = self._wooldridge_stub(anticipation=1) + dr = DiagnosticReport(stub) + effects = sorted(dr._collect_effect_scalars()) + # Anticipation window (rel=-1) must not leak into the post set + # for Wooldridge even with anticipation=1. + assert effects == pytest.approx([1.00, 1.20, 1.40]) + + class TestAnticipationAwareHorizonClassification: """Round-15 P1 regression: on anticipation-aware fits (CS / SA / EfficientDiD with ``anticipation > 0``), the report layer must From ac7cbf20cec8cb1378c6870df0253033c26f1ae4 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 07:42:47 -0400 Subject: [PATCH 20/48] Address seventeenth round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - P1 Anticipation-aware assumption block: ``_describe_assumption`` now centralizes the anticipation-adjustment via a new ``_apply_anticipation_to_assumption`` helper applied after the estimator-specific branch. On fits with ``results.anticipation > 0``, the block flips ``no_anticipation`` to ``False``, records ``anticipation_periods``, and appends an anticipation clause to the description ("Anticipation is allowed for the k periods immediately before treatment... not strict no-anticipation"). Covers CS, SA, MultiPeriod, Stacked, TwoStage, Imputation, EfficientDiD, StaggeredTripleDiff, ContinuousDiD, and Wooldridge per the estimator contracts in REGISTRY.md. - P1 Wooldridge sample semantics: ``_extract_sample`` now gates the "reinterpret ``n_control_units`` as ``n_never_treated``" step on a whitelist of estimators whose contracts define that field as a never-treated-only count (CS, SA, Imputation, TwoStage, Stacked, EfficientDiD, StaggeredTripleDiff, dCDH). Wooldridge is explicitly excluded because its ``n_control_units`` is the total eligible comparison set (never-treated + future-treated units that contribute valid not-yet-treated comparisons) per REGISTRY.md §Wooldridge line 1345. Wooldridge ``not_yet_treated`` fits now retain the fixed tally rather than being silently reinterpreted. - Regressions: - ``TestAnticipationAwareAssumptionBlock`` covers CS + ``anticipation=2`` (block flips ``no_anticipation`` off, records periods=2, description mentions "not strict no-anticipation"), EfficientDiD + ``anticipation=1`` (singular period wording), and the ``anticipation=0`` no-op path. - ``TestWooldridgeSampleNotYetTreatedSemantics`` covers Wooldridge + ``not_yet_treated`` (n_control preserved; n_never_treated stays None) and the complementary CS + ``not_yet_treated`` path (existing round-13 behavior preserved). 211 targeted tests passing (BR + DR + pretrends); black / ruff / mypy clean on BR/DR modules. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 76 +++++++++++++++++- tests/test_business_report.py | 140 ++++++++++++++++++++++++++++++++++ 2 files changed, 214 insertions(+), 2 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index adc12b79..8c1e1a9e 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -291,7 +291,10 @@ def _build_schema(self) -> Dict[str, Any]: pre_trends = _lift_pre_trends(dr_schema) sensitivity = _lift_sensitivity(dr_schema) robustness = _lift_robustness(dr_schema) - assumption = _describe_assumption(estimator_name, self._results) + assumption = _apply_anticipation_to_assumption( + _describe_assumption(estimator_name, self._results), + self._results, + ) next_steps = (dr_schema or {}).get("next_steps", []) caveats = _build_caveats(self._results, headline, sample, dr_schema) references = _references_for(estimator_name) @@ -483,10 +486,32 @@ def _extract_sample(self) -> Dict[str, Any]: # active control-group mode so prose can surface the dynamic- # comparison context instead of misreporting "0 control" # (round-13 CI review on PR #318). + # + # Estimator-specific exception (round-17 CI review): Wooldridge + # stores ``n_control_units`` as the total eligible comparison + # set (never-treated plus future-treated units that contribute + # valid not-yet-treated comparisons). Re-labeling that total as + # ``n_never_treated`` would overstate never-treated availability. + # Keep the fixed-count labeling for Wooldridge in that mode. control_group = getattr(r, "control_group", None) + name = type(r).__name__ n_never_treated: Optional[int] = None n_control: Optional[int] = n_control_units - if isinstance(control_group, str) and control_group == "not_yet_treated": + _never_treated_count_contract = name in { + "CallawaySantAnnaResults", + "SunAbrahamResults", + "ImputationDiDResults", + "TwoStageDiDResults", + "StackedDiDResults", + "EfficientDiDResults", + "StaggeredTripleDiffResults", + "ChaisemartinDHaultfoeuilleResults", + } + if ( + isinstance(control_group, str) + and control_group == "not_yet_treated" + and _never_treated_count_contract + ): n_never_treated = n_control_units # Do not populate a fixed ``n_control`` for this mode: the # comparison set is dynamic and varies by (g, t) cell. @@ -645,6 +670,53 @@ def _lift_robustness(dr: Optional[Dict[str, Any]]) -> Dict[str, Any]: } +def _anticipation_periods(results: Any) -> int: + """Return the non-negative anticipation-period count from a result, or 0. + + Helper for ``_describe_assumption``. Anticipation-capable estimators + (MultiPeriodDiD, CS, SA, ImputationDiD, TwoStageDiD, Stacked, EfficientDiD, + StaggeredTripleDiff, ContinuousDiD, Wooldridge) expose ``anticipation`` + as an int defaulting to ``0``. + """ + a = getattr(results, "anticipation", 0) + try: + k = int(a) + except (TypeError, ValueError): + return 0 + return k if k > 0 else 0 + + +def _apply_anticipation_to_assumption(block: Dict[str, Any], results: Any) -> Dict[str, Any]: + """If the fit used ``anticipation > 0``, flip ``no_anticipation`` off and + append an anticipation clause to the description. + + Round-17 CI review flagged the strict "plus no anticipation" language + on anticipation-enabled fits. Per REGISTRY.md §CallawaySantAnna lines + 355-395 and the matching sections for SA / MultiPeriod / Wooldridge / + EfficientDiD, a fit with ``anticipation=k`` shifts the effective + treatment boundary by ``k`` pre-periods; the identifying assumption + becomes "no treatment effects earlier than ``k`` periods before the + treatment start" rather than strict no-anticipation. + """ + k = _anticipation_periods(results) + if k <= 0: + return block + block = dict(block) # don't mutate the caller's dict + block["no_anticipation"] = False + block["anticipation_periods"] = k + period_word = "period" if k == 1 else "periods" + clause = ( + f" Anticipation is allowed for the {k} {period_word} immediately " + "before treatment: the identifying contract requires no treatment " + f"effects earlier than {k} {period_word} before the treatment " + "start (not strict no-anticipation)." + ) + desc = block.get("description", "") + if isinstance(desc, str): + block["description"] = desc + clause + return block + + def _describe_assumption(estimator_name: str, results: Any = None) -> Dict[str, Any]: """Return the identifying-assumption block for an estimator.""" if estimator_name in { diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 1168bcd2..3adc7b5f 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -1100,6 +1100,146 @@ class _Result: ), f"cluster column must propagate from fit to Hausman pretest; got {captured}" +class TestAnticipationAwareAssumptionBlock: + """Round-17 P1 regression: ``_describe_assumption`` must drop the + strict "plus no anticipation" language when the fit allows + ``anticipation > 0``. REGISTRY.md §CallawaySantAnna lines 355-395 + (and the matching SA / MultiPeriod / Wooldridge / EfficientDiD + sections) treat anticipation as a relaxation of the strict no- + anticipation assumption: no treatment effects earlier than ``k`` + periods before treatment, not none at all. + """ + + def test_cs_with_anticipation_sets_no_anticipation_false(self): + class CallawaySantAnnaResults: + pass + + stub = CallawaySantAnnaResults() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 100 + stub.n_treated = 40 + stub.n_control = 60 + stub.survey_metadata = None + stub.event_study_effects = None + stub.anticipation = 2 + + br = BusinessReport(stub, auto_diagnostics=False) + a = br.to_dict()["assumption"] + assert ( + a["no_anticipation"] is False + ), f"anticipation=2 must flip no_anticipation off; got {a}" + assert a["anticipation_periods"] == 2 + assert "2 periods" in a["description"] + assert "not strict no-anticipation" in a["description"] + + def test_efficient_did_with_anticipation_flips_no_anticipation_off(self): + class EfficientDiDResults: + pass + + stub = EfficientDiDResults() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 100 + stub.n_treated = 40 + stub.n_control = 60 + stub.survey_metadata = None + stub.event_study_effects = None + stub.pt_assumption = "all" + stub.control_group = "never_treated" + stub.anticipation = 1 + + br = BusinessReport(stub, auto_diagnostics=False) + a = br.to_dict()["assumption"] + assert a["no_anticipation"] is False + assert a["anticipation_periods"] == 1 + assert "1 period" in a["description"] + + def test_anticipation_zero_preserves_strict_no_anticipation(self): + """Default (``anticipation=0``) keeps the strict text.""" + + class CallawaySantAnnaResults: + pass + + stub = CallawaySantAnnaResults() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 100 + stub.n_treated = 40 + stub.n_control = 60 + stub.survey_metadata = None + stub.event_study_effects = None + stub.anticipation = 0 + + br = BusinessReport(stub, auto_diagnostics=False) + a = br.to_dict()["assumption"] + assert a["no_anticipation"] is True + assert "anticipation_periods" not in a + assert "not strict no-anticipation" not in a["description"] + + +class TestWooldridgeSampleNotYetTreatedSemantics: + """Round-17 P1 regression: Wooldridge's ``n_control_units`` is the + total eligible comparison set (never-treated plus future-treated + units that contribute valid not-yet-treated comparisons). BR must + NOT reinterpret that count as ``n_never_treated`` for Wooldridge, + which would overstate never-treated availability. CS / SA / + ImputationDiD / etc. retain the existing reinterpretation because + their contracts define ``n_control_units`` as never-treated only. + """ + + def test_wooldridge_not_yet_treated_keeps_fixed_n_control(self): + class WooldridgeDiDResults: + pass + + stub = WooldridgeDiDResults() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 100 + stub.n_treated = 40 + stub.n_control = 60 # Total eligible, NOT never-treated only. + stub.survey_metadata = None + stub.control_group = "not_yet_treated" + + br = BusinessReport(stub, auto_diagnostics=False) + sample = br.to_dict()["sample"] + assert sample["n_control"] == 60, ( + "Wooldridge n_control_units is total eligible controls; " + "must not be hidden behind not_yet_treated reinterpretation" + ) + assert sample["n_never_treated"] is None + + def test_cs_not_yet_treated_still_reinterprets(self): + """CS retains the existing behavior: the fixed ``n_control`` is + suppressed and ``n_never_treated`` surfaces the never-treated + count. Regression from round 13.""" + sdf = generate_staggered_data(n_units=100, n_periods=6, treatment_effect=1.5, seed=7) + cs = CallawaySantAnna(base_period="universal", control_group="not_yet_treated").fit( + sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + aggregate="event_study", + ) + br = BusinessReport(cs, auto_diagnostics=False) + sample = br.to_dict()["sample"] + assert sample["n_control"] is None + assert sample["n_never_treated"] == getattr(cs, "n_control_units", None) + + class TestWooldridgeResultsRouting: """Round-16 P1 regression: the collectors must accept ``WooldridgeDiDResults`` payloads, which use ``att`` (not From d8a49c71f9c3ad573349ca41dfe76cc5672dfe9c Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 08:04:01 -0400 Subject: [PATCH 21/48] Address eighteenth round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - P1 Dynamic-control sample semantics for ContinuousDiD + StaggeredTripleDiff: ``BusinessReport._extract_sample`` now normalizes the ``control_group`` string (``"not_yet_treated"`` and ``"notyettreated"`` both canonicalize to the same dynamic mode) and surfaces the estimator-specific fixed subset: - ContinuousDiD: added to the never-treated-count contract set; in dynamic mode, ``n_control`` is suppressed and ``n_never_treated`` surfaces the stored D=0 tally (which is only part of the actual comparison set). - StaggeredTripleDiff: ``control_group="notyettreated"`` now suppresses the composite ``n_control_units`` total and exposes ``n_never_enabled`` (separate field on the result, per the estimator contract — never-enabled units are the fixed subset in this mode). - Exposed a new ``dynamic_control`` boolean on the sample schema so both summary() and full_report() branch on the canonical mode rather than a single exact-string check. Prose now renders "Never-enabled units present in the panel: N" for triple- difference fits and "Never-treated units present in the panel: N" for the rest. - Regressions: ``TestContinuousDiDDynamicControlSample`` covers the ContinuousDiD schema + summary + full_report path. ``TestStaggeredTripleDiffDynamicControlSample`` covers the triple-difference path with the ``"notyettreated"`` spelling and asserts ``n_never_enabled`` surfaces (not ``n_never_treated``). 213 targeted tests passing (140 BR + 65 DR + 8 pretrends); black / ruff / mypy clean on BR/DR modules. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 114 +++++++++++++++++++++------------- tests/test_business_report.py | 86 +++++++++++++++++++++++++ 2 files changed, 156 insertions(+), 44 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 8c1e1a9e..8de98ad6 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -477,25 +477,34 @@ def _extract_sample(self) -> Dict[str, Any]: n_control_units = _safe_int(getattr(r, "n_control", getattr(r, "n_control_units", None))) # Control-group semantics. For estimators that expose a - # ``control_group`` kwarg (CS, EfficientDiD), the meaning of - # ``n_control_units`` depends on it. On CallawaySantAnna with - # ``control_group="not_yet_treated"``, ``n_control_units`` counts - # only the never-treated subset, so the actual dynamic - # comparison group can be non-empty even when this count is 0. - # Label the exposed count as never-treated and record the - # active control-group mode so prose can surface the dynamic- - # comparison context instead of misreporting "0 control" - # (round-13 CI review on PR #318). + # ``control_group`` kwarg (CS, EfficientDiD, ContinuousDiD, + # StaggeredTripleDiff, ...), the meaning of ``n_control_units`` + # depends on it. When the mode is "not-yet-treated" (dynamic + # comparison set), the fixed tally stored on the result is only + # the fully-untreated subset — the actual comparison set varies + # by (g, t) cell. Label the exposed count accordingly so prose + # surfaces the dynamic context instead of misreporting + # "0 control" (round-13 / round-17 / round-18 CI review). # - # Estimator-specific exception (round-17 CI review): Wooldridge - # stores ``n_control_units`` as the total eligible comparison - # set (never-treated plus future-treated units that contribute - # valid not-yet-treated comparisons). Re-labeling that total as - # ``n_never_treated`` would overstate never-treated availability. - # Keep the fixed-count labeling for Wooldridge in that mode. + # Canonicalize both ``"not_yet_treated"`` (CS / EfficientDiD / + # ContinuousDiD / Wooldridge) and ``"notyettreated"`` + # (StaggeredTripleDiff) as the same dynamic mode. + # + # Per-estimator fixed-subset field: + # * CS / SA / Imputation / TwoStage / Stacked / EfficientDiD / + # dCDH / ContinuousDiD — ``n_control_units`` is the + # never-treated tally; surface as ``n_never_treated``. + # * StaggeredTripleDiff — ``n_control_units`` is a composite + # total; the fixed subset is ``n_never_enabled`` (stored + # separately on the result). + # * Wooldridge — ``n_control_units`` is total eligible + # comparisons (never-treated + future-treated) and does not + # map to a never-treated count. Keep on the fixed-count + # path even in dynamic mode. control_group = getattr(r, "control_group", None) name = type(r).__name__ n_never_treated: Optional[int] = None + n_never_enabled: Optional[int] = None n_control: Optional[int] = n_control_units _never_treated_count_contract = name in { "CallawaySantAnnaResults", @@ -504,30 +513,36 @@ def _extract_sample(self) -> Dict[str, Any]: "TwoStageDiDResults", "StackedDiDResults", "EfficientDiDResults", - "StaggeredTripleDiffResults", "ChaisemartinDHaultfoeuilleResults", + "ContinuousDiDResults", } - if ( - isinstance(control_group, str) - and control_group == "not_yet_treated" - and _never_treated_count_contract - ): - n_never_treated = n_control_units - # Do not populate a fixed ``n_control`` for this mode: the - # comparison set is dynamic and varies by (g, t) cell. - n_control = None - - return { + _canonical_control = ( + control_group.replace("_", "").lower() if isinstance(control_group, str) else None + ) + is_dynamic_control = _canonical_control == "notyettreated" + if is_dynamic_control: + if name == "StaggeredTripleDiffResults": + n_never_enabled = _safe_int(getattr(r, "n_never_enabled", None)) + n_control = None + elif _never_treated_count_contract: + n_never_treated = n_control_units + n_control = None + + sample_block: Dict[str, Any] = { "n_obs": _safe_int(getattr(r, "n_obs", None)), "n_treated": n_treated, "n_control": n_control, "n_never_treated": n_never_treated, "control_group": control_group if isinstance(control_group, str) else None, + "dynamic_control": is_dynamic_control, "n_periods": _safe_int(getattr(r, "n_periods", None)), "pre_periods": _safe_list_len(getattr(r, "pre_periods", None)), "post_periods": _safe_list_len(getattr(r, "post_periods", None)), "survey": survey, } + if n_never_enabled is not None: + sample_block["n_never_enabled"] = n_never_enabled + return sample_block def _extract_survey_block(self) -> Optional[Dict[str, Any]]: sm = getattr(self._results, "survey_metadata", None) @@ -1460,29 +1475,33 @@ def _render_summary(schema: Dict[str, Any]) -> str: f"pre-period variation." ) - # Sample sentence. For CS ``control_group="not_yet_treated"`` the - # fixed control count is suppressed because the comparison group is - # dynamic; narrate the mode explicitly rather than misreporting a - # never-treated-only tally as "control" (round-13 CI review). + # Sample sentence. For fits with a dynamic not-yet-treated + # comparison set (CS / ContinuousDiD / StaggeredTripleDiff / + # EfficientDiD) the fixed control count is suppressed because the + # comparison group varies by (g, t) cell; narrate the mode + # explicitly rather than misreporting a fixed-subset tally as + # "control" (rounds 13 / 17 / 18 CI review). sample = schema.get("sample", {}) or {} n_obs = sample.get("n_obs") n_t = sample.get("n_treated") n_c = sample.get("n_control") n_nt = sample.get("n_never_treated") - control_mode = sample.get("control_group") + n_ne = sample.get("n_never_enabled") + is_dynamic = sample.get("dynamic_control") if isinstance(n_obs, int): if isinstance(n_t, int) and isinstance(n_c, int): sentences.append(f"Sample: {n_obs:,} observations ({n_t:,} treated, {n_c:,} control).") - elif control_mode == "not_yet_treated" and isinstance(n_t, int): - extra = ( - f"; {n_nt:,} never-treated units are also present" - if isinstance(n_nt, int) and n_nt > 0 - else "" - ) + elif is_dynamic and isinstance(n_t, int): + if isinstance(n_ne, int) and n_ne > 0: + subset_clause = f"; {n_ne:,} never-enabled units are also present" + elif isinstance(n_nt, int) and n_nt > 0: + subset_clause = f"; {n_nt:,} never-treated units are also present" + else: + subset_clause = "" sentences.append( f"Sample: {n_obs:,} observations ({n_t:,} treated) with a " "dynamic not-yet-treated comparison group (the control set " - f"varies by cohort and period){extra}." + f"varies by cohort and period){subset_clause}." ) else: sentences.append(f"Sample: {n_obs:,} observations.") @@ -1610,13 +1629,20 @@ def _render_full_report(schema: Dict[str, Any]) -> str: if isinstance(sample.get("n_treated"), int): lines.append(f"- Treated: {sample['n_treated']:,}") # ``n_control`` is only populated for estimators whose control set - # is a fixed tally. For CS ``control_group="not_yet_treated"`` the - # comparison group is dynamic per (g, t); report the never-treated - # count (when non-zero) and the dynamic-comparison mode explicitly. + # is a fixed tally. For dynamic not-yet-treated modes (CS / + # ContinuousDiD / StaggeredTripleDiff / EfficientDiD) the + # comparison group is dynamic per (g, t); report the estimator- + # specific fixed subset (``n_never_enabled`` for triple-difference; + # ``n_never_treated`` elsewhere) when non-zero, then name the + # dynamic-comparison mode explicitly. if isinstance(sample.get("n_control"), int): lines.append(f"- Control: {sample['n_control']:,}") - elif sample.get("control_group") == "not_yet_treated": - if isinstance(sample.get("n_never_treated"), int) and sample["n_never_treated"] > 0: + elif sample.get("dynamic_control"): + if isinstance(sample.get("n_never_enabled"), int) and sample["n_never_enabled"] > 0: + lines.append( + f"- Never-enabled units present in the panel: {sample['n_never_enabled']:,}" + ) + elif isinstance(sample.get("n_never_treated"), int) and sample["n_never_treated"] > 0: lines.append( f"- Never-treated units present in the panel: {sample['n_never_treated']:,}" ) diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 3adc7b5f..f0f3fa07 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -1187,6 +1187,92 @@ class CallawaySantAnnaResults: assert "not strict no-anticipation" not in a["description"] +class TestContinuousDiDDynamicControlSample: + """Round-18 P1 regression: ContinuousDiD with + ``control_group="not_yet_treated"`` must take the dynamic-control + path in ``to_dict()``, ``summary()``, and ``full_report()``. The + stored ``n_control_units`` is only the fully-untreated ``D=0`` + tally; the actual comparison set includes future-treated cohorts + beyond the anticipation window. + """ + + def test_continuous_did_not_yet_treated_surfaces_dynamic_mode(self): + class ContinuousDiDResults: + pass + + stub = ContinuousDiDResults() + stub.overall_att = 1.0 + stub.overall_att_se = 0.2 + stub.overall_att_p_value = 0.001 + stub.overall_att_conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 120 + stub.n_treated = 50 + stub.n_control = 70 # D=0 (never-treated) count only. + stub.survey_metadata = None + stub.control_group = "not_yet_treated" + + br = BusinessReport(stub, auto_diagnostics=False) + sample = br.to_dict()["sample"] + assert sample["n_control"] is None + assert sample["n_never_treated"] == 70 + assert sample["dynamic_control"] is True + + summary = br.summary() + assert " control)" not in summary + assert "dynamic not-yet-treated" in summary + + full = br.full_report() + assert "- Control: 70" not in full + assert "dynamic not-yet-treated" in full + + +class TestStaggeredTripleDiffDynamicControlSample: + """Round-18 P1 regression: StaggeredTripleDifference with + ``control_group="notyettreated"`` (no underscore per the estimator + contract) must also take the dynamic-control path. Its fixed + subset is ``n_never_enabled`` (separate field) rather than a + never-treated count. + """ + + def test_notyettreated_surfaces_n_never_enabled(self): + class StaggeredTripleDiffResults: + pass + + stub = StaggeredTripleDiffResults() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 200 + stub.n_treated = 80 + stub.n_control = 120 # Composite total (ignored in this mode). + stub.n_never_enabled = 30 # Fixed subset exposed in this mode. + stub.survey_metadata = None + stub.event_study_effects = None + stub.inference_method = "analytical" + stub.control_group = "notyettreated" # No underscore. + + br = BusinessReport(stub, auto_diagnostics=False) + sample = br.to_dict()["sample"] + assert sample["n_control"] is None + assert sample["dynamic_control"] is True + assert sample["n_never_enabled"] == 30 + assert sample["n_never_treated"] is None, ( + "StaggeredTripleDiff must expose n_never_enabled, not " "n_never_treated" + ) + + summary = br.summary() + assert " control)" not in summary + assert "dynamic not-yet-treated" in summary + assert "30 never-enabled" in summary + + full = br.full_report() + assert "- Control:" not in full + assert "Never-enabled units present in the panel: 30" in full + + class TestWooldridgeSampleNotYetTreatedSemantics: """Round-17 P1 regression: Wooldridge's ``n_control_units`` is the total eligible comparison set (never-treated plus future-treated From dba910797403b58ecb1abe3144abb825c31e0ce7 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 08:21:12 -0400 Subject: [PATCH 22/48] Address nineteenth round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - P1 Persist ``anticipation`` on staggered result dataclasses. Round 15/17 added anticipation-aware horizon classification and assumption prose, but only stub-based tests exercised it — real fits were silently returning ``0`` because the fit() constructors never threaded the kwarg onto the result object. Fixed in: - ``CallawaySantAnnaResults`` — new ``anticipation: int = 0`` field; ``staggered.py`` fit() passes ``anticipation=self. anticipation``. - ``SunAbrahamResults`` — same. - ``StaggeredTripleDiffResults`` — same. On real ``CallawaySantAnna(anticipation=1).fit(...)`` outputs, ``BusinessReport.assumption`` now flips ``no_anticipation`` to ``False`` and ``_collect_pre_period_coefs`` + ``compute_pretrends_power`` correctly exclude the anticipation window. Confirmed end-to-end in the new regression class. - P2 SDiD / TROP step-6 completion: ``_collect_next_steps`` now marks Baker step 6 (sensitivity) complete when ``estimator_native_diagnostics.status == "ran"`` on those estimators. Previously the generic ``sensitivity`` section was "skipped" for SDiD/TROP (because they route to native diagnostics), so ``next_steps`` redundantly recommended a sensitivity check the report had already executed. - P3 ``llms-full.txt``: DR orchestration line no longer mentions ``compute_deff_diagnostics``. Added an explicit note that ``design_effect`` and ``epv`` are read-only (echo from ``survey_metadata`` / ``results.epv_diagnostics``). - Regressions: ``TestAnticipationPersistsOnRealResults`` fits real ``CallawaySantAnna(anticipation=1)`` and ``SunAbraham (anticipation=1)`` objects and asserts both that the field persists on the result and that the BR assumption block flips ``no_anticipation`` off with the "not strict no-anticipation" clause — exercise the full estimator → result → report chain rather than relying on a hand-set stub attribute. 450 targeted tests passing across BR / DR / staggered / sun_abraham / triple_diff / pretrends suites; black / ruff / mypy clean on BR/DR modules. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/diagnostic_report.py | 11 ++++++ diff_diff/guides/llms-full.txt | 9 +++-- diff_diff/staggered.py | 1 + diff_diff/staggered_results.py | 8 +++++ diff_diff/staggered_triple_diff.py | 9 ++--- diff_diff/staggered_triple_diff_results.py | 5 +++ diff_diff/sun_abraham.py | 7 ++++ tests/test_business_report.py | 41 ++++++++++++++++++++++ 8 files changed, 82 insertions(+), 9 deletions(-) diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 3bbd99cd..e9dd2248 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -801,6 +801,17 @@ def _ran(key: str) -> bool: completed.append("parallel_trends") if _ran("sensitivity"): completed.append("sensitivity") + # SDiD / TROP route their sensitivity analogue through + # ``estimator_native_diagnostics`` rather than HonestDiD. When + # that native block ran, the Baker step-6 sensitivity check + # has effectively been performed; treating the sensitivity + # section as not-run would have ``next_steps`` redundantly + # recommend a check the report already executed (round-19 + # CI review on PR #318). + result_name = type(self._results).__name__ + if result_name in {"SyntheticDiDResults", "TROPResults"} and _ran("estimator_native"): + if "sensitivity" not in completed: + completed.append("sensitivity") if _ran("heterogeneity"): completed.append("heterogeneity") ns = practitioner_next_steps( diff --git a/diff_diff/guides/llms-full.txt b/diff_diff/guides/llms-full.txt index 5173094e..40bb5ae2 100644 --- a/diff_diff/guides/llms-full.txt +++ b/diff_diff/guides/llms-full.txt @@ -1786,11 +1786,14 @@ Status enum values: `ran | skipped | error | not_applicable | not_run | computed Unified diagnostic runner orchestrating `check_parallel_trends`, `compute_pretrends_power`, `HonestDiD.sensitivity`, `bacon_decompose`, -`compute_deff_diagnostics`, `results.epv_diagnostics`, plus -estimator-native surfaces for SyntheticDiD (`pre_treatment_fit`, +plus estimator-native surfaces for SyntheticDiD (`pre_treatment_fit`, `get_weight_concentration`, `in_time_placebo`, `sensitivity_to_zeta_omega`) and TROP (factor-model metrics). EfficientDiD PT uses the native -`hausman_pretest`. +`hausman_pretest`. The `design_effect` section is read-only: it +echoes `survey_metadata.design_effect` / `effective_n` from the +fitted result along with a plain-English band label. The +`epv` section is similarly read-only, reporting from +`results.epv_diagnostics` plus `results.epv_threshold`. ```python from diff_diff import DiagnosticReport diff --git a/diff_diff/staggered.py b/diff_diff/staggered.py index 2004f2e7..ae711e12 100644 --- a/diff_diff/staggered.py +++ b/diff_diff/staggered.py @@ -2001,6 +2001,7 @@ def fit( alpha=self.alpha, control_group=self.control_group, base_period=self.base_period, + anticipation=self.anticipation, event_study_effects=event_study_effects, group_effects=group_effects, bootstrap_results=bootstrap_results, diff --git a/diff_diff/staggered_results.py b/diff_diff/staggered_results.py index 2cb31d60..9c8f5275 100644 --- a/diff_diff/staggered_results.py +++ b/diff_diff/staggered_results.py @@ -111,6 +111,14 @@ class CallawaySantAnnaResults: alpha: float = 0.05 control_group: str = "never_treated" base_period: str = "varying" + # Anticipation periods (``k``) used at fit time. Persisted on the + # result so downstream diagnostics (``BusinessReport`` / + # ``DiagnosticReport`` / ``compute_pretrends_power``) can classify + # pre-period vs anticipation-window coefficients without re- + # plumbing the kwarg through every call site. See REGISTRY.md + # §CallawaySantAnna lines 355-395 for the shifted-boundary + # contract. + anticipation: int = 0 panel: bool = True event_study_effects: Optional[Dict[int, Dict[str, Any]]] = field(default=None) group_effects: Optional[Dict[Any, Dict[str, Any]]] = field(default=None) diff --git a/diff_diff/staggered_triple_diff.py b/diff_diff/staggered_triple_diff.py index 758d518b..08c6131f 100644 --- a/diff_diff/staggered_triple_diff.py +++ b/diff_diff/staggered_triple_diff.py @@ -136,8 +136,7 @@ def __init__( raise ValueError(f"epv_threshold must be > 0, got {epv_threshold}") if pscore_fallback not in ["error", "unconditional"]: raise ValueError( - f"pscore_fallback must be 'error' or 'unconditional', " - f"got '{pscore_fallback}'" + f"pscore_fallback must be 'error' or 'unconditional', " f"got '{pscore_fallback}'" ) self.estimation_method = estimation_method @@ -707,6 +706,7 @@ def fit( alpha=self.alpha, control_group=self.control_group, base_period=self.base_period, + anticipation=self.anticipation, estimation_method=self.estimation_method, event_study_effects=event_study_effects, group_effects=group_effects, @@ -1379,10 +1379,7 @@ def _compute_pscore( beta_clean = np.where(np.isfinite(beta_logistic), beta_logistic, 0.0) pscore_cache[pscore_key] = (beta_clean, diag) except (np.linalg.LinAlgError, ValueError): - if ( - self.pscore_fallback == "error" - or self.rank_deficient_action == "error" - ): + if self.pscore_fallback == "error" or self.rank_deficient_action == "error": raise ctx = f" for {context_label}" if context_label else "" warnings.warn( diff --git a/diff_diff/staggered_triple_diff_results.py b/diff_diff/staggered_triple_diff_results.py index bc664d4a..6ffc0738 100644 --- a/diff_diff/staggered_triple_diff_results.py +++ b/diff_diff/staggered_triple_diff_results.py @@ -74,6 +74,11 @@ class StaggeredTripleDiffResults: alpha: float = 0.05 control_group: str = "notyettreated" base_period: str = "varying" + # Anticipation periods (``k``) used at fit time. Persisted so + # downstream diagnostics in ``BusinessReport`` / ``DiagnosticReport`` + # can render the anticipation-aware assumption block and + # horizon-classification cutoffs accurately on real fits. + anticipation: int = 0 estimation_method: str = "dr" event_study_effects: Optional[Dict[int, Dict[str, Any]]] = field(default=None) group_effects: Optional[Dict[Any, Dict[str, Any]]] = field(default=None) diff --git a/diff_diff/sun_abraham.py b/diff_diff/sun_abraham.py index bb79052f..f3c78f8e 100644 --- a/diff_diff/sun_abraham.py +++ b/diff_diff/sun_abraham.py @@ -79,6 +79,12 @@ class SunAbrahamResults: n_control_units: int alpha: float = 0.05 control_group: str = "never_treated" + # Anticipation periods (``k``) used at fit time. Persisted so + # downstream diagnostics (``BusinessReport`` / ``DiagnosticReport`` + # / ``compute_pretrends_power``) can classify pre-period vs + # anticipation-window coefficients without re-plumbing the kwarg + # through every caller. + anticipation: int = 0 bootstrap_results: Optional["SABootstrapResults"] = field(default=None, repr=False) cohort_effects: Optional[Dict[Tuple[Any, int], Dict[str, Any]]] = field( default=None, repr=False @@ -893,6 +899,7 @@ def _refit_sa_cohort(w_r): n_control_units=n_control_units, alpha=self.alpha, control_group=self.control_group, + anticipation=self.anticipation, bootstrap_results=bootstrap_results, cohort_effects=cohort_effects_storage, survey_metadata=survey_metadata, diff --git a/tests/test_business_report.py b/tests/test_business_report.py index f0f3fa07..b6c811d5 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -1100,6 +1100,47 @@ class _Result: ), f"cluster column must propagate from fit to Hausman pretest; got {captured}" +class TestAnticipationPersistsOnRealResults: + """Round-19 P1 regression: ``CallawaySantAnnaResults``, + ``SunAbrahamResults``, and ``StaggeredTripleDiffResults`` must + persist the ``anticipation`` field so the anticipation-aware + reporting code (round-15/17) actually fires on real fits. Stub- + only regressions had hidden that the result constructors were + dropping the value. + """ + + def test_cs_fit_persists_anticipation(self): + sdf = generate_staggered_data(n_units=100, n_periods=6, treatment_effect=1.5, seed=7) + cs = CallawaySantAnna(base_period="universal", anticipation=1).fit( + sdf, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + aggregate="event_study", + ) + assert getattr(cs, "anticipation", None) == 1 + br = BusinessReport(cs, auto_diagnostics=False) + a = br.to_dict()["assumption"] + # Round-17 assumption-aware block now fires on a real fit. + assert a["no_anticipation"] is False + assert a["anticipation_periods"] == 1 + assert "not strict no-anticipation" in a["description"] + + def test_sun_abraham_fit_persists_anticipation(self): + from diff_diff import SunAbraham + + sdf = generate_staggered_data(n_units=100, n_periods=6, treatment_effect=1.5, seed=7) + sa = SunAbraham(anticipation=1).fit( + sdf, outcome="outcome", unit="unit", time="period", first_treat="first_treat" + ) + assert getattr(sa, "anticipation", None) == 1 + br = BusinessReport(sa, auto_diagnostics=False) + a = br.to_dict()["assumption"] + assert a["no_anticipation"] is False + assert a["anticipation_periods"] == 1 + + class TestAnticipationAwareAssumptionBlock: """Round-17 P1 regression: ``_describe_assumption`` must drop the strict "plus no anticipation" language when the fit allows From 124b1bf0a4964a4264500ee668c5dec66e947299 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 08:49:57 -0400 Subject: [PATCH 23/48] Address twentieth round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-20 review findings: P1 methodology. `_format_precomputed_pretrends_power` now mirrors the covariance-source annotation and diagonal-fallback downgrade applied by `_check_pretrends_power`. Extract `_infer_cov_source()` plus the module-level `_apply_diag_fallback_downgrade()` helper so both paths share identical logic. The precomputed adapter resolves the source fit via `PreTrendsPowerResults.original_results` (populated by `compute_pretrends_power`) and falls back to `self._results`. Without this, the same CS/SA fit could be labeled `well_powered` through the precomputed path while the default path reported `moderately_powered`. P2 code quality. `BusinessReport._build_caveats()` no longer surfaces "HonestDiD sensitivity was not run" as a warning when the skipped sensitivity block is routed to estimator-native diagnostics (`method="estimator_native"`) AND the native battery actually ran (`estimator_native_diagnostics.status == "ran"`). For SDiD / TROP, this now emits an info-severity `sensitivity_native_routed` caveat pointing at the native block instead of the misleading warning. P2 maintainability. Retag the EfficientDiD practitioner workflow step "Run Hausman pretest (PT-All vs PT-Post)" from `_step_name="heterogeneity"` to `_step_name="parallel_trends"`. The Hausman pretest is a PT diagnostic per REGISTRY.md §EfficientDiD, and DR already treats a ran Hausman block as parallel-trends completion — so the previous tag caused `_collect_next_steps()` to recommend a check the report had just executed. P3 coverage. Add regressions covering the `precomputed={"pretrends_power": ...}` adapter — parity with the default path on a real CS fit, plus a stub-based test that pins the `well_powered -> moderately_powered` downgrade when full `event_study_vcov` is available but unused. Pre-emptive audit findings (bundled): Same R19 stub-vs-real-fit pattern as the previous round applied to three more staggered estimators. Persist `anticipation` on `ImputationDiDResults`, `TwoStageDiDResults`, and `StackedDiDResults` so the anticipation-aware horizon classification and assumption-block flipping (rounds 15 / 17) fire on real fits. Without the field, real `ImputationDiD(anticipation=1).fit(...)` silently returned `anticipation=0` through `getattr`, and two rounds of anticipation-aware reporting code were ignored. Normalize StackedDiD's control-group choice. StackedDiD exposes the choice as `clean_control` (the public WFH-2024 kwarg name), not `control_group`. Add a `_control_group_choice()` helper that reads `control_group` first and falls back to `clean_control` for `StackedDiDResults`. Without this alias, a Stacked fit with `clean_control="not_yet_treated"` surfaced as `control_group=None` in the schema and bypassed the dynamic-control branch in `_extract_sample`. This mirrors the source-faithfulness gap R17/R18 flagged for EfficientDiD's `control_group` handling. Surface dCDH Phase-3 configuration in the assumption description. When `controls`, `trends_linear`, or `heterogeneity` is set in fit(), the estimand label changes (`DID^X_l`, `DID^{fd}_l`, `DID^{X,fd}_l`) and the identifying contract becomes conditional on the first-stage residualization / group-specific linear trends. Append an explicit Phase-3 clause listing the active features so the description does not misrepresent the identifying assumption on a Phase-3 fit. Tests: 13 new regressions across `TestAnticipationPersistsOnRealResults` (imputation / two-stage / stacked), `TestStackedCleanControlSurfacesInSampleBlock`, `TestDCDHPhase3AssumptionClause`, `TestSDiDTROPSkippedSensitivityCaveatSuppressed`, `TestEfficientDiDHausmanStepTaggedAsParallelTrends`, and `TestPrecomputed` (parity + downgrade). All 165 BR/DR tests pass; 161 imputation/two-stage tests and 82 stacked/practitioner tests unaffected. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 136 +++++++++++++--- diff_diff/diagnostic_report.py | 119 ++++++++------ diff_diff/imputation.py | 1 + diff_diff/imputation_results.py | 1 + diff_diff/practitioner.py | 11 +- diff_diff/stacked_did.py | 1 + diff_diff/stacked_did_results.py | 1 + diff_diff/two_stage.py | 1 + diff_diff/two_stage_results.py | 1 + tests/test_business_report.py | 265 +++++++++++++++++++++++++++++++ tests/test_diagnostic_report.py | 83 ++++++++++ 11 files changed, 553 insertions(+), 67 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 8de98ad6..42da7d5d 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -501,7 +501,7 @@ def _extract_sample(self) -> Dict[str, Any]: # comparisons (never-treated + future-treated) and does not # map to a never-treated count. Keep on the fixed-count # path even in dynamic mode. - control_group = getattr(r, "control_group", None) + control_group = _control_group_choice(r) name = type(r).__name__ n_never_treated: Optional[int] = None n_never_enabled: Optional[int] = None @@ -701,6 +701,28 @@ def _anticipation_periods(results: Any) -> int: return k if k > 0 else 0 +def _control_group_choice(results: Any) -> Optional[str]: + """Return the control-group choice string for a fitted result, normalized + across estimator-specific attribute names. + + Most anticipation-capable estimators expose the control-group choice as + ``results.control_group``. ``StackedDiDResults`` exposes the same choice + as ``clean_control`` (the public Wing-Freedman-Hollingsworth-2024 kwarg + name). Without this alias, a StackedDiD fit with + ``clean_control="not_yet_treated"`` would surface as ``control_group=None`` + in the business-report schema, and the dynamic-control branch in + ``_extract_sample`` would never fire. + """ + cg = getattr(results, "control_group", None) + if isinstance(cg, str): + return cg + if type(results).__name__ == "StackedDiDResults": + clean = getattr(results, "clean_control", None) + if isinstance(clean, str): + return clean + return None + + def _apply_anticipation_to_assumption(block: Dict[str, Any], results: Any) -> Dict[str, Any]: """If the fit used ``anticipation > 0``, flip ``no_anticipation`` off and append an anticipation clause to the description. @@ -808,27 +830,77 @@ def _describe_assumption(estimator_name: str, results: Any = None) -> Dict[str, # treatment cohorts" was flagged as a source-faithfulness bug in # PR #318 review; REGISTRY.md §ChaisemartinDHaultfoeuille is # explicit about the transition-set construction. + # + # Phase-3 features (``controls``, ``trends_linear``, + # ``heterogeneity``) each modify the identifying contract and + # change the estimand from ``DID_l`` to ``DID^X_l`` / + # ``DID^{fd}_l`` / the heterogeneity-test variant. When active, + # append an explicit clause so the description does not + # misrepresent the identifying assumption (the reviewer has + # flagged several parallel source-faithfulness gaps elsewhere + # — explicitly surfacing Phase-3 config matches the per-estimator + # walkthrough pattern). + base_description = ( + "Identification is transition-based (de Chaisemartin & " + "D'Haultfoeuille 2020; dynamic companion 2024). At each " + "switching period, the estimator contrasts joiners " + "(D:0->1), leavers (D:1->0), and stable-treated / " + "stable-untreated control cells that share the same " + "treatment state across adjacent periods, yielding the " + "contemporaneous ``DID_M`` and per-horizon ``DID_l`` / " + "``DID_{g,l}`` building blocks. The identifying " + "restriction is parallel trends within each transition's " + "stable-control cell (not a single group-time ATT PT " + "condition across all cohorts) plus no anticipation; " + "with non-binary treatment the stable-control match is " + "additionally on exact baseline dose ``D_{g,1}``. " + "Reversible treatment is natively supported, unlike the " + "absorbing-treatment designs that rely on a fixed " + "treatment-onset cohort." + ) + has_controls = ( + results is not None + and getattr(results, "covariate_residuals", None) is not None + ) + has_trends = ( + results is not None + and getattr(results, "linear_trends_effects", None) is not None + ) + has_heterogeneity = ( + results is not None + and getattr(results, "heterogeneity_effects", None) is not None + ) + active_parts: List[str] = [] + if has_controls and has_trends: + active_parts.append( + "the estimand is ``DID^{X,fd}_l`` (covariate-residualized " + "first-differences), and identification holds conditional on " + "the covariates entering the first-stage regression and " + "allowing group-specific linear trends" + ) + elif has_controls: + active_parts.append( + "the estimand is ``DID^X_l``, and identification holds " + "conditional on the covariates entering the first-stage " + "residualization" + ) + elif has_trends: + active_parts.append( + "the estimand is ``DID^{fd}_l`` (first-differenced) and the " + "identifying restriction is relaxed to allow group-specific " + "linear pre-trends" + ) + if has_heterogeneity: + active_parts.append( + "heterogeneity tests ``beta^{het}_l`` are reported per horizon" + ) + if active_parts: + phase3_clause = " Phase-3 configuration: " + "; ".join(active_parts) + "." + base_description = base_description + phase3_clause return { "parallel_trends_variant": "transition_based", "no_anticipation": True, - "description": ( - "Identification is transition-based (de Chaisemartin & " - "D'Haultfoeuille 2020; dynamic companion 2024). At each " - "switching period, the estimator contrasts joiners " - "(D:0->1), leavers (D:1->0), and stable-treated / " - "stable-untreated control cells that share the same " - "treatment state across adjacent periods, yielding the " - "contemporaneous ``DID_M`` and per-horizon ``DID_l`` / " - "``DID_{g,l}`` building blocks. The identifying " - "restriction is parallel trends within each transition's " - "stable-control cell (not a single group-time ATT PT " - "condition across all cohorts) plus no anticipation; " - "with non-binary treatment the stable-control match is " - "additionally on exact baseline dose ``D_{g,1}``. " - "Reversible treatment is natively supported, unlike the " - "absorbing-treatment designs that rely on a fixed " - "treatment-onset cohort." - ), + "description": base_description, } if estimator_name == "EfficientDiDResults": # Chen, Sant'Anna & Xie (2025) — identification is parameterized @@ -1071,9 +1143,33 @@ def _build_caveats( # ``base_period='varying'`` — HonestDiD bounds are not interpretable # there). Surface the reason as a warning-severity caveat so readers # do not assume the headline is robust across the R-R grid. + # + # Exception (round-20 P2 CI review on PR #318): SDiD and TROP route + # robustness to ``estimator_native_diagnostics`` and mark the HonestDiD + # sensitivity block ``status="skipped", method="estimator_native"``. + # Surfacing "sensitivity was not run" as a warning contradicts the + # documented native-routing contract when the native battery actually + # ran. Suppress the warning and point readers at the native block + # instead. if sens.get("status") == "skipped": reason = sens.get("reason") - if isinstance(reason, str) and reason: + method = sens.get("method") + native = dr_schema.get("estimator_native_diagnostics") or {} + native_ran = native.get("status") == "ran" + if method == "estimator_native" and native_ran: + caveats.append( + { + "severity": "info", + "topic": "sensitivity_native_routed", + "message": ( + "HonestDiD was not run for this estimator. Robustness " + "is covered by the estimator-native sensitivity " + "diagnostics reported under " + "``estimator_native_diagnostics``." + ), + } + ) + elif isinstance(reason, str) and reason: caveats.append( { "severity": "warning", diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index e9dd2248..558a5517 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -1046,50 +1046,8 @@ def _check_pretrends_power(self) -> Dict[str, Any]: ): ratio = mdv / abs(att) - # Annotate whether ``compute_pretrends_power`` had access to the full - # pre-period covariance (CS / SA / ImputationDiD currently fall back to - # ``np.diag(ses**2)`` inside ``pretrends.py``, even when - # ``event_study_vcov`` is available). BR uses this field to downgrade - # power-tier prose when only the diagonal approximation was used. - r = self._results - has_full_es_vcov = ( - getattr(r, "event_study_vcov", None) is not None - and getattr(r, "event_study_vcov_index", None) is not None - ) - is_event_study_type = type(r).__name__ in { - "CallawaySantAnnaResults", - "SunAbrahamResults", - "ImputationDiDResults", - "StackedDiDResults", - "StaggeredTripleDiffResults", - "WooldridgeDiDResults", - "ChaisemartinDHaultfoeuilleResults", - "EfficientDiDResults", - "TwoStageDiDResults", - } - if is_event_study_type and has_full_es_vcov: - # ``compute_pretrends_power`` does not currently consume - # ``event_study_vcov`` for these result types (see the reviewer's - # note on pretrends.py). Flag the diagonal fallback explicitly so - # the prose layer can hedge. - cov_source = "diag_fallback_available_full_vcov_unused" - elif is_event_study_type: - cov_source = "diag_fallback" - else: - cov_source = "full_pre_period_vcov" - - tier = _power_tier(ratio) - # Central diagonal-fallback downgrade. When the helper used the - # diagonal-SE approximation while the full ``event_study_vcov`` - # was available, a ``well_powered`` verdict can be optimistic - # because off-diagonal pre-period correlations are ignored. - # REPORTING.md's conservative deviation says to downgrade in - # that case. Doing it here (once) ensures every downstream - # surface — BR ``summary()``, BR ``full_report()``, BR schema, - # DR ``summary()`` — reads the same adjusted tier (round-14 - # CI review flagged per-surface divergence). - if tier == "well_powered" and cov_source == "diag_fallback_available_full_vcov_unused": - tier = "moderately_powered" + cov_source = self._infer_cov_source(self._results) + tier = _apply_diag_fallback_downgrade(_power_tier(ratio), cov_source) return { "status": "ran", "method": "compute_pretrends_power", @@ -1110,13 +1068,28 @@ def _check_pretrends_power(self) -> Dict[str, Any]: } def _format_precomputed_pretrends_power(self, obj: Any) -> Dict[str, Any]: - """Adapt a pre-computed ``PreTrendsPowerResults`` to the schema shape.""" + """Adapt a pre-computed ``PreTrendsPowerResults`` to the schema shape. + + Round-20 P1 CI review on PR #318: this path must mirror the + covariance-source annotation and diagonal-fallback downgrade that + ``_check_pretrends_power`` applies on the default path. Otherwise + the same fit passed through ``precomputed={"pretrends_power": ...}`` + can be labeled ``well_powered`` while the default path reports + ``moderately_powered`` (per REPORTING.md's conservative deviation + for CS / SA / ImputationDiD event-study fits with full + ``event_study_vcov`` available but unused). Resolve the source + fit via ``obj.original_results`` first (which ``compute_pretrends_power`` + populates at construction time), falling back to ``self._results``. + """ mdv = _to_python_float(getattr(obj, "mdv", None)) hm = self._extract_headline_metric() att = hm.get("value") if hm else None ratio: Optional[float] = None if mdv is not None and att is not None and np.isfinite(att) and abs(att) > 0: ratio = mdv / abs(att) + source_fit = getattr(obj, "original_results", None) or self._results + cov_source = self._infer_cov_source(source_fit) + tier = _apply_diag_fallback_downgrade(_power_tier(ratio), cov_source) return { "status": "ran", "method": "precomputed", @@ -1128,10 +1101,44 @@ def _format_precomputed_pretrends_power(self, obj: Any) -> Dict[str, Any]: "violation_magnitude": _to_python_float(getattr(obj, "violation_magnitude", None)), "power_at_violation_magnitude": _to_python_float(getattr(obj, "power", None)), "n_pre_periods": int(getattr(obj, "n_pre_periods", 0) or 0), - "tier": _power_tier(ratio), + "tier": tier, + "covariance_source": cov_source, "precomputed": True, } + @staticmethod + def _infer_cov_source(source_fit: Any) -> str: + """Classify whether ``compute_pretrends_power`` had access to the + full pre-period covariance on ``source_fit``. + + CS / SA / ImputationDiD / EfficientDiD / Stacked / etc. currently + fall back to ``np.diag(ses**2)`` inside ``pretrends.py``, even when + ``event_study_vcov`` is populated on the result; the returned + ``PreTrendsPowerResults.vcov`` therefore ignores off-diagonal pre- + period correlations. Annotating the source explicitly lets BR + downgrade the tier conservatively. + """ + is_event_study_type = type(source_fit).__name__ in { + "CallawaySantAnnaResults", + "SunAbrahamResults", + "ImputationDiDResults", + "StackedDiDResults", + "StaggeredTripleDiffResults", + "WooldridgeDiDResults", + "ChaisemartinDHaultfoeuilleResults", + "EfficientDiDResults", + "TwoStageDiDResults", + } + has_full_es_vcov = ( + getattr(source_fit, "event_study_vcov", None) is not None + and getattr(source_fit, "event_study_vcov_index", None) is not None + ) + if is_event_study_type and has_full_es_vcov: + return "diag_fallback_available_full_vcov_unused" + if is_event_study_type: + return "diag_fallback" + return "full_pre_period_vcov" + def _check_sensitivity(self) -> Dict[str, Any]: """Run HonestDiD over the M grid. Uses ``SensitivityResults.breakdown_M``. @@ -2058,6 +2065,26 @@ def _power_tier(ratio: Optional[float]) -> str: return "underpowered" +def _apply_diag_fallback_downgrade(tier: str, cov_source: str) -> str: + """Conservatively downgrade ``well_powered`` to ``moderately_powered`` + when ``compute_pretrends_power`` used the diagonal-SE approximation + while the full ``event_study_vcov`` was available on the source fit. + + REPORTING.md's conservative deviation: off-diagonal pre-period + correlations are ignored under the diagonal fallback, so a + ``well_powered`` verdict can overstate the real informativeness of + the pre-test. The downgrade applies at every DR path + (``_check_pretrends_power`` and ``_format_precomputed_pretrends_power``) + so BR ``summary()`` / ``full_report()`` / ``to_dict()`` and DR + ``summary()`` all read the same adjusted tier. Round-14 CI review + flagged per-surface divergence; round-20 flagged that the precomputed + adapter bypassed the downgrade entirely. + """ + if tier == "well_powered" and cov_source == "diag_fallback_available_full_vcov_unused": + return "moderately_powered" + return tier + + def _pre_post_boundary(results: Any) -> int: """Return the relative-time cutoff that separates true pre-period horizons from treatment (and post-treatment) horizons. diff --git a/diff_diff/imputation.py b/diff_diff/imputation.py index 49a40333..b5454c6f 100644 --- a/diff_diff/imputation.py +++ b/diff_diff/imputation.py @@ -857,6 +857,7 @@ def _refit_imp(w_r): n_treated_units=n_treated_units, n_control_units=n_control_units, alpha=self.alpha, + anticipation=self.anticipation, bootstrap_results=bootstrap_results, _estimator_ref=self, survey_metadata=survey_metadata, diff --git a/diff_diff/imputation_results.py b/diff_diff/imputation_results.py index 870b7271..e7f7613c 100644 --- a/diff_diff/imputation_results.py +++ b/diff_diff/imputation_results.py @@ -135,6 +135,7 @@ class ImputationDiDResults: n_treated_units: int n_control_units: int alpha: float = 0.05 + anticipation: int = 0 pretrend_results: Optional[Dict[str, Any]] = field(default=None, repr=False) bootstrap_results: Optional[ImputationBootstrapResults] = field(default=None, repr=False) # Internal: stores data needed for pretrend_test() diff --git a/diff_diff/practitioner.py b/diff_diff/practitioner.py index c58a75f6..d6d550b9 100644 --- a/diff_diff/practitioner.py +++ b/diff_diff/practitioner.py @@ -663,7 +663,16 @@ def _handle_efficient(results: Any): "pretest = EfficientDiD.hausman_pretest(\n" " data, outcome='y', unit='id', time='t', first_treat='g')" ), - step_name="heterogeneity", + # The Hausman pretest is a parallel-trends diagnostic per + # REGISTRY.md §EfficientDiD: it tests whether the stronger + # PT-All regime is tenable relative to PT-Post. ``DiagnosticReport`` + # treats a ran Hausman block as ``parallel_trends`` completion + # (``_check_pt_hausman``), so tagging this practitioner step as + # ``parallel_trends`` keeps ``_collect_next_steps()`` from + # recommending a check the report already executed. Round-20 P2 + # CI review on PR #318 flagged the earlier ``heterogeneity`` tag + # as a mismatched-step-name bug. + step_name="parallel_trends", ), _robustness_compare_step("CS, SA, or BJS"), _covariates_step(), diff --git a/diff_diff/stacked_did.py b/diff_diff/stacked_did.py index 7c610b5c..69e4ffd2 100644 --- a/diff_diff/stacked_did.py +++ b/diff_diff/stacked_did.py @@ -593,6 +593,7 @@ def _refit_stacked(w_r): weighting=self.weighting, clean_control=self.clean_control, alpha=self.alpha, + anticipation=self.anticipation, survey_metadata=survey_metadata, ) diff --git a/diff_diff/stacked_did_results.py b/diff_diff/stacked_did_results.py index 7a45a316..fb5bfb96 100644 --- a/diff_diff/stacked_did_results.py +++ b/diff_diff/stacked_did_results.py @@ -93,6 +93,7 @@ class StackedDiDResults: weighting: str = "aggregate" clean_control: str = "not_yet_treated" alpha: float = 0.05 + anticipation: int = 0 # Survey design metadata (SurveyMetadata instance from diff_diff.survey) survey_metadata: Optional[Any] = field(default=None) diff --git a/diff_diff/two_stage.py b/diff_diff/two_stage.py index 6a385bd9..dc86e438 100644 --- a/diff_diff/two_stage.py +++ b/diff_diff/two_stage.py @@ -841,6 +841,7 @@ def _refit_ts(w_r): n_treated_units=n_treated_units, n_control_units=n_control_units, alpha=self.alpha, + anticipation=self.anticipation, bootstrap_results=bootstrap_results, survey_metadata=survey_metadata, ) diff --git a/diff_diff/two_stage_results.py b/diff_diff/two_stage_results.py index 6097f05a..d7cf7c8c 100644 --- a/diff_diff/two_stage_results.py +++ b/diff_diff/two_stage_results.py @@ -136,6 +136,7 @@ class TwoStageDiDResults: n_treated_units: int n_control_units: int alpha: float = 0.05 + anticipation: int = 0 bootstrap_results: Optional[TwoStageBootstrapResults] = field(default=None, repr=False) # Survey design metadata (SurveyMetadata instance from diff_diff.survey) survey_metadata: Optional[Any] = field(default=None, repr=False) diff --git a/tests/test_business_report.py b/tests/test_business_report.py index b6c811d5..e64be987 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -1140,6 +1140,158 @@ def test_sun_abraham_fit_persists_anticipation(self): assert a["no_anticipation"] is False assert a["anticipation_periods"] == 1 + def test_imputation_fit_persists_anticipation(self): + from diff_diff import ImputationDiD + + sdf = generate_staggered_data(n_units=80, n_periods=8, treatment_effect=1.5, seed=7) + im = ImputationDiD(anticipation=1).fit( + sdf, outcome="outcome", unit="unit", time="period", first_treat="first_treat" + ) + assert getattr(im, "anticipation", None) == 1 + br = BusinessReport(im, auto_diagnostics=False) + a = br.to_dict()["assumption"] + assert a["no_anticipation"] is False + assert a["anticipation_periods"] == 1 + assert "not strict no-anticipation" in a["description"] + + def test_two_stage_fit_persists_anticipation(self): + from diff_diff import TwoStageDiD + + sdf = generate_staggered_data(n_units=80, n_periods=8, treatment_effect=1.5, seed=7) + ts = TwoStageDiD(anticipation=2).fit( + sdf, outcome="outcome", unit="unit", time="period", first_treat="first_treat" + ) + assert getattr(ts, "anticipation", None) == 2 + br = BusinessReport(ts, auto_diagnostics=False) + a = br.to_dict()["assumption"] + assert a["no_anticipation"] is False + assert a["anticipation_periods"] == 2 + assert "2 periods" in a["description"] + + def test_stacked_fit_persists_anticipation(self): + from diff_diff import StackedDiD + + sdf = generate_staggered_data(n_units=80, n_periods=8, treatment_effect=1.5, seed=7) + st = StackedDiD(anticipation=1).fit( + sdf, outcome="outcome", unit="unit", time="period", first_treat="first_treat" + ) + assert getattr(st, "anticipation", None) == 1 + br = BusinessReport(st, auto_diagnostics=False) + a = br.to_dict()["assumption"] + assert a["no_anticipation"] is False + assert a["anticipation_periods"] == 1 + + +class TestStackedCleanControlSurfacesInSampleBlock: + """Pre-emptive audit regression: ``StackedDiD`` exposes its control- + group choice as ``clean_control`` (the public Wing-Freedman- + Hollingsworth-2024 kwarg name), not ``control_group``. The business- + report sample-block treatment for ``"not_yet_treated"`` (dynamic + control comparison) must still fire — otherwise a Stacked fit with + ``clean_control="not_yet_treated"`` surfaces as ``control_group=None`` + with ``dynamic_control=False``, which misreports the sample semantics + the same way R17/R18 flagged for EfficientDiD's ``control_group`` + handling. + """ + + def test_stacked_not_yet_treated_surfaces_as_dynamic_control(self): + from diff_diff import StackedDiD + + sdf = generate_staggered_data(n_units=80, n_periods=8, treatment_effect=1.5, seed=7) + st = StackedDiD(clean_control="not_yet_treated").fit( + sdf, outcome="outcome", unit="unit", time="period", first_treat="first_treat" + ) + assert getattr(st, "clean_control", None) == "not_yet_treated" + sample = BusinessReport(st, auto_diagnostics=False).to_dict()["sample"] + # The clean_control choice must be normalized to control_group in + # the schema so downstream agents see a consistent key across + # estimators. + assert sample["control_group"] == "not_yet_treated" + assert sample["dynamic_control"] is True + # Under the dynamic-control branch, the fixed tally is relabeled: + # n_never_treated carries the fixed never-treated subset and + # n_control is set to None. + assert sample["n_never_treated"] is not None + assert sample["n_control"] is None + + def test_stacked_never_treated_surfaces_as_fixed_control(self): + from diff_diff import StackedDiD + + sdf = generate_staggered_data(n_units=80, n_periods=8, treatment_effect=1.5, seed=7) + st = StackedDiD(clean_control="never_treated").fit( + sdf, outcome="outcome", unit="unit", time="period", first_treat="first_treat" + ) + sample = BusinessReport(st, auto_diagnostics=False).to_dict()["sample"] + assert sample["control_group"] == "never_treated" + assert sample["dynamic_control"] is False + + +class TestDCDHPhase3AssumptionClause: + """Pre-emptive audit regression: ``ChaisemartinDHaultfoeuilleResults`` + populates ``covariate_residuals`` when ``controls`` is set in fit, + ``linear_trends_effects`` when ``trends_linear=True``, and + ``heterogeneity_effects`` when ``heterogeneity`` is set. Each change + modifies the identifying contract and the estimand label + (``DID^X_l`` / ``DID^{fd}_l`` / ``DID^{X,fd}_l``). The BR assumption + description must surface the active configuration so the prose does + not misrepresent the identifying assumption on a Phase-3 fit. + """ + + def test_dcdh_base_case_has_no_phase3_clause(self): + from diff_diff.business_report import _describe_assumption + + class Stub: + covariate_residuals = None + linear_trends_effects = None + heterogeneity_effects = None + + block = _describe_assumption("ChaisemartinDHaultfoeuilleResults", Stub()) + assert "Phase-3 configuration" not in block["description"] + + def test_dcdh_controls_only_surfaces_did_x(self): + import pandas as pd + + from diff_diff.business_report import _describe_assumption + + class Stub: + covariate_residuals = pd.DataFrame({"theta_hat": [0.1]}) + linear_trends_effects = None + heterogeneity_effects = None + + desc = _describe_assumption("ChaisemartinDHaultfoeuilleResults", Stub())["description"] + assert "Phase-3 configuration" in desc + assert "DID^X_l" in desc + assert "first-stage residualization" in desc + assert "DID^{fd}_l" not in desc + + def test_dcdh_trends_linear_only_surfaces_did_fd(self): + from diff_diff.business_report import _describe_assumption + + class Stub: + covariate_residuals = None + linear_trends_effects = {1: {"effect": 0.1}} + heterogeneity_effects = None + + desc = _describe_assumption("ChaisemartinDHaultfoeuilleResults", Stub())["description"] + assert "Phase-3 configuration" in desc + assert "DID^{fd}_l" in desc + assert "group-specific linear pre-trends" in desc + + def test_dcdh_controls_and_trends_surfaces_combined_estimand(self): + import pandas as pd + + from diff_diff.business_report import _describe_assumption + + class Stub: + covariate_residuals = pd.DataFrame({"theta_hat": [0.1]}) + linear_trends_effects = {1: {"effect": 0.1}} + heterogeneity_effects = {1: {}} + + desc = _describe_assumption("ChaisemartinDHaultfoeuilleResults", Stub())["description"] + assert "DID^{X,fd}_l" in desc + assert "heterogeneity tests" in desc + assert "beta^{het}_l" in desc + class TestAnticipationAwareAssumptionBlock: """Round-17 P1 regression: ``_describe_assumption`` must drop the @@ -1770,6 +1922,119 @@ def test_dr_trop_does_not_mention_sensitivity_below(self, sdid_fit): assert "sensitivity analysis below" not in summary +class TestSDiDTROPSkippedSensitivityCaveatSuppressed: + """Round-20 P2 regression on PR #318: ``DiagnosticReport`` marks the + HonestDiD sensitivity block ``status="skipped", method="estimator_native"`` + for SDiD / TROP because robustness is routed to the native diagnostics + (``in_time_placebo``, ``sensitivity_to_zeta_omega``, factor-model + metrics) under ``estimator_native_diagnostics``. ``BusinessReport`` + must not surface "HonestDiD sensitivity was not run" as a warning + caveat when the native battery actually ran, because that contradicts + the documented native-routing contract and misleads the reader into + thinking robustness was skipped. + """ + + def test_sdid_native_routed_suppresses_skipped_caveat(self, sdid_fit): + from diff_diff import DiagnosticReport + + fit, _ = sdid_fit + br = BusinessReport(fit) + schema = br.to_dict() + + # BR's lifted ``sensitivity`` block only carries status/reason; the + # ``method`` field lives on the DR schema, which BR reads internally + # to decide caveat suppression. Confirm the DR-side shape separately. + assert schema["sensitivity"]["status"] == "skipped" + dr_schema = DiagnosticReport(fit).to_dict() + assert dr_schema["sensitivity"]["status"] == "skipped" + assert dr_schema["sensitivity"]["method"] == "estimator_native" + native_ran = dr_schema["estimator_native_diagnostics"].get("status") == "ran" + + caveat_topics = [c.get("topic") for c in schema.get("caveats", [])] + if native_ran: + # The fix: no "sensitivity_skipped" warning; instead an info + # caveat pointing at the native block. + assert "sensitivity_skipped" not in caveat_topics + assert "sensitivity_native_routed" in caveat_topics + native_msg = next( + c for c in schema["caveats"] if c.get("topic") == "sensitivity_native_routed" + ) + assert native_msg["severity"] == "info" + assert "estimator-native" in native_msg["message"].lower() + else: + # When the native battery did not produce a ran block, the + # legacy warning behavior is still correct — SDiD users should + # know HonestDiD was not attempted. + assert ( + "sensitivity_skipped" in caveat_topics + or "sensitivity_native_routed" in caveat_topics + ) + + +class TestEfficientDiDHausmanStepTaggedAsParallelTrends: + """Round-20 P2 regression on PR #318: the EfficientDiD practitioner + workflow step "Run Hausman pretest (PT-All vs PT-Post)" must be + tagged ``_step_name="parallel_trends"``, not ``"heterogeneity"``, so + that ``DiagnosticReport._collect_next_steps()`` — which treats a ran + Hausman block as parallel-trends completion — correctly suppresses the + step from the "next steps" list when the report already executed it. + REGISTRY.md §EfficientDiD (lines 895-908) classifies the Hausman + pretest as a parallel-trends diagnostic, so the fix aligns the + practitioner tag with the identification-layer classification. + """ + + def test_hausman_step_is_tagged_parallel_trends(self): + """``practitioner_next_steps`` strips ``_step_name`` from the + returned steps, so we exercise the tagging via the + ``completed_steps=["parallel_trends"]`` filter contract: a + correctly-tagged Hausman step is removed from the output; a + mistagged step remains. + """ + from diff_diff.practitioner import practitioner_next_steps + + class EfficientDiDResults: + pass + + stub = EfficientDiDResults() + stub.overall_att = 1.0 + stub.overall_se = 0.3 + stub.overall_p_value = 0.01 + stub.overall_conf_int = (0.4, 1.6) + stub.alpha = 0.05 + stub.n_obs = 500 + stub.n_treated = 200 + stub.n_control = 300 + stub.survey_metadata = None + stub.event_study_effects = None + stub.pt_assumption = "all" + + # Without any completed steps, the Hausman pretest is included. + baseline = practitioner_next_steps(stub, verbose=False)["next_steps"] + hausman_in_baseline = any( + "Hausman pretest" in s.get("label", "") for s in baseline + ) + assert hausman_in_baseline, ( + "EfficientDiD workflow must include the Hausman pretest step" + ) + + # After marking ``parallel_trends`` complete (which DR does when + # ``_check_pt_hausman`` runs), the Hausman step must be filtered + # out. Before the round-20 retag it was tagged as + # ``heterogeneity`` and survived this filter — that is the bug. + filtered = practitioner_next_steps( + stub, completed_steps=["parallel_trends"], verbose=False + )["next_steps"] + assert not any( + "Hausman pretest" in s.get("label", "") for s in filtered + ), ( + "Hausman step must be tagged as 'parallel_trends' (REGISTRY.md " + "§EfficientDiD classifies it as a PT diagnostic) so that " + "DR's _collect_next_steps() suppresses it after running the same " + "check. Still present after completed_steps=['parallel_trends'] " + "filter, meaning the tag is wrong." + ) + + class TestHausmanTestStatisticPopulated: """Round-10 P3 regression: ``HausmanPretestResult`` exposes ``statistic`` (not ``test_statistic``); the DR schema was previously diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index 2b4c3ba6..1f416c3a 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -329,6 +329,89 @@ class _FakeSens: assert schema["sensitivity"]["status"] == "ran" assert schema["sensitivity"]["breakdown_M"] == 0.75 + def test_precomputed_pretrends_power_parity_with_default_path(self, cs_fit): + """Round-20 P1 regression: ``precomputed={"pretrends_power": ...}`` + must apply the same covariance-source annotation and conservative + diagonal-fallback downgrade as ``_check_pretrends_power``. Otherwise + the same fit can be labeled ``well_powered`` through the precomputed + path and ``moderately_powered`` through the default path. + """ + from diff_diff.pretrends import compute_pretrends_power + + fit, data = cs_fit + + # Precompute the power result from the same fit. The compute function + # populates ``original_results`` on the output so DR's precomputed + # adapter can inspect the source fit's event_study_vcov. + pp = compute_pretrends_power(fit, alpha=0.05, target_power=0.80, violation_type="linear") + assert getattr(pp, "original_results", None) is fit + + dr_default = DiagnosticReport(fit, data=data).to_dict() + dr_precomputed = DiagnosticReport( + fit, data=data, precomputed={"pretrends_power": pp} + ).to_dict() + + default_block = dr_default["pretrends_power"] + precomp_block = dr_precomputed["pretrends_power"] + + # Both paths are "ran"; the precomputed path flags itself with + # ``precomputed=True`` while the default path sets ``method= + # compute_pretrends_power``. + assert default_block["status"] == "ran" + assert precomp_block["status"] == "ran" + assert precomp_block.get("precomputed") is True + + # Tier and covariance_source must agree across paths so downstream + # BR prose does not diverge based on which path produced the block. + assert default_block["tier"] == precomp_block["tier"] + assert default_block["covariance_source"] == precomp_block["covariance_source"] + + def test_precomputed_pretrends_power_downgrades_when_full_vcov_unused(self): + """Stub-based regression: when the source fit has both + ``event_study_vcov`` and ``event_study_vcov_index`` populated but + the diagonal fallback was used, the precomputed adapter must emit + ``covariance_source='diag_fallback_available_full_vcov_unused'`` and + downgrade a ``well_powered`` tier to ``moderately_powered`` — just + like the default compute path. Complements the live-fit parity test + by exercising the tier-bumping edge explicitly. + """ + + # Minimal CS-shaped stub with full vcov flagged. + class _CSStub: + overall_att = 1.0 + overall_se = 0.25 + overall_t_stat = 4.0 + overall_p_value = 0.001 + overall_conf_int = (0.5, 1.5) + alpha = 0.05 + n_obs = 400 + n_treated = 80 + n_control = 320 + survey_metadata = None + event_study_effects = None + event_study_vcov = np.eye(3) + event_study_vcov_index = {-2: 0, -1: 1, 0: 2} + + stub = _CSStub() + stub.__class__.__name__ = "CallawaySantAnnaResults" + + class _PPStub: + mdv = 0.1 # |ATT| = 1.0 -> ratio = 0.1 -> well_powered before downgrade + violation_type = "linear" + alpha = 0.05 + target_power = 0.80 + violation_magnitude = 0.1 + power = 0.80 + n_pre_periods = 2 + original_results = stub + + dr = DiagnosticReport(stub, precomputed={"pretrends_power": _PPStub()}) + block = dr.to_dict()["pretrends_power"] + assert block["status"] == "ran" + assert block["covariance_source"] == "diag_fallback_available_full_vcov_unused" + # Downgrade must apply: pre-tier is well_powered, post-tier is moderately_powered. + assert block["tier"] == "moderately_powered" + # --------------------------------------------------------------------------- # Verdict / tier helpers From d26e02b7ec25e89f97f2b39b579ae6678438f440 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 09:13:51 -0400 Subject: [PATCH 24/48] Address twenty-first round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-21 review findings: P1 methodology. Estimator-aware validation for precomputed passthrough on SDiD / TROP. ``DiagnosticReport.__init__`` now rejects ``precomputed["sensitivity"]`` and ``precomputed["parallel_trends"]`` with a ``ValueError`` pointing at ``estimator_native_diagnostics`` when the fit is ``SyntheticDiDResults`` or ``TROPResults`` — these estimators route robustness and pre-trends to native analogues (SDiD: ``in_time_placebo``, ``sensitivity_to_zeta_omega``, ``pre_treatment_fit``; TROP: factor-model fit metrics), and generic HonestDiD / PT inputs are methodology-incompatible with the native- routing contract in REPORTING.md. ``BusinessReport.__init__`` mirrors the guard for ``honest_did_results=`` before it forwards to the auto-constructed DR. Previously the precomputed check fired before the SDiD / TROP skip, silently installing an incompatible diagnostic into the generic section. P1 methodology. Revert the ``StackedDiDResults`` entry in ``_never_treated_count_contract`` that the previous round introduced. ``StackedDiDResults.n_control_units`` is documented as "distinct control units across the trimmed set" (``stacked_did_results.py`` lines 59-62); under ``clean_control="not_yet_treated"`` the trimmed set admits future-treated controls (rule ``A_s > a + kappa_post``) and is NOT a never-treated tally, so relabeling it as ``n_never_treated`` could fabricate never-treated support on an all-eventually-treated panel. The update keeps StackedDiD on the fixed-count path, and the ``_control_group_choice`` helper still normalizes ``clean_control`` into ``control_group`` / ``dynamic_control`` in the sample block. Pre-emptive audit findings (bundled): Finding 1 / sensitivity parity. ``_format_precomputed_sensitivity`` now carries ``original_estimate`` and ``original_se`` on the single-M ``HonestDiDResults`` branch so the schema shape matches the grid branch (which exposes them via ``_format_sensitivity_results``). Both fields are documented on ``HonestDiDResults``; dropping them on the single-M branch left downstream tooling with a shape-dependent schema. Finding 2 / lift preserves method. ``_lift_sensitivity`` now preserves the DR ``method`` field when the sensitivity block is not ``ran``. Without it, BR-schema consumers could not distinguish native-routed skips (``method="estimator_native"`` for SDiD / TROP) from methodology-blocked skips (e.g., CS with ``base_period='varying'``) without re-consulting the DR schema. Finding 3 / practitioner step tag alignment. Retag six workflow steps whose ``_step_name="sensitivity"`` caused ``_collect_next_steps`` to suppress them after HonestDiD ran — they recommend specification variation or placebo sweeps that HonestDiD does not replay. Five steps in SA / ImputationDiD / TwoStageDiD / StackedDiD / EfficientDiD retagged ``specification_comparison``; TROP's in-time-or-in-space placebo retagged ``placebo`` (TROP's native battery surfaces factor-model diagnostics only, no placebo). Same class of mistag as the round-20 Hausman finding (``heterogeneity`` → ``parallel_trends``). Tests: updated ``TestStackedCleanControlSurfacesInSampleBlock`` to assert the corrected no-relabel contract and added an all-eventually-treated-panel regression. New test classes: ``TestSpecificationComparisonStepTagPersistsAfterSensitivityRuns`` (5 tests), ``TestTROPInTimePlaceboStepTaggedAsPlacebo`` (1), ``TestBRLiftSensitivityPreservesMethodOnSkip`` (1), ``TestSDiDTROPRejectIncompatiblePrecomputedInputs`` (7), plus a single-M precomputed sensitivity parity test under ``TestPrecomputed``. 214 BR / DR / practitioner tests pass; 210 imputation / two-stage / stacked tests unaffected. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 48 ++++- diff_diff/diagnostic_report.py | 43 ++++- diff_diff/practitioner.py | 36 +++- tests/test_business_report.py | 325 ++++++++++++++++++++++++++++++-- tests/test_diagnostic_report.py | 35 ++++ 5 files changed, 462 insertions(+), 25 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 42da7d5d..73102bbf 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -160,6 +160,33 @@ def __init__( f"got {type(diagnostics).__name__}." ) + # Estimator-aware validation for ``honest_did_results``. SDiD / + # TROP route robustness to ``estimator_native_diagnostics`` + # (SDiD: ``in_time_placebo``, ``sensitivity_to_zeta_omega``; + # TROP: factor-model fit metrics) and do not accept HonestDiD + # bounds because they are methodology-incompatible with the + # documented native-routing contract in REPORTING.md. Reject + # the passthrough here so it doesn't silently forward to the + # auto-constructed ``DiagnosticReport`` (which now also + # rejects it at construction time — round-21 P1 CI review on + # PR #318). + if honest_did_results is not None and type(results).__name__ in { + "SyntheticDiDResults", + "TROPResults", + }: + raise ValueError( + f"{type(results).__name__} routes robustness to " + "``estimator_native_diagnostics`` — ``honest_did_results`` " + "is not accepted on this estimator because HonestDiD " + "bounds are methodology-incompatible with the native " + "routing documented in REPORTING.md. Use the result " + "object's native diagnostics " + "(SDiD: ``in_time_placebo()``, ``sensitivity_to_zeta_omega()``, " + "``pre_treatment_fit``; TROP: ``effective_rank``, " + "``loocv_score``) — BusinessReport surfaces these " + "automatically under ``estimator_native_diagnostics``." + ) + self._results = results self._honest_did_results = honest_did_results self._auto_diagnostics = auto_diagnostics @@ -491,7 +518,7 @@ def _extract_sample(self) -> Dict[str, Any]: # (StaggeredTripleDiff) as the same dynamic mode. # # Per-estimator fixed-subset field: - # * CS / SA / Imputation / TwoStage / Stacked / EfficientDiD / + # * CS / SA / Imputation / TwoStage / EfficientDiD / # dCDH / ContinuousDiD — ``n_control_units`` is the # never-treated tally; surface as ``n_never_treated``. # * StaggeredTripleDiff — ``n_control_units`` is a composite @@ -501,6 +528,15 @@ def _extract_sample(self) -> Dict[str, Any]: # comparisons (never-treated + future-treated) and does not # map to a never-treated count. Keep on the fixed-count # path even in dynamic mode. + # * Stacked — ``n_control_units`` is "distinct control units + # across the trimmed set" (stacked_did_results.py L59-62). + # Under ``clean_control="not_yet_treated"``, the trimmed + # set uses the rule ``A_s > a + kappa_post`` which admits + # future-treated controls; it is NOT a never-treated tally + # and cannot be relabeled as ``n_never_treated``. Keep + # Stacked on the fixed-count path (round-21 P1 CI review + # on PR #318 flagged the earlier relabeling as a + # semantic-contract violation). control_group = _control_group_choice(r) name = type(r).__name__ n_never_treated: Optional[int] = None @@ -511,7 +547,6 @@ def _extract_sample(self) -> Dict[str, Any]: "SunAbrahamResults", "ImputationDiDResults", "TwoStageDiDResults", - "StackedDiDResults", "EfficientDiDResults", "ChaisemartinDHaultfoeuilleResults", "ContinuousDiDResults", @@ -638,9 +673,18 @@ def _lift_sensitivity(dr: Optional[Dict[str, Any]]) -> Dict[str, Any]: return {"status": "skipped", "reason": "auto_diagnostics=False"} sens = dr.get("sensitivity") or {} if sens.get("status") != "ran": + # Preserve ``method`` through to the BR schema so downstream + # consumers can distinguish a native-routed skip + # (``method="estimator_native"`` for SDiD / TROP, where + # robustness is covered by the native battery) from a + # methodology-blocked skip (e.g., CS with + # ``base_period='varying'``). Without it, agents reading the BR + # schema alone cannot tell these cases apart and would have to + # re-consult the DR schema to disambiguate. return { "status": sens.get("status", "not_run"), "reason": sens.get("reason"), + "method": sens.get("method"), } return { "status": "computed", diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 558a5517..87b2c597 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -346,6 +346,37 @@ def __init__( "``heterogeneity``, and ``epv`` are read directly from the " "fitted result and do not accept precomputed overrides." ) + + # Estimator-aware precomputed validation. SDiD / TROP route + # robustness to ``estimator_native_diagnostics`` (SDiD: weighted + # pre-treatment fit, in-time placebo, zeta-omega sensitivity; + # TROP: factor-model fit metrics), and TROP PT is not applicable + # (factor-model identification, not PT). Accepting generic + # HonestDiD / parallel-trends precomputed inputs on these + # estimators would surface methodology-incompatible diagnostics + # through the generic report sections — the opposite of the + # native-routing contract documented in REPORTING.md. + # Round-21 P1 CI review on PR #318 flagged this bypass. + _result_name = type(self._results).__name__ + _native_routed_names = {"SyntheticDiDResults", "TROPResults"} + if _result_name in _native_routed_names: + _incompatible_keys = [] + if "sensitivity" in self._precomputed: + _incompatible_keys.append("sensitivity") + if "parallel_trends" in self._precomputed: + _incompatible_keys.append("parallel_trends") + if _incompatible_keys: + raise ValueError( + f"{_result_name} routes robustness and pre-trends " + "diagnostics to ``estimator_native_diagnostics`` — " + "generic HonestDiD and parallel-trends precomputed " + "passthroughs are methodology-incompatible with this " + f"estimator. Rejected precomputed keys: {sorted(_incompatible_keys)}. " + "Use the native diagnostics on the result object " + "(SDiD: ``in_time_placebo``, ``sensitivity_to_zeta_omega``, " + "``pre_treatment_fit``; TROP: ``effective_rank``, " + "``loocv_score``) — DR surfaces these automatically." + ) self._outcome_label = outcome_label self._treatment_label = treatment_label self._cached: Optional[DiagnosticReportResults] = None @@ -1258,7 +1289,15 @@ def _format_sensitivity_results(self, sens: Any) -> Dict[str, Any]: } def _format_precomputed_sensitivity(self, obj: Any) -> Dict[str, Any]: - """Accept either ``SensitivityResults`` (grid) or ``HonestDiDResults`` (single M).""" + """Accept either ``SensitivityResults`` (grid) or ``HonestDiDResults`` (single M). + + The single-M branch preserves ``original_estimate`` and + ``original_se`` for parity with the grid branch — both + ``SensitivityResults`` and ``HonestDiDResults`` carry these fields, + and downstream tooling that reads the schema should see a + consistent shape regardless of which object was passed. (The + grid path surfaces them via ``_format_sensitivity_results``.) + """ if hasattr(obj, "M_values") and hasattr(obj, "breakdown_M"): formatted = self._format_sensitivity_results(obj) formatted["precomputed"] = True @@ -1282,6 +1321,8 @@ def _format_precomputed_sensitivity(self, obj: Any) -> Dict[str, Any]: } ], "breakdown_M": None, + "original_estimate": _to_python_float(getattr(obj, "original_estimate", None)), + "original_se": _to_python_float(getattr(obj, "original_se", None)), "conclusion": "single_M_precomputed", "precomputed": True, } diff --git a/diff_diff/practitioner.py b/diff_diff/practitioner.py index d6d550b9..d094a3b4 100644 --- a/diff_diff/practitioner.py +++ b/diff_diff/practitioner.py @@ -388,7 +388,12 @@ def _handle_sa(results: Any): "# sa_alt = SunAbraham(control_group='not_yet_treated')" ), priority="medium", - step_name="sensitivity", + # DR's sensitivity section runs HonestDiD, not specification + # variation; tagging this as ``sensitivity`` caused + # ``_collect_next_steps`` to suppress it after HonestDiD ran. + # Use ``specification_comparison`` so the recommendation + # persists alongside a completed HonestDiD sensitivity check. + step_name="specification_comparison", ), _step( baker_step=7, @@ -431,7 +436,10 @@ def _handle_imputation(results: Any): "# Leave-one-cohort-out sensitivity analysis" ), priority="medium", - step_name="sensitivity", + # See note on SA handler: DR completes ``sensitivity`` when + # HonestDiD runs, which is unrelated to this specification- + # variation recommendation. Tag separately. + step_name="specification_comparison", ), _robustness_compare_step("CS, SA, or Gardner"), _covariates_step(), @@ -457,7 +465,10 @@ def _handle_two_stage(results: Any): "# Leave-one-cohort-out sensitivity analysis" ), priority="medium", - step_name="sensitivity", + # See note on SA handler: DR completes ``sensitivity`` when + # HonestDiD runs, which is unrelated to this specification- + # variation recommendation. Tag separately. + step_name="specification_comparison", ), _robustness_compare_step("CS, BJS, or SA"), _covariates_step(), @@ -482,7 +493,10 @@ def _handle_stacked(results: Any): "# stacked_alt = StackedDiD(clean_control='not_yet_treated')" ), priority="medium", - step_name="sensitivity", + # See note on SA handler: DR completes ``sensitivity`` when + # HonestDiD runs, which does not replay ``clean_control`` + # variation. Tag separately. + step_name="specification_comparison", ), _step( baker_step=7, @@ -624,7 +638,12 @@ def _handle_trop(results: Any): "# Leave-one-out: drop each treated unit and re-estimate" ), priority="medium", - step_name="sensitivity", + # TROP's estimator-native diagnostics surface factor-model fit + # metrics, not in-time or in-space placebos; DR does not run + # placebos on TROP. Tag separately from ``sensitivity`` so the + # recommendation persists after DR marks the TROP native + # battery complete. + step_name="placebo", ), _robustness_compare_step("SyntheticDiD or CS"), ] @@ -648,7 +667,12 @@ def _handle_efficient(results: Any): "# edid_alt = EfficientDiD(control_group='last_cohort')" ), priority="medium", - step_name="sensitivity", + # See note on SA handler: DR completes ``sensitivity`` when + # HonestDiD runs, which does not re-estimate with an + # alternative control_group. Tag separately so this + # recommendation persists alongside a completed HonestDiD + # block. + step_name="specification_comparison", ), _step( baker_step=7, diff --git a/tests/test_business_report.py b/tests/test_business_report.py index e64be987..be32a109 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -1185,16 +1185,22 @@ def test_stacked_fit_persists_anticipation(self): class TestStackedCleanControlSurfacesInSampleBlock: """Pre-emptive audit regression: ``StackedDiD`` exposes its control- group choice as ``clean_control`` (the public Wing-Freedman- - Hollingsworth-2024 kwarg name), not ``control_group``. The business- - report sample-block treatment for ``"not_yet_treated"`` (dynamic - control comparison) must still fire — otherwise a Stacked fit with - ``clean_control="not_yet_treated"`` surfaces as ``control_group=None`` - with ``dynamic_control=False``, which misreports the sample semantics - the same way R17/R18 flagged for EfficientDiD's ``control_group`` - handling. + Hollingsworth-2024 kwarg name), not ``control_group``. The BR sample + block must normalize the key so downstream agents see a consistent + ``control_group`` field across estimators. + + ``n_control_units`` on ``StackedDiDResults`` is documented as + "distinct control units across the trimmed set" (stacked_did_results + L59-62). Under ``clean_control="not_yet_treated"`` the trimmed set + admits future-treated controls by construction, so the count is + NOT a never-treated tally and must not be relabeled as + ``n_never_treated`` — round-21 P1 CI review on PR #318 flagged the + prior relabeling as a semantic-contract violation because it can + fabricate never-treated support that does not exist (e.g., in an + all-eventually-treated panel). """ - def test_stacked_not_yet_treated_surfaces_as_dynamic_control(self): + def test_stacked_not_yet_treated_surfaces_without_never_treated_relabel(self): from diff_diff import StackedDiD sdf = generate_staggered_data(n_units=80, n_periods=8, treatment_effect=1.5, seed=7) @@ -1203,16 +1209,19 @@ def test_stacked_not_yet_treated_surfaces_as_dynamic_control(self): ) assert getattr(st, "clean_control", None) == "not_yet_treated" sample = BusinessReport(st, auto_diagnostics=False).to_dict()["sample"] - # The clean_control choice must be normalized to control_group in - # the schema so downstream agents see a consistent key across - # estimators. + # clean_control normalizes into control_group. assert sample["control_group"] == "not_yet_treated" assert sample["dynamic_control"] is True - # Under the dynamic-control branch, the fixed tally is relabeled: - # n_never_treated carries the fixed never-treated subset and - # n_control is set to None. - assert sample["n_never_treated"] is not None - assert sample["n_control"] is None + # n_control_units is "distinct control units in the trimmed set"; + # that count includes future-treated controls and must not be + # relabeled as n_never_treated. + assert sample["n_never_treated"] is None, ( + "StackedDiDResults.n_control_units is the distinct-control-" + "units tally of the trimmed set (includes future-treated " + "controls); it must not be surfaced as n_never_treated." + ) + # The count stays on the n_control path. + assert sample["n_control"] == int(st.n_control_units) def test_stacked_never_treated_surfaces_as_fixed_control(self): from diff_diff import StackedDiD @@ -1225,6 +1234,40 @@ def test_stacked_never_treated_surfaces_as_fixed_control(self): assert sample["control_group"] == "never_treated" assert sample["dynamic_control"] is False + def test_stacked_all_eventually_treated_panel_does_not_fabricate_never_treated(self): + """All-eventually-treated stacked panel with + ``clean_control="not_yet_treated"`` must not claim any + never-treated units, because every unit is eventually treated + (the round-21 reviewer example). + """ + import pandas as pd + + from diff_diff import StackedDiD + + # Every unit is eventually treated (no never-treated). + # Multiple cohorts so Stacked has something to stack against. + sdf = generate_staggered_data( + n_units=80, + n_periods=10, + never_treated_frac=0.0, + treatment_effect=1.5, + seed=7, + ) + # Sanity: the fixture has no never-treated units. + assert sdf[sdf["first_treat"] == 0].empty + + st = StackedDiD( + clean_control="not_yet_treated", kappa_pre=1, kappa_post=1 + ).fit( + sdf, outcome="outcome", unit="unit", time="period", first_treat="first_treat" + ) + sample = BusinessReport(st, auto_diagnostics=False).to_dict()["sample"] + assert sample["n_never_treated"] is None, ( + "All-eventually-treated panel under clean_control='not_yet_treated' " + "must not surface any never-treated count; the trimmed stack " + "contains only future-treated controls." + ) + class TestDCDHPhase3AssumptionClause: """Pre-emptive audit regression: ``ChaisemartinDHaultfoeuilleResults`` @@ -2035,6 +2078,256 @@ class EfficientDiDResults: ) +class TestSpecificationComparisonStepTagPersistsAfterSensitivityRuns: + """Pre-emptive audit regression: several practitioner handlers + previously tagged their "compare specifications" / "vary control + group" step as ``_step_name="sensitivity"``. DR marks ``sensitivity`` + complete when HonestDiD runs — which is orthogonal to the + specification-variation recommendation — so these steps were + incorrectly suppressed from ``next_steps`` after a fit with + HonestDiD sensitivity. Retag as ``specification_comparison`` so the + recommendations persist alongside a completed HonestDiD block. Same + class of mistag as the round-20 Hausman finding (which was about + ``heterogeneity`` vs ``parallel_trends``). + """ + + @staticmethod + def _build_stub(class_name: str, **extras): + stub_cls = type(class_name, (), {}) + stub = stub_cls() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 400 + stub.n_treated = 100 + stub.n_control = 300 + stub.survey_metadata = None + stub.event_study_effects = None + for k, v in extras.items(): + setattr(stub, k, v) + return stub + + @staticmethod + def _step_labels_after_completed(stub, completed): + from diff_diff.practitioner import practitioner_next_steps + + return [ + s.get("label", "") + for s in practitioner_next_steps( + stub, completed_steps=completed, verbose=False + )["next_steps"] + ] + + def test_sa_specification_falsification_persists_after_sensitivity_runs(self): + stub = self._build_stub("SunAbrahamResults") + labels = self._step_labels_after_completed(stub, completed=["sensitivity"]) + assert any( + "Specification-based falsification" in lab for lab in labels + ), ( + "SA's 'Specification-based falsification' step must persist " + "after DR marks sensitivity complete — HonestDiD does not run " + "control_group / anticipation variation." + ) + + def test_imputation_specification_falsification_persists_after_sensitivity_runs(self): + stub = self._build_stub("ImputationDiDResults") + labels = self._step_labels_after_completed(stub, completed=["sensitivity"]) + assert any("Specification-based falsification" in lab for lab in labels) + + def test_two_stage_specification_falsification_persists_after_sensitivity_runs(self): + stub = self._build_stub("TwoStageDiDResults") + labels = self._step_labels_after_completed(stub, completed=["sensitivity"]) + assert any("Specification-based falsification" in lab for lab in labels) + + def test_stacked_clean_control_variation_persists_after_sensitivity_runs(self): + stub = self._build_stub("StackedDiDResults") + labels = self._step_labels_after_completed(stub, completed=["sensitivity"]) + assert any("Vary clean control" in lab for lab in labels), ( + "StackedDiD's 'Vary clean control definition' step must " + "persist after DR marks sensitivity complete — HonestDiD does " + "not replay clean_control variation." + ) + + def test_efficient_compare_control_groups_persists_after_sensitivity_runs(self): + stub = self._build_stub("EfficientDiDResults", pt_assumption="all") + labels = self._step_labels_after_completed(stub, completed=["sensitivity"]) + assert any("Compare control group definitions" in lab for lab in labels), ( + "EfficientDiD's 'Compare control group definitions' step " + "must persist after DR marks sensitivity complete — HonestDiD " + "does not re-estimate with alternative control_group." + ) + + +class TestTROPInTimePlaceboStepTaggedAsPlacebo: + """Pre-emptive audit regression: the TROP practitioner workflow + step "In-time or in-space placebo" was previously tagged + ``_step_name="sensitivity"``. TROP's estimator-native diagnostics + surface factor-model fit metrics (``effective_rank``, ``loocv_score``, + selected lambdas) — not placebos — and + ``DiagnosticReport._collect_next_steps`` marks ``sensitivity`` complete + for SDiD / TROP when the native battery runs. That suppressed the + TROP placebo recommendation unjustly. Retag as ``placebo`` so it + persists. + """ + + def test_trop_placebo_step_persists_after_native_sensitivity_completion(self): + from diff_diff.practitioner import practitioner_next_steps + + class TROPResults: + pass + + stub = TROPResults() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 400 + stub.n_treated = 40 + stub.n_control = 360 + stub.survey_metadata = None + stub.event_study_effects = None + + labels = [ + s.get("label", "") + for s in practitioner_next_steps( + stub, completed_steps=["sensitivity"], verbose=False + )["next_steps"] + ] + assert any( + "In-time or in-space placebo" in lab for lab in labels + ), ( + "TROP's placebo recommendation must persist after DR marks " + "sensitivity complete (SDiD/TROP native battery) — factor-" + "model diagnostics are not a placebo substitute." + ) + + +class TestSDiDTROPRejectIncompatiblePrecomputedInputs: + """Round-21 P1 CI review on PR #318: ``precomputed={"sensitivity": + ...}`` and ``BusinessReport(honest_did_results=...)`` previously + short-circuited the SDiD / TROP native-routing guards, letting the + generic report sections surface methodology-incompatible HonestDiD + or generic PT diagnostics on estimators that route robustness to + ``estimator_native_diagnostics``. DR / BR must now reject those + passthroughs with a clear error pointing users at the native + diagnostics on the result object. + """ + + @staticmethod + def _dummy_sens_object(): + from types import SimpleNamespace + + return SimpleNamespace( + M_values=[0.5, 1.0], + bounds=[(0.1, 2.0), (-0.2, 2.5)], + robust_cis=[(0.05, 2.1), (-0.3, 2.6)], + breakdown_M=0.75, + method="relative_magnitude", + original_estimate=1.0, + original_se=0.2, + alpha=0.05, + ) + + @staticmethod + def _dummy_pt_object(): + from types import SimpleNamespace + + return SimpleNamespace( + joint_p_value=0.2, n_pre_periods=3, method="event_study" + ) + + def test_dr_rejects_precomputed_sensitivity_on_sdid(self, sdid_fit): + from diff_diff import DiagnosticReport + + fit, _ = sdid_fit + with pytest.raises(ValueError, match="estimator_native_diagnostics"): + DiagnosticReport(fit, precomputed={"sensitivity": self._dummy_sens_object()}) + + def test_dr_rejects_precomputed_parallel_trends_on_sdid(self, sdid_fit): + from diff_diff import DiagnosticReport + + fit, _ = sdid_fit + with pytest.raises(ValueError, match="estimator_native_diagnostics"): + DiagnosticReport(fit, precomputed={"parallel_trends": self._dummy_pt_object()}) + + def test_br_rejects_honest_did_results_on_sdid(self, sdid_fit): + fit, _ = sdid_fit + with pytest.raises(ValueError, match="estimator_native_diagnostics"): + BusinessReport(fit, honest_did_results=self._dummy_sens_object()) + + def test_dr_rejects_precomputed_sensitivity_on_trop(self): + """TROP construction is expensive; use a stub with the right name.""" + from diff_diff import DiagnosticReport + + class TROPResults: + pass + + stub = TROPResults() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.alpha = 0.05 + stub.n_obs = 100 + with pytest.raises(ValueError, match="estimator_native_diagnostics"): + DiagnosticReport(stub, precomputed={"sensitivity": self._dummy_sens_object()}) + + def test_dr_rejects_precomputed_parallel_trends_on_trop(self): + from diff_diff import DiagnosticReport + + class TROPResults: + pass + + stub = TROPResults() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.alpha = 0.05 + stub.n_obs = 100 + with pytest.raises(ValueError, match="estimator_native_diagnostics"): + DiagnosticReport(stub, precomputed={"parallel_trends": self._dummy_pt_object()}) + + def test_dr_still_accepts_precomputed_on_compatible_estimators(self, cs_fit): + """CS remains a valid passthrough target — the guardrail is + estimator-specific, not a blanket ban. + """ + from diff_diff import DiagnosticReport + + fit, _ = cs_fit + # Should not raise. + DiagnosticReport(fit, precomputed={"sensitivity": self._dummy_sens_object()}) + + def test_br_still_accepts_honest_did_results_on_compatible_estimators(self, cs_fit): + fit, _ = cs_fit + # Should not raise. + BusinessReport(fit, honest_did_results=self._dummy_sens_object()) + + +class TestBRLiftSensitivityPreservesMethodOnSkip: + """Pre-emptive audit regression: ``_lift_sensitivity`` previously + dropped the ``method`` field from BR's ``sensitivity`` block when + ``status != "ran"``. That forced BR-schema consumers to re-consult + the DR schema to distinguish native-routed skips + (``method="estimator_native"`` for SDiD / TROP, where robustness is + covered by the native battery) from methodology-blocked skips (e.g., + CS with ``base_period='varying'``). Preserving the field keeps BR + self-describing. + """ + + def test_sdid_br_schema_exposes_native_method_on_sensitivity_skip(self, sdid_fit): + fit, _ = sdid_fit + sens_block = BusinessReport(fit).to_dict()["sensitivity"] + assert sens_block["status"] == "skipped" + # The round-20 DR fix set method="estimator_native"; BR must pass + # it through so an agent consuming BR alone can tell this is a + # native-routed skip. + assert sens_block.get("method") == "estimator_native", ( + "BR's sensitivity block must preserve method='estimator_native' " + "when DR emitted it; otherwise downstream agents cannot " + f"distinguish native routing from methodology blocks. Got: {sens_block}" + ) + + class TestHausmanTestStatisticPopulated: """Round-10 P3 regression: ``HausmanPretestResult`` exposes ``statistic`` (not ``test_statistic``); the DR schema was previously diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index 1f416c3a..dd46a9a1 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -412,6 +412,41 @@ class _PPStub: # Downgrade must apply: pre-tier is well_powered, post-tier is moderately_powered. assert block["tier"] == "moderately_powered" + def test_precomputed_single_m_sensitivity_exposes_original_estimate_and_se(self, cs_fit): + """Pre-emptive audit regression: ``_format_precomputed_sensitivity`` + used to drop ``original_estimate`` and ``original_se`` on the + single-M ``HonestDiDResults`` branch, even though both + ``SensitivityResults`` and ``HonestDiDResults`` carry those fields. + The grid branch surfaces them via ``_format_sensitivity_results``, + so dropping them on the single-M branch made the schema shape + dependent on which object type the user passed. Parity fix: the + single-M branch now carries the same fields. + """ + from types import SimpleNamespace + + fit, _ = cs_fit + single_m = SimpleNamespace( + lb=0.3, + ub=1.8, + ci_lb=0.15, + ci_ub=1.95, + M=1.0, + method="relative_magnitude", + original_estimate=1.05, + original_se=0.22, + alpha=0.05, + ) + + block = DiagnosticReport( + fit, precomputed={"sensitivity": single_m} + ).to_dict()["sensitivity"] + assert block["status"] == "ran" + assert block["conclusion"] == "single_M_precomputed" + # Parity with the grid branch: these fields must be present and + # reflect the passed object's values. + assert block["original_estimate"] == 1.05 + assert block["original_se"] == 0.22 + # --------------------------------------------------------------------------- # Verdict / tier helpers From 722400d33ba9f4df47a4cfe99999bccaa99f734e Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 09:38:28 -0400 Subject: [PATCH 25/48] Address twenty-second round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P1 methodology (Stacked assumption block). Add a dedicated ``StackedDiDResults`` branch in ``_describe_assumption``. Previously StackedDiD was routed through the generic "group-time ATT parallel trends + no anticipation" clause used for CS / SA / Imputation / TwoStage / Wooldridge, which understates its identifying contract. The new branch names Wing-Freedman-Hollingsworth (2024) sub- experiment common trends, the IC1 (event window fits in data) and IC2 (clean controls exist) inclusion conditions, and the active ``clean_control`` rule: - ``never_treated`` -> ``A_s = infinity``; - ``not_yet_treated`` -> ``A_s > a + kappa_post`` (includes future-treated controls); - ``strict`` -> ``A_s > a + kappa_post + kappa_pre`` (strictly untreated across the full event window). Refs: REGISTRY.md Sec.StackedDiD lines 1189-1193 and 1234-1256. P1 methodology (Stacked sample rendering). The prior round only stopped the ``n_never_treated`` relabel; the BR summary and full report still narrated Stacked dynamic clean-control designs as if they had a fixed control pool. Two fixes: 1) ``_extract_sample`` treats both ``clean_control='not_yet_treated'`` and ``'strict'`` as dynamic, clears ``n_control`` under both modes, and exposes the distinct-control-units tally under the dedicated ``n_distinct_controls_trimmed`` key so downstream agents see the sub-experiment-specific context without misreading it as a never-treated tally. 2) ``_render_summary`` / ``_render_full_report`` unwrap ``schema["estimator"]["class_name"]`` (previously compared a dict to a string, so the Stacked-specific branch never fired) and emit a Stacked-specific clean-control sentence naming the active ``clean_control`` rule and the trimmed distinct-control count, instead of falling through to the generic "dynamic not-yet-treated" phrasing. P1 code quality (precomputed PT / Bacon applicability gate). ``DiagnosticReport.__init__`` advertises ``precomputed["parallel_ trends"]`` and ``precomputed["bacon"]`` as supported, but ``_instance_skip_reason`` bypassed the gate only for ``sensitivity`` and ``pretrends_power``. As a result, the two user-facing passthroughs that matter most in practice — a precomputed Hausman-style PT object when DR cannot replay the EfficientDiD fit, and a precomputed ``BaconDecompositionResults`` when raw panel kwargs are unavailable at report time — were skipped before the runner could fire. Add the missing bypasses so advertised contracts land on the runners. Tests: updated the existing ``TestStackedCleanControlSurfacesInSampleBlock`` to assert the corrected no-fixed-control contract plus a new ``strict``-mode regression. New test classes: - ``TestStackedDiDAssumptionBlock`` (3 tests): asserts the dedicated variant, IC1/IC2 naming, and per-``clean_control`` rule text; - ``TestStackedRenderingNarratesDynamicControl`` (2 tests): asserts ``summary()`` and ``full_report()`` no longer render a fixed "treated, control" clause under dynamic clean-control modes and name the sub-experiment-specific comparison explicitly; - ``TestPrecomputed`` gains ``test_precomputed_parallel_trends_ bypasses_applicability_gate`` and ``test_precomputed_bacon_bypasses_applicability_gate`` to pin the runtime PT / Bacon passthrough behavior. 222 BR / DR / practitioner tests pass; 49 stacked estimator tests unaffected. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 174 ++++++++++++++++++++++++++++---- diff_diff/diagnostic_report.py | 18 ++++ tests/test_business_report.py | 144 ++++++++++++++++++++++++-- tests/test_diagnostic_report.py | 53 ++++++++++ 4 files changed, 360 insertions(+), 29 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 73102bbf..40ac7666 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -554,11 +554,33 @@ def _extract_sample(self) -> Dict[str, Any]: _canonical_control = ( control_group.replace("_", "").lower() if isinstance(control_group, str) else None ) - is_dynamic_control = _canonical_control == "notyettreated" + # Stacked has two dynamic (sub-experiment-specific) modes: + # ``not_yet_treated`` (A_s > a + kappa_post) and ``strict`` + # (A_s > a + kappa_post + kappa_pre). Only ``never_treated`` + # (A_s = infinity) is a fixed never-treated pool. Round-22 P1 + # CI review on PR #318 flagged that ``strict`` was being + # misrendered as a fixed control design. + is_stacked_dynamic = ( + name == "StackedDiDResults" + and _canonical_control in {"notyettreated", "strict"} + ) + is_dynamic_control = ( + _canonical_control == "notyettreated" or is_stacked_dynamic + ) if is_dynamic_control: if name == "StaggeredTripleDiffResults": n_never_enabled = _safe_int(getattr(r, "n_never_enabled", None)) n_control = None + elif name == "StackedDiDResults": + # ``n_control_units`` is "distinct control units across + # the trimmed set" (stacked_did_results.py L59-62) which + # includes future-treated controls by construction under + # both dynamic modes. Do NOT relabel as + # ``n_never_treated``; instead surface the count under + # ``n_distinct_controls_trimmed`` (sub-experiment- + # specific context) and clear ``n_control`` so the + # report does not narrate a fixed control pool. + n_control = None elif _never_treated_count_contract: n_never_treated = n_control_units n_control = None @@ -577,6 +599,12 @@ def _extract_sample(self) -> Dict[str, Any]: } if n_never_enabled is not None: sample_block["n_never_enabled"] = n_never_enabled + # Stacked-specific: surface the distinct-control-units tally on a + # dedicated key so agents see the sub-experiment-specific + # comparison count without misreading it as a never-treated + # subset (round-21 / round-22 CI review). + if name == "StackedDiDResults": + sample_block["n_distinct_controls_trimmed"] = n_control_units return sample_block def _extract_survey_block(self) -> Optional[Dict[str, Any]]: @@ -1023,12 +1051,64 @@ def _describe_assumption(estimator_name: str, results: Any = None) -> Dict[str, if isinstance(control_group, str): block["control_group"] = control_group return block + if estimator_name == "StackedDiDResults": + # Wing, Freedman & Hollingsworth (2024) — identification is + # sub-experiment common trends plus the IC1 (event window fits + # within the data range) and IC2 (clean controls exist for the + # event) inclusion conditions, NOT the generic "group-time ATT + # parallel trends" clause used for CS / SA / etc. (round-22 P1 + # CI review on PR #318). The active ``clean_control`` rule + # determines which units qualify as valid controls for each + # adoption event. REGISTRY.md §StackedDiD lines 1189-1193 + # (identification) and 1234-1256 (clean-control rules). + clean_control = getattr(results, "clean_control", None) + if clean_control == "never_treated": + control_clause = ( + "controls are restricted to units that are never treated " + "over the panel (``A_s = infinity``)" + ) + elif clean_control == "strict": + control_clause = ( + "controls for event ``a`` are units satisfying the strict " + "rule ``A_s > a + kappa_post + kappa_pre`` (strictly " + "untreated across the full pre- and post-event window)" + ) + else: + # Default: "not_yet_treated" — A_s > a + kappa_post. + control_clause = ( + "controls for event ``a`` are units satisfying ``A_s > a + " + "kappa_post`` (not yet treated through the end of the " + "event's post-window, so future-treated units can serve " + "as controls for earlier events)" + ) + block: Dict[str, Any] = { + "parallel_trends_variant": "stacked_sub_experiment", + "no_anticipation": True, + "description": ( + "Identification under Stacked DiD (Wing, Freedman & " + "Hollingsworth 2024): within each stacked sub-experiment " + "parallel trends holds between the treated cohort and the " + "corresponding clean-control set over the event window " + "``[-kappa_pre, +kappa_post]``; " + + control_clause + + ". Sub-experiments are restricted by IC1 (the event " + "window fits within the available time range) and IC2 " + "(at least one clean control exists). The aggregate ATT is " + "a weighted sum over sub-experiments, so the common-trends " + "assumption is sub-experiment-specific, not a single " + "panel-wide group-time ATT condition. Also assumes no " + "anticipation." + ), + } + if isinstance(clean_control, str): + block["control_group"] = clean_control + block["clean_control"] = clean_control + return block if estimator_name in { "CallawaySantAnnaResults", "SunAbrahamResults", "ImputationDiDResults", "TwoStageDiDResults", - "StackedDiDResults", "WooldridgeDiDResults", }: return { @@ -1615,19 +1695,27 @@ def _render_summary(schema: Dict[str, Any]) -> str: f"pre-period variation." ) - # Sample sentence. For fits with a dynamic not-yet-treated - # comparison set (CS / ContinuousDiD / StaggeredTripleDiff / - # EfficientDiD) the fixed control count is suppressed because the - # comparison group varies by (g, t) cell; narrate the mode - # explicitly rather than misreporting a fixed-subset tally as - # "control" (rounds 13 / 17 / 18 CI review). + # Sample sentence. For fits with a dynamic comparison set (CS / + # ContinuousDiD / StaggeredTripleDiff / EfficientDiD / + # StackedDiD under ``clean_control in {"not_yet_treated", + # "strict"}``) the fixed control count is suppressed because the + # comparison group varies by cohort/sub-experiment; narrate the + # mode explicitly rather than misreporting a fixed-subset tally as + # "control" (rounds 13 / 17 / 18 / 22 CI review). sample = schema.get("sample", {}) or {} + # ``schema["estimator"]`` is a dict with ``class_name``; unwrap it + # for the per-estimator dynamic-control phrasing branch below. + estimator_block = schema.get("estimator") or {} + estimator = ( + estimator_block.get("class_name") if isinstance(estimator_block, dict) else None + ) n_obs = sample.get("n_obs") n_t = sample.get("n_treated") n_c = sample.get("n_control") n_nt = sample.get("n_never_treated") n_ne = sample.get("n_never_enabled") is_dynamic = sample.get("dynamic_control") + cg = sample.get("control_group") if isinstance(n_obs, int): if isinstance(n_t, int) and isinstance(n_c, int): sentences.append(f"Sample: {n_obs:,} observations ({n_t:,} treated, {n_c:,} control).") @@ -1638,11 +1726,32 @@ def _render_summary(schema: Dict[str, Any]) -> str: subset_clause = f"; {n_nt:,} never-treated units are also present" else: subset_clause = "" - sentences.append( - f"Sample: {n_obs:,} observations ({n_t:,} treated) with a " - "dynamic not-yet-treated comparison group (the control set " - f"varies by cohort and period){subset_clause}." - ) + # Estimator-specific dynamic-comparison phrasing. StackedDiD + # uses sub-experiment-specific clean controls (IC1/IC2 + # trimming) rather than a not-yet-treated rollout; the + # generic phrasing misstates the identification setup. + if estimator == "StackedDiDResults": + cc_label = cg if isinstance(cg, str) else "clean_control" + n_distinct = sample.get("n_distinct_controls_trimmed") + distinct_clause = ( + f" across {n_distinct:,} distinct control units in the trimmed stack" + if isinstance(n_distinct, int) + else "" + ) + sentences.append( + f"Sample: {n_obs:,} observations ({n_t:,} treated) with a " + f"sub-experiment-specific clean-control comparison " + f"(``clean_control='{cc_label}'``): each adoption event is " + f"compared against the units satisfying the rule relative " + f"to that event's window, not a single fixed control " + f"group{distinct_clause}{subset_clause}." + ) + else: + sentences.append( + f"Sample: {n_obs:,} observations ({n_t:,} treated) with a " + "dynamic not-yet-treated comparison group (the control set " + f"varies by cohort and period){subset_clause}." + ) else: sentences.append(f"Sample: {n_obs:,} observations.") survey = sample.get("survey") @@ -1769,12 +1878,19 @@ def _render_full_report(schema: Dict[str, Any]) -> str: if isinstance(sample.get("n_treated"), int): lines.append(f"- Treated: {sample['n_treated']:,}") # ``n_control`` is only populated for estimators whose control set - # is a fixed tally. For dynamic not-yet-treated modes (CS / - # ContinuousDiD / StaggeredTripleDiff / EfficientDiD) the - # comparison group is dynamic per (g, t); report the estimator- + # is a fixed tally. For dynamic modes (CS / ContinuousDiD / + # StaggeredTripleDiff / EfficientDiD / StackedDiD under + # ``clean_control in {"not_yet_treated", "strict"}``) the comparison + # group is dynamic per cohort/sub-experiment; report the estimator- # specific fixed subset (``n_never_enabled`` for triple-difference; - # ``n_never_treated`` elsewhere) when non-zero, then name the - # dynamic-comparison mode explicitly. + # ``n_never_treated`` elsewhere; ``n_distinct_controls_trimmed`` for + # Stacked) when available, then name the dynamic-comparison mode + # explicitly. + estimator_block = schema.get("estimator") or {} + estimator_name = ( + estimator_block.get("class_name") if isinstance(estimator_block, dict) else None + ) + cg = sample.get("control_group") if isinstance(sample.get("n_control"), int): lines.append(f"- Control: {sample['n_control']:,}") elif sample.get("dynamic_control"): @@ -1786,10 +1902,24 @@ def _render_full_report(schema: Dict[str, Any]) -> str: lines.append( f"- Never-treated units present in the panel: {sample['n_never_treated']:,}" ) - lines.append( - "- Comparison group: dynamic not-yet-treated units " - "(varies by cohort and period; no fixed control count)" - ) + if estimator_name == "StackedDiDResults": + n_distinct = sample.get("n_distinct_controls_trimmed") + if isinstance(n_distinct, int): + lines.append( + f"- Distinct control units in trimmed stack: {n_distinct:,}" + ) + cc_label = cg if isinstance(cg, str) else "clean_control" + lines.append( + f"- Comparison group: sub-experiment-specific clean controls " + f"(``clean_control='{cc_label}'``; each adoption event is " + "compared against units satisfying the rule relative to that " + "event's window, not a single fixed control group)" + ) + else: + lines.append( + "- Comparison group: dynamic not-yet-treated units " + "(varies by cohort and period; no fixed control count)" + ) survey = sample.get("survey") if survey: if survey.get("is_trivial"): diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 87b2c597..68a8e1ad 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -494,6 +494,15 @@ def _instance_skip_reason(self, check: str) -> Optional[str]: return "Estimator did not produce results.epv_diagnostics for this fit." return None if check == "parallel_trends": + # Precomputed parallel-trends always unlocks this check. The + # EfficientDiD Hausman skip message already points users at + # ``precomputed={'parallel_trends': ...}`` when replay fails + # (DR / survey fits), so applicability must honor the + # override before the replay-gate below fires. Round-22 P1 + # CI review on PR #318 flagged that PT precomputed was + # advertised but skipped before use. + if "parallel_trends" in self._precomputed: + return None method = _PT_METHOD.get(name) if method == "two_x_two": # Mirror the full argument contract of ``_pt_two_x_two``: @@ -644,6 +653,15 @@ def _instance_skip_reason(self, check: str) -> Optional[str]: return "HonestDiD requires at least one pre-period coefficient." return None if check == "bacon": + # Precomputed Bacon always unlocks this check. Users with an + # already-computed ``BaconDecompositionResults`` (e.g., run + # separately against a stored panel that isn't available at + # report time) need the passthrough to land on the Bacon + # runner instead of being skipped for missing column kwargs. + # Round-22 P1 CI review on PR #318 flagged that Bacon + # precomputed was advertised but skipped before use. + if "bacon" in self._precomputed: + return None # ``BaconDecompositionResults`` carries the decomposition # directly; no data/column kwargs needed. if name == "BaconDecompositionResults": diff --git a/tests/test_business_report.py b/tests/test_business_report.py index be32a109..ef4d90c3 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -1200,7 +1200,13 @@ class TestStackedCleanControlSurfacesInSampleBlock: all-eventually-treated panel). """ - def test_stacked_not_yet_treated_surfaces_without_never_treated_relabel(self): + def test_stacked_not_yet_treated_surfaces_as_dynamic_without_never_treated_relabel(self): + """``clean_control='not_yet_treated'`` is a dynamic, sub- + experiment-specific comparison set (``A_s > a + kappa_post``); + ``n_control`` is cleared (not a fixed tally), ``n_never_treated`` + is NOT relabeled, and the distinct-controls tally is surfaced + under the dedicated ``n_distinct_controls_trimmed`` key. + """ from diff_diff import StackedDiD sdf = generate_staggered_data(n_units=80, n_periods=8, treatment_effect=1.5, seed=7) @@ -1209,19 +1215,42 @@ def test_stacked_not_yet_treated_surfaces_without_never_treated_relabel(self): ) assert getattr(st, "clean_control", None) == "not_yet_treated" sample = BusinessReport(st, auto_diagnostics=False).to_dict()["sample"] - # clean_control normalizes into control_group. assert sample["control_group"] == "not_yet_treated" assert sample["dynamic_control"] is True - # n_control_units is "distinct control units in the trimmed set"; - # that count includes future-treated controls and must not be - # relabeled as n_never_treated. assert sample["n_never_treated"] is None, ( "StackedDiDResults.n_control_units is the distinct-control-" "units tally of the trimmed set (includes future-treated " "controls); it must not be surfaced as n_never_treated." ) - # The count stays on the n_control path. - assert sample["n_control"] == int(st.n_control_units) + # Round-22 correction: ``n_control`` must be cleared under + # dynamic modes so the report does not narrate a fixed control + # tally. The underlying count is surfaced under the dedicated + # Stacked key. + assert sample["n_control"] is None + assert sample["n_distinct_controls_trimmed"] == int(st.n_control_units) + + def test_stacked_strict_clean_control_surfaces_as_dynamic(self): + """``clean_control='strict'`` (``A_s > a + kappa_post + kappa_pre``) + is also a sub-experiment-specific rule — stricter than + ``not_yet_treated`` but still NOT a fixed never-treated pool + (round-22 P1 CI review on PR #318). + """ + from diff_diff import StackedDiD + + sdf = generate_staggered_data(n_units=80, n_periods=8, treatment_effect=1.5, seed=7) + st = StackedDiD(clean_control="strict").fit( + sdf, outcome="outcome", unit="unit", time="period", first_treat="first_treat" + ) + sample = BusinessReport(st, auto_diagnostics=False).to_dict()["sample"] + assert sample["control_group"] == "strict" + assert sample["dynamic_control"] is True, ( + "clean_control='strict' is sub-experiment-specific (rule " + "A_s > a + kappa_post + kappa_pre) and must be marked dynamic " + "so the report does not claim a fixed never-treated control " + "pool." + ) + assert sample["n_control"] is None + assert sample["n_never_treated"] is None def test_stacked_never_treated_surfaces_as_fixed_control(self): from diff_diff import StackedDiD @@ -1269,6 +1298,107 @@ def test_stacked_all_eventually_treated_panel_does_not_fabricate_never_treated(s ) +class TestStackedDiDAssumptionBlock: + """Round-22 P1 regression: ``StackedDiDResults`` must get a + dedicated assumption description reflecting Wing-Freedman- + Hollingsworth (2024) identification — sub-experiment common trends + plus IC1 (event window fits) and IC2 (clean controls exist) — not + the generic "group-time ATT" clause used for CS / SA / etc. The + active ``clean_control`` rule must be named in the description. + """ + + @staticmethod + def _stub(clean_control: str): + class StackedDiDResults: + pass + + stub = StackedDiDResults() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 400 + stub.n_treated = 50 + stub.n_control_units = 300 + stub.survey_metadata = None + stub.event_study_effects = None + stub.clean_control = clean_control + return stub + + def test_not_yet_treated_names_subexperiment_contract(self): + br = BusinessReport(self._stub("not_yet_treated"), auto_diagnostics=False) + a = br.to_dict()["assumption"] + assert a["parallel_trends_variant"] == "stacked_sub_experiment" + desc = a["description"] + assert "Wing, Freedman & Hollingsworth 2024" in desc + assert "sub-experiment" in desc + assert "IC1" in desc and "IC2" in desc + assert "A_s > a + kappa_post" in desc + assert "not_yet_treated" not in desc or "``A_s > a + kappa_post``" in desc + # The active clean_control is carried on the block explicitly for + # consumers that want structured access. + assert a["clean_control"] == "not_yet_treated" + + def test_strict_names_strict_rule(self): + desc = BusinessReport( + self._stub("strict"), auto_diagnostics=False + ).to_dict()["assumption"]["description"] + assert "A_s > a + kappa_post + kappa_pre" in desc + + def test_never_treated_names_fixed_pool(self): + desc = BusinessReport( + self._stub("never_treated"), auto_diagnostics=False + ).to_dict()["assumption"]["description"] + assert "never treated" in desc.lower() + assert "A_s = infinity" in desc + + +class TestStackedRenderingNarratesDynamicControl: + """Round-22 P1 regression: BR ``summary()`` / ``full_report()`` must + narrate Stacked dynamic clean-control designs as sub-experiment- + specific comparisons, not as fixed "N treated / M control" samples. + Previously the ``n_control`` branch fired first and misrendered both + ``clean_control='not_yet_treated'`` and ``'strict'``. + """ + + def test_summary_does_not_narrate_stacked_dynamic_as_fixed_control(self): + from diff_diff import StackedDiD + + sdf = generate_staggered_data(n_units=80, n_periods=8, treatment_effect=1.5, seed=7) + st = StackedDiD(clean_control="not_yet_treated").fit( + sdf, outcome="outcome", unit="unit", time="period", first_treat="first_treat" + ) + summary = BusinessReport(st, auto_diagnostics=False).summary() + # Must NOT render a "X treated, Y control" clause (that narration + # implies a fixed comparison pool). + import re + + assert not re.search(r"\d[\d,]*\s+treated,\s+\d[\d,]*\s+control", summary), ( + f"Stacked with dynamic clean-control must not be narrated " + f"as fixed treated/control counts. Got: {summary!r}" + ) + # Must narrate the sub-experiment-specific clean-control contract. + assert "sub-experiment-specific clean-control" in summary + assert "clean_control='not_yet_treated'" in summary + + def test_full_report_names_sub_experiment_comparison_for_stacked_strict(self): + from diff_diff import StackedDiD + + sdf = generate_staggered_data(n_units=80, n_periods=8, treatment_effect=1.5, seed=7) + st = StackedDiD(clean_control="strict").fit( + sdf, outcome="outcome", unit="unit", time="period", first_treat="first_treat" + ) + md = BusinessReport(st, auto_diagnostics=False).full_report() + # Must NOT emit a bare "Control: N" line. + assert "- Control:" not in md or "- Control: " not in md.split("## Sample")[1].split("##")[0], ( + "Stacked with dynamic clean-control must not render a fixed " + "'- Control: N' line in the Sample section." + ) + assert "sub-experiment-specific clean controls" in md + assert "clean_control='strict'" in md + + class TestDCDHPhase3AssumptionClause: """Pre-emptive audit regression: ``ChaisemartinDHaultfoeuilleResults`` populates ``covariate_residuals`` when ``controls`` is set in fit, diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index dd46a9a1..2f2377b6 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -412,6 +412,59 @@ class _PPStub: # Downgrade must apply: pre-tier is well_powered, post-tier is moderately_powered. assert block["tier"] == "moderately_powered" + def test_precomputed_parallel_trends_bypasses_applicability_gate(self, cs_fit): + """Round-22 P1 regression: ``precomputed["parallel_trends"]`` was + documented as supported but ``_instance_skip_reason`` skipped the + PT check on applicability grounds (missing raw panel / columns + for the event-study replay, non-replayable EfficientDiD fits, + etc.) BEFORE the precomputed runner could fire. The fix + short-circuits the gate when the precomputed key is present so + advertised passthroughs actually land on the runner. + """ + fit, _ = cs_fit + precomputed_pt = { + "status": "ran", + "method": "event_study", + "joint_p_value": 0.42, + "n_pre_periods": 3, + "verdict": "no_detected_violation", + } + + # Without passing ``data`` + column kwargs, the applicability + # gate would previously have marked PT as skipped. With the + # precomputed override, it must land on the formatter instead. + dr = DiagnosticReport(fit, precomputed={"parallel_trends": precomputed_pt}) + pt_block = dr.to_dict()["parallel_trends"] + assert pt_block["status"] == "ran", ( + f"precomputed parallel_trends must bypass the applicability gate. " + f"Got status={pt_block.get('status')}, reason={pt_block.get('reason')}" + ) + + def test_precomputed_bacon_bypasses_applicability_gate(self, cs_fit): + """Round-22 P1 regression: ``precomputed["bacon"]`` was + documented as supported but ``_instance_skip_reason`` skipped + Bacon on applicability grounds (``data`` / column kwargs missing) + before the runner could fire. Users with an already-computed + ``BaconDecompositionResults`` must be able to pass it through + without re-supplying the raw panel. + """ + from types import SimpleNamespace + + fit, _ = cs_fit + precomputed_bacon = SimpleNamespace( + weights=None, + att=1.2, + comparison_types={}, + total_weight_later_vs_earlier=0.02, + ) + + dr = DiagnosticReport(fit, precomputed={"bacon": precomputed_bacon}) + bacon_block = dr.to_dict()["bacon"] + assert bacon_block["status"] == "ran", ( + f"precomputed bacon must bypass the applicability gate. " + f"Got status={bacon_block.get('status')}, reason={bacon_block.get('reason')}" + ) + def test_precomputed_single_m_sensitivity_exposes_original_estimate_and_se(self, cs_fit): """Pre-emptive audit regression: ``_format_precomputed_sensitivity`` used to drop ``original_estimate`` and ``original_se`` on the From 61270b89819653c34690dc86e06bf8e78ad154d9 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 09:54:42 -0400 Subject: [PATCH 26/48] Address twenty-third round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P1 methodology (precomputed PT formatter too narrow). ``_format_precomputed_pt`` previously rejected non-dict inputs outright and read only the 2x2 ``p_value`` key, so two user-facing paths the PR now advertises were broken: 1. ``HausmanPretestResult`` — the native object ``EfficientDiD. hausman_pretest(...)`` returns and the exact type ``_pt_hausman``'s skip message (for non-replayable DR / survey fits) tells users to pass through — was rejected at the first branch. 2. Schema-shaped PT dicts from the default DR path (``joint_p_value`` / ``test_statistic`` / ``df``) silently had their statistics dropped and were degraded to ``joint_p_value=None`` / ``verdict="inconclusive"``. This happens whenever one DR run's PT block is replayed into another. The formatter now accepts any of: * ``utils.check_parallel_trends`` 2x2 dicts (``p_value`` key); * schema-shaped dicts (``joint_p_value`` + optional ``test_statistic`` / ``df`` / ``method``); * native result objects exposing ``p_value`` (or ``joint_p_value``) plus optional ``statistic`` / ``test_statistic`` and ``df`` — including ``HausmanPretestResult``. Preserves ``joint_p_value``, ``test_statistic``, ``df``, and ``method`` on the emitted schema instead of collapsing them. Inputs with neither ``joint_p_value`` nor ``p_value`` now surface an ``error`` status with a clear reason instead of silently producing a ``None`` p-value. P3 coverage. Round-22 added a regression that only asserted ``status == "ran"``, which is why the value-preservation bug slipped through. Three new tests pin value preservation: * ``test_precomputed_parallel_trends_preserves_schema_shaped_joint_p`` replays a schema-shaped dict and asserts ``joint_p_value`` / ``test_statistic`` / ``df`` survive; * ``test_precomputed_parallel_trends_accepts_native_hausman_result`` passes a ``HausmanPretestResult``-shaped object and asserts ``statistic`` -> ``test_statistic``, ``p_value`` -> ``joint_p_value``, and ``df`` land on the schema; * ``test_precomputed_parallel_trends_rejects_input_without_p_value`` pins the error path for obviously-wrong inputs. 234 BR / DR / practitioner tests pass (69 DR + 165 BR/practitioner). Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/diagnostic_report.py | 76 ++++++++++++++++++++++++++--- tests/test_diagnostic_report.py | 85 +++++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+), 7 deletions(-) diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 68a8e1ad..b261c27d 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -1963,21 +1963,83 @@ def _pt_factor(self) -> Dict[str, Any]: } def _format_precomputed_pt(self, obj: Any) -> Dict[str, Any]: - """Adapt a pre-computed PT result (from utils.check_parallel_trends) to schema shape.""" - if not isinstance(obj, dict): + """Adapt a pre-computed parallel-trends result to the schema shape. + + Accepted inputs (round-23 P1 CI review on PR #318): + * A dict from ``utils.check_parallel_trends`` with ``p_value`` + (2x2 PT shape) — ``joint_p_value`` inherits from ``p_value`` + when only the 2x2 key is supplied. + * A schema-shaped dict with ``joint_p_value`` and optional + ``test_statistic`` / ``df`` / ``method`` (the same shape + ``to_dict()["parallel_trends"]`` emits on the default path), + so a PT block from one DR run can be replayed into another. + * A native result object exposing ``p_value`` (or + ``joint_p_value``) plus optional ``statistic`` / + ``test_statistic`` and ``df`` — in particular, EfficientDiD's + ``HausmanPretestResult``, which is what the ``_pt_hausman`` + skip message points users toward when replay fails on a + non-nocov / survey fit. + + Previously the formatter rejected non-dict inputs outright and + only read ``p_value``, so ``HausmanPretestResult`` could not be + passed through at all and a schema-shaped dict silently lost its + ``joint_p_value`` / ``test_statistic`` / ``df`` fields. + """ + + def _read(name: str) -> Any: + if isinstance(obj, dict): + return obj.get(name) + return getattr(obj, name, None) + + # Accept joint_p_value preferentially, but fall back to the 2x2 + # ``p_value`` key so ``utils.check_parallel_trends`` dicts still + # work as before. + raw_p = _read("joint_p_value") + if raw_p is None: + raw_p = _read("p_value") + p_value = _to_python_float(raw_p) + + # ``HausmanPretestResult`` exposes ``statistic``; schema-shaped + # dicts and the default DR path both use ``test_statistic``. + raw_stat = _read("test_statistic") + if raw_stat is None: + raw_stat = _read("statistic") + test_statistic = _to_python_float(raw_stat) + + df = _to_python_scalar(_read("df")) + method = _read("method") or "precomputed" + + # If no recognized p-value field was supplied at all, surface an + # error rather than silently producing ``joint_p_value=None``. + # Stay permissive about dict shapes — absence of ``test_statistic`` + # or ``df`` is fine (2x2 PT has neither), but a complete absence + # of a p-value / joint-p-value means the input is not a PT result. + if raw_p is None: return { "status": "error", - "reason": "precomputed['parallel_trends'] must be a dict returned by " - "check_parallel_trends or compatible shape.", + "method": method, + "reason": ( + "precomputed['parallel_trends'] must expose either " + "``joint_p_value`` (schema shape / HausmanPretestResult) or " + "``p_value`` (check_parallel_trends 2x2 shape). Got an object " + "with neither: pass a dict with one of those keys, or a " + "native result object (e.g., HausmanPretestResult) exposing " + "``p_value``." + ), } - p_value = _to_python_float(obj.get("p_value")) - return { + + out: Dict[str, Any] = { "status": "ran", - "method": obj.get("method", "precomputed"), + "method": method, "joint_p_value": p_value, "verdict": _pt_verdict(p_value), "precomputed": True, } + if test_statistic is not None: + out["test_statistic"] = test_statistic + if df is not None: + out["df"] = df + return out # -- Headline metric extraction ---------------------------------------- diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index 2f2377b6..b810b135 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -440,6 +440,91 @@ def test_precomputed_parallel_trends_bypasses_applicability_gate(self, cs_fit): f"Got status={pt_block.get('status')}, reason={pt_block.get('reason')}" ) + def test_precomputed_parallel_trends_preserves_schema_shaped_joint_p(self, cs_fit): + """Round-23 P1 regression: schema-shaped PT dicts with + ``joint_p_value`` (the key emitted by the default DR path and + the shape users are most likely to replay from one DR to + another) must land on ``joint_p_value`` in the output, not + silently fall through to ``None``. Prior formatter read only + ``p_value``, so a dict with ``joint_p_value=0.42`` was + degraded to ``joint_p_value=None`` / ``verdict="inconclusive"``. + """ + fit, _ = cs_fit + dr = DiagnosticReport( + fit, + precomputed={ + "parallel_trends": { + "joint_p_value": 0.42, + "test_statistic": 5.6, + "df": 3, + "method": "hausman", + } + }, + ) + pt = dr.to_dict()["parallel_trends"] + assert pt["status"] == "ran" + assert pt["method"] == "hausman" + assert pt["joint_p_value"] == 0.42, ( + f"joint_p_value must survive formatting; got {pt.get('joint_p_value')}" + ) + assert pt["test_statistic"] == 5.6 + assert pt["df"] == 3 + # Verdict must be derived from the surviving p-value, not None. + assert pt["verdict"] != "inconclusive" + + def test_precomputed_parallel_trends_accepts_native_hausman_result(self, cs_fit): + """Round-23 P1 regression: ``_pt_hausman`` tells users with + non-replayable EfficientDiD fits to pass a precomputed pretest + result, but the formatter previously rejected non-dict inputs + outright. The ``HausmanPretestResult`` dataclass — the exact + object ``EfficientDiD.hausman_pretest(...)`` returns — must + now pass through with ``statistic`` / ``p_value`` / ``df`` + preserved on the schema. + """ + from types import SimpleNamespace + + fit, _ = cs_fit + # Mirror HausmanPretestResult: the key fields are ``statistic``, + # ``p_value``, ``df``. Uses SimpleNamespace so the test does + # not need EfficientDiD's construction path. + hausman = SimpleNamespace( + statistic=7.2, + p_value=0.065, + df=3, + reject=False, + alpha=0.05, + att_all=1.0, + att_post=1.05, + recommendation="pt_all", + ) + dr = DiagnosticReport(fit, precomputed={"parallel_trends": hausman}) + pt = dr.to_dict()["parallel_trends"] + assert pt["status"] == "ran", ( + f"Native HausmanPretestResult must be accepted; got " + f"status={pt.get('status')}, reason={pt.get('reason')}" + ) + assert pt["joint_p_value"] == 0.065 + # ``statistic`` on the source object maps to ``test_statistic`` + # in the emitted schema (matches the default ``_pt_hausman`` + # path that also exposes it as ``test_statistic``). + assert pt["test_statistic"] == 7.2 + assert pt["df"] == 3 + + def test_precomputed_parallel_trends_rejects_input_without_p_value(self, cs_fit): + """Inputs without any recognized p-value field (neither + ``joint_p_value`` nor ``p_value``) must surface a clear error, + not silently land on ``joint_p_value=None``. Keeps the formatter + permissive about absent ``test_statistic`` / ``df`` (2x2 PT has + neither) while catching obviously-wrong inputs. + """ + fit, _ = cs_fit + dr = DiagnosticReport( + fit, precomputed={"parallel_trends": {"method": "event_study"}} + ) + pt = dr.to_dict()["parallel_trends"] + assert pt["status"] == "error" + assert "joint_p_value" in pt["reason"] or "p_value" in pt["reason"] + def test_precomputed_bacon_bypasses_applicability_gate(self, cs_fit): """Round-22 P1 regression: ``precomputed["bacon"]`` was documented as supported but ``_instance_skip_reason`` skipped From dae62ac8f5a739ea108c7b3dda540d61bc6eab67 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 10:14:37 -0400 Subject: [PATCH 27/48] Address twenty-fourth round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P2 code quality (SDiD jackknife step-tag bug). The SyntheticDiD practitioner step "Leave-one-out influence (jackknife)" was tagged ``_step_name="sensitivity"``. DR's SDiD native battery covers pre-treatment fit, weight concentration, ``in_time_placebo``, and ``sensitivity_to_zeta_omega`` — but not the jackknife LOO workflow, which requires a separate ``variance_method='jackknife'`` fit before ``get_loo_effects_df`` returns anything. As soon as the native block ran, ``_collect_next_steps`` marked ``"sensitivity"`` complete and suppressed the jackknife recommendation, overstating what the report had actually executed. Same class as round-20 Hausman (``heterogeneity`` -> ``parallel_trends``) and the pre-emptive TROP-placebo retag. Retag the step as ``_step_name="loo_jackknife"`` so it persists regardless of which DR blocks ran. No DR check maps to the new tag — the step stays in ``next_steps`` until the user completes it explicitly. P3 coverage. Add the two regressions the reviewer specified: * ``test_sdid_jackknife_step_persists_via_practitioner_filter`` (unit-level) asserts ``practitioner_next_steps(sdid_stub, completed_steps=["sensitivity"])`` still surfaces the jackknife label; * ``test_sdid_jackknife_step_persists_in_dr_next_steps`` (integration) asserts ``DiagnosticReport(sdid).to_dict() ["next_steps"]`` preserves the recommendation when only the default native SDiD diagnostics ran. 227 BR / DR / practitioner tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/practitioner.py | 11 +++++- tests/test_business_report.py | 68 +++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 1 deletion(-) diff --git a/diff_diff/practitioner.py b/diff_diff/practitioner.py index d094a3b4..cd1d4235 100644 --- a/diff_diff/practitioner.py +++ b/diff_diff/practitioner.py @@ -570,7 +570,16 @@ def _handle_synthetic(results: Any): " 'with positive effective support.')" ), priority="medium", - step_name="sensitivity", + # DR's SyntheticDiD native battery covers pre-treatment fit, + # weight concentration, in-time placebo, and zeta-omega + # sensitivity, but NOT the jackknife LOO workflow (which + # requires a separate ``variance_method='jackknife'`` fit + # via ``get_loo_effects_df``). Tagging this recommendation + # as ``sensitivity`` caused ``_collect_next_steps`` to + # suppress it as soon as the native block ran, even though + # the jackknife was never executed. Round-24 P2 CI review + # on PR #318; same class as round-20 Hausman mistag. + step_name="loo_jackknife", ), _step( baker_step=6, diff --git a/tests/test_business_report.py b/tests/test_business_report.py index ef4d90c3..c559b570 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -2290,6 +2290,74 @@ def test_efficient_compare_control_groups_persists_after_sensitivity_runs(self): ) +class TestSDiDJackknifeStepPersistsAfterNativeSensitivity: + """Round-24 P2 CI review on PR #318: the SyntheticDiD practitioner + step "Leave-one-out influence (jackknife)" must persist after + ``DiagnosticReport`` marks ``sensitivity`` complete via the SDiD + native battery (pre-treatment fit, weight concentration, + ``in_time_placebo``, ``sensitivity_to_zeta_omega``). DR does NOT + run the jackknife LOO workflow — ``get_loo_effects_df`` requires a + separate ``variance_method='jackknife'`` fit — so suppressing the + recommendation when the native block fires overstates what the + report has already executed. Same class as round-20 Hausman and + pre-emptive TROP-placebo retags: step_name was coarser than DR's + actual coverage. + """ + + def test_sdid_jackknife_step_persists_via_practitioner_filter(self): + """Unit-level: ``practitioner_next_steps`` with + ``completed_steps=["sensitivity"]`` still surfaces the jackknife + recommendation because it is now tagged ``loo_jackknife``. + """ + from diff_diff.practitioner import practitioner_next_steps + + class SyntheticDiDResults: + pass + + stub = SyntheticDiDResults() + stub.att = 1.0 + stub.se = 0.2 + stub.p_value = 0.001 + stub.conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 200 + stub.n_treated = 20 + stub.n_control = 180 + stub.survey_metadata = None + stub.event_study_effects = None + + labels = [ + s.get("label", "") + for s in practitioner_next_steps( + stub, completed_steps=["sensitivity"], verbose=False + )["next_steps"] + ] + assert any( + "Leave-one-out influence (jackknife)" in lab for lab in labels + ), ( + "SDiD jackknife recommendation must persist after DR marks " + "sensitivity complete — the SDiD native battery does not run " + "the jackknife LOO workflow (requires a separate " + "variance_method='jackknife' fit)." + ) + + def test_sdid_jackknife_step_persists_in_dr_next_steps(self, sdid_fit): + """Integration: ``DiagnosticReport(...).to_dict()["next_steps"]`` + preserves the jackknife recommendation when only the default + native SDiD diagnostics ran. + """ + from diff_diff import DiagnosticReport + + fit, _ = sdid_fit + next_steps = DiagnosticReport(fit).to_dict()["next_steps"] + labels = [s.get("label", "") for s in next_steps] + assert any("Leave-one-out influence (jackknife)" in lab for lab in labels), ( + "DR next_steps must preserve the SDiD jackknife recommendation " + "when the SDiD native battery ran but the jackknife workflow " + f"did not. Got labels: {labels}" + ) + + class TestTROPInTimePlaceboStepTaggedAsPlacebo: """Pre-emptive audit regression: the TROP practitioner workflow step "In-time or in-space placebo" was previously tagged From 3167f05b75ac3eb083bfa566c2cd1f136823400b Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 10:40:35 -0400 Subject: [PATCH 28/48] Address twenty-fifth round of CI review findings on PR #318 P2 code quality (full_report PT label). ``BusinessReport. full_report()`` hard-coded ``joint p = ...`` in the Pre-Trends section, which mislabeled the 2x2 ``slope_difference`` and EfficientDiD ``hausman`` single-statistic tests (both emit a single ``p``, not a joint p) and invented a nonexistent label for design-enforced SDiD ``synthetic_fit`` / TROP ``factor`` paths that have no p-value at all. ``summary()`` was already method-aware via ``_pt_method_stat_label``; the markdown path now uses the same helper and omits the parenthetical entirely for no-p-value methods. P3 docs. ``REPORTING.md``'s "single-knob alpha" note said ``alpha`` drives both the CI level and the phrasing threshold. The implementation and regression tests actually preserve the fit's native CI on alpha mismatch (the stored CI is the only quantile the underlying estimator supplied; bootstrap distributions and finite-df analytical variances are not always retained) and only change the significance phrasing, with an ``alpha_override_preserved`` caveat. Updated the note to describe the preserved-native-CI fallback and the reason for the conservative choice. P3 coverage. Add ``TestFullReportMethodAwarePTLabel`` with three regressions using the same fake-DR-schema pattern the summary tests use: * ``slope_difference`` -> markdown uses ``p = ...``, not ``joint p``; * ``hausman`` -> markdown uses ``p = ...``, not ``joint p``; * ``synthetic_fit`` -> markdown omits any p-value label; verdict still renders. 233 BR / DR / practitioner tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 17 +++++- docs/methodology/REPORTING.md | 23 ++++++-- tests/test_business_report.py | 106 ++++++++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+), 7 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 40ac7666..8c45aa7b 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -1820,8 +1820,21 @@ def _render_full_report(schema: Dict[str, Any]) -> str: jp = pt.get("joint_p_value") verdict = pt.get("verdict") tier = pt.get("power_tier") - jp_str = f"joint p = {jp:.3g}" if isinstance(jp, (int, float)) else "joint p unavailable" - lines.append(f"- Verdict: `{verdict}` ({jp_str})") + # Use the method-aware statistic label the summary path already + # uses: "joint p" for Wald / Bonferroni event-study, "p" for + # slope-difference / Hausman single-statistic tests, and None + # for design-enforced SDiD / TROP paths where there is no + # p-value at all. Round-25 P2 CI review on PR #318 flagged the + # hard-coded "joint p" wording as misdescribing 2x2 / Hausman + # fits and inventing a nonexistent p-value for SDiD / TROP. + method = pt.get("method") + stat_label = _pt_method_stat_label(method) + if stat_label and isinstance(jp, (int, float)): + lines.append(f"- Verdict: `{verdict}` ({stat_label} = {jp:.3g})") + elif stat_label: + lines.append(f"- Verdict: `{verdict}` ({stat_label} unavailable)") + else: + lines.append(f"- Verdict: `{verdict}`") if tier: lines.append(f"- Power tier: `{tier}`") mdv = pt.get("mdv") diff --git a/docs/methodology/REPORTING.md b/docs/methodology/REPORTING.md index 119fbc65..8c5cf931 100644 --- a/docs/methodology/REPORTING.md +++ b/docs/methodology/REPORTING.md @@ -153,11 +153,24 @@ not new inference. coefficients), which would be unsafe in the presence of non-linear link functions (Poisson QMLE, logit). -- **Note:** Single-knob `alpha`. BusinessReport exposes only `alpha` - (defaults to `results.alpha`); there is no separate - `significance_threshold` parameter. `alpha` drives both the CI level - (`(1 - alpha) * 100`% interval) and the phrasing tier threshold - ("statistically significant at the (1 - alpha) * 100% level"). +- **Note:** Single-knob `alpha` with preserved-native-CI fallback. + BusinessReport exposes only `alpha` (defaults to `results.alpha`); + there is no separate `significance_threshold` parameter. When the + requested `alpha` matches the fit's native level, it drives both the + CI level (`(1 - alpha) * 100`% interval) and the phrasing tier + threshold ("statistically significant at the (1 - alpha) * 100% + level"). When the requested `alpha` differs from the fit's native + level (e.g., the user asks for `alpha=0.10` on a result fit with + `alpha=0.05`), BusinessReport does NOT recompute the CI at the + requested level, because the stored CI is the only quantile the + underlying estimator supplied (bootstrap distributions and + finite-df analytical variances are not always retained on the + result). Instead, the schema preserves the fit's native CI (with its + original level) and uses the requested `alpha` only for the + significance-phrasing threshold, and emits an + `alpha_override_preserved` caveat describing the mismatch. This is + the conservative choice: it avoids silently recomputing CIs under + assumptions the estimator may not support. - **Note:** Schema stability policy for the AI-legible `to_dict()` surface. New top-level keys count as additive (no version bump); new diff --git a/tests/test_business_report.py b/tests/test_business_report.py index c559b570..17176e4a 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -921,6 +921,112 @@ def test_dr_summary_uses_hausman_wording_for_efficient_did(self, edid_fit): assert "event-study coefficients" not in summary +class TestFullReportMethodAwarePTLabel: + """Round-25 P2 CI review on PR #318: ``BusinessReport.full_report()`` + previously hard-coded ``joint p = ...`` in the Pre-Trends section, + which mislabels the 2x2 ``slope_difference`` and EfficientDiD + ``hausman`` single-statistic tests and invents a nonexistent + ``joint p`` label for design-enforced SDiD / TROP paths that have + no p-value at all. The markdown path must use the same + method-aware label helper the summary path already uses + (``_pt_method_stat_label``). + """ + + @staticmethod + def _stub_result_with_method(method: str): + from diff_diff.diagnostic_report import DiagnosticReportResults + + class DiDResults: + pass + + stub = DiDResults() + stub.att = 1.0 + stub.se = 0.2 + stub.p_value = 0.001 + stub.conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 100 + stub.n_treated = 40 + stub.n_control = 60 + stub.survey_metadata = None + stub.inference_method = "analytical" + + pt_block: dict = { + "status": "ran", + "method": method, + "verdict": "no_detected_violation", + } + # SDiD's synthetic_fit path has no p-value by design; the other + # methods do. + if method != "synthetic_fit": + pt_block["joint_p_value"] = 0.40 + + fake_schema = { + "schema_version": "1.0", + "estimator": "DiDResults", + "headline_metric": {"name": "att", "value": 1.0}, + "parallel_trends": pt_block, + "pretrends_power": {"status": "not_applicable"}, + "sensitivity": {"status": "not_applicable"}, + "placebo": {"status": "skipped", "reason": "opt-in"}, + "bacon": {"status": "not_applicable"}, + "design_effect": {"status": "not_applicable"}, + "heterogeneity": {"status": "not_applicable"}, + "epv": {"status": "not_applicable"}, + "estimator_native_diagnostics": {"status": "not_applicable"}, + "skipped": {}, + "warnings": [], + "overall_interpretation": "", + "next_steps": [], + } + fake_dr = DiagnosticReportResults( + schema=fake_schema, + interpretation="", + applicable_checks=("parallel_trends",), + skipped_checks={}, + warnings=(), + ) + return stub, fake_dr + + def _pt_section(self, md: str) -> str: + # The Pre-Trends section is delimited by the next ``##`` heading. + after = md.split("## Pre-Trends", 1)[1] + return after.split("\n## ", 1)[0] + + def test_full_report_slope_difference_uses_single_p_label(self): + stub, fake_dr = self._stub_result_with_method("slope_difference") + md = BusinessReport(stub, diagnostics=fake_dr).full_report() + section = self._pt_section(md) + assert "joint p" not in section, ( + f"2x2 slope_difference is a single-statistic test and must " + f"not be labeled ``joint p`` in the markdown. Got: {section!r}" + ) + # The single-statistic label ``p = ...`` must be present. + assert "p = 0.4" in section + + def test_full_report_hausman_uses_single_p_label(self): + stub, fake_dr = self._stub_result_with_method("hausman") + section = self._pt_section( + BusinessReport(stub, diagnostics=fake_dr).full_report() + ) + assert "joint p" not in section, ( + f"EfficientDiD Hausman is a single-statistic test and must " + f"not be labeled ``joint p`` in the markdown. Got: {section!r}" + ) + assert "p = 0.4" in section + + def test_full_report_synthetic_fit_omits_p_label(self): + stub, fake_dr = self._stub_result_with_method("synthetic_fit") + section = self._pt_section( + BusinessReport(stub, diagnostics=fake_dr).full_report() + ) + # No p-value of any kind for design-enforced SDiD PT analogue. + assert "joint p" not in section + assert "p = " not in section + # Verdict must still render. + assert "Verdict:" in section + + class TestHausmanPretestPropagatesFitDesign: """Round-9 regression: ``_pt_hausman`` must propagate the fitted result's ``control_group`` and ``anticipation`` into From d9b16cb5d7bff48884f37ee5c20b79e7b1d31456 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 10:56:42 -0400 Subject: [PATCH 29/48] Address twenty-sixth round of CI review findings on PR #318 P2 methodology (precomputed PT method inference). The round-23 formatter accepted raw ``utils.check_parallel_trends()`` dicts and native ``HausmanPretestResult`` objects but defaulted ``method`` to ``"precomputed"`` when the input lacked an explicit tag. Downstream prose helpers (``_pt_method_subject`` / ``_pt_method_stat_label``) fall through to the generic ``"Pre-treatment data"`` / ``"joint p"`` wording for unknown methods, so the exact passthroughs the ``_pt_hausman`` skip message tells users to supply rendered with the wrong diagnostic label even though the numeric p-value was correct. ``_format_precomputed_pt`` now infers the method from distinguishing fields when not explicitly supplied: * ``HausmanPretestResult`` shape (``statistic`` plus at least one of ``att_all`` / ``att_post`` / ``recommendation`` / ``reject``) -> ``method="hausman"``; * ``utils.check_parallel_trends`` 2x2 shape (``trend_difference`` / ``treated_trend`` / ``control_trend``) -> ``method="slope_difference"``; * otherwise -> ``method="precomputed"``. Explicit ``method`` in the input wins over inference (defensive guard for schema-shaped dicts whose companion fields might accidentally overlap). P3 coverage. Three new regressions assert that the inferred method survives through to rendered prose: * raw 2x2 dict -> ``method="slope_difference"``, markdown uses ``p = ...`` (not ``joint p``); * native Hausman-like object -> ``method="hausman"``, markdown uses ``p = ...``; * explicit ``method`` is not overridden by heuristic inference. P3 schema-contract tightening. The MVP placebo section is always emitted with ``status="skipped"``, per REPORTING.md and ``_compute_applicable_checks`` (which seeds ``"placebo"`` into the skipped map unconditionally). Tightened the existing assertion from ``status in {"skipped", "not_applicable"}`` to exact ``status == "skipped"`` plus a non-empty reason check so the regression pins the documented contract rather than both states. 246 BR / DR / practitioner tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/diagnostic_report.py | 38 +++++++++- tests/test_diagnostic_report.py | 119 +++++++++++++++++++++++++++++++- 2 files changed, 154 insertions(+), 3 deletions(-) diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index b261c27d..95def874 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -2007,7 +2007,43 @@ def _read(name: str) -> Any: test_statistic = _to_python_float(raw_stat) df = _to_python_scalar(_read("df")) - method = _read("method") or "precomputed" + + # Method inference (round-26 P2 CI review on PR #318). Downstream + # BR / DR prose keys off ``method`` to pick the right subject and + # statistic label (``"joint p"`` for event-study Wald / + # Bonferroni, ``"p"`` for the 2x2 slope-difference and Hausman + # single-statistic tests, no label for design-enforced paths). + # Defaulting to ``"precomputed"`` made raw 2x2 dicts and native + # Hausman objects render with the wrong subject ("Pre-treatment + # data") and label ("joint p"). Infer from the distinguishing + # fields when ``method`` is not explicit: + # * ``HausmanPretestResult`` / shape: has ``statistic``, plus + # at least one of ``att_all`` / ``att_post`` / ``recommendation`` + # (disambiguates from the schema-shaped dict which may also + # carry ``test_statistic`` but does not carry the Hausman- + # specific companion fields). + # * ``utils.check_parallel_trends`` 2x2 dict: carries + # ``trend_difference`` / ``treated_trend`` / ``control_trend`` + # as its distinguishing fields. + method = _read("method") + if method is None: + hausman_markers = ( + _read("statistic") is not None + and any( + _read(tag) is not None + for tag in ("att_all", "att_post", "recommendation", "reject") + ) + ) + slope_markers = any( + _read(tag) is not None + for tag in ("trend_difference", "treated_trend", "control_trend") + ) + if hausman_markers: + method = "hausman" + elif slope_markers: + method = "slope_difference" + else: + method = "precomputed" # If no recognized p-value field was supplied at all, surface an # error rather than silently producing ``joint_p_value=None``. diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index b810b135..617ec192 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -295,11 +295,19 @@ def test_run_opt_outs_move_checks_to_skipped(self, multi_period_fit): assert dr.skipped_checks["sensitivity"].startswith("run_sensitivity=False") def test_placebo_is_reserved_and_skipped(self, did_fit): - """Placebo is always in _CHECK_NAMES, always skipped in MVP.""" + """Placebo is always in _CHECK_NAMES, always skipped in MVP. + + Round-26 P3: tightened from ``status in {"skipped", + "not_applicable"}`` to exact ``status == "skipped"`` because + both REPORTING.md §MVP scope and the implementation + (``_compute_applicable_checks`` always seeds ``"placebo"`` into + ``skipped``) now pin the MVP contract to a single value. + """ fit, df = did_fit dr = DiagnosticReport(fit, data=df, outcome="outcome", treatment="treated", time="post") placebo_section = dr.to_dict()["placebo"] - assert placebo_section["status"] in {"skipped", "not_applicable"} + assert placebo_section["status"] == "skipped" + assert isinstance(placebo_section.get("reason"), str) and placebo_section["reason"] # --------------------------------------------------------------------------- @@ -510,6 +518,113 @@ def test_precomputed_parallel_trends_accepts_native_hausman_result(self, cs_fit) assert pt["test_statistic"] == 7.2 assert pt["df"] == 3 + def test_precomputed_pt_infers_slope_difference_method_for_raw_2x2_dict(self, cs_fit): + """Round-26 P2 regression: a raw ``utils.check_parallel_trends()`` + dict (no ``method`` key, has ``trend_difference`` / p_value) must + be recognized as the slope-difference 2x2 path and render with + the single-statistic ``p`` label, not the generic ``joint p`` + wording that ``"precomputed"`` falls through to. + """ + from diff_diff import BusinessReport + from diff_diff.diagnostic_report import DiagnosticReportResults + + fit, _ = cs_fit + raw_2x2 = { + "treated_trend": 0.1, + "treated_trend_se": 0.05, + "control_trend": 0.08, + "control_trend_se": 0.04, + "trend_difference": 0.02, + "trend_difference_se": 0.06, + "t_statistic": 0.33, + "p_value": 0.40, + "parallel_trends_plausible": True, + } + dr = DiagnosticReport(fit, precomputed={"parallel_trends": raw_2x2}) + pt = dr.to_dict()["parallel_trends"] + assert pt["status"] == "ran" + assert pt["method"] == "slope_difference", ( + f"Raw check_parallel_trends dict must infer " + f"method='slope_difference'; got {pt.get('method')!r}" + ) + + # Markdown prose must use the single-statistic ``p`` label + # (not ``joint p``, which is Wald / Bonferroni-specific). + br_dr = DiagnosticReportResults( + schema=dr.to_dict(), + interpretation="", + applicable_checks=("parallel_trends",), + skipped_checks={}, + warnings=(), + ) + md = BusinessReport(fit, diagnostics=br_dr).full_report() + pt_section = md.split("## Pre-Trends", 1)[1].split("\n## ", 1)[0] + assert "joint p" not in pt_section + assert "p = 0.4" in pt_section + + def test_precomputed_pt_infers_hausman_method_for_native_object(self, cs_fit): + """Round-26 P2 regression: a native Hausman-like object without + an explicit ``method`` tag (``HausmanPretestResult`` shape: + ``statistic`` + ``att_all`` / ``att_post`` / ``recommendation``) + must be recognized as the Hausman path and render with the + single-statistic ``p`` label, not ``joint p``. + """ + from types import SimpleNamespace + + from diff_diff import BusinessReport + from diff_diff.diagnostic_report import DiagnosticReportResults + + fit, _ = cs_fit + hausman_like = SimpleNamespace( + statistic=4.5, + p_value=0.21, + df=3, + reject=False, + alpha=0.05, + att_all=1.0, + att_post=1.1, + recommendation="pt_all", + # Note: no ``method`` attribute — tests the inference path. + ) + dr = DiagnosticReport(fit, precomputed={"parallel_trends": hausman_like}) + pt = dr.to_dict()["parallel_trends"] + assert pt["status"] == "ran" + assert pt["method"] == "hausman", ( + f"Native Hausman-like object must infer method='hausman'; " + f"got {pt.get('method')!r}" + ) + assert pt["test_statistic"] == 4.5 + assert pt["joint_p_value"] == 0.21 + + # Markdown prose must use the single-statistic ``p`` label. + br_dr = DiagnosticReportResults( + schema=dr.to_dict(), + interpretation="", + applicable_checks=("parallel_trends",), + skipped_checks={}, + warnings=(), + ) + md = BusinessReport(fit, diagnostics=br_dr).full_report() + pt_section = md.split("## Pre-Trends", 1)[1].split("\n## ", 1)[0] + assert "joint p" not in pt_section + assert "p = 0.21" in pt_section + + def test_precomputed_pt_explicit_method_wins_over_inference(self, cs_fit): + """Explicit ``method`` in the input must never be overridden by + the heuristic inference (defensive: e.g., a user passes a + schema-shaped dict labeled ``method='event_study'`` where the + ``trend_difference`` markers would otherwise suggest + slope_difference). + """ + fit, _ = cs_fit + spoofed = { + "method": "event_study", + "joint_p_value": 0.42, + "trend_difference": 0.02, # would otherwise trigger slope_difference inference + } + dr = DiagnosticReport(fit, precomputed={"parallel_trends": spoofed}) + assert dr.to_dict()["parallel_trends"]["method"] == "event_study" + def test_precomputed_parallel_trends_rejects_input_without_p_value(self, cs_fit): """Inputs without any recognized p-value field (neither ``joint_p_value`` nor ``p_value``) must surface a clear error, From e3f9bfec6ef73c2c2b65144e8d3947ffadb4a9c4 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 11:13:54 -0400 Subject: [PATCH 30/48] Address twenty-seventh round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P1 methodology (survey finite-df PT). ``_pt_event_study`` previously mapped the joint Wald statistic to ``1 - chi2.cdf(stat, df=k)`` unconditionally. For survey-backed fits the result already carries a finite ``survey_metadata.df_survey``, and the design-based SE reflects the effective sample size — so the chi-square reference systematically over-rejects under the finite-sample correction. ``joint_p_value`` and ``verdict`` were therefore materially overconfident on survey-backed event-study PT tests. The joint-Wald branch now reads ``results.survey_metadata.df_survey``. When the value is finite and positive, DR computes ``F = W / k`` against an ``F(k, df_survey)`` reference; the chi-square path is reserved for fits with no finite-df information (NaN / inf / non-positive ``df_survey`` falls back). Schema changes: * the ``method`` tag gains a ``_survey`` suffix on the survey branch (e.g., ``joint_wald_survey``, ``joint_wald_event_study_survey``) so downstream BR / DR prose can flag the finite-sample correction rather than silently presenting a chi-square-style result; * new ``df_denom`` field exposes the denominator df when the F path was used. P3 coverage. Two new regressions on ``TestJointWaldAlignment``: * ``test_joint_wald_uses_F_reference_when_survey_df_is_finite`` builds a stub with ``beta = [1, 1, 1]``, ``V = I``, and ``df_survey = 20``; verifies the statistic is 3.0 (matches the closed form), the method suffix is ``_survey``, ``df_denom`` is 20, and the joint p-value equals ``1 - F.cdf(1.0, dfn=3, dfd=20)`` — and is strictly larger than the chi-square counterpart (confirming the correction actually fires); * ``test_joint_wald_ignores_non_finite_survey_df`` pins the fallback behavior when ``df_survey`` is NaN. Docs. ``REPORTING.md`` gains a dedicated finite-df PT note describing the policy, the method-tag suffix, the ``df_denom`` field, and the chi-square fallback. 242 BR / DR / practitioner tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/diagnostic_report.py | 48 ++++++++++++++++++++--- docs/methodology/REPORTING.md | 14 +++++++ tests/test_diagnostic_report.py | 69 +++++++++++++++++++++++++++++++++ 3 files changed, 126 insertions(+), 5 deletions(-) diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 95def874..af64cdc5 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -1012,6 +1012,7 @@ def _pt_event_study(self) -> Dict[str, Any]: vcov_method_tag = "joint_wald_event_study" except TypeError: idx_map_for_wald = None + df_denom: Optional[float] = None if vcov_for_wald is not None and idx_map_for_wald is not None and df > 0: try: keys_in_vcov = [k for (k, _, _, _) in pre_coefs if k in idx_map_for_wald] @@ -1021,11 +1022,42 @@ def _pt_event_study(self) -> Dict[str, Any]: beta = np.array([beta_map[k] for k in keys_in_vcov], dtype=float) v_sub = np.asarray(vcov_for_wald)[np.ix_(idx, idx)] stat = float(beta @ np.linalg.solve(v_sub, beta)) - from scipy.stats import chi2 - joint_p = float(1.0 - chi2.cdf(stat, df=df)) - test_statistic = stat - method = vcov_method_tag + # Round-27 P1 CI review on PR #318: survey-backed + # fits carry a finite ``df_survey`` on + # ``survey_metadata``; using the chi-square reference + # distribution on those produces overconfident + # p-values because it ignores the finite-sample + # correction the design-based SE already reflects. + # When a finite denominator df is available, compute + # ``F = W / k`` (numerator df = k pre-periods) against + # an F(k, df_survey) reference. Reserve the chi-square + # path for fits with no finite-df information. + sm = getattr(r, "survey_metadata", None) + df_survey_raw = getattr(sm, "df_survey", None) if sm is not None else None + df_survey: Optional[float] = None + if df_survey_raw is not None: + try: + df_survey_val = float(df_survey_raw) + if np.isfinite(df_survey_val) and df_survey_val > 0: + df_survey = df_survey_val + except (TypeError, ValueError): + df_survey = None + + if df_survey is not None: + from scipy.stats import f as f_dist + + f_stat = stat / df + joint_p = float(1.0 - f_dist.cdf(f_stat, dfn=df, dfd=df_survey)) + test_statistic = stat + method = f"{vcov_method_tag}_survey" + df_denom = df_survey + else: + from scipy.stats import chi2 + + joint_p = float(1.0 - chi2.cdf(stat, df=df)) + test_statistic = stat + method = vcov_method_tag except Exception: # noqa: BLE001 joint_p = None test_statistic = None @@ -1046,7 +1078,7 @@ def _pt_event_study(self) -> Dict[str, Any]: if ps: joint_p = min(1.0, min(ps) * len(ps)) - return { + out = { "status": "ran", "method": method, "joint_p_value": joint_p, @@ -1056,6 +1088,12 @@ def _pt_event_study(self) -> Dict[str, Any]: "per_period": per_period, "verdict": _pt_verdict(joint_p), } + # Expose the denominator df when the survey F-path was used so + # BR / DR prose can flag the finite-sample correction rather than + # silently presenting a chi-square-style result. + if df_denom is not None: + out["df_denom"] = df_denom + return out def _check_pretrends_power(self) -> Dict[str, Any]: """Compute pre-trends power (MDV) via ``compute_pretrends_power``. diff --git a/docs/methodology/REPORTING.md b/docs/methodology/REPORTING.md index 8c5cf931..6d26b48a 100644 --- a/docs/methodology/REPORTING.md +++ b/docs/methodology/REPORTING.md @@ -69,6 +69,20 @@ not new inference. Wald statistic (or Bonferroni fallback when `vcov` is missing). This mirrors the guidance in `practitioner._parallel_trends_step(staggered=True)`. +- **Note:** Survey finite-df PT policy. When the fitted result carries + a finite `survey_metadata.df_survey`, `_pt_event_study` computes + `F = W / k` (numerator df = k pre-period coefficients) against an + F(k, df_survey) reference distribution rather than chi-square(k). + The design-based SE already reflects the effective sample size, so + the chi-square reference would systematically over-reject under the + finite-sample correction the SE captures. The schema surfaces the + survey branch via the `method` suffix `_survey` + (e.g., `joint_wald_survey`, `joint_wald_event_study_survey`) and + exposes the denominator df as `df_denom`, so BR / DR prose can flag + the finite-sample correction rather than silently presenting a + chi-square-style result. Non-finite `df_survey` (NaN / inf / + non-positive) falls back to the chi-square path. + - **Note:** Estimator-native validation surfaces are surfaced rather than duplicated. `SyntheticDiDResults` routes parallel-trends to `pre_treatment_fit` (the RMSE of the synthetic-control fit on the diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index 617ec192..9d298881 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -816,6 +816,75 @@ def test_falls_back_to_bonferroni_when_vcov_missing(self): pt = dr.to_dict()["parallel_trends"] assert pt["method"] == "bonferroni" + def test_joint_wald_uses_F_reference_when_survey_df_is_finite(self): + """Round-27 P1 regression: event-study PT on a survey-backed fit + must use an F reference distribution with denominator df = + ``survey_metadata.df_survey`` rather than the chi-square + reference. Chi-square over-rejects under a finite-sample + correction; the design-based SE already reflects the effective + sample size and the PT test must match. + """ + from types import SimpleNamespace + + from scipy.stats import chi2, f as f_dist + + # Same fixture as ``test_joint_wald_runs_when_keys_align`` but with + # a survey_metadata carrying a finite df_survey. + pre = [(-3, 1.0, 1.0, 0.32), (-2, 1.0, 1.0, 0.32), (-1, 1.0, 1.0, 0.32)] + interaction_indices = {-3: 0, -2: 1, -1: 2, 0: 3} + vcov = np.eye(4) + stub = self._stub_result( + pre, + interaction_indices, + vcov, + survey_metadata=SimpleNamespace(df_survey=20.0), + ) + + dr = DiagnosticReport(stub, run_sensitivity=False, run_bacon=False) + pt = dr.to_dict()["parallel_trends"] + + # With beta = [1,1,1] and V = I, the Wald statistic is 3.0. + assert pt["status"] == "ran" + assert pt["test_statistic"] == pytest.approx(3.0, rel=1e-6) + assert pt["df"] == 3 + + # Method tag surfaces the survey branch so BR / DR prose can + # flag the finite-sample correction. Denominator df is exposed + # on the schema for downstream consumers. + assert pt["method"].endswith("_survey") + assert pt["df_denom"] == pytest.approx(20.0) + + # F statistic = W / k = 3.0 / 3 = 1.0; survey p-value uses + # F(3, 20) instead of chi-square(3). + expected_p_survey = float(1.0 - f_dist.cdf(1.0, dfn=3, dfd=20.0)) + expected_p_chi2 = float(1.0 - chi2.cdf(3.0, df=3)) + assert pt["joint_p_value"] == pytest.approx(expected_p_survey, rel=1e-6) + # Chi-square would be noticeably more confident (smaller p) than + # F under finite df; confirm the survey path isn't degenerating + # back to chi-square. + assert expected_p_survey > expected_p_chi2 + + def test_joint_wald_ignores_non_finite_survey_df(self): + """If ``df_survey`` is NaN / inf / non-positive, fall back to + chi-square (no finite-sample correction available). + """ + from types import SimpleNamespace + + pre = [(-3, 1.0, 1.0, 0.32), (-2, 1.0, 1.0, 0.32), (-1, 1.0, 1.0, 0.32)] + interaction_indices = {-3: 0, -2: 1, -1: 2, 0: 3} + vcov = np.eye(4) + stub = self._stub_result( + pre, + interaction_indices, + vcov, + survey_metadata=SimpleNamespace(df_survey=float("nan")), + ) + dr = DiagnosticReport(stub, run_sensitivity=False, run_bacon=False) + pt = dr.to_dict()["parallel_trends"] + # Non-finite df_survey must not taint the method tag. + assert not pt["method"].endswith("_survey") + assert "df_denom" not in pt + class TestNarrowedApplicabilityAndPlaceboSchema: """Regressions for the round-3 CI-review findings. From cd85edac0d23c5bf3ff9be542185330b0724447e Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 11:32:21 -0400 Subject: [PATCH 31/48] Address twenty-eighth round of CI review findings on PR #318 P2 code quality (TROP applicable_checks mismatch). TROP identification is factor-model-based, not PT-based; the estimator-native ``_pt_factor()`` handler returns ``status="not_applicable"`` and REPORTING.md routes TROP PT to factor-model diagnostics. Exposing ``parallel_trends`` in ``_APPLICABILITY["TROPResults"]`` advertised a handler that never runs, leaving callers who gate workflows on ``applicable_checks`` with a contract mismatch. Remove PT from the TROP applicability set. P2 methodology (CS repeated-cross-section count labels). ``CallawaySantAnna(panel=False)`` stores treated / control counts as OBSERVATIONS rather than units (``staggered_results.py`` lines 183-184 render them as "obs:" in that mode). BR previously labeled them "units" / "present in the panel", which misstates the sample composition on RCS fits. Add a ``count_unit`` field to the BR sample schema (derived from ``results.panel``) and branch the summary / full-report rendering: RCS fits render "never-treated observations" and "present in the repeated cross-section sample" instead of the panel-mode phrasing. P3 coverage (survey PT prose / replay propagation). The round-27 fix added the ``_survey`` method suffix and ``df_denom`` schema field but did not carry the provenance through the prose / replay helpers: * ``_pt_method_subject`` and ``_pt_method_stat_label`` didn't recognize ``joint_wald_survey`` / ``joint_wald_event_study_survey``, so BR prose fell through to the generic "Pre-treatment data" / "joint p" default; * ``_lift_pre_trends`` didn't preserve ``df_denom`` in the BR schema, so downstream consumers couldn't see the finite-sample correction without re-consulting the DR schema; * ``_format_precomputed_pt`` didn't carry ``df_denom`` on replay, so a survey-aware DR block round-tripped as a chi-square-style passthrough. All three helpers now recognize / preserve the survey variants. Tests: 7 new regressions. * ``TestCSRepeatedCrossSectionCountLabels`` (3 tests): schema flag, panel-mode wording, RCS-mode wording; * ``TestTROPApplicableChecksExcludesParallelTrends`` (1 test): TROP DR exposes no PT in applicable_checks; * ``TestSurveyPTProsePropagation`` (2 tests): ``_lift_pre_trends`` preserves ``df_denom``, and method helpers return "joint p" + event-study subject for both survey variants; * ``test_precomputed_survey_pt_replay_preserves_df_denom`` (DR): round-trip replay of a ``joint_wald_event_study_survey`` block preserves ``method``, ``df_denom``, and ``df``. 249 BR / DR / practitioner tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 78 ++++++++++++++++--- diff_diff/diagnostic_report.py | 15 +++- tests/test_business_report.py | 131 ++++++++++++++++++++++++++++++++ tests/test_diagnostic_report.py | 23 ++++++ 4 files changed, 237 insertions(+), 10 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 8c45aa7b..cf7ec87d 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -585,6 +585,18 @@ def _extract_sample(self) -> Dict[str, Any]: n_never_treated = n_control_units n_control = None + # Panel-vs-RCS count semantics. CallawaySantAnnaResults stores + # treated/control counts as OBSERVATIONS (not units) when the + # fit used ``panel=False`` — ``staggered_results.py L183-L184`` + # renders those counts as "obs:" rather than "units:". BR + # previously labeled them as "units" / "present in the panel", + # which misstates the sample composition for repeated cross- + # section fits. Carry the flag into the schema so rendering can + # branch. Round-28 P2 CI review on PR #318. + count_unit = ( + "observations" if getattr(r, "panel", True) is False else "units" + ) + sample_block: Dict[str, Any] = { "n_obs": _safe_int(getattr(r, "n_obs", None)), "n_treated": n_treated, @@ -595,6 +607,7 @@ def _extract_sample(self) -> Dict[str, Any]: "n_periods": _safe_int(getattr(r, "n_periods", None)), "pre_periods": _safe_list_len(getattr(r, "pre_periods", None)), "post_periods": _safe_list_len(getattr(r, "post_periods", None)), + "count_unit": count_unit, "survey": survey, } if n_never_enabled is not None: @@ -685,6 +698,11 @@ def _lift_pre_trends(dr: Optional[Dict[str, Any]]) -> Dict[str, Any]: "joint_p_value": pt.get("joint_p_value"), "verdict": pt.get("verdict"), "n_pre_periods": pt.get("n_pre_periods"), + # Carry the denominator df through when the survey F-reference + # branch was used so BR consumers can flag the finite-sample + # correction without re-consulting the DR schema (round-28 P3 + # CI review on PR #318). + "df_denom": pt.get("df_denom"), "power_status": pp.get("status"), "power_tier": pp.get("tier"), "mdv": pp.get("mdv"), @@ -1356,7 +1374,19 @@ def _pt_method_subject(method: Optional[str]) -> str: return "The pre-period slope-difference test" if method == "hausman": return "The Hausman PT-All vs PT-Post pretest" - if method in {"joint_wald", "joint_wald_event_study", "joint_wald_no_vcov", "bonferroni"}: + if method in { + "joint_wald", + "joint_wald_event_study", + "joint_wald_no_vcov", + "bonferroni", + # Survey-aware event-study PT variants use an F reference + # distribution with denominator df = ``survey_metadata.df_survey`` + # (round-27 P1 fix, documented in REPORTING.md). The subject + # remains the pre-period event-study coefficients; prose elsewhere + # flags the finite-sample correction via ``df_denom``. + "joint_wald_survey", + "joint_wald_event_study_survey", + }: return "Pre-treatment event-study coefficients" if method == "synthetic_fit": return "The synthetic-control pre-treatment fit" @@ -1368,11 +1398,21 @@ def _pt_method_subject(method: Optional[str]) -> str: def _pt_method_stat_label(method: Optional[str]) -> Optional[str]: """Return the joint-statistic label appropriate to the PT method. - Returns ``"joint p"`` for Wald / Bonferroni paths, ``"p"`` for the - 2x2 slope-difference and Hausman paths (which are single-statistic - tests), and ``None`` for design-enforced paths that have no p-value. + Returns ``"joint p"`` for Wald / Bonferroni paths (including the + survey-aware F-reference variants, which remain joint tests on the + pre-period coefficient vector — only the reference distribution + changes), ``"p"`` for the 2x2 slope-difference and Hausman paths + (single-statistic tests), and ``None`` for design-enforced paths + that have no p-value. """ - if method in {"joint_wald", "joint_wald_event_study", "joint_wald_no_vcov", "bonferroni"}: + if method in { + "joint_wald", + "joint_wald_event_study", + "joint_wald_no_vcov", + "bonferroni", + "joint_wald_survey", + "joint_wald_event_study_survey", + }: return "joint p" if method in {"slope_difference", "hausman"}: return "p" @@ -1716,14 +1756,22 @@ def _render_summary(schema: Dict[str, Any]) -> str: n_ne = sample.get("n_never_enabled") is_dynamic = sample.get("dynamic_control") cg = sample.get("control_group") + # Panel-vs-RCS count-unit label. For repeated cross-section fits + # (``panel=False`` on CallawaySantAnna), treated / never-treated + # tallies are observation counts, not unit counts. Keep the + # "N treated" phrasing (the N is still correct), but adjust the + # never-treated clause so it does not claim "units present in + # the panel" for an RCS sample. + count_unit = sample.get("count_unit", "units") + ne_unit_word = "observations" if count_unit == "observations" else "units" if isinstance(n_obs, int): if isinstance(n_t, int) and isinstance(n_c, int): sentences.append(f"Sample: {n_obs:,} observations ({n_t:,} treated, {n_c:,} control).") elif is_dynamic and isinstance(n_t, int): if isinstance(n_ne, int) and n_ne > 0: - subset_clause = f"; {n_ne:,} never-enabled units are also present" + subset_clause = f"; {n_ne:,} never-enabled {ne_unit_word} are also present" elif isinstance(n_nt, int) and n_nt > 0: - subset_clause = f"; {n_nt:,} never-treated units are also present" + subset_clause = f"; {n_nt:,} never-treated {ne_unit_word} are also present" else: subset_clause = "" # Estimator-specific dynamic-comparison phrasing. StackedDiD @@ -1904,16 +1952,28 @@ def _render_full_report(schema: Dict[str, Any]) -> str: estimator_block.get("class_name") if isinstance(estimator_block, dict) else None ) cg = sample.get("control_group") + # Panel-vs-RCS count-unit label for the full report. Mirrors the + # summary path: CallawaySantAnna's ``panel=False`` mode stores + # counts as observations, not units (round-28 P2). + md_count_unit = sample.get("count_unit", "units") + md_ne_unit_word = "observations" if md_count_unit == "observations" else "units" + md_sample_location = ( + "in the repeated cross-section sample" + if md_count_unit == "observations" + else "in the panel" + ) if isinstance(sample.get("n_control"), int): lines.append(f"- Control: {sample['n_control']:,}") elif sample.get("dynamic_control"): if isinstance(sample.get("n_never_enabled"), int) and sample["n_never_enabled"] > 0: lines.append( - f"- Never-enabled units present in the panel: {sample['n_never_enabled']:,}" + f"- Never-enabled {md_ne_unit_word} present " + f"{md_sample_location}: {sample['n_never_enabled']:,}" ) elif isinstance(sample.get("n_never_treated"), int) and sample["n_never_treated"] > 0: lines.append( - f"- Never-treated units present in the panel: {sample['n_never_treated']:,}" + f"- Never-treated {md_ne_unit_word} present " + f"{md_sample_location}: {sample['n_never_treated']:,}" ) if estimator_name == "StackedDiDResults": n_distinct = sample.get("n_distinct_controls_trimmed") diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index af64cdc5..12436816 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -141,8 +141,14 @@ {"parallel_trends", "sensitivity", "design_effect", "estimator_native"} ), "TROPResults": frozenset( + # TROP identification is factor-model-based, not parallel-trends- + # based: the estimator native ``_pt_factor()`` handler returns + # ``status="not_applicable"``, and REPORTING.md routes TROP PT + # to factor-model diagnostics instead. Exposing PT in + # ``applicable_checks`` advertised a handler that never runs — + # round-28 P2 CI review on PR #318 flagged the contract mismatch + # for callers who gate workflows on ``applicable_checks``. { - "parallel_trends", "sensitivity", "design_effect", "heterogeneity", @@ -2113,6 +2119,13 @@ def _read(name: str) -> Any: out["test_statistic"] = test_statistic if df is not None: out["df"] = df + # Preserve the survey-F denominator df when replaying a schema- + # shaped PT block from the default path (round-28 P3 CI review + # on PR #318). Without this, the finite-sample correction + # recorded on the source block is silently dropped at replay. + df_denom = _to_python_float(_read("df_denom")) + if df_denom is not None: + out["df_denom"] = df_denom return out # -- Headline metric extraction ---------------------------------------- diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 17176e4a..796054b6 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -2396,6 +2396,137 @@ def test_efficient_compare_control_groups_persists_after_sensitivity_runs(self): ) +class TestCSRepeatedCrossSectionCountLabels: + """Round-28 P2 CI review on PR #318: ``CallawaySantAnna(panel=False)`` + stores treated / control counts as OBSERVATIONS, not units + (``staggered_results.py L183-L184`` renders them as "obs:" in that + mode). BR previously labeled them as "units" / "present in the + panel", which misstates the sample composition on repeated-cross- + section fits. The schema now carries a ``count_unit`` flag and the + rendering branches on it. + """ + + @staticmethod + def _stub(panel: bool): + class CallawaySantAnnaResults: + pass + + stub = CallawaySantAnnaResults() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 1000 + stub.n_treated_units = 200 + stub.n_control_units = 800 + stub.survey_metadata = None + stub.event_study_effects = None + stub.control_group = "not_yet_treated" + stub.panel = panel + return stub + + def test_schema_exposes_count_unit(self): + for panel, expected in [(True, "units"), (False, "observations")]: + sample = BusinessReport( + self._stub(panel), auto_diagnostics=False + ).to_dict()["sample"] + assert sample["count_unit"] == expected + + def test_panel_true_renders_unit_wording(self): + br = BusinessReport(self._stub(panel=True), auto_diagnostics=False) + summary = br.summary() + md = br.full_report() + assert "never-treated units" in summary + assert "present in the panel" in md + assert "repeated cross-section sample" not in md + + def test_panel_false_renders_rcs_wording(self): + br = BusinessReport(self._stub(panel=False), auto_diagnostics=False) + summary = br.summary() + md = br.full_report() + # RCS-specific wording in both surfaces. + assert "never-treated observations" in summary + assert "repeated cross-section sample" in md + # No misleading "units" or "panel" claims. + assert "never-treated units" not in summary + assert "present in the panel" not in md + + +class TestTROPApplicableChecksExcludesParallelTrends: + """Round-28 P2 CI review on PR #318: TROP identification is + factor-model-based; its native PT handler returns + ``status="not_applicable"``. Advertising ``parallel_trends`` in + ``DiagnosticReport.applicable_checks`` for TROP was a contract + mismatch for callers using that set to gate workflows or UI. + """ + + def test_trop_applicable_checks_omits_parallel_trends(self): + from diff_diff import DiagnosticReport + + class TROPResults: + pass + + stub = TROPResults() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.alpha = 0.05 + stub.n_obs = 100 + + dr = DiagnosticReport(stub) + assert "parallel_trends" not in dr.applicable_checks, ( + "TROP PT routes to factor-model diagnostics and is " + "not_applicable; it must not appear in applicable_checks." + ) + + +class TestSurveyPTProsePropagation: + """Round-28 P3 CI review on PR #318: the survey F-reference PT + variants (``joint_wald_survey``, ``joint_wald_event_study_survey``) + must carry through BR's method-aware label helpers so prose uses + "joint p" (not the fall-through default) and preserves the + ``df_denom`` provenance in the BR schema. + """ + + def test_lift_pre_trends_preserves_df_denom(self): + from diff_diff.business_report import _lift_pre_trends + + fake_dr = { + "parallel_trends": { + "status": "ran", + "method": "joint_wald_event_study_survey", + "joint_p_value": 0.35, + "df_denom": 30.0, + "n_pre_periods": 3, + "verdict": "no_detected_violation", + }, + "pretrends_power": {"status": "not_applicable"}, + } + lifted = _lift_pre_trends(fake_dr) + assert lifted["method"] == "joint_wald_event_study_survey" + assert lifted["df_denom"] == 30.0 + + def test_survey_pt_method_stat_label_uses_joint_p(self): + from diff_diff.business_report import ( + _pt_method_stat_label, + _pt_method_subject, + ) + + for method in ("joint_wald_survey", "joint_wald_event_study_survey"): + assert _pt_method_stat_label(method) == "joint p", ( + f"Survey PT variant {method!r} must map to 'joint p' " + f"(the joint test remains; only the reference " + f"distribution changes)." + ) + assert ( + _pt_method_subject(method) + == "Pre-treatment event-study coefficients" + ), ( + f"Survey PT variant {method!r} must use the event-study " + f"subject phrase, not the generic fall-through." + ) + + class TestSDiDJackknifeStepPersistsAfterNativeSensitivity: """Round-24 P2 CI review on PR #318: the SyntheticDiD practitioner step "Leave-one-out influence (jackknife)" must persist after diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index 9d298881..e981ac6b 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -864,6 +864,29 @@ def test_joint_wald_uses_F_reference_when_survey_df_is_finite(self): # back to chi-square. assert expected_p_survey > expected_p_chi2 + def test_precomputed_survey_pt_replay_preserves_df_denom(self, cs_fit): + """Round-28 P3 regression: a schema-shaped PT block carrying the + survey ``df_denom`` and ``_survey`` method suffix must round-trip + through ``precomputed={"parallel_trends": ...}`` without losing + the finite-sample provenance. Previously ``_format_precomputed_pt`` + dropped ``df_denom``, so replaying a survey-aware DR block + silently demoted it to a chi-square-style passthrough. + """ + fit, _ = cs_fit + survey_pt = { + "method": "joint_wald_event_study_survey", + "joint_p_value": 0.18, + "test_statistic": 5.2, + "df": 3, + "df_denom": 20.0, + } + dr = DiagnosticReport(fit, precomputed={"parallel_trends": survey_pt}) + pt = dr.to_dict()["parallel_trends"] + assert pt["status"] == "ran" + assert pt["method"] == "joint_wald_event_study_survey" + assert pt["df_denom"] == 20.0 + assert pt["df"] == 3 + def test_joint_wald_ignores_non_finite_survey_df(self): """If ``df_survey`` is NaN / inf / non-positive, fall back to chi-square (no finite-sample correction available). From 59d7df7ef81a80272bc1f731b5c8eb151668fd09 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 11:53:32 -0400 Subject: [PATCH 32/48] Address twenty-ninth round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P3 code quality (power_reason provenance). REPORTING.md lines 118-125 say the ``pretrends_power`` fallback reason is recorded on the BR pre-trends block, but ``_lift_pre_trends`` only carried the enum status and dropped the reason. Downstream schema consumers saw ``power_status="not_applicable"`` with no explanation — e.g., on ``StackedDiDResults`` / ``EfficientDiDResults`` / ``StaggeredTripleDiffResults`` / ``WooldridgeDiDResults`` / ``ChaisemartinDHaultfoeuilleResults`` fits where the power adapter is not yet available. Add a dedicated ``power_reason`` field alongside the existing ``power_status`` enum (additive, no breaking change) and update ``REPORTING.md`` to describe both fields. P3 docs / tests (DR prose for survey PT variants). Round-28 added the ``_survey`` suffix and ``df_denom`` to ``_pt_event_study``, and BR's method-aware helpers were updated to recognize the variants. ``DiagnosticReport``'s own ``_pt_subject_phrase`` / ``_pt_stat_label`` prose helpers were not, so DR ``summary()`` / ``full_report()`` still rendered the generic "Pre-treatment data" subject on survey-backed fits. Recognize ``joint_wald_survey`` and ``joint_wald_event_study_survey`` alongside the non-survey variants: subject is the pre-period event-study coefficient vector, statistic label is ``joint p`` (the F-reference correction is a different reference distribution, not a different test). Tests: 2 new regressions. * ``test_lift_pre_trends_exposes_power_reason`` under ``TestSurveyPTProsePropagation``: a fake DR block with a skipped power section surfaces both the enum status and the plain-English reason on the BR schema. * ``test_dr_prose_uses_event_study_subject_for_survey_pt`` under ``TestJointWaldAlignment``: DR's own subject / stat-label helpers return the event-study phrasing and ``joint p`` for both survey variants. 244 BR / DR / practitioner tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 9 +++++++++ diff_diff/diagnostic_report.py | 30 ++++++++++++++++++++++++--- docs/methodology/REPORTING.md | 5 ++++- tests/test_business_report.py | 36 +++++++++++++++++++++++++++++++++ tests/test_diagnostic_report.py | 29 ++++++++++++++++++++++++++ 5 files changed, 105 insertions(+), 4 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index cf7ec87d..1efd23c0 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -704,6 +704,15 @@ def _lift_pre_trends(dr: Optional[Dict[str, Any]]) -> Dict[str, Any]: # CI review on PR #318). "df_denom": pt.get("df_denom"), "power_status": pp.get("status"), + # Dedicated reason field so schema consumers see the fallback + # explanation when ``compute_pretrends_power`` cannot run + # (``status in {"skipped", "error", "not_applicable"}``). + # REPORTING.md lines 118-125 promise this provenance; round-29 + # P3 CI review on PR #318 flagged that only the enum status was + # being exposed and the reason was dropped at the lift boundary. + # ``power_status`` stays the machine-readable enum; ``power_reason`` + # carries the plain-English explanation. + "power_reason": pp.get("reason"), "power_tier": pp.get("tier"), "mdv": pp.get("mdv"), "mdv_share_of_att": pp.get("mdv_share_of_att"), diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 12436816..de5e187d 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -2540,7 +2540,21 @@ def _pt_subject_phrase(method: Optional[str]) -> str: return "The pre-period slope-difference test" if method == "hausman": return "The Hausman PT-All vs PT-Post pretest" - if method in {"joint_wald", "joint_wald_event_study", "joint_wald_no_vcov", "bonferroni"}: + if method in { + "joint_wald", + "joint_wald_event_study", + "joint_wald_no_vcov", + "bonferroni", + # Survey-aware event-study PT variants use an F(k, df_survey) + # reference rather than chi-square(k); the subject is still the + # pre-period event-study coefficient vector — only the + # reference distribution changes (round-28 / round-29 CI + # review on PR #318). Recognizing the ``_survey`` suffix here + # lets DR prose match the BR prose and the REPORTING.md + # contract. + "joint_wald_survey", + "joint_wald_event_study_survey", + }: return "Pre-treatment event-study coefficients" if method == "synthetic_fit": return "The synthetic-control pre-treatment fit" @@ -2555,9 +2569,19 @@ def _pt_stat_label(method: Optional[str]) -> Optional[str]: Wald / Bonferroni paths take a joint p-value (``joint p``); the 2x2 slope-difference and Hausman paths are single-statistic tests (``p``). Design-enforced paths return ``None`` so the sentence - omits a statistic. + omits a statistic. Survey F-reference variants remain joint tests + on the pre-period coefficient vector and keep the ``joint p`` + label — the correction is a different reference distribution, not + a different test. """ - if method in {"joint_wald", "joint_wald_event_study", "joint_wald_no_vcov", "bonferroni"}: + if method in { + "joint_wald", + "joint_wald_event_study", + "joint_wald_no_vcov", + "bonferroni", + "joint_wald_survey", + "joint_wald_event_study_survey", + }: return "joint p" if method in {"slope_difference", "hausman"}: return "p" diff --git a/docs/methodology/REPORTING.md b/docs/methodology/REPORTING.md index 6d26b48a..36cb1c22 100644 --- a/docs/methodology/REPORTING.md +++ b/docs/methodology/REPORTING.md @@ -120,7 +120,10 @@ not new inference. `ChaisemartinDHaultfoeuilleResults`) do not yet have a power adapter and therefore render the `no_detected_violation` tier as `underpowered` with the fallback reason recorded in - `schema["pre_trends"]["power_status"]`. BusinessReport then reads + `schema["pre_trends"]["power_reason"]` (plain-English explanation) + while `schema["pre_trends"]["power_status"]` carries the + machine-readable enum (`"ran"` / `"skipped"` / `"error"` / + `"not_applicable"`). BusinessReport then reads `mdv_share_of_att = mdv / abs(att)` and selects a tier: - `< 0.25` → `well_powered` — "the test has 80% power to diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 796054b6..bf915ef2 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -2506,6 +2506,42 @@ def test_lift_pre_trends_preserves_df_denom(self): assert lifted["method"] == "joint_wald_event_study_survey" assert lifted["df_denom"] == 30.0 + def test_lift_pre_trends_exposes_power_reason(self): + """Round-29 P3 regression: when ``compute_pretrends_power`` cannot + run, REPORTING.md lines 118-125 promise the fallback reason is + recorded in the BR pre-trends block. Previously only the enum + status surfaced and the reason was dropped at the lift + boundary; the new ``power_reason`` field carries the + plain-English explanation alongside the existing enum + ``power_status``. + """ + from diff_diff.business_report import _lift_pre_trends + + fake_dr = { + "parallel_trends": { + "status": "ran", + "method": "joint_wald_event_study", + "joint_p_value": 0.35, + "n_pre_periods": 3, + "verdict": "no_detected_violation", + }, + "pretrends_power": { + "status": "not_applicable", + "reason": ( + "StackedDiDResults does not yet have a " + "compute_pretrends_power adapter." + ), + }, + } + lifted = _lift_pre_trends(fake_dr) + # Machine-readable status preserved. + assert lifted["power_status"] == "not_applicable" + # Plain-English reason now exposed on the schema. + assert lifted["power_reason"] == ( + "StackedDiDResults does not yet have a " + "compute_pretrends_power adapter." + ) + def test_survey_pt_method_stat_label_uses_joint_p(self): from diff_diff.business_report import ( _pt_method_stat_label, diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index e981ac6b..2ca49a68 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -887,6 +887,35 @@ def test_precomputed_survey_pt_replay_preserves_df_denom(self, cs_fit): assert pt["df_denom"] == 20.0 assert pt["df"] == 3 + def test_dr_prose_uses_event_study_subject_for_survey_pt(self): + """Round-29 P3 regression: DR's own ``_pt_subject_phrase`` / + ``_pt_stat_label`` helpers previously didn't recognize the + ``_survey`` variants, so summary / full_report prose fell + through to the generic "Pre-treatment data" wording — BR's + helpers were fixed last round but DR's were not. The survey + variants must render with the event-study subject and the + ``joint p`` label; the F-reference correction is a different + reference distribution, not a different test. + """ + from diff_diff.diagnostic_report import ( + _pt_stat_label, + _pt_subject_phrase, + ) + + for method in ( + "joint_wald_survey", + "joint_wald_event_study_survey", + ): + assert ( + _pt_subject_phrase(method) + == "Pre-treatment event-study coefficients" + ), ( + f"DR subject for {method!r} must match the non-survey " + f"event-study phrasing; got " + f"{_pt_subject_phrase(method)!r}" + ) + assert _pt_stat_label(method) == "joint p" + def test_joint_wald_ignores_non_finite_survey_df(self): """If ``df_survey`` is NaN / inf / non-positive, fall back to chi-square (no finite-sample correction available). From c6d672efe53949fd28be34ec7810245168e334c0 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 12:15:14 -0400 Subject: [PATCH 33/48] Address thirtieth round of CI review findings on PR #318 P1 methodology (self-contradictory anticipation prose). ``_apply_anticipation_to_assumption`` previously only APPENDED an anticipation-aware clause. Several base assumption descriptions in ``_describe_assumption`` hard-code a strict "plus no anticipation" (CS / SA / Imputation / TwoStage / Wooldridge generic, StackedDiD sub-experiment, dCDH, TripleDifference, SyntheticDiD, TROP, ContinuousDiD) or "Also assumes no anticipation (Assumption NA) ..." (EfficientDiD PT-All, PT-Post) clause. On an anticipation-enabled fit BR would render both in the same paragraph, contradicting REGISTRY.md's description of anticipation as a SHIFTED effective treatment boundary rather than strict no-anticipation plus an exception. Add ``_strip_strict_no_anticipation`` that removes any of the canonical strict phrasings from a description before the helper appends the relaxed clause. Collapses dangling punctuation and doubled whitespace left by the removal so the rewritten description reads cleanly. The helper still flips ``no_anticipation = False`` and records ``anticipation_periods`` on the block. P3 docs drift. Round-29 added ``power_reason`` alongside ``power_status`` and updated one REPORTING.md reference, but a second reference at line 142 still pointed at ``power_status`` for the fallback explanation. Updated to name ``power_reason`` and note that ``power_status`` carries the enum. Tests: ``TestAnticipationStripsStrictNoAnticipationClause`` with five regressions asserts that every anticipation-capable estimator's rendered description under ``anticipation > 0`` drops both "plus no anticipation" and "Also assumes no anticipation" AND still carries the "Anticipation is allowed / not strict no-anticipation" contract: generic group-time, EfficientDiD PT-All, EfficientDiD PT-Post, StackedDiD sub-experiment, and an integration check of ``full_report()``'s rendered Identifying Assumption section. 249 BR / DR / practitioner tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 54 ++++++++++++++++++-- docs/methodology/REPORTING.md | 3 +- tests/test_business_report.py | 96 +++++++++++++++++++++++++++++++++++ 3 files changed, 148 insertions(+), 5 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 1efd23c0..eb1a6609 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -36,6 +36,7 @@ from __future__ import annotations +import re from dataclasses import dataclass from typing import Any, Dict, List, Optional, Union @@ -822,9 +823,51 @@ def _control_group_choice(results: Any) -> Optional[str]: return None +_STRICT_NO_ANTICIPATION_PATTERNS = ( + # Ordered from most specific to least specific so the first match + # wins on strings that could match multiple patterns. Matches are + # case-sensitive because every occurrence in ``_describe_assumption`` + # is a fixed canonical phrase. + ", plus no anticipation", + "plus no anticipation", + " Also assumes no anticipation (Assumption NA), overlap " + "(Assumption O), and absorbing / irreversible treatment.", + " Also assumes no anticipation.", + "Also assumes no anticipation.", + " and no anticipation", +) + + +def _strip_strict_no_anticipation(desc: str) -> str: + """Remove any strict no-anticipation phrasing from ``desc``. + + Several base assumption descriptions in ``_describe_assumption`` + hard-code a strict "plus no anticipation" / "Also assumes no + anticipation" clause (CS / SA / Imputation / TwoStage / Wooldridge + generic, StackedDiD sub-experiment, EfficientDiD PT-Post, EfficientDiD + PT-All, ContinuousDiD, TripleDifference, SyntheticDiD, TROP, dCDH, + and the fallback unconditional branch). When a fit actually allows + anticipation the helper must REPLACE that wording, not append a + contradictory clause on top of it. Round-30 P1 CI review on PR #318. + """ + if not desc: + return desc + out = desc + for pattern in _STRICT_NO_ANTICIPATION_PATTERNS: + out = out.replace(pattern, "") + # Collapse any doubled whitespace or dangling punctuation left by + # the removal (e.g., "cohorts, with..." -> "cohorts, with..."; + # "cohorts . " -> "cohorts."). + out = re.sub(r"\s+\.", ".", out) + out = re.sub(r"\s+,", ",", out) + out = re.sub(r" {2,}", " ", out) + return out.strip() + + def _apply_anticipation_to_assumption(block: Dict[str, Any], results: Any) -> Dict[str, Any]: - """If the fit used ``anticipation > 0``, flip ``no_anticipation`` off and - append an anticipation clause to the description. + """If the fit used ``anticipation > 0``, flip ``no_anticipation`` off, + strip any strict no-anticipation wording from the base description, + and append an anticipation-aware clause. Round-17 CI review flagged the strict "plus no anticipation" language on anticipation-enabled fits. Per REGISTRY.md §CallawaySantAnna lines @@ -832,7 +875,10 @@ def _apply_anticipation_to_assumption(block: Dict[str, Any], results: Any) -> Di EfficientDiD, a fit with ``anticipation=k`` shifts the effective treatment boundary by ``k`` pre-periods; the identifying assumption becomes "no treatment effects earlier than ``k`` periods before the - treatment start" rather than strict no-anticipation. + treatment start" rather than strict no-anticipation. Round-30 CI + review caught that the previous implementation only appended — the + resulting prose said both "strict no-anticipation holds" and + "anticipation is allowed" in the same paragraph. """ k = _anticipation_periods(results) if k <= 0: @@ -849,7 +895,7 @@ def _apply_anticipation_to_assumption(block: Dict[str, Any], results: Any) -> Di ) desc = block.get("description", "") if isinstance(desc, str): - block["description"] = desc + clause + block["description"] = _strip_strict_no_anticipation(desc) + clause return block diff --git a/docs/methodology/REPORTING.md b/docs/methodology/REPORTING.md index 36cb1c22..ab29c9f8 100644 --- a/docs/methodology/REPORTING.md +++ b/docs/methodology/REPORTING.md @@ -139,7 +139,8 @@ not new inference. signal." - Power analysis not runnable → fall back to `underpowered` phrasing; the fallback reason is recorded in - `schema["pre_trends"]["power_status"]`. + `schema["pre_trends"]["power_reason"]` (plain-English explanation; + `power_status` carries the enum). Rationale: always-hedging phrasing under-sells well-designed studies; always-confident phrasing over-sells underpowered ones. diff --git a/tests/test_business_report.py b/tests/test_business_report.py index bf915ef2..afc8c7e9 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -1572,6 +1572,102 @@ class Stub: assert "beta^{het}_l" in desc +class TestAnticipationStripsStrictNoAnticipationClause: + """Round-30 P1 CI review on PR #318: ``_apply_anticipation_to_assumption`` + previously only appended an anticipation clause. Several base + descriptions already say "plus no anticipation" or "Also assumes + no anticipation", so an anticipation-enabled fit would render + self-contradictory prose: the strict clause AND the relaxed one in + the same paragraph. The helper now strips the strict phrasing + before appending. These regressions cover every anticipation- + capable estimator base description that previously carried such + wording. + """ + + _STRICT_PATTERNS = ( + "plus no anticipation", + "Also assumes no anticipation", + ) + + @staticmethod + def _stub(class_name: str, **extras): + stub_cls = type(class_name, (), {}) + stub = stub_cls() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 400 + stub.n_treated = 100 + stub.n_control = 300 + stub.survey_metadata = None + stub.event_study_effects = None + stub.anticipation = 2 + for k, v in extras.items(): + setattr(stub, k, v) + return stub + + def _assert_no_strict_contract(self, description: str): + assert isinstance(description, str) and description + for pat in self._STRICT_PATTERNS: + assert pat not in description, ( + f"Anticipation-enabled fit description must not carry " + f"the strict phrase {pat!r}. Got: {description!r}" + ) + # Must still say anticipation is allowed (relaxed contract). + assert "Anticipation is allowed" in description + assert "not strict no-anticipation" in description + + def test_generic_group_time_strips_strict_clause(self): + # Generic CS/SA/Imputation/TwoStage/Wooldridge branch. + stub = self._stub("CallawaySantAnnaResults") + block = BusinessReport(stub, auto_diagnostics=False).to_dict()["assumption"] + assert block["no_anticipation"] is False + assert block["anticipation_periods"] == 2 + self._assert_no_strict_contract(block["description"]) + + def test_efficient_did_pt_all_strips_strict_clause(self): + stub = self._stub("EfficientDiDResults", pt_assumption="all") + block = BusinessReport(stub, auto_diagnostics=False).to_dict()["assumption"] + self._assert_no_strict_contract(block["description"]) + # PT-All identifying content should still be present. + assert "PT-All" in block["description"] + + def test_efficient_did_pt_post_strips_strict_clause(self): + stub = self._stub("EfficientDiDResults", pt_assumption="post") + block = BusinessReport(stub, auto_diagnostics=False).to_dict()["assumption"] + self._assert_no_strict_contract(block["description"]) + assert "PT-Post" in block["description"] + + def test_stacked_did_strips_strict_clause(self): + stub = self._stub( + "StackedDiDResults", clean_control="not_yet_treated" + ) + block = BusinessReport(stub, auto_diagnostics=False).to_dict()["assumption"] + self._assert_no_strict_contract(block["description"]) + # Stacked sub-experiment identifying content preserved. + assert "IC1" in block["description"] and "IC2" in block["description"] + + def test_rendered_full_report_has_no_strict_contract_for_anticipation(self): + """Integration: the rendered markdown's Identifying Assumption + section must also be free of the strict phrase on an + anticipation-enabled fit. + """ + stub = self._stub("CallawaySantAnnaResults") + md = BusinessReport(stub, auto_diagnostics=False).full_report() + assumption_section = md.split("## Identifying Assumption", 1)[1].split( + "\n## ", 1 + )[0] + for pat in self._STRICT_PATTERNS: + assert pat not in assumption_section, ( + f"Rendered assumption section must not carry the strict " + f"phrase {pat!r} under anticipation > 0. Got: " + f"{assumption_section!r}" + ) + assert "Anticipation is allowed" in assumption_section + + class TestAnticipationAwareAssumptionBlock: """Round-17 P1 regression: ``_describe_assumption`` must drop the strict "plus no anticipation" language when the fit allows From 95c4a7a8be3905bb86d72495c78d7f3743ef75a6 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 12:40:21 -0400 Subject: [PATCH 34/48] Address thirty-first round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P1 methodology (precomputed sensitivity silently dropped outside the sensitivity-capable applicability set). ``DiagnosticReport`` advertises ``precomputed["sensitivity"]`` as a supported escape hatch and ``BusinessReport`` forwards ``honest_did_results`` into it, but ``_APPLICABILITY`` only enabled ``sensitivity`` for ``MultiPeriodDiDResults``, ``CallawaySantAnnaResults``, and ``ChaisemartinDHaultfoeuilleResults`` (plus the SDiD / TROP native- routed families). On every other estimator — SA, Imputation, TwoStage, Stacked, EfficientDiD, Wooldridge, TripleDifference, StaggeredTripleDiff, ContinuousDiD, and plain DiD — the applicability gate filtered the section out before the supplied object reached the runner, so the schema rendered ``sensitivity: {"status": "not_applicable"}`` and users never learned their robustness input had been ignored. ``_compute_applicable_checks`` now treats an explicit ``precomputed`` key as a caller override: the key is unioned into ``type_level`` so it survives the ``_APPLICABILITY`` gate, and the per-check opt-in branch short-circuits before ``_instance_skip_reason`` to make the override contract explicit. SDiD / TROP are still rejected up front in ``__init__`` (round-21 guard); that methodology-incompatibility check lives above this gate and is unaffected. P2 code quality (heterogeneity schema contract). ``_lift_heterogeneity`` returned ``None`` whenever the DR heterogeneity section didn't run, so ``schema["heterogeneity"]`` was stored as a raw ``None`` and downstream consumers had to special-case this one section to read ``status``. Every other top-level BR section is a dict with a ``status`` field. Return a dict-shaped block on every path: * DR section status ``"ran"`` -> populated statistics + ``status: "ran"``; * DR section non-``ran`` -> ``{"status": , "reason": }``; * ``auto_diagnostics=False`` -> ``{"status": "skipped", "reason": "auto_diagnostics=False"}``. Tests: 8 new regressions. * ``TestPrecomputedSensitivityHonoredOnAllCompatibleEstimators`` (4 tests): ``precomputed["sensitivity"]`` on SunAbrahamResults, EfficientDiDResults, plain DiDResults, plus BR ``honest_did_results`` on ImputationDiDResults, all surface a populated sensitivity block with ``breakdown_M = 1.25``; * ``TestHeterogeneityLiftAlwaysReturnsDict`` (4 tests): ``None`` DR, ``skipped`` section, ``not_applicable`` section, and end-to-end BR schema all resolve to dict-shaped blocks with a ``status`` field. 257 BR / DR / practitioner tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 23 ++++- diff_diff/diagnostic_report.py | 24 ++++- tests/test_business_report.py | 176 +++++++++++++++++++++++++++++++++ 3 files changed, 218 insertions(+), 5 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index eb1a6609..4d8aed6a 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -751,13 +751,28 @@ def _lift_sensitivity(dr: Optional[Dict[str, Any]]) -> Dict[str, Any]: } -def _lift_heterogeneity(dr: Optional[Dict[str, Any]]) -> Optional[Dict[str, Any]]: +def _lift_heterogeneity(dr: Optional[Dict[str, Any]]) -> Dict[str, Any]: + """Return the heterogeneity section of the BR schema. + + Round-31 P2 CI review on PR #318: the lift previously returned + ``None`` on any non-``ran`` path, which broke the schema contract + that every top-level BR key resolves to a dict with a ``status`` + field. Downstream consumers had to special-case this one section. + Now returns a dict-shaped ``{"status": ..., "reason": ...}`` block + mirroring DR's own status enum so ``schema["heterogeneity"] + ["status"]`` is always readable. + """ if dr is None: - return None + return {"status": "skipped", "reason": "auto_diagnostics=False"} het = dr.get("heterogeneity") or {} - if het.get("status") != "ran": - return None + status = het.get("status") + if status != "ran": + return { + "status": status or "not_run", + "reason": het.get("reason"), + } return { + "status": "ran", "source": het.get("source"), "n_effects": het.get("n_effects"), "min": het.get("min"), diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index de5e187d..00d48a5d 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -460,6 +460,21 @@ def _compute_applicable_checks(self) -> Tuple[set, Dict[str, str]]: """ type_name = type(self._results).__name__ type_level = set(_APPLICABILITY.get(type_name, frozenset())) + # A precomputed passthrough is a caller-supplied override, not + # a claim about estimator-native applicability. Round-31 P1 CI + # review on PR #318: when a caller passes + # ``precomputed["sensitivity"] = ...`` on an estimator family + # whose ``_APPLICABILITY`` row lacks ``"sensitivity"`` (SA, + # Imputation, TwoStage, Stacked, EfficientDiD, Wooldridge, + # TripleDifference, StaggeredTripleDiff, ContinuousDiD, plain + # DiD), the gate previously filtered the section out silently + # and the supplied result disappeared from the schema. SDiD + # and TROP are still rejected up front in ``__init__`` + # (round-21) because their native-routing contract makes + # HonestDiD methodology-incompatible; those never reach here. + # For every other estimator, an explicit passthrough wins + # over the default applicability matrix. + type_level = type_level | set(self._precomputed) applicable: set = set() skipped: Dict[str, str] = {} @@ -468,7 +483,14 @@ def _compute_applicable_checks(self) -> Tuple[set, Dict[str, str]]: if not self._run_flags.get(check, True): skipped[check] = f"run_{check}=False (user opted out)" continue - # Instance-level gating + # Instance-level gating — skipped when the caller supplied + # a precomputed override (the per-check ``_instance_skip_reason`` + # branches already return None for precomputed keys, but this + # short-circuit makes the override contract explicit and + # survives any future gate additions). + if check in self._precomputed: + applicable.add(check) + continue reason = self._instance_skip_reason(check) if reason is not None: skipped[check] = reason diff --git a/tests/test_business_report.py b/tests/test_business_report.py index afc8c7e9..eaff8846 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -2772,6 +2772,182 @@ class TROPResults: ) +class TestPrecomputedSensitivityHonoredOnAllCompatibleEstimators: + """Round-31 P1 CI review on PR #318: ``DiagnosticReport(precomputed= + {"sensitivity": ...})`` and ``BusinessReport(honest_did_results=...)`` + were silently dropped on estimator families whose ``_APPLICABILITY`` + row lacked ``"sensitivity"`` — SA, Imputation, TwoStage, Stacked, + EfficientDiD, Wooldridge, TripleDifference, StaggeredTripleDiff, + ContinuousDiD, and plain DiD. The applicability gate filtered the + section out before the supplied object reached the runner, so the + schema rendered ``sensitivity: {"status": "not_applicable"}`` and + the user never learned their robustness result had been ignored. + + The gate now honors an explicit passthrough regardless of the + default ``_APPLICABILITY`` matrix. SDiD / TROP are still rejected + up front in ``__init__`` (round-21) because their native-routing + contract is methodology-incompatible with HonestDiD. + """ + + @staticmethod + def _fake_grid_sens(): + from types import SimpleNamespace + + return SimpleNamespace( + M_values=[0.5, 1.0, 1.5], + bounds=[(0.1, 2.0), (-0.2, 2.5), (-0.5, 3.0)], + robust_cis=[(0.05, 2.1), (-0.3, 2.6), (-0.6, 3.1)], + breakdown_M=1.25, + method="relative_magnitude", + original_estimate=1.0, + original_se=0.2, + alpha=0.05, + ) + + @staticmethod + def _stub(class_name: str, **extras): + from diff_diff.prep_dgp import generate_staggered_data + + # For estimator types that have fits, we'd use real fits; but + # several of these need specific setup. Stub with minimal + # required fields — the gate fix operates on the applicability + # set and the sensitivity runner short-circuits on the + # precomputed key without touching result internals. + stub_cls = type(class_name, (), {}) + stub = stub_cls() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (0.6, 1.4) + stub.att = 1.0 + stub.se = 0.2 + stub.p_value = 0.001 + stub.conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 500 + stub.n_treated = 200 + stub.n_control = 300 + stub.survey_metadata = None + stub.event_study_effects = None + for k, v in extras.items(): + setattr(stub, k, v) + return stub + + def test_dr_precomputed_sensitivity_honored_on_sun_abraham(self): + from diff_diff import DiagnosticReport + + stub = self._stub("SunAbrahamResults") + dr = DiagnosticReport(stub, precomputed={"sensitivity": self._fake_grid_sens()}) + sens = dr.to_dict()["sensitivity"] + assert sens["status"] == "ran", ( + f"precomputed sensitivity on SunAbrahamResults must be honored; " + f"got {sens!r}" + ) + assert sens.get("precomputed") is True + assert sens["breakdown_M"] == 1.25 + + def test_dr_precomputed_sensitivity_honored_on_efficient_did(self): + from diff_diff import DiagnosticReport + + stub = self._stub("EfficientDiDResults", pt_assumption="all") + dr = DiagnosticReport(stub, precomputed={"sensitivity": self._fake_grid_sens()}) + sens = dr.to_dict()["sensitivity"] + assert sens["status"] == "ran" + assert sens.get("precomputed") is True + + def test_dr_precomputed_sensitivity_honored_on_plain_did(self): + from diff_diff import DiagnosticReport + + stub = self._stub("DiDResults") + dr = DiagnosticReport(stub, precomputed={"sensitivity": self._fake_grid_sens()}) + sens = dr.to_dict()["sensitivity"] + assert sens["status"] == "ran" + + def test_br_honest_did_results_honored_on_imputation(self): + stub = self._stub("ImputationDiDResults") + br = BusinessReport(stub, honest_did_results=self._fake_grid_sens()) + sens = br.to_dict()["sensitivity"] + assert sens["status"] == "computed", ( + f"honest_did_results on ImputationDiDResults must be honored " + f"by BR; got {sens!r}" + ) + assert sens["breakdown_M"] == 1.25 + + +class TestHeterogeneityLiftAlwaysReturnsDict: + """Round-31 P2 CI review on PR #318: ``_lift_heterogeneity`` used to + return ``None`` whenever the DR heterogeneity section didn't + successfully run, so the BR schema stored a raw ``None`` at + ``schema["heterogeneity"]``. The rest of the schema promises dict- + shaped ``{"status": ..., "reason": ...}`` blocks on every top- + level key; this one broke the contract and forced downstream + consumers to special-case it. + """ + + def test_lift_none_dr_returns_dict(self): + from diff_diff.business_report import _lift_heterogeneity + + block = _lift_heterogeneity(None) + assert isinstance(block, dict) + assert block["status"] == "skipped" + assert "auto_diagnostics" in (block.get("reason") or "") + + def test_lift_skipped_dr_section_returns_dict_with_status(self): + from diff_diff.business_report import _lift_heterogeneity + + block = _lift_heterogeneity( + { + "heterogeneity": { + "status": "skipped", + "reason": "No group_effects or event_study_effects on result.", + } + } + ) + assert block["status"] == "skipped" + assert "No group_effects" in block["reason"] + + def test_lift_not_applicable_dr_section_returns_dict(self): + from diff_diff.business_report import _lift_heterogeneity + + block = _lift_heterogeneity( + { + "heterogeneity": { + "status": "not_applicable", + "reason": "TripleDifferenceResults is a 2-period design.", + } + } + ) + assert block["status"] == "not_applicable" + assert block["reason"] + + def test_br_schema_heterogeneity_is_always_dict(self): + """End-to-end: a fit whose heterogeneity did not run still + exposes a dict-shaped block at ``schema["heterogeneity"]`` + rather than a raw ``None``. + """ + + class DiDResults: + pass + + stub = DiDResults() + stub.att = 1.0 + stub.se = 0.2 + stub.p_value = 0.001 + stub.conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 200 + stub.n_treated = 100 + stub.n_control = 100 + stub.survey_metadata = None + + het = BusinessReport(stub, auto_diagnostics=True).to_dict()["heterogeneity"] + assert isinstance(het, dict), ( + f"schema['heterogeneity'] must be a dict (the stable-schema " + f"contract); got {type(het).__name__}: {het!r}" + ) + assert "status" in het + + class TestSDiDTROPRejectIncompatiblePrecomputedInputs: """Round-21 P1 CI review on PR #318: ``precomputed={"sensitivity": ...}`` and ``BusinessReport(honest_did_results=...)`` previously From c8d9d8425ce28d31414b41c3c2a1a86bdc680e74 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 13:00:08 -0400 Subject: [PATCH 35/48] Address thirty-second round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P1 methodology (precomputed pretrends_power on native-routed estimators). Round-21 added an ``__init__`` guard on SDiD / TROP that rejected ``precomputed["sensitivity"]`` and ``precomputed["parallel_trends"]`` because those estimators route robustness and PT to ``estimator_native_diagnostics``. Round-31's blanket applicability-override exposed a parallel hole: ``precomputed["pretrends_power"]`` was not in the rejection set, so a caller could surface a Roth-style pre-trends power tier on a report whose PT is design-enforced (SDiD ``pre_treatment_fit``) or factor-model (TROP ``effective_rank`` / ``loocv_score``). Extend the guard to reject ``pretrends_power`` uniformly on SDiD / TROP. The error message now names all three methodology- incompatible keys and points users at the native diagnostics on the result object. P2 code quality (heterogeneity section rendered with None values). Round-31 changed ``_lift_heterogeneity`` to always return a dict (stable schema contract). The full-report renderer's truthiness guard (``if het:``) then entered the Heterogeneity section on every fit — a dict with ``status="not_applicable"`` is still truthy — and printed ``Source: None`` / ``N effects: None`` / ``Sign consistent: None``. Gate on ``status == "ran"`` instead. P2 code quality (design_effect band label). REPORTING.md and the LLM guide promise a plain-English band label on the ``design_effect`` section, but ``_check_design_effect()`` only emitted numeric fields plus ``is_trivial``. Add a stable ``band_label`` field with four enum values aligned to the docs: * ``deff < 1.05`` -> ``"trivial"``; * ``1.05 <= deff < 2`` -> ``"slightly_reduces"``; * ``2 <= deff < 5`` -> ``"materially_reduces"``; * ``deff >= 5`` -> ``"large_warning"``. Tests: 14 new regressions. * ``TestSDiDTROPRejectPrecomputedPretrendsPower`` (2 tests): SDiD fit + dummy power object -> ``ValueError``; TROP stub + dummy power object -> ``ValueError``; * ``TestHeterogeneityOmittedFromFullReportWhenNotRan`` (1 test): plain-DiD fit (no heterogeneity in applicability row) renders ``full_report()`` without the Heterogeneity header and without any ``: None`` placeholder lines; * ``TestDesignEffectBandLabel`` (4 tests): each of the four bands (trivial 1.01, slightly_reduces 1.5, materially_reduces 3.2, large_warning 7.5) surfaces the right ``band_label``. Plus the 7 tests from the previous two rounds (precomputed sensitivity honored on SA / Imputation / EfficientDiD / plain DiD + heterogeneity-lift dict-shape coverage). 271 BR / DR / practitioner tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 9 ++- diff_diff/diagnostic_report.py | 47 ++++++++++- tests/test_business_report.py | 144 +++++++++++++++++++++++++++++++++ 3 files changed, 194 insertions(+), 6 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 4d8aed6a..cb5d435e 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -2076,8 +2076,13 @@ def _render_full_report(schema: Dict[str, Any]) -> str: lines.append(f"- Effective N: {eff_n:,.0f}") lines.append("") - # Heterogeneity - if het: + # Heterogeneity — only render the populated section when the check + # actually ran. Round-32 P2 CI review on PR #318: round-31 changed + # ``_lift_heterogeneity`` to always return a dict (stable schema + # contract), but the renderer's ``if het:`` truthiness guard then + # entered the block on every fit and printed ``Source: None``, + # ``N effects: None``, etc. Gate on the ``status`` enum instead. + if isinstance(het, dict) and het.get("status") == "ran": lines.append("## Heterogeneity") lines.append("") lines.append(f"- Source: `{het.get('source')}`") diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 00d48a5d..b610e7f9 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -371,13 +371,26 @@ def __init__( _incompatible_keys.append("sensitivity") if "parallel_trends" in self._precomputed: _incompatible_keys.append("parallel_trends") + # Round-32 P1 CI review on PR #318: ``pretrends_power`` is a + # Roth-style power analysis on pre-period event-study + # coefficients under the PT identifying contract. SDiD's PT + # analogue is design-enforced pre-treatment fit and TROP uses + # factor-model identification (PT not applicable); surfacing + # a Roth-style power tier on either would bypass the native- + # routing contract. Round-21's guard covered ``sensitivity`` + # and ``parallel_trends`` but not ``pretrends_power``, so the + # round-31 ``_compute_applicable_checks`` broadening exposed + # it. + if "pretrends_power" in self._precomputed: + _incompatible_keys.append("pretrends_power") if _incompatible_keys: raise ValueError( f"{_result_name} routes robustness and pre-trends " "diagnostics to ``estimator_native_diagnostics`` — " - "generic HonestDiD and parallel-trends precomputed " - "passthroughs are methodology-incompatible with this " - f"estimator. Rejected precomputed keys: {sorted(_incompatible_keys)}. " + "generic HonestDiD, parallel-trends, and pre-trends " + "power precomputed passthroughs are methodology-" + "incompatible with this estimator. Rejected " + f"precomputed keys: {sorted(_incompatible_keys)}. " "Use the native diagnostics on the result object " "(SDiD: ``in_time_placebo``, ``sensitivity_to_zeta_omega``, " "``pre_treatment_fit``; TROP: ``effective_rank``, " @@ -1481,7 +1494,22 @@ def _format_bacon(self, bacon: Any) -> Dict[str, Any]: } def _check_design_effect(self) -> Dict[str, Any]: - """Read survey design-effect from ``results.survey_metadata``.""" + """Read survey design-effect from ``results.survey_metadata``. + + Emits a plain-English ``band_label`` alongside the numeric + fields so downstream prose can classify the correction without + re-deriving the threshold rule. REPORTING.md describes the + band breakpoints (round-32 P2 CI review on PR #318 flagged + that the docs advertised the label but the implementation was + only emitting the numeric fields plus ``is_trivial``). + + Bands (per REPORTING.md): + * ``deff < 1.05`` -> ``"trivial"``; + * ``1.05 <= deff < 2`` -> ``"slightly_reduces"``; + * ``2 <= deff < 5`` -> ``"materially_reduces"``; + * ``deff >= 5`` -> ``"large_warning"``. + ``None`` deff -> ``band_label=None`` (no classification). + """ sm = getattr(self._results, "survey_metadata", None) if sm is None: return { @@ -1491,6 +1519,16 @@ def _check_design_effect(self) -> Dict[str, Any]: deff = _to_python_float(getattr(sm, "design_effect", None)) eff_n = _to_python_float(getattr(sm, "effective_n", None)) is_trivial = deff is not None and 0.95 <= deff <= 1.05 + if deff is None or not np.isfinite(deff): + band_label: Optional[str] = None + elif deff < 1.05: + band_label = "trivial" + elif deff < 2.0: + band_label = "slightly_reduces" + elif deff < 5.0: + band_label = "materially_reduces" + else: + band_label = "large_warning" return { "status": "ran", "deff": deff, @@ -1501,6 +1539,7 @@ def _check_design_effect(self) -> Dict[str, Any]: "df_survey": _to_python_scalar(getattr(sm, "df_survey", None)), "replicate_method": getattr(sm, "replicate_method", None), "is_trivial": is_trivial, + "band_label": band_label, } def _check_heterogeneity(self) -> Dict[str, Any]: diff --git a/tests/test_business_report.py b/tests/test_business_report.py index eaff8846..1626d2ec 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -2948,6 +2948,150 @@ class DiDResults: assert "status" in het +class TestSDiDTROPRejectPrecomputedPretrendsPower: + """Round-32 P1 CI review on PR #318: round-21 rejected + ``precomputed["sensitivity"]`` / ``precomputed["parallel_trends"]`` + on SDiD / TROP because the native-routing contract makes those + methodology-incompatible. Round-31's broadening of the + applicability gate exposed a parallel hole — ``precomputed[ + "pretrends_power"]`` was not in the rejection set, so a Roth- + style power verdict could surface on a report whose PT is + design-enforced (SDiD) or factor-model (TROP). The guard now + rejects all three precomputed keys uniformly on the native- + routed estimator families. + """ + + @staticmethod + def _dummy_power_object(): + from types import SimpleNamespace + + return SimpleNamespace( + mdv=0.1, + violation_type="linear", + alpha=0.05, + target_power=0.80, + violation_magnitude=0.1, + power=0.80, + n_pre_periods=2, + ) + + def test_dr_rejects_precomputed_pretrends_power_on_sdid(self, sdid_fit): + from diff_diff import DiagnosticReport + + fit, _ = sdid_fit + with pytest.raises(ValueError, match="estimator_native_diagnostics"): + DiagnosticReport( + fit, precomputed={"pretrends_power": self._dummy_power_object()} + ) + + def test_dr_rejects_precomputed_pretrends_power_on_trop(self): + from diff_diff import DiagnosticReport + + class TROPResults: + pass + + stub = TROPResults() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.alpha = 0.05 + stub.n_obs = 100 + with pytest.raises(ValueError, match="estimator_native_diagnostics"): + DiagnosticReport( + stub, precomputed={"pretrends_power": self._dummy_power_object()} + ) + + +class TestHeterogeneityOmittedFromFullReportWhenNotRan: + """Round-32 P2 CI review on PR #318: round-31 made + ``_lift_heterogeneity`` always return a dict (stable schema + contract), but the full-report renderer's ``if het:`` truthiness + guard then entered the Heterogeneity section on every fit and + printed ``Source: None`` / ``N effects: None`` / ``Sign + consistent: None``. Renderer now gates on ``status == "ran"``. + """ + + def test_full_report_omits_heterogeneity_section_when_skipped(self): + class DiDResults: + pass + + stub = DiDResults() + stub.att = 1.0 + stub.se = 0.2 + stub.p_value = 0.001 + stub.conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 200 + stub.n_treated = 100 + stub.n_control = 100 + stub.survey_metadata = None + + md = BusinessReport(stub, auto_diagnostics=True).full_report() + # The section header is only emitted when status == "ran". + # Plain DiD does not have heterogeneity in its applicability + # row, so the section should NOT appear. + assert "## Heterogeneity" not in md, ( + f"Heterogeneity section must be omitted when it did not " + f"run; rendering ``Source: None`` / ``N effects: None`` " + f"is worse than omitting. Got markdown:\n{md}" + ) + # Specifically, none of the placeholder ``None`` lines may + # appear anywhere in the rendered report. + assert "Source: `None`" not in md + assert "N effects: None" not in md + assert "Sign consistent: None" not in md + + +class TestDesignEffectBandLabel: + """Round-32 P2 CI review on PR #318: REPORTING.md promises a + plain-English band label on the ``design_effect`` section, but the + implementation only emitted numeric fields plus ``is_trivial``. + Add a stable ``band_label`` enum aligned with the REPORTING.md + threshold rule. + """ + + @staticmethod + def _stub_with_deff(deff: float): + from types import SimpleNamespace + + from diff_diff import DiagnosticReport + + class CallawaySantAnnaResults: + pass + + stub = CallawaySantAnnaResults() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 500 + stub.n_treated = 100 + stub.n_control_units = 400 + stub.event_study_effects = None + stub.survey_metadata = SimpleNamespace( + design_effect=deff, + effective_n=500.0 / max(deff, 1e-9), + weight_type="pweight", + n_strata=None, + n_psu=None, + df_survey=None, + replicate_method=None, + ) + return DiagnosticReport(stub).to_dict()["design_effect"] + + def test_trivial_band_under_1_05(self): + assert self._stub_with_deff(1.01)["band_label"] == "trivial" + + def test_slightly_reduces_band_under_2(self): + assert self._stub_with_deff(1.5)["band_label"] == "slightly_reduces" + + def test_materially_reduces_band_under_5(self): + assert self._stub_with_deff(3.2)["band_label"] == "materially_reduces" + + def test_large_warning_band_at_or_above_5(self): + assert self._stub_with_deff(7.5)["band_label"] == "large_warning" + + class TestSDiDTROPRejectIncompatiblePrecomputedInputs: """Round-21 P1 CI review on PR #318: ``precomputed={"sensitivity": ...}`` and ``BusinessReport(honest_did_results=...)`` previously From 9cbe42539f92928265081a2852c7f16735a54b5d Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 13:30:25 -0400 Subject: [PATCH 36/48] Address thirty-third round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P0 methodology (Bonferroni silently shrinks pre-trends test family on undefined inference). ``_collect_pre_period_coefs`` filtered non-finite ``effect`` / ``se`` but accepted ``se == 0`` / ``se < 0``, and ``_pt_event_study``'s Bonferroni fallback dropped NaN per-period p-values while still reporting the full ``df`` / ``n_pre_periods``. A pre-period surface where one or more coefficients had undefined inference (NaN p-value, zero / negative SE) produced a finite joint PT p-value computed on the remaining subset, which BR then lifted into clean "do not reject parallel trends" prose. That violated the ``safe_inference`` contract (``utils.py`` line 175 — ``se <= 0`` yields NaN downstream) and was not a documented REPORTING.md deviation. Fix (DR): * ``_collect_pre_period_coefs`` now rejects ``se <= 0`` alongside non-finite values (same rule as ``safe_inference``). Reference- marker rows (``n_groups == 0`` / ``n_obs == 0``) remain filtered separately — they are synthetic and never represented a real pre-period. Real pre-periods whose inference is undefined now count toward a new ``n_dropped_undefined`` return value. * The collector signature becomes ``(sorted list, n_dropped_undefined)``. All four callers (``_pt_event_study``, PT / sensitivity / pretrends-power applicability gates) updated to unpack the tuple. * ``_pt_event_study`` returns an explicit inconclusive PT block when ``n_dropped_undefined > 0``: ``method="inconclusive"``, ``joint_p_value=None``, ``verdict="inconclusive"``, plus ``n_dropped_undefined`` and a ``reason`` pointing at the safe-inference contract. Fix (pretrends): the CS / SA ``compute_pretrends_power`` adapters in ``pretrends.py`` now apply the same ``se > 0`` rule alongside ``np.isfinite(se)`` so pre-trends power never silently includes rows whose per-period SE collapsed. (MPD's adapter already used ``se > 0``.) Tests: replaced the round-14 regression that codified the subset- Bonferroni behavior with three new P0 regressions: * ``test_undefined_pre_period_inference_yields_inconclusive_not_shrunken_bonferroni`` (NaN p-value + zero SE path) -> ``verdict="inconclusive"``, ``method="inconclusive"``, ``n_dropped_undefined==1``; * ``test_zero_se_pre_period_yields_inconclusive`` (finite p but zero SE) -> same inconclusive shape; * ``test_pretrends_power_adapter_filters_zero_se_cs`` pins the ``se > 0`` filter on the CS pretrends-power adapter (zero-SE row must not appear in the returned ``pre_period_ses``). Also migrated the four existing tests that destructured the collector result to the new tuple signature. 266 BR / DR / practitioner tests pass; 90 pretrends / pretrends- event-study tests unaffected. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/diagnostic_report.py | 106 ++++++++++++++++++++++--------- diff_diff/pretrends.py | 7 +++ tests/test_business_report.py | 8 +-- tests/test_diagnostic_report.py | 107 ++++++++++++++++++++++++++++---- 4 files changed, 181 insertions(+), 47 deletions(-) diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index b610e7f9..c11eb440 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -570,7 +570,7 @@ def _instance_skip_reason(self, check: str) -> Optional[str]: + "." ) if method == "event_study": - pre_coefs = _collect_pre_period_coefs(r) + pre_coefs, _ = _collect_pre_period_coefs(r) if not pre_coefs: return ( "No pre-period event-study coefficients are exposed on " @@ -635,7 +635,7 @@ def _instance_skip_reason(self, check: str) -> Optional[str]: "event_study_effects (from aggregate='event_study' on " "staggered estimators); neither available." ) - pre_coefs = _collect_pre_period_coefs(r) + pre_coefs, _ = _collect_pre_period_coefs(r) if len(pre_coefs) < 2: return "Pre-trends power needs >= 2 pre-treatment periods." return None @@ -689,7 +689,7 @@ def _instance_skip_reason(self, check: str) -> Optional[str]: "HonestDiD needs either results.vcov, event_study_vcov, " "or event_study_effects; none available." ) - pre_coefs = _collect_pre_period_coefs(r) + pre_coefs, _ = _collect_pre_period_coefs(r) if len(pre_coefs) < 1: return "HonestDiD requires at least one pre-period coefficient." return None @@ -996,12 +996,43 @@ def _pt_event_study(self) -> Dict[str, Any]: ImputationDiD style, dict of dicts with ``effect``/``se``/``p_value`` keys). """ r = self._results - pre_coefs = _collect_pre_period_coefs(r) + pre_coefs, n_dropped_undefined = _collect_pre_period_coefs(r) if not pre_coefs: return { "status": "skipped", "reason": "No pre-period event-study coefficients available.", } + # Round-33 P0 CI review on PR #318: if any real pre-period was + # rejected for undefined inference (``se <= 0`` or non-finite + # ``effect`` / ``se``), the Bonferroni fallback used to silently + # shrink the test family on the remaining subset and publish a + # finite joint p-value that then lifted into clean BR prose. + # That violates the ``safe_inference`` contract (``se <= 0`` -> + # NaN downstream). Return an explicit inconclusive PT result + # instead — the user cannot conclude "PT holds" from a + # partially-undefined pre-period surface. + if n_dropped_undefined > 0: + return { + "status": "ran", + "method": "inconclusive", + "joint_p_value": None, + "test_statistic": None, + "df": len(pre_coefs), + "n_pre_periods": len(pre_coefs), + "n_dropped_undefined": n_dropped_undefined, + "verdict": "inconclusive", + "reason": ( + f"{n_dropped_undefined} pre-period coefficient(s) " + "have undefined inference (non-finite effect / SE or " + "SE <= 0). Per the safe-inference contract " + "(``utils.py`` line 175, REGISTRY.md line 197), this " + "yields NaN downstream; the joint PT test is " + "inconclusive on this fit. Re-fit with a different " + "variance method (bootstrap / cluster) if the " + "affected rows are a small number of cohorts, or " + "investigate why the per-period SE collapsed." + ), + } interaction_indices = getattr(r, "interaction_indices", None) vcov = getattr(r, "vcov", None) @@ -2394,8 +2425,11 @@ def _pre_post_boundary(results: Any) -> int: return -k -def _collect_pre_period_coefs(results: Any) -> List[Tuple[Any, float, float, Optional[float]]]: - """Return a sorted list of ``(key, effect, se, p_value)`` for pre-period coefficients. +def _collect_pre_period_coefs( + results: Any, +) -> Tuple[List[Tuple[Any, float, float, Optional[float]]], int]: + """Return ``(sorted list of (key, effect, se, p_value), n_dropped_undefined)`` + for pre-period coefficients. Handles three shapes: * ``pre_period_effects``: dict-of-``PeriodEffect`` on ``MultiPeriodDiDResults``. @@ -2404,26 +2438,31 @@ def _collect_pre_period_coefs(results: Any) -> List[Tuple[Any, float, float, Opt Pre-period entries are those with negative relative-time keys. * ``placebo_event_study``: dict-of-dict on ``ChaisemartinDHaultfoeuilleResults`` — dCDH's dynamic placebos - ``DID^{pl}_l`` are the estimator's pre-period analogue (the - Rambachan-Roth machinery in ``honest_did.py`` consumes them via a - dedicated branch, and this diagnostic must match). Keys are - negative horizons; entries share the event-study dict shape. + ``DID^{pl}_l`` are the estimator's pre-period analogue. Filtering rules (critical for methodology-safe PT tests): - * Entries marked as reference markers (``n_groups == 0`` on the CS / SA / - ImputationDiD / Stacked event-study shape) are excluded. These are - synthetic ``effect=0, se=NaN`` rows injected for universal-base - normalization; treating them as real pre-period evidence would inflate - the Bonferroni denominator and produce bogus zero-deviation entries. - * Entries whose ``effect`` or ``se`` is non-finite (NaN / inf) are - excluded. A NaN SE means inference is undefined — feeding it into - Bonferroni or Wald would produce a false-clean PT verdict. - - Returns an empty list when none of the three sources provides valid + * Entries marked as reference markers (``n_groups == 0`` on CS / SA or + ``n_obs == 0`` on Stacked / TwoStage / Imputation event-study shape) + are excluded. These are synthetic ``effect=0, se=NaN`` rows injected + for universal-base normalization and are NOT counted in + ``n_dropped_undefined`` — they never represented a real pre-period. + * Entries whose ``effect`` or ``se`` is non-finite (NaN / inf) or whose + ``se <= 0`` are excluded as undefined inference (``safe_inference`` + contract, ``utils.py:175``). These ARE real pre-periods whose + inference is undefined, so they contribute to + ``n_dropped_undefined``. Round-33 P0 CI review on PR #318 flagged + that the Bonferroni fallback silently shrank the test family when + this happened, turning partially-undefined PT surfaces into clean + stakeholder-facing verdicts. Callers (``_pt_event_study``) use + ``n_dropped_undefined`` to force an inconclusive verdict rather + than silently shrinking. + + Returns ``([], 0)`` when none of the three sources provides valid pre-period entries. """ results_list: List[Tuple[Any, float, float, Optional[float]]] = [] + n_dropped_undefined = 0 pre = getattr(results, "pre_period_effects", None) # dCDH exposes pre-period placebos via ``placebo_event_study``; the # round-6 CI review flagged that routing dCDH through the generic @@ -2436,13 +2475,16 @@ def _collect_pre_period_coefs(results: Any) -> List[Tuple[Any, float, float, Opt se = getattr(pe, "se", None) p = getattr(pe, "p_value", None) if eff is None or se is None: + n_dropped_undefined += 1 continue try: eff_f = float(eff) se_f = float(se) except (TypeError, ValueError): + n_dropped_undefined += 1 continue - if not (np.isfinite(eff_f) and np.isfinite(se_f)): + if not (np.isfinite(eff_f) and np.isfinite(se_f) and se_f > 0): + n_dropped_undefined += 1 continue results_list.append((k, eff_f, se_f, _to_python_float(p))) elif dcdh_placebo: @@ -2454,13 +2496,16 @@ def _collect_pre_period_coefs(results: Any) -> List[Tuple[Any, float, float, Opt se = entry.get("se") p = entry.get("p_value") if eff is None or se is None: + n_dropped_undefined += 1 continue try: eff_f = float(eff) se_f = float(se) except (TypeError, ValueError): + n_dropped_undefined += 1 continue - if not (np.isfinite(eff_f) and np.isfinite(se_f)): + if not (np.isfinite(eff_f) and np.isfinite(se_f) and se_f > 0): + n_dropped_undefined += 1 continue results_list.append((k, eff_f, se_f, _to_python_float(p))) else: @@ -2480,13 +2525,9 @@ def _collect_pre_period_coefs(results: Any) -> List[Tuple[Any, float, float, Opt continue if not isinstance(entry, dict): continue - # Drop universal-base reference markers. Different estimator - # aggregations use different flags for the synthetic marker row - # (all of which carry NaN SE and p-value): - # * CS / SA: ``n_groups == 0`` - # * Stacked / TwoStage / Imputation: ``n_obs == 0`` - # Treat either as a disqualifier so the Bonferroni denominator - # and joint-Wald index are not inflated by non-informative rows. + # Drop universal-base reference markers. These are synthetic, + # not a real pre-period, so they do not count toward + # ``n_dropped_undefined``. if entry.get("n_groups") == 0 or entry.get("n_obs") == 0: continue # Wooldridge stores ``att`` rather than ``effect`` in its @@ -2497,17 +2538,20 @@ def _collect_pre_period_coefs(results: Any) -> List[Tuple[Any, float, float, Opt se = entry.get("se") p = entry.get("p_value") if eff is None or se is None: + n_dropped_undefined += 1 continue try: eff_f = float(eff) se_f = float(se) except (TypeError, ValueError): + n_dropped_undefined += 1 continue - if not (np.isfinite(eff_f) and np.isfinite(se_f)): + if not (np.isfinite(eff_f) and np.isfinite(se_f) and se_f > 0): + n_dropped_undefined += 1 continue results_list.append((k, eff_f, se_f, _to_python_float(p))) results_list.sort(key=lambda t: t[0] if isinstance(t[0], (int, float)) else str(t[0])) - return results_list + return results_list, n_dropped_undefined def _pt_verdict(p: Optional[float]) -> str: diff --git a/diff_diff/pretrends.py b/diff_diff/pretrends.py index 23588da0..b249cef6 100644 --- a/diff_diff/pretrends.py +++ b/diff_diff/pretrends.py @@ -626,12 +626,17 @@ def _extract_pre_period_params( except (TypeError, ValueError): _ant = 0 _pre_cutoff = -_ant + # ``safe_inference`` treats ``se <= 0`` as undefined + # inference; filter the same way here so pre-trends + # power never silently includes rows whose per-period + # SE collapsed (round-33 P0 CI review on PR #318). pre_effects = { t: data for t, data in results.event_study_effects.items() if t < _pre_cutoff and data.get("n_groups", 1) > 0 and np.isfinite(data.get("se", np.nan)) + and float(data.get("se", 0.0)) > 0 } if not pre_effects: @@ -661,12 +666,14 @@ def _extract_pre_period_params( except (TypeError, ValueError): _ant = 0 _pre_cutoff = -_ant + # Mirror the ``se > 0`` filter applied on the CS branch. pre_effects = { t: data for t, data in results.event_study_effects.items() if t < _pre_cutoff and data.get("n_groups", 1) > 0 and np.isfinite(data.get("se", np.nan)) + and float(data.get("se", 0.0)) > 0 } if not pre_effects: diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 1626d2ec..7398830b 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -1932,7 +1932,7 @@ def test_pre_period_collector_reads_att_payload(self): from diff_diff.diagnostic_report import _collect_pre_period_coefs stub = self._wooldridge_stub() - pre = _collect_pre_period_coefs(stub) + pre, _ = _collect_pre_period_coefs(stub) keys = sorted(row[0] for row in pre) assert keys == [ -2, @@ -1955,7 +1955,7 @@ def test_wooldridge_ignores_anticipation_shift_on_pre_periods(self): from diff_diff.diagnostic_report import _collect_pre_period_coefs stub = self._wooldridge_stub(anticipation=1) - pre = _collect_pre_period_coefs(stub) + pre, _ = _collect_pre_period_coefs(stub) keys = sorted(row[0] for row in pre) # Wooldridge keeps rel < 0 regardless of anticipation. assert keys == [-2, -1] @@ -2016,7 +2016,7 @@ def test_pre_period_collector_excludes_anticipation_window(self): from diff_diff.diagnostic_report import _collect_pre_period_coefs stub = self._cs_stub_with_anticipation(anticipation=1) - pre = _collect_pre_period_coefs(stub) + pre, _ = _collect_pre_period_coefs(stub) keys = sorted(row[0] for row in pre) # Anticipation window (rel=-1) must be excluded; only -3, -2 remain. assert keys == [-3, -2], ( @@ -2037,7 +2037,7 @@ def test_anticipation_zero_preserves_old_behavior(self): from diff_diff.diagnostic_report import _collect_pre_period_coefs stub = self._cs_stub_with_anticipation(anticipation=0) - pre = _collect_pre_period_coefs(stub) + pre, _ = _collect_pre_period_coefs(stub) assert sorted(row[0] for row in pre) == [-3, -2, -1] dr = DiagnosticReport(stub) diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index 2ca49a68..0d85d009 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -1009,7 +1009,7 @@ class StackedDiDResults: }, 0: {"effect": 1.5, "se": 0.2, "p_value": 0.0001, "n_obs": 50}, } - coefs = _collect_pre_period_coefs(obj) + coefs, _ = _collect_pre_period_coefs(obj) keys = [k for (k, _, _, _) in coefs] assert -1 not in keys, "n_obs==0 row must be filtered out" assert -2 in keys @@ -1066,7 +1066,7 @@ def test_reference_marker_excluded_from_pt_collection(self): from diff_diff.diagnostic_report import _collect_pre_period_coefs obj = self._cs_stub_with_reference_marker() - coefs = _collect_pre_period_coefs(obj) + coefs, _ = _collect_pre_period_coefs(obj) keys = [k for (k, _, _, _) in coefs] assert -1 not in keys, ( "Universal-base reference marker (n_groups=0) must not appear " @@ -1117,20 +1117,31 @@ class CallawaySantAnnaResults: # "data" was a reference marker. assert pt.get("verdict") != "no_detected_violation" - def test_bonferroni_excludes_nan_p_values(self): - """If a pre-period row has a finite effect/SE but NaN p-value (edge - case on some exotic fits), Bonferroni must skip it, not feed it in.""" + def test_undefined_pre_period_inference_yields_inconclusive_not_shrunken_bonferroni(self): + """Round-33 P0 regression: when any pre-period has undefined + inference (non-finite effect / SE or ``se <= 0``), the Bonferroni + fallback must NOT silently shrink the test family on the + remaining subset and publish a clean joint p-value. Per the + ``safe_inference`` contract (``utils.py`` line 175), undefined + SE yields NaN downstream; the joint PT test must be explicitly + inconclusive so BR prose does not render a stakeholder-facing + "parallel trends hold" verdict from a partially-undefined + pre-period surface. + """ import numpy as np + from types import SimpleNamespace class MultiPeriodDiDResults: pass - from types import SimpleNamespace - obj = MultiPeriodDiDResults() + # One valid row + one row whose p-value is NaN (the ``se`` here + # is finite / positive; the NaN p models an exotic fit where + # the inference pipeline could not produce a p-value even with + # a valid SE). obj.pre_period_effects = { -2: SimpleNamespace(effect=1.0, se=0.5, p_value=0.04), - -1: SimpleNamespace(effect=0.5, se=0.5, p_value=np.nan), + -1: SimpleNamespace(effect=0.5, se=0.0, p_value=np.nan), } obj.vcov = None obj.interaction_indices = None @@ -1148,10 +1159,82 @@ class MultiPeriodDiDResults: dr = DiagnosticReport(obj, run_sensitivity=False, run_bacon=False) pt = dr.to_dict()["parallel_trends"] - # With only one valid p-value (0.04), Bonferroni should be min(1.0, 0.04*1) = 0.04. - # If the NaN were naively included the test would either error or coerce to 1.0. - assert pt["method"] == "bonferroni" - assert pt["joint_p_value"] == pytest.approx(0.04, abs=1e-9) + + # Method flagged inconclusive; joint_p None; verdict inconclusive. + assert pt["method"] == "inconclusive" + assert pt["joint_p_value"] is None + assert pt["verdict"] == "inconclusive" + # Metadata records how many pre-periods were dropped and why. + assert pt["n_dropped_undefined"] == 1 + assert "undefined inference" in pt["reason"] + + def test_zero_se_pre_period_yields_inconclusive(self): + """Round-33 P0 regression: a pre-period row whose SE is + zero/negative is undefined inference per the ``safe_inference`` + contract and must push the event-study PT to inconclusive. + """ + from types import SimpleNamespace + + class MultiPeriodDiDResults: + pass + + obj = MultiPeriodDiDResults() + obj.pre_period_effects = { + -2: SimpleNamespace(effect=1.0, se=0.5, p_value=0.04), + -1: SimpleNamespace(effect=0.5, se=0.0, p_value=0.99), + } + obj.vcov = None + obj.interaction_indices = None + obj.event_study_vcov = None + obj.event_study_vcov_index = None + obj.avg_att = 1.0 + obj.avg_se = 0.1 + obj.avg_p_value = 0.001 + obj.avg_conf_int = (0.8, 1.2) + obj.alpha = 0.05 + obj.n_obs = 100 + obj.n_treated = 50 + obj.n_control = 50 + obj.survey_metadata = None + + pt = DiagnosticReport( + obj, run_sensitivity=False, run_bacon=False + ).to_dict()["parallel_trends"] + assert pt["verdict"] == "inconclusive" + assert pt["method"] == "inconclusive" + assert pt["n_dropped_undefined"] >= 1 + + def test_pretrends_power_adapter_filters_zero_se_cs(self): + """Round-33 P0 regression: CS / SA ``compute_pretrends_power`` + adapters also use the ``se > 0`` filter alongside + ``np.isfinite(se)`` so the power analysis never includes rows + whose per-period SE collapsed. + """ + from types import SimpleNamespace + + import numpy as np + + from diff_diff.pretrends import compute_pretrends_power + from diff_diff.staggered import CallawaySantAnnaResults + + obj = object.__new__(CallawaySantAnnaResults) + obj.anticipation = 0 + # Three pre-periods: two valid, one with zero SE. The valid + # two are enough to run power analysis; the zero-SE row must + # NOT slip into the `ses` vector and divide-by-zero. + obj.event_study_effects = { + -3: {"effect": 0.1, "se": 0.2, "p_value": 0.7, "n_groups": 1}, + -2: {"effect": 0.0, "se": 0.0, "p_value": float("nan"), "n_groups": 1}, + -1: {"effect": 0.0, "se": 0.2, "p_value": 0.99, "n_groups": 1}, + 0: {"effect": 1.0, "se": 0.2, "p_value": 0.0, "n_groups": 1}, + } + obj.overall_att = 1.0 + obj.alpha = 0.05 + + pp = compute_pretrends_power(obj, alpha=0.05, target_power=0.80, violation_type="linear") + # Zero-SE row must not appear in pre_period_ses. + assert len(pp.pre_period_ses) == 2 + assert np.all(pp.pre_period_ses > 0) class TestPrecomputedValidation: From 94e9110a65037c258320ef21b85f294d97313b16 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 13:46:24 -0400 Subject: [PATCH 37/48] Address thirty-fourth round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P0 methodology (Bonferroni still shrinks family on finite-SE / NaN-p rows). Round-33 tightened the pre-period collector to filter ``se <= 0`` and forced inconclusive when the collector dropped a real pre-period for undefined SE. But rows with finite ``effect`` / ``se`` but ``p_value=NaN`` survive the collector (the SE is positive) and the Bonferroni fallback excluded NaN p-values from ``ps`` and scaled by the reduced family — silently shrinking the test family while still reporting the full ``df`` / ``n_pre_periods``. This state is reachable on replicate-weight survey fits where ``safe_inference`` sees ``df <= 0`` and returns NaN inference fields (``utils.py`` line 175) even though the design-based SE is still defined. The affected surfaces are the Bonferroni-only result types: ``StackedDiDResults``, ``TwoStageDiDResults``, ``ImputationDiDResults``, ``SunAbrahamResults``. Round-34 P0 CI review on PR #318. Fix. ``_pt_event_study``'s Bonferroni fallback now checks for non-finite per-period p-values BEFORE computing. If any retained pre-period has a NaN p-value, emit an explicit inconclusive PT block with ``n_dropped_undefined`` and a visible reason pointing at the ``safe_inference`` contract. Otherwise run Bonferroni on the full family as REPORTING.md documents. P2 coverage. The round-33 regression ``test_bonferroni_excludes_ nan_p_values`` claimed to cover the NaN-p / valid-SE case but actually used ``se=0.0``, so it only exercised the already-fixed ``se <= 0`` path. Added ``test_finite_se_nan_p_value_yields_inconclusive_on_bonferroni_ only_surface`` using a ``StackedDiDResults`` stub with one finite-inference row and one finite-SE / NaN-p row, asserting both the DR schema goes inconclusive AND BR summary prose does not emit "do not reject parallel trends" / "consistent with parallel trends" wording. 357 BR / DR / practitioner / pretrends tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/diagnostic_report.py | 56 +++++++++++++++++++++++++++------ tests/test_diagnostic_report.py | 56 +++++++++++++++++++++++++++++++++ 2 files changed, 102 insertions(+), 10 deletions(-) diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index c11eb440..6073c11f 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -1136,17 +1136,53 @@ def _pt_event_study(self) -> Dict[str, Any]: method = "bonferroni" if joint_p is None: - # Bonferroni: min per-period p-value scaled by count, capped at 1. - # NaN p-values are excluded — a non-finite p-value means the - # per-period test was undefined (zero SE, reference marker that - # slipped through, etc.) and must not be treated as clean - # evidence. If no valid p-values remain, joint_p stays None and - # the verdict will be ``inconclusive``. - ps = [ - p["p_value"] + # Bonferroni fallback is only valid when EVERY retained pre- + # period contributes a finite p-value. Otherwise we would + # silently shrink the test family (e.g., replicate-weight + # survey fits where ``safe_inference`` returns NaN p-values + # for rows whose effective survey df collapsed — the row's + # ``effect`` / ``se`` is still finite, so the ``se > 0`` + # collector filter lets it through, but a Bonferroni + # computed on the remaining subset publishes a finite joint + # p-value that BR lifts into "consistent with parallel + # trends" prose). Round-34 P0 CI review on PR #318 flagged + # that the round-33 guard only caught the ``se <= 0`` case + # and missed this. + # + # Strategy: if any retained pre-period has non-finite + # ``p_value``, emit an explicit inconclusive PT block with + # a visible count/reason. Otherwise run Bonferroni on the + # full family as documented in REPORTING.md. + nan_p_count = sum( + 1 for p in per_period - if isinstance(p["p_value"], (int, float)) and np.isfinite(p["p_value"]) - ] + if not ( + isinstance(p["p_value"], (int, float)) and np.isfinite(p["p_value"]) + ) + ) + if nan_p_count > 0: + return { + "status": "ran", + "method": "inconclusive", + "joint_p_value": None, + "test_statistic": None, + "df": len(pre_coefs), + "n_pre_periods": len(pre_coefs), + "n_dropped_undefined": nan_p_count, + "per_period": per_period, + "verdict": "inconclusive", + "reason": ( + f"{nan_p_count} retained pre-period coefficient(s) " + "have non-finite per-period p-value (undefined " + "inference per the ``safe_inference`` contract — " + "e.g., replicate-weight survey fits where effective " + "df collapsed). Bonferroni on the remaining subset " + "would silently shrink the test family; the joint " + "PT test is inconclusive on this fit. Inspect the " + "per_period block for the undefined rows." + ), + } + ps = [p["p_value"] for p in per_period] if ps: joint_p = min(1.0, min(ps) * len(ps)) diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index 0d85d009..2d32ab27 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -1168,6 +1168,62 @@ class MultiPeriodDiDResults: assert pt["n_dropped_undefined"] == 1 assert "undefined inference" in pt["reason"] + def test_finite_se_nan_p_value_yields_inconclusive_on_bonferroni_only_surface(self): + """Round-34 P0 regression: replicate-weight survey fits can emit + event-study rows with finite ``effect`` / ``se`` but + ``p_value=NaN`` when ``safe_inference`` sees ``df <= 0`` — the + design-based SE is still defined but inference fields collapse + to NaN per ``utils.py`` line 175. The round-33 collector filter + (``se > 0``) lets such rows through; the Bonferroni fallback + previously excluded NaN p-values and scaled by the reduced + family, producing a clean joint PT verdict that BR rendered as + "do not reject parallel trends" prose. + + Use a ``StackedDiDResults`` stub (Bonferroni-only surface: no + ``vcov`` / ``event_study_vcov``) with one finite-inference row + and one finite-SE / NaN-p row, and assert DR emits inconclusive. + """ + from diff_diff import BusinessReport + + class StackedDiDResults: + pass + + obj = StackedDiDResults() + obj.overall_att = 1.0 + obj.overall_se = 0.2 + obj.overall_p_value = 0.001 + obj.overall_conf_int = (0.6, 1.4) + obj.alpha = 0.05 + obj.n_obs = 400 + obj.n_treated_units = 100 + obj.n_control_units = 300 + obj.survey_metadata = None + obj.event_study_effects = { + -2: {"effect": 0.1, "se": 0.2, "p_value": 0.62, "n_obs": 400}, + # Finite SE but NaN p-value — models the replicate-weight + # collapsed-df case. Previously stayed in the family but + # was dropped from the Bonferroni denominator. + -1: {"effect": 0.05, "se": 0.3, "p_value": float("nan"), "n_obs": 400}, + } + + dr = DiagnosticReport(obj, run_sensitivity=False, run_bacon=False) + pt = dr.to_dict()["parallel_trends"] + + assert pt["method"] == "inconclusive", ( + f"Bonferroni-only surface with NaN per-period p-value must " + f"return inconclusive; got method={pt.get('method')!r} with " + f"joint_p={pt.get('joint_p_value')!r}" + ) + assert pt["verdict"] == "inconclusive" + assert pt["joint_p_value"] is None + assert pt["n_dropped_undefined"] == 1 + + # And BR must not turn that into "do not reject" / "consistent + # with parallel trends" wording. + br_summary = BusinessReport(obj).summary().lower() + assert "do not reject parallel trends" not in br_summary + assert "consistent with parallel trends" not in br_summary + def test_zero_se_pre_period_yields_inconclusive(self): """Round-33 P0 regression: a pre-period row whose SE is zero/negative is undefined inference per the ``safe_inference`` From 7743b5cb6fcc26367a3b6be016e4d9e9335cc7cf Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 14:00:13 -0400 Subject: [PATCH 38/48] Address thirty-fifth round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P1 methodology (inconclusive PT prose missing). Rounds 33-34 made the event-study PT schema emit ``verdict="inconclusive"`` whenever pre-period inference is undefined (zero / negative SE, non-finite per-period p-value). But neither ``BusinessReport.summary()`` nor ``DiagnosticReport.summary()`` / ``overall_interpretation`` had an ``elif verdict == "inconclusive"`` branch, so the PT sentence was silently omitted from the primary prose output. A missing sentence is indistinguishable from "PT did not run" and drops the identifying-assumption diagnostic from stakeholder output. Add explicit inconclusive branches on both surfaces. When ``n_dropped_undefined`` is available, the sentence quotes the count ("3 pre-period rows had undefined inference"); otherwise falls back to a generic "pre-period inference was undefined" clause. Both surfaces now close with "Treat parallel trends as unassessed" so the stakeholder takeaway is explicit. P2 code quality (DEFF ``deff < 0.95`` directional bug). The ``is_trivial`` flag required ``0.95 <= deff <= 1.05`` while ``band_label`` treated anything ``< 1.05`` as trivial. BR's summary keyed off ``not is_trivial`` and narrated "Survey design reduces effective sample size" for ``deff < 0.95``, which is directionally wrong — a precision-improving design has LARGER effective N than nominal N. Two fixes: * Add a dedicated ``band_label="improves_precision"`` enum value for ``deff < 0.95`` so the schema carries the direction explicitly; * Split BR's summary rendering: ``deff < 1.0`` -> "improves effective sample size"; ``deff >= 1.0`` -> "reduces effective sample size". ``is_trivial`` stays at ``0.95 <= deff <= 1.05`` (the tight "effectively no effect" window). P2 coverage. Round-33/34 regressions only asserted absence of false-clean "do not reject" wording; that assertion still passes even when the PT sentence disappears entirely. Added positive regressions: * ``test_summary_prose_surfaces_inconclusive_pt_explicitly`` asserts both ``DiagnosticReport.summary()`` and ``BusinessReport.summary()`` contain the word "inconclusive" on a Bonferroni-only surface with a NaN per-period p-value; * ``test_design_effect_deff_below_95_uses_improves_precision_wording`` pins the new ``band_label`` enum value AND the BR summary "improves effective sample size" wording. 332 BR / DR / practitioner / pretrends tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 46 ++++++++++++++++-- diff_diff/diagnostic_report.py | 37 ++++++++++++++ tests/test_diagnostic_report.py | 85 +++++++++++++++++++++++++++++++++ 3 files changed, 164 insertions(+), 4 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index cb5d435e..a1ce425b 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -1758,6 +1758,33 @@ def _render_summary(schema: Dict[str, Any]) -> str: "group's pre-period trajectory (SDiD's weighted-parallel-" "trends analogue)." ) + elif verdict == "inconclusive": + # Round-35 P1 CI review on PR #318: a ``verdict=="inconclusive"`` + # state means one or more pre-period coefficients had + # undefined inference (zero SE, NaN p-value) and the joint + # test cannot be formed. BR previously omitted the sentence + # entirely, so stakeholder prose silently skipped the + # identifying-assumption diagnostic. Name the state + # explicitly and quote the undefined-row count when + # available. + n_dropped = pt.get("n_dropped_undefined") + if isinstance(n_dropped, int) and n_dropped > 0: + rows_word = "row" if n_dropped == 1 else "rows" + sentences.append( + f"The pre-trends test is inconclusive on this fit: " + f"{n_dropped} pre-period {rows_word} had undefined " + "inference (zero / negative SE or a non-finite " + "per-period p-value), so the joint test cannot be " + "formed. Treat parallel trends as unassessed rather " + "than supported." + ) + else: + sentences.append( + "The pre-trends test is inconclusive on this fit: " + "pre-period inference was undefined, so the joint " + "test cannot be formed. Treat parallel trends as " + "unassessed rather than supported." + ) # Sensitivity. A ``single_M_precomputed`` sensitivity block has # ``breakdown_M=None`` by construction because only one M was evaluated; @@ -1877,10 +1904,21 @@ def _render_summary(schema: Dict[str, Any]) -> str: deff = survey.get("design_effect") eff_n = survey.get("effective_n") if isinstance(deff, (int, float)) and isinstance(eff_n, (int, float)): - sentences.append( - f"Survey design reduces effective sample size to " - f"~{eff_n:,.0f} (DEFF = {deff:.2g})." - ) + # Round-35 P2 CI review on PR #318: ``deff < 0.95`` is a + # precision-improving design (effective N is LARGER than + # nominal N). Narrating that as "reduces effective sample + # size" is directionally wrong. Branch on the sign of + # the departure from 1. + if deff < 1.0: + sentences.append( + f"Survey design improves effective sample size to " + f"~{eff_n:,.0f} (DEFF = {deff:.2g})." + ) + else: + sentences.append( + f"Survey design reduces effective sample size to " + f"~{eff_n:,.0f} (DEFF = {deff:.2g})." + ) # Highest-severity caveat (if any). caveats = schema.get("caveats", []) diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 6073c11f..e9b9d91d 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -1585,9 +1585,22 @@ def _check_design_effect(self) -> Dict[str, Any]: } deff = _to_python_float(getattr(sm, "design_effect", None)) eff_n = _to_python_float(getattr(sm, "effective_n", None)) + # Round-35 P2 CI review on PR #318: ``is_trivial`` used to be + # ``0.95 <= deff <= 1.05`` while ``band_label`` treated + # anything ``< 1.05`` as trivial. On a precision-improving + # design (``deff < 0.95``) BR's summary keyed off + # ``not is_trivial`` and narrated "Survey design reduces + # effective sample size", which is directionally wrong — the + # effective N is LARGER than the nominal N. Split the band + # into a dedicated ``improves_precision`` label for + # ``deff < 0.95`` and keep ``is_trivial`` restricted to the + # tight "effectively no effect" window so the schema + # carries the precision-improving signal explicitly. is_trivial = deff is not None and 0.95 <= deff <= 1.05 if deff is None or not np.isfinite(deff): band_label: Optional[str] = None + elif deff < 0.95: + band_label = "improves_precision" elif deff < 1.05: band_label = "trivial" elif deff < 2.0: @@ -2846,6 +2859,30 @@ def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str else "SDiD's synthetic control is designed to satisfy the " "weighted parallel-trends analogue." ) + elif verdict == "inconclusive": + # Round-35 P1 CI review on PR #318: DR summary / overall + # interpretation must surface the inconclusive state + # explicitly rather than omitting the PT sentence. A missing + # sentence was indistinguishable from "PT did not run", and + # stakeholders reading the summary could not tell that the + # joint test had been attempted but yielded undefined + # inference. + n_dropped = pt.get("n_dropped_undefined") + if isinstance(n_dropped, int) and n_dropped > 0: + rows_word = "row" if n_dropped == 1 else "rows" + sentences.append( + f"Pre-trends is inconclusive on this fit: " + f"{n_dropped} pre-period {rows_word} had undefined " + "inference (zero / negative SE or a non-finite " + "per-period p-value), so the joint test cannot be " + "formed. Treat parallel trends as unassessed." + ) + else: + sentences.append( + "Pre-trends is inconclusive on this fit: pre-period " + "inference was undefined, so the joint test cannot " + "be formed. Treat parallel trends as unassessed." + ) # Sentence 3: sensitivity. The "robust across the grid" phrasing is reserved # for genuine SensitivityResults grids; a precomputed single-M HonestDiDResults diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index 2d32ab27..be711877 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -1168,6 +1168,91 @@ class MultiPeriodDiDResults: assert pt["n_dropped_undefined"] == 1 assert "undefined inference" in pt["reason"] + def test_summary_prose_surfaces_inconclusive_pt_explicitly(self): + """Round-35 P1 regression: when pre-trends is inconclusive + (undefined pre-period inference), both ``BusinessReport.summary()`` + and ``DiagnosticReport.summary()`` must emit explicit inconclusive + prose — not merely omit the PT sentence. A missing sentence was + indistinguishable from "PT did not run" and would silently drop + the identifying-assumption diagnostic from stakeholder output. + """ + from diff_diff import BusinessReport + + class StackedDiDResults: + pass + + obj = StackedDiDResults() + obj.overall_att = 1.0 + obj.overall_se = 0.2 + obj.overall_p_value = 0.001 + obj.overall_conf_int = (0.6, 1.4) + obj.alpha = 0.05 + obj.n_obs = 400 + obj.n_treated_units = 100 + obj.n_control_units = 300 + obj.survey_metadata = None + obj.event_study_effects = { + -2: {"effect": 0.1, "se": 0.2, "p_value": 0.62, "n_obs": 400}, + -1: {"effect": 0.05, "se": 0.3, "p_value": float("nan"), "n_obs": 400}, + } + + dr_summary = DiagnosticReport( + obj, run_sensitivity=False, run_bacon=False + ).summary() + br_summary = BusinessReport(obj).summary() + + # Both summaries must explicitly name the inconclusive state. + for label, prose in [("DR", dr_summary), ("BR", br_summary)]: + assert "inconclusive" in prose.lower(), ( + f"{label}.summary() must surface the inconclusive PT " + f"state explicitly; got: {prose!r}" + ) + # And must not offer false-clean "do not reject" wording. + assert "do not reject parallel trends" not in prose.lower() + assert "consistent with parallel trends" not in prose.lower() + + def test_design_effect_deff_below_95_uses_improves_precision_wording(self): + """Round-35 P2 regression: ``deff < 0.95`` is a precision- + improving survey design — effective N is LARGER than nominal + N. DR emits ``band_label="improves_precision"`` and BR narrates + "improves effective sample size" instead of "reduces". + """ + from types import SimpleNamespace + + from diff_diff import BusinessReport + + class CallawaySantAnnaResults: + pass + + obj = CallawaySantAnnaResults() + obj.overall_att = 1.0 + obj.overall_se = 0.2 + obj.overall_p_value = 0.001 + obj.overall_conf_int = (0.6, 1.4) + obj.alpha = 0.05 + obj.n_obs = 500 + obj.n_treated = 100 + obj.n_control_units = 400 + obj.event_study_effects = None + obj.survey_metadata = SimpleNamespace( + design_effect=0.80, + effective_n=625.0, + weight_type="pweight", + n_strata=None, + n_psu=None, + df_survey=None, + replicate_method=None, + ) + + # Schema: band_label surfaces the precision-improving state. + deff_block = DiagnosticReport(obj).to_dict()["design_effect"] + assert deff_block["band_label"] == "improves_precision" + + # Prose: BR says "improves", not "reduces". + summary = BusinessReport(obj).summary().lower() + assert "improves effective sample size" in summary + assert "reduces effective sample size" not in summary + def test_finite_se_nan_p_value_yields_inconclusive_on_bonferroni_only_surface(self): """Round-34 P0 regression: replicate-weight survey fits can emit event-study rows with finite ``effect`` / ``se`` but From 6b251afb47de597cbaa67df639f4d5cdccd7ae36 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 14:18:10 -0400 Subject: [PATCH 39/48] Address thirty-sixth round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P0 methodology (NaN headline renders as "did not change by nan"). ``_render_overall_interpretation`` gated the headline sentence on ``val is not None``, which lets a NaN headline effect pass through (``NaN is not None``). Since ``NaN > 0`` and ``NaN < 0`` are both false, the directional branch fell through to the ``else`` clause and rendered "On , did not change by nan (p = nan, 95% CI: nan to nan)" — misleading stakeholder prose on a failed fit (rank-deficient design, zero effective sample, survey-design collapse). ``BusinessReport``'s equivalent headline renderer already gated on ``np.isfinite(value)``; DR now mirrors that. Branch on finiteness: * Non-finite ``val`` -> emit an explicit estimation-failure sentence naming the common causes (rank deficiency, zero effective sample, survey-design collapse) and directing the reader to inspect the fit before interpreting. * Finite ``val`` -> render the usual directional sentence, but also gate the ``ci_str`` / ``p_str`` fragments on finiteness so a partially-failed fit (finite ATT but NaN CI / p) can't silently render ``CI: nan to nan``. P2 coverage. Round-35 regressions covered BR's NaN headline behavior but not DR's prose surface; the P0 slipped through that gap. Added ``test_nan_headline_yields_estimation_failure_prose_not_did_not_change`` asserting both ``DiagnosticReport.summary()`` and ``to_dict()["overall_interpretation"]`` emit the non-finite sentence (contain "non-finite" or "did not produce") and do not emit "did not change" / "nan" / "CI: nan" fragments. 334 BR / DR / practitioner / pretrends tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/diagnostic_report.py | 36 ++++++++++++++++++++---- tests/test_diagnostic_report.py | 50 +++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 6 deletions(-) diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index e9b9d91d..a9a07444 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -2759,11 +2759,28 @@ def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str outcome = labels.get("outcome_label", "the outcome") treatment = labels.get("treatment_label", "the treatment") - # Sentence 1: headline + # Sentence 1: headline. + # Round-36 P0 CI review on PR #318: a non-finite headline value + # (NaN ATT from a failed fit, e.g., rank-deficient design matrix or + # zero effective sample) previously passed the ``val is not None`` + # guard because ``NaN is not None``. Since ``NaN > 0`` and + # ``NaN < 0`` are both false, the directional branch fell through + # to "did not change" and the sentence rendered as "did not change + # ... by nan (p = nan, 95% CI: nan to nan)". BR's equivalent + # headline renderer already gates on ``np.isfinite(value)`` and + # emits an estimation-failure sentence; DR now mirrors that. val = headline.get("value") if isinstance(headline, dict) else None ci = headline.get("conf_int") if isinstance(headline, dict) else None p = headline.get("p_value") if isinstance(headline, dict) else None - if val is not None: + val_finite = isinstance(val, (int, float)) and np.isfinite(val) + if val is not None and not val_finite: + sentences.append( + f"On {est}, {treatment}'s effect on {outcome} is non-finite " + "(the estimation did not produce a usable point estimate). " + "Inspect the fit for rank deficiency, zero effective sample, " + "or a survey-design collapse before interpreting." + ) + elif val_finite: direction = "increased" if val > 0 else "decreased" if val < 0 else "did not change" # Use the headline's own alpha rather than hardcoding 95 so prose # stays consistent with the rendered interval when alpha != 0.05. @@ -2772,12 +2789,19 @@ def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str ci_level = int(round((1.0 - headline_alpha) * 100)) else: ci_level = 95 - ci_str = ( - f" ({ci_level}% CI: {ci[0]:.3g} to {ci[1]:.3g})" - if isinstance(ci, (list, tuple)) and len(ci) == 2 and None not in ci + ci_finite = ( + isinstance(ci, (list, tuple)) + and len(ci) == 2 + and all( + isinstance(v, (int, float)) and np.isfinite(v) for v in ci + ) + ) + ci_str = f" ({ci_level}% CI: {ci[0]:.3g} to {ci[1]:.3g})" if ci_finite else "" + p_str = ( + f", p = {p:.3g}" + if isinstance(p, (int, float)) and np.isfinite(p) else "" ) - p_str = f", p = {p:.3g}" if isinstance(p, (int, float)) else "" sentences.append( f"On {est}, {treatment} {direction} {outcome} by {val:.3g}{ci_str}{p_str}." ) diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index be711877..96c1fb56 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -1168,6 +1168,56 @@ class MultiPeriodDiDResults: assert pt["n_dropped_undefined"] == 1 assert "undefined inference" in pt["reason"] + def test_nan_headline_yields_estimation_failure_prose_not_did_not_change(self): + """Round-36 P0 regression: a non-finite headline effect + (``NaN`` ATT from a failed fit) previously passed the ``val is + not None`` guard in ``_render_overall_interpretation``. Since + ``NaN > 0`` and ``NaN < 0`` are both false, the directional + branch fell through to "did not change" and rendered + "did not change ... by nan (p = nan, 95% CI: nan to nan)" — + misleading stakeholder prose on a failed fit. + + Both ``DiagnosticReport.summary()`` and + ``to_dict()["overall_interpretation"]`` must now emit an + explicit estimation-failure sentence instead. + """ + import numpy as np + + class DiDResults: + pass + + stub = DiDResults() + stub.att = float("nan") + stub.se = float("nan") + stub.t_stat = float("nan") + stub.p_value = float("nan") + stub.conf_int = (float("nan"), float("nan")) + stub.alpha = 0.05 + stub.n_obs = 100 + stub.n_treated = 50 + stub.n_control = 50 + stub.survey_metadata = None + + dr = DiagnosticReport(stub, run_sensitivity=False, run_bacon=False) + summary = dr.summary() + interp = dr.to_dict()["overall_interpretation"] + + for label, prose in [("summary", summary), ("overall_interpretation", interp)]: + lower = prose.lower() + # Must NOT render directional / numeric prose on a NaN fit. + assert "did not change" not in lower, ( + f"{label} rendered 'did not change' on a NaN fit; got: {prose!r}" + ) + assert "nan" not in lower, ( + f"{label} rendered 'nan' in the stakeholder-facing prose; got: {prose!r}" + ) + assert "by nan" not in lower + assert "ci: nan" not in lower + # Must name the non-finite state explicitly. + assert "non-finite" in lower or "did not produce" in lower, ( + f"{label} must emit an estimation-failure sentence; got: {prose!r}" + ) + def test_summary_prose_surfaces_inconclusive_pt_explicitly(self): """Round-35 P1 regression: when pre-trends is inconclusive (undefined pre-period inference), both ``BusinessReport.summary()`` From 883830351b40b4d97ceafd231aea9d0a16133be1 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 14:41:16 -0400 Subject: [PATCH 40/48] Address thirty-seventh round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P1 methodology (StaggeredTripleDiff never_treated surfaced composite total as control). ``_extract_sample`` swapped to ``n_never_enabled`` only inside the dynamic ``notyettreated`` branch; under ``control_group="never_treated"`` it left ``n_control = n_control_units``. But ``staggered_triple_diff.py:384`` and REGISTRY.md §StaggeredTriple- Difference (line 1730) define ``n_control_units`` as a composite total that also includes eligibility-denied / larger-cohort cells — the valid fixed comparison is only the never-enabled cohort. BR's schema / ``summary()`` / ``full_report()`` therefore misrepresented the comparison count on the ``nevertreated`` path. Added a dedicated branch that fires whenever the canonical control is ``nevertreated`` on ``StaggeredTripleDiffResults`` and surfaces ``n_never_enabled`` as the fixed comparison tally (``n_control`` set to None), regardless of whether the ``is_dynamic_control`` branch runs. P1 code quality (broken CI clause on undefined inference). ``_render_headline_sentence`` gated CI rendering on ``isinstance(lo, (int, float))`` — which accepts ``NaN`` because ``NaN`` is a float — so a fit with a finite point estimate but undefined CI endpoints (survey-df collapse, zero effective clusters, ...) rendered ``(... 95% CI: undefined to undefined)`` in both ``summary()`` and ``full_report()``. DR's own headline renderer already gated on ``np.isfinite`` (round-36 fix); BR now mirrors that: * both CI bounds finite -> usual ``(ci_level% CI: lo to hi)``; * at least one bound supplied but not finite -> explicit ``(inference unavailable: confidence interval is undefined for this fit)`` trailer; * bounds absent -> no trailer. Tests: 5 new regressions across two classes. * ``TestStaggeredTripleDiffNeverTreatedFixedComparison`` (2 tests): schema-level ``n_control is None`` / ``n_never_enabled == 300`` on the ``never_treated`` mode, plus a prose assertion that the composite 500 does not appear as "500 control" in ``summary()``; * ``TestBRHeadlineOmitsBrokenCIOnUndefinedInference`` (2 tests): NaN-CI stub renders ``summary()`` and ``full_report()`` without "undefined to undefined" / "CI: nan" fragments and with explicit "inference unavailable" language. 278 BR / DR / practitioner tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 36 ++++++++++- tests/test_business_report.py | 110 ++++++++++++++++++++++++++++++++++ 2 files changed, 145 insertions(+), 1 deletion(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index a1ce425b..1a2c1036 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -568,6 +568,20 @@ def _extract_sample(self) -> Dict[str, Any]: is_dynamic_control = ( _canonical_control == "notyettreated" or is_stacked_dynamic ) + # StaggeredTripleDiff comparison-group contract: + # ``n_control_units`` is a composite total that also includes + # the eligibility-denied / larger-cohort cells. Regardless of + # the ``control_group`` mode the valid fixed comparison is the + # never-enabled cohort (``staggered_triple_diff.py:384``, + # REGISTRY.md §StaggeredTripleDifference line 1730). Round-37 + # P1 CI review on PR #318: under ``control_group="never_treated"`` + # (i.e., ``_canonical_control == "nevertreated"``) the composite + # total was being narrated as "control". Surface + # ``n_never_enabled`` instead on both the ``nevertreated`` and + # the dynamic ``notyettreated`` modes. + if name == "StaggeredTripleDiffResults" and _canonical_control == "nevertreated": + n_never_enabled = _safe_int(getattr(r, "n_never_enabled", None)) + n_control = None if is_dynamic_control: if name == "StaggeredTripleDiffResults": n_never_enabled = _safe_int(getattr(r, "n_never_enabled", None)) @@ -1661,11 +1675,31 @@ def _render_headline_sentence(schema: Dict[str, Any]) -> str: magnitude = _format_value(abs(effect), unit, unit_kind) lo = h.get("ci_lower") hi = h.get("ci_upper") + # Round-37 P1 CI review on PR #318: on a finite point estimate + # whose CI bounds are NaN (undefined inference — survey-df + # collapse, zero effective clusters, etc.), the previous isinstance + # check passed because ``NaN`` is a ``float`` and the sentence + # rendered ``(... 95% CI: undefined to undefined)``. Gate on + # ``np.isfinite`` like DR's own headline renderer already does; + # add an explicit inference-unavailable trailer instead of the + # broken CI clause. ci_str = "" - if isinstance(lo, (int, float)) and isinstance(hi, (int, float)): + ci_finite = ( + isinstance(lo, (int, float)) + and isinstance(hi, (int, float)) + and np.isfinite(lo) + and np.isfinite(hi) + ) + if ci_finite: lo_s = _format_value(lo, unit, unit_kind) hi_s = _format_value(hi, unit, unit_kind) ci_str = f" ({h.get('ci_level', 95)}% CI: {lo_s} to {hi_s})" + elif isinstance(lo, (int, float)) or isinstance(hi, (int, float)): + # At least one bound was supplied but not finite -> inference + # undefined. Replace the CI clause with an explicit marker so + # downstream prose does not claim a confidence interval that + # is not actually available. + ci_str = " (inference unavailable: confidence interval is undefined for this fit)" by_clause = f" by {magnitude}" if effect != 0 else "" return f"{treatment.capitalize()} {verb} {outcome}{by_clause}{ci_str}." diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 7398830b..ec511aea 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -1288,6 +1288,116 @@ def test_stacked_fit_persists_anticipation(self): assert a["anticipation_periods"] == 1 +class TestStaggeredTripleDiffNeverTreatedFixedComparison: + """Round-37 P1 CI review on PR #318: ``StaggeredTripleDiffResults`` + stores ``n_control_units`` as a composite total that also includes + the eligibility-denied cohorts. The valid fixed comparison under + ``control_group="never_treated"`` is the never-enabled cohort + (``staggered_triple_diff.py:384``, REGISTRY.md §StaggeredTripleDifference + line 1730). BR was previously narrating the composite total as + "control" on the ``nevertreated`` mode; the fix surfaces + ``n_never_enabled`` as the fixed comparison count on that path + too (the dynamic ``notyettreated`` path was already correct). + """ + + @staticmethod + def _stub(control_group: str): + class StaggeredTripleDiffResults: + pass + + stub = StaggeredTripleDiffResults() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 800 + stub.n_treated = 100 + stub.n_control_units = 500 # composite total + stub.n_never_enabled = 300 # fixed never-enabled subset + stub.event_study_effects = None + stub.survey_metadata = None + stub.control_group = control_group + return stub + + def test_never_treated_mode_surfaces_never_enabled_not_composite_total(self): + sample = BusinessReport( + self._stub("never_treated"), auto_diagnostics=False + ).to_dict()["sample"] + # Composite total must not be surfaced as the fixed control + # count on the ``nevertreated`` path. + assert sample["n_control"] is None, ( + f"n_control must not carry the composite n_control_units " + f"total on StaggeredTripleDiff(control_group='never_treated'); " + f"got sample={sample!r}" + ) + assert sample["n_never_enabled"] == 300 + + def test_never_treated_mode_summary_does_not_narrate_composite_as_control(self): + summary = BusinessReport( + self._stub("never_treated"), auto_diagnostics=False + ).summary() + # The composite total must not appear as "500 control" in prose. + import re + + assert not re.search(r"\b500\s+control", summary), ( + f"BR summary must not narrate the composite n_control_units " + f"total as 'control' on StaggeredTripleDiff(control_group=" + f"'never_treated'); got: {summary!r}" + ) + + +class TestBRHeadlineOmitsBrokenCIOnUndefinedInference: + """Round-37 P1 CI review on PR #318: ``_extract_headline`` preserves + the fit's native CI even when it is undefined (e.g., survey-df + collapse produces finite ATT but NaN CI endpoints). The renderer + previously gated on ``isinstance(lo, (int, float))``, which accepts + ``NaN`` (a float) and rendered ``95% CI: undefined to undefined``. + Gate on ``np.isfinite`` instead, and emit an explicit + "inference unavailable" trailer when at least one bound is + non-finite. DR's own headline renderer already handled this + correctly (round-36 fix). + """ + + @staticmethod + def _stub_nan_ci(): + class DiDResults: + pass + + stub = DiDResults() + stub.att = 1.0 + stub.se = float("nan") + stub.t_stat = float("nan") + stub.p_value = float("nan") + stub.conf_int = (float("nan"), float("nan")) + stub.alpha = 0.05 + stub.n_obs = 200 + stub.n_treated = 100 + stub.n_control = 100 + stub.survey_metadata = None + return stub + + def test_summary_does_not_render_undefined_ci_interval(self): + summary = BusinessReport( + self._stub_nan_ci(), auto_diagnostics=False + ).summary() + lower = summary.lower() + # Must not render the broken CI interval fragment. + assert "undefined to undefined" not in lower, summary + assert "95% ci: nan" not in lower + # Must explicitly flag that inference is unavailable. + assert "inference unavailable" in lower + + def test_full_report_does_not_render_undefined_ci_interval(self): + md = BusinessReport( + self._stub_nan_ci(), auto_diagnostics=False + ).full_report() + lower = md.lower() + assert "undefined to undefined" not in lower + assert "95% ci: nan" not in lower + assert "inference unavailable" in lower + + class TestStackedCleanControlSurfacesInSampleBlock: """Pre-emptive audit regression: ``StackedDiD`` exposes its control- group choice as ``clean_control`` (the public Wing-Freedman- From 86c4a5a6c764bf3dd6eabace258e7d6f79c2eb41 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 14:55:16 -0400 Subject: [PATCH 41/48] Address thirty-eighth round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P2 methodology (StaggeredTripleDiff fixed-control prose incomplete). Round-37 moved StaggeredTripleDiff's fixed ``control_group="never_ treated"`` schema to ``n_never_enabled`` (REGISTRY.md line 1730 names the never-enabled cohort as the valid fixed comparison) and cleared the composite ``n_control_units`` total from ``n_control``. The renderers, however, only surface ``n_never_enabled`` inside the ``is_dynamic_control`` branch — so the fixed ``never_treated`` path fell through to the generic ``Sample: N observations.`` sentence and the full report omitted the fixed comparison cohort entirely. Added dedicated fixed-never-enabled branches to both renderers: * ``_render_summary`` emits ``Sample: N observations (N_t treated, N_ne never-enabled).`` when the estimator is ``StaggeredTripleDiffResults``, the dynamic branch is not active, and ``n_never_enabled > 0``; * ``_render_full_report`` emits a dedicated bullet ``- Never-enabled units (fixed comparison cohort): N_ne`` under the same condition. P3 coverage. Round-37 regression only asserted absence of the wrong ``500 control`` wording; it did not positively assert the valid never-enabled comparison cohort appeared in rendered prose, which is why the P2 above slipped through. Regressions extended: * ``test_never_treated_mode_summary_renders_never_enabled_count`` asserts ``300 never-enabled`` appears in ``summary()`` AND the generic fallback ``Sample: 800 observations.`` does not fire; * new ``test_never_treated_mode_full_report_renders_never_enabled_count`` asserts the sample section of ``full_report()`` names ``never-enabled`` and the ``300`` count while omitting any bare ``- Control: 500`` line. 278 BR / DR / practitioner tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 36 +++++++++++++++++++++++++++++++++++ tests/test_business_report.py | 34 ++++++++++++++++++++++++++------- 2 files changed, 63 insertions(+), 7 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 1a2c1036..9e2bea7b 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -1931,6 +1931,25 @@ def _render_summary(schema: Dict[str, Any]) -> str: "dynamic not-yet-treated comparison group (the control set " f"varies by cohort and period){subset_clause}." ) + elif ( + estimator == "StaggeredTripleDiffResults" + and isinstance(n_t, int) + and isinstance(n_ne, int) + and n_ne > 0 + ): + # Round-38 P2 CI review on PR #318: StaggeredTripleDiff + # under fixed ``control_group="never_treated"`` had the + # schema moved to ``n_never_enabled`` (round-37) but the + # renderers fell through to the generic + # ``Sample: N observations.`` sentence because the + # ``is_dynamic_control`` branch didn't fire. REGISTRY.md + # §StaggeredTripleDifference line 1730 names the + # never-enabled cohort as the valid fixed comparison on + # this path; the prose must say so. + sentences.append( + f"Sample: {n_obs:,} observations ({n_t:,} treated, " + f"{n_ne:,} never-enabled)." + ) else: sentences.append(f"Sample: {n_obs:,} observations.") survey = sample.get("survey") @@ -2106,6 +2125,23 @@ def _render_full_report(schema: Dict[str, Any]) -> str: ) if isinstance(sample.get("n_control"), int): lines.append(f"- Control: {sample['n_control']:,}") + elif ( + estimator_name == "StaggeredTripleDiffResults" + and isinstance(sample.get("n_never_enabled"), int) + and sample["n_never_enabled"] > 0 + and not sample.get("dynamic_control") + ): + # Round-38 P2 CI review on PR #318: fixed + # ``control_group="never_treated"`` on StaggeredTripleDiff + # clears ``n_control`` (composite total) and populates + # ``n_never_enabled`` (the valid fixed comparison cohort per + # REGISTRY.md line 1730). The full report must render that + # fixed count — the dynamic-control branch below would not + # fire on this path. + lines.append( + f"- Never-enabled units (fixed comparison cohort): " + f"{sample['n_never_enabled']:,}" + ) elif sample.get("dynamic_control"): if isinstance(sample.get("n_never_enabled"), int) and sample["n_never_enabled"] > 0: lines.append( diff --git a/tests/test_business_report.py b/tests/test_business_report.py index ec511aea..6036a2c4 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -1333,18 +1333,38 @@ def test_never_treated_mode_surfaces_never_enabled_not_composite_total(self): ) assert sample["n_never_enabled"] == 300 - def test_never_treated_mode_summary_does_not_narrate_composite_as_control(self): + def test_never_treated_mode_summary_renders_never_enabled_count(self): + """Round-38 P3 strengthened regression: the summary must + POSITIVELY surface the valid fixed comparison cohort + (``300 never-enabled``), not merely avoid the wrong + ``500 control`` phrasing. + """ + import re + summary = BusinessReport( self._stub("never_treated"), auto_diagnostics=False ).summary() - # The composite total must not appear as "500 control" in prose. - import re - - assert not re.search(r"\b500\s+control", summary), ( - f"BR summary must not narrate the composite n_control_units " - f"total as 'control' on StaggeredTripleDiff(control_group=" + # Old wrong phrasing absent. + assert not re.search(r"\b500\s+control", summary), summary + # New fixed cohort present. + assert "300 never-enabled" in summary, ( + f"BR summary must render the valid fixed never-enabled " + f"comparison cohort on StaggeredTripleDiff(control_group=" f"'never_treated'); got: {summary!r}" ) + # And the generic no-comparison fallback must not fire. + assert "Sample: 800 observations." not in summary + + def test_never_treated_mode_full_report_renders_never_enabled_count(self): + md = BusinessReport( + self._stub("never_treated"), auto_diagnostics=False + ).full_report() + sample_section = md.split("## Sample", 1)[1].split("\n## ", 1)[0] + assert "never-enabled" in sample_section.lower() + assert "300" in sample_section + # No bare "- Control: 500" line (composite total) should appear + # on this path. + assert "- Control: 500" not in sample_section class TestBRHeadlineOmitsBrokenCIOnUndefinedInference: From 9d8ad3496ddc283e7d8947ab46a790ed00ba7287 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 15:11:32 -0400 Subject: [PATCH 42/48] Address thirty-ninth round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit P3 maintainability (inconclusive-PT provenance dropped at the lift). DR's ``_pt_event_study`` inconclusive branch emits ``n_dropped_undefined`` (count of real pre-periods rejected for undefined inference) and a detailed ``reason`` tying the skip to the ``safe_inference`` contract. BR's ``_lift_pre_trends`` was dropping both fields at the lift boundary, so BR's own summary renderer — which is already written to quote the count when available — never saw it. Preserve both fields on the BR ``pre_trends`` block so the schema carries the same provenance DR does, and summary prose can render "1 pre-period row had undefined inference" instead of the generic "pre-period inference was undefined" fallback. P3 docs drift. Round-35 added ``band_label="improves_precision"`` for ``deff < 0.95`` and code/tests exercise that enum value, but ``REPORTING.md`` still described only the four-band table (``trivial`` / ``slightly_reduces`` / ``materially_reduces`` / ``large_warning``) and ``_check_design_effect``'s docstring listed the same old table. Updated both surfaces to enumerate the full five-value enum with explicit threshold rules and the intuition for the precision-improving band. Tests: 1 new regression. * ``TestInconclusivePTProvenancePreservedOnBRSchema``: NaN-p ``StackedDiDResults`` fit (Bonferroni-only surface) lifts through to BR with ``n_dropped_undefined==1`` on the BR ``pre_trends`` block, a populated ``reason``, and the summary renderer quotes "1 pre-period row had undefined inference". All BR / DR / practitioner tests pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 9 +++++++ diff_diff/diagnostic_report.py | 9 +++++-- docs/methodology/REPORTING.md | 15 +++++++++-- tests/test_business_report.py | 46 ++++++++++++++++++++++++++++++++++ 4 files changed, 75 insertions(+), 4 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 9e2bea7b..814604e4 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -713,6 +713,15 @@ def _lift_pre_trends(dr: Optional[Dict[str, Any]]) -> Dict[str, Any]: "joint_p_value": pt.get("joint_p_value"), "verdict": pt.get("verdict"), "n_pre_periods": pt.get("n_pre_periods"), + # Preserve DR's inconclusive-PT provenance on the BR schema so + # downstream consumers (and BR's own summary renderer) see the + # undefined-row count and DR's detailed reason without having + # to re-consult the DR schema (round-39 P3 CI review on PR + # #318). These fields are populated only when + # ``verdict == "inconclusive"`` per ``_pt_event_study``'s + # inconclusive branch (``diagnostic_report.py:999``). + "n_dropped_undefined": pt.get("n_dropped_undefined"), + "reason": pt.get("reason"), # Carry the denominator df through when the survey F-reference # branch was used so BR consumers can flag the finite-sample # correction without re-consulting the DR schema (round-28 P3 diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index a9a07444..9bdb7c96 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -1571,11 +1571,16 @@ def _check_design_effect(self) -> Dict[str, Any]: only emitting the numeric fields plus ``is_trivial``). Bands (per REPORTING.md): - * ``deff < 1.05`` -> ``"trivial"``; + * ``deff < 0.95`` -> ``"improves_precision"`` (effective N + is LARGER than nominal N — a precision-improving design; + round-35 split this out from the old ``trivial`` bucket); + * ``0.95 <= deff < 1.05`` -> ``"trivial"`` (effectively no + effect on inference); * ``1.05 <= deff < 2`` -> ``"slightly_reduces"``; * ``2 <= deff < 5`` -> ``"materially_reduces"``; * ``deff >= 5`` -> ``"large_warning"``. - ``None`` deff -> ``band_label=None`` (no classification). + ``None`` deff (or non-finite) -> ``band_label=None`` (no + classification). """ sm = getattr(self._results, "survey_metadata", None) if sm is None: diff --git a/docs/methodology/REPORTING.md b/docs/methodology/REPORTING.md index ab29c9f8..ced19faf 100644 --- a/docs/methodology/REPORTING.md +++ b/docs/methodology/REPORTING.md @@ -29,8 +29,19 @@ pretest via `EfficientDiD.hausman_pretest`). The `design_effect` section of `DiagnosticReport.to_dict()` is a read-only surface: it echoes `survey_metadata.design_effect` and -`effective_n` from the fitted result along with a plain-English band -label. It does not call `compute_deff_diagnostics` (that helper +`effective_n` from the fitted result along with a `band_label` enum +classifying the deviation from 1. The enum values are: + +- `"improves_precision"` for `deff < 0.95` (effective N is LARGER + than nominal N — a precision-improving design); +- `"trivial"` for `0.95 <= deff < 1.05` (effectively no effect on + inference); +- `"slightly_reduces"` for `1.05 <= deff < 2`; +- `"materially_reduces"` for `2 <= deff < 5`; +- `"large_warning"` for `deff >= 5`; +- `None` when `deff` is missing or non-finite. + +The section does not call `compute_deff_diagnostics` (that helper needs per-fit internals the result objects do not expose). The report layer **does** compose a few cross-period summary statistics from per-period inputs already produced by the estimator — specifically the joint-Wald / Bonferroni diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 6036a2c4..6bf4ceb5 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -1288,6 +1288,52 @@ def test_stacked_fit_persists_anticipation(self): assert a["anticipation_periods"] == 1 +class TestInconclusivePTProvenancePreservedOnBRSchema: + """Round-39 P3 CI review on PR #318: DR's ``_pt_event_study`` emits + ``n_dropped_undefined`` and a detailed ``reason`` on the + inconclusive PT block (undefined pre-period inference — NaN + per-period p-value or zero / negative SE). BR's ``_lift_pre_trends`` + was dropping both fields at the lift boundary, so the BR schema + and BR's summary renderer lost the provenance DR had already + computed. Preserve both so BR consumers see the exact count of + undefined rows and the same reason without re-consulting the DR + schema. + """ + + def test_n_dropped_undefined_and_reason_land_on_br_pre_trends(self): + class StackedDiDResults: + pass + + obj = StackedDiDResults() + obj.overall_att = 1.0 + obj.overall_se = 0.2 + obj.overall_p_value = 0.001 + obj.overall_conf_int = (0.6, 1.4) + obj.alpha = 0.05 + obj.n_obs = 400 + obj.n_treated_units = 100 + obj.n_control_units = 300 + obj.survey_metadata = None + obj.event_study_effects = { + -2: {"effect": 0.1, "se": 0.2, "p_value": 0.62, "n_obs": 400}, + -1: {"effect": 0.05, "se": 0.3, "p_value": float("nan"), "n_obs": 400}, + } + + br = BusinessReport(obj) + pt = br.to_dict()["pre_trends"] + # Status and verdict reflect the inconclusive outcome. + assert pt["verdict"] == "inconclusive" + # The provenance fields are present on the BR schema. + assert pt["n_dropped_undefined"] == 1 + assert isinstance(pt.get("reason"), str) and pt["reason"] + # And the summary renderer quotes the count (the existing + # inconclusive branch in ``_render_summary`` reads + # ``pt.get("n_dropped_undefined")``; before this fix that lookup + # returned ``None`` because the lift had dropped it). + summary = br.summary() + assert "1 pre-period row had undefined inference" in summary + + class TestStaggeredTripleDiffNeverTreatedFixedComparison: """Round-37 P1 CI review on PR #318: ``StaggeredTripleDiffResults`` stores ``n_control_units`` as a composite total that also includes From cf2f592c69937faabec0bfde4f3c4449d51e8f38 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 15:43:50 -0400 Subject: [PATCH 43/48] Address fortieth round of CI review findings on PR #318 Round-40 landed two P1 methodology findings on the reporting layer, both instances of the same silent-failure class: survey-backed fits routing through diagnostic helpers that don't accept ``survey_design``, silently emitting unweighted results for a weighted estimate. Survey-design threading (``diagnostic_report.py``, ``business_report.py``): - ``DiagnosticReport`` and ``BusinessReport`` now accept ``survey_design=``. BR forwards to the auto- constructed DR; DR threads through ``bacon_decompose( survey_design=...)``. - When ``results.survey_metadata`` is set but ``survey_design`` is not supplied, Bacon and the simple 2x2 parallel-trends helper skip with an explicit reason instead of replaying an unweighted decomposition / verdict for a design that does not match the estimate. Precomputed passthroughs remain honored. - Defense-in-depth skips added at the runner level (``_check_bacon`` / ``_pt_two_x_two``) in case future callers bypass the applicability gate. Docs: - ``REPORTING.md`` documents the new ``survey_design`` contract and the skip-with-reason policy alongside the existing finite-df PT note. - ``business_report.rst`` and ``diagnostic_report.rst`` surface the kwarg with a pointer to the methodology file. Tests: 7 new regressions (4 DR + 3 BR) covering PT skip with reason on survey-backed DiDResults, precomputed PT override honored, Bacon skip without survey_design, and survey_design forwarded via ``unittest.mock.patch``. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 65 ++++--- diff_diff/diagnostic_report.py | 124 +++++++++++-- docs/api/business_report.rst | 9 + docs/api/diagnostic_report.rst | 19 ++ docs/methodology/REPORTING.md | 17 ++ tests/test_business_report.py | 305 ++++++++++++++++++++++---------- tests/test_diagnostic_report.py | 265 +++++++++++++++++++++++---- 7 files changed, 634 insertions(+), 170 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 814604e4..0e5b3f4e 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -122,6 +122,19 @@ class BusinessReport: include_appendix : bool, default True Whether ``full_report()`` appends the estimator's academic ``results.summary()`` output under a "Technical Appendix" section. + data, outcome, treatment, unit, time, first_treat : optional + Raw panel + column names forwarded to the auto-constructed + ``DiagnosticReport`` so data-dependent checks (2x2 PT on simple + DiD, Bacon-from-scratch, EfficientDiD Hausman pretest) can run. + survey_design : SurveyDesign, optional + The ``SurveyDesign`` object used to fit a survey-weighted + estimator. Forwarded to the auto-constructed ``DiagnosticReport`` + for fit-faithful Goodman-Bacon replay. When the fit carries + ``survey_metadata`` but ``survey_design`` is not supplied, Bacon + and the simple 2x2 parallel-trends check are skipped with an + explicit reason rather than replaying an unweighted decomposition + for a design that does not match the estimate. See + ``docs/methodology/REPORTING.md``. """ def __init__( @@ -144,6 +157,7 @@ def __init__( unit: Optional[str] = None, time: Optional[str] = None, first_treat: Optional[str] = None, + survey_design: Optional[Any] = None, ): if type(results).__name__ == "BaconDecompositionResults": raise TypeError( @@ -204,6 +218,13 @@ def __init__( self._dr_unit = unit self._dr_time = time self._dr_first_treat = first_treat + # Round-40 P1 CI review on PR #318: survey-backed fits need + # the ``SurveyDesign`` threaded through to the auto-constructed + # DR so Bacon decomposition is fit-faithful and the 2x2 PT + # skip path triggers for DiDResults with ``survey_metadata``. + # Without this passthrough, the auto path silently replays an + # unweighted decomposition / PT verdict for a weighted fit. + self._dr_survey_design = survey_design resolved_alpha = alpha if alpha is not None else getattr(results, "alpha", 0.05) self._context = BusinessContext( @@ -297,6 +318,7 @@ def _resolve_diagnostics(self) -> Optional[DiagnosticReportResults]: unit=self._dr_unit, time=self._dr_time, first_treat=self._dr_first_treat, + survey_design=self._dr_survey_design, ) return dr.run_all() @@ -561,13 +583,11 @@ def _extract_sample(self) -> Dict[str, Any]: # (A_s = infinity) is a fixed never-treated pool. Round-22 P1 # CI review on PR #318 flagged that ``strict`` was being # misrendered as a fixed control design. - is_stacked_dynamic = ( - name == "StackedDiDResults" - and _canonical_control in {"notyettreated", "strict"} - ) - is_dynamic_control = ( - _canonical_control == "notyettreated" or is_stacked_dynamic - ) + is_stacked_dynamic = name == "StackedDiDResults" and _canonical_control in { + "notyettreated", + "strict", + } + is_dynamic_control = _canonical_control == "notyettreated" or is_stacked_dynamic # StaggeredTripleDiff comparison-group contract: # ``n_control_units`` is a composite total that also includes # the eligibility-denied / larger-cohort cells. Regardless of @@ -608,9 +628,7 @@ def _extract_sample(self) -> Dict[str, Any]: # which misstates the sample composition for repeated cross- # section fits. Carry the flag into the schema so rendering can # branch. Round-28 P2 CI review on PR #318. - count_unit = ( - "observations" if getattr(r, "panel", True) is False else "units" - ) + count_unit = "observations" if getattr(r, "panel", True) is False else "units" sample_block: Dict[str, Any] = { "n_obs": _safe_int(getattr(r, "n_obs", None)), @@ -1042,16 +1060,13 @@ def _describe_assumption(estimator_name: str, results: Any = None) -> Dict[str, "treatment-onset cohort." ) has_controls = ( - results is not None - and getattr(results, "covariate_residuals", None) is not None + results is not None and getattr(results, "covariate_residuals", None) is not None ) has_trends = ( - results is not None - and getattr(results, "linear_trends_effects", None) is not None + results is not None and getattr(results, "linear_trends_effects", None) is not None ) has_heterogeneity = ( - results is not None - and getattr(results, "heterogeneity_effects", None) is not None + results is not None and getattr(results, "heterogeneity_effects", None) is not None ) active_parts: List[str] = [] if has_controls and has_trends: @@ -1074,9 +1089,7 @@ def _describe_assumption(estimator_name: str, results: Any = None) -> Dict[str, "linear pre-trends" ) if has_heterogeneity: - active_parts.append( - "heterogeneity tests ``beta^{het}_l`` are reported per horizon" - ) + active_parts.append("heterogeneity tests ``beta^{het}_l`` are reported per horizon") if active_parts: phase3_clause = " Phase-3 configuration: " + "; ".join(active_parts) + "." base_description = base_description + phase3_clause @@ -1886,9 +1899,7 @@ def _render_summary(schema: Dict[str, Any]) -> str: # ``schema["estimator"]`` is a dict with ``class_name``; unwrap it # for the per-estimator dynamic-control phrasing branch below. estimator_block = schema.get("estimator") or {} - estimator = ( - estimator_block.get("class_name") if isinstance(estimator_block, dict) else None - ) + estimator = estimator_block.get("class_name") if isinstance(estimator_block, dict) else None n_obs = sample.get("n_obs") n_t = sample.get("n_treated") n_c = sample.get("n_control") @@ -1956,8 +1967,7 @@ def _render_summary(schema: Dict[str, Any]) -> str: # never-enabled cohort as the valid fixed comparison on # this path; the prose must say so. sentences.append( - f"Sample: {n_obs:,} observations ({n_t:,} treated, " - f"{n_ne:,} never-enabled)." + f"Sample: {n_obs:,} observations ({n_t:,} treated, " f"{n_ne:,} never-enabled)." ) else: sentences.append(f"Sample: {n_obs:,} observations.") @@ -2148,8 +2158,7 @@ def _render_full_report(schema: Dict[str, Any]) -> str: # fixed count — the dynamic-control branch below would not # fire on this path. lines.append( - f"- Never-enabled units (fixed comparison cohort): " - f"{sample['n_never_enabled']:,}" + f"- Never-enabled units (fixed comparison cohort): " f"{sample['n_never_enabled']:,}" ) elif sample.get("dynamic_control"): if isinstance(sample.get("n_never_enabled"), int) and sample["n_never_enabled"] > 0: @@ -2165,9 +2174,7 @@ def _render_full_report(schema: Dict[str, Any]) -> str: if estimator_name == "StackedDiDResults": n_distinct = sample.get("n_distinct_controls_trimmed") if isinstance(n_distinct, int): - lines.append( - f"- Distinct control units in trimmed stack: {n_distinct:,}" - ) + lines.append(f"- Distinct control units in trimmed stack: {n_distinct:,}") cc_label = cg if isinstance(cg, str) else "clean_control" lines.append( f"- Comparison group: sub-experiment-specific clean controls " diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 9bdb7c96..facff564 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -268,6 +268,15 @@ class DiagnosticReport: HonestDiD restriction type. alpha : float, default 0.05 Significance level used across checks. + survey_design : SurveyDesign, optional + The ``SurveyDesign`` object used to fit a survey-weighted + estimator. Required for fit-faithful replay of Goodman-Bacon on a + survey-backed fit; threaded to ``bacon_decompose(survey_design=...)``. + When the fit carries ``survey_metadata`` but ``survey_design`` is + not supplied, Bacon and the simple 2x2 parallel-trends check are + skipped with an explicit reason rather than replaying an + unweighted decomposition for a design that does not match the + estimate. See ``docs/methodology/REPORTING.md``. precomputed : dict, optional Map of check name to a pre-computed result object. Accepted keys (this is the full implemented list; unsupported keys raise @@ -312,6 +321,7 @@ def __init__( sensitivity_M_grid: Tuple[float, ...] = (0.5, 1.0, 1.5, 2.0), sensitivity_method: str = "relative_magnitude", alpha: float = 0.05, + survey_design: Optional[Any] = None, precomputed: Optional[Dict[str, Any]] = None, outcome_label: Optional[str] = None, treatment_label: Optional[str] = None, @@ -339,6 +349,17 @@ def __init__( self._sensitivity_M_grid = tuple(sensitivity_M_grid) self._sensitivity_method = sensitivity_method self._alpha = float(alpha) + # Round-40 P1 CI review on PR #318: survey-backed fits need the + # ``SurveyDesign`` object threaded through to ``bacon_decompose`` + # for a fit-faithful Goodman-Bacon replay, and the unweighted + # 2x2 parallel-trends helper (``utils.check_parallel_trends``) + # cannot be called on a survey-weighted DiDResults without + # silently reporting an unweighted verdict for a weighted fit. + # When the fit carries ``survey_metadata`` but the caller did + # not supply ``survey_design``, both checks skip with an + # explicit reason instead of replaying a different design than + # the estimate. See REPORTING.md "Survey-backed fits". + self._survey_design = survey_design self._precomputed = dict(precomputed or {}) # Validate precomputed keys against the actually-implemented passthrough # set so advertised contracts do not silently diverge from behavior. @@ -569,6 +590,26 @@ def _instance_skip_reason(self, check: str) -> Optional[str]: + ", ".join(two_x_two_missing) + "." ) + # Round-40 P1 CI review on PR #318: the simple 2x2 helper + # ``utils.check_parallel_trends`` is unweighted — it has + # no ``survey_design`` parameter and cannot faithfully + # diagnose the pre-period trajectory of a survey- + # weighted DiDResults. Rather than silently emitting + # an unweighted verdict alongside the weighted estimate, + # skip with an explicit reason. Users can supply + # ``precomputed={'parallel_trends': ...}`` with a + # survey-aware pretest result if they have one. + if getattr(r, "survey_metadata", None) is not None: + return ( + "Original fit used a survey design; the simple " + "2x2 parallel-trends check (``utils." + "check_parallel_trends``) is unweighted and " + "would diagnose a different design than the " + "weighted estimate. Supply a survey-aware " + "pretest via " + "``precomputed={'parallel_trends': ...}`` to " + "opt in." + ) if method == "event_study": pre_coefs, _ = _collect_pre_period_coefs(r) if not pre_coefs: @@ -730,6 +771,27 @@ def _instance_skip_reason(self, check: str) -> Optional[str]: "Bacon decomposition needs panel data + outcome / time " "/ unit / first_treat column names. Missing: " + ", ".join(bacon_missing) + "." ) + # Round-40 P1 CI review on PR #318: ``bacon_decompose`` + # supports a ``survey_design`` kwarg for survey-weighted + # decomposition. When the fitted result carries + # ``survey_metadata`` but the caller did not supply a + # ``survey_design`` object, replaying with defaults would + # produce an unweighted decomposition for a different + # design than the weighted estimate. Skip with an explicit + # reason; users can pass ``survey_design=`` on + # ``DiagnosticReport`` / ``BusinessReport`` or supply + # ``precomputed={'bacon': ...}`` with a survey-aware + # decomposition. + if getattr(r, "survey_metadata", None) is not None and self._survey_design is None: + return ( + "Original fit used a survey design; Goodman-Bacon " + "replay under defaults would produce an unweighted " + "decomposition for a different design than the " + "weighted estimate. Pass ``survey_design=`` " + "on DiagnosticReport / BusinessReport, or supply " + "``precomputed={'bacon': ...}`` with a survey-aware " + "decomposition." + ) return None if check == "heterogeneity": # Needs multiple group or event-study effects. Use len() rather than @@ -963,6 +1025,26 @@ def _pt_two_x_two(self) -> Dict[str, Any]: "reason": "Requires treatment= identifying the " "treated-group indicator; not supplied.", } + # Round-40 P1 CI review on PR #318: defense-in-depth. The + # instance-level applicability gate should have already returned + # a skip reason when ``results.survey_metadata`` is non-None and + # no precomputed PT was supplied, but ``_pt_two_x_two`` is also + # reachable directly from ``_check_parallel_trends`` if future + # callers add method dispatch overrides. Guard at the runner + # too to prevent ``utils.check_parallel_trends`` from emitting + # an unweighted verdict for a weighted fit. + if getattr(self._results, "survey_metadata", None) is not None: + return { + "status": "skipped", + "reason": ( + "Original fit used a survey design; the simple 2x2 " + "parallel-trends helper (``utils.check_parallel_trends``) " + "is unweighted and cannot faithfully diagnose a " + "survey-weighted DiDResults. Supply a survey-aware " + "pretest via ``precomputed={'parallel_trends': ...}`` " + "to opt in." + ), + } try: raw = check_parallel_trends( self._data, @@ -1156,9 +1238,7 @@ def _pt_event_study(self) -> Dict[str, Any]: nan_p_count = sum( 1 for p in per_period - if not ( - isinstance(p["p_value"], (int, float)) and np.isfinite(p["p_value"]) - ) + if not (isinstance(p["p_value"], (int, float)) and np.isfinite(p["p_value"])) ) if nan_p_count > 0: return { @@ -1517,6 +1597,24 @@ def _check_bacon(self) -> Dict[str, Any]: "reason": "Bacon decomposition requires data + outcome + unit + time " "+ first_treat on DiagnosticReport; not all supplied.", } + # Round-40 P1 CI review on PR #318: defense-in-depth. The + # instance-level applicability gate should have already returned + # a skip when the result carries ``survey_metadata`` but no + # ``survey_design`` is available to thread through. Guard at + # the runner too in case a future caller bypasses the gate. + if getattr(r, "survey_metadata", None) is not None and self._survey_design is None: + return { + "status": "skipped", + "reason": ( + "Original fit used a survey design; Goodman-Bacon " + "replay under defaults would produce an unweighted " + "decomposition for a different design than the " + "weighted estimate. Pass ``survey_design=`` " + "on DiagnosticReport / BusinessReport, or supply " + "``precomputed={'bacon': ...}`` with a survey-aware " + "decomposition." + ), + } try: from diff_diff.bacon import bacon_decompose @@ -1527,6 +1625,7 @@ def _check_bacon(self) -> Dict[str, Any]: unit=unit, time=time, first_treat=first_treat, + survey_design=self._survey_design, ) except Exception as exc: # noqa: BLE001 return { @@ -2217,12 +2316,9 @@ def _read(name: str) -> Any: # as its distinguishing fields. method = _read("method") if method is None: - hausman_markers = ( - _read("statistic") is not None - and any( - _read(tag) is not None - for tag in ("att_all", "att_post", "recommendation", "reject") - ) + hausman_markers = _read("statistic") is not None and any( + _read(tag) is not None + for tag in ("att_all", "att_post", "recommendation", "reject") ) slope_markers = any( _read(tag) is not None @@ -2797,16 +2893,10 @@ def _render_overall_interpretation(schema: Dict[str, Any], labels: Dict[str, str ci_finite = ( isinstance(ci, (list, tuple)) and len(ci) == 2 - and all( - isinstance(v, (int, float)) and np.isfinite(v) for v in ci - ) + and all(isinstance(v, (int, float)) and np.isfinite(v) for v in ci) ) ci_str = f" ({ci_level}% CI: {ci[0]:.3g} to {ci[1]:.3g})" if ci_finite else "" - p_str = ( - f", p = {p:.3g}" - if isinstance(p, (int, float)) and np.isfinite(p) - else "" - ) + p_str = f", p = {p:.3g}" if isinstance(p, (int, float)) and np.isfinite(p) else "" sentences.append( f"On {est}, {treatment} {direction} {outcome} by {val:.3g}{ci_str}{p_str}." ) diff --git a/docs/api/business_report.rst b/docs/api/business_report.rst index 27482742..02f35da0 100644 --- a/docs/api/business_report.rst +++ b/docs/api/business_report.rst @@ -23,6 +23,15 @@ to the auto-constructed ``DiagnosticReport``. Without these kwargs, those specific checks are skipped with an explicit reason while the rest of the report still renders. +For survey-weighted fits (any result carrying +``survey_metadata``) pass the original ``SurveyDesign`` via +``survey_design=``. It is threaded through to +``bacon_decompose`` for a fit-faithful Goodman-Bacon replay. When +``survey_metadata`` is set but ``survey_design`` is not supplied, +Bacon and the simple 2x2 parallel-trends check are skipped with an +explicit reason so the report never emits an unweighted decomposition +or PT verdict for a design that differs from the estimate. + Methodology deviations (no traffic-light gates, pre-trends verdict thresholds, power-aware phrasing, unit-translation policy, schema stability) are documented in :doc:`../methodology/REPORTING`. diff --git a/docs/api/diagnostic_report.rst b/docs/api/diagnostic_report.rst index c795c0c1..c3c3626d 100644 --- a/docs/api/diagnostic_report.rst +++ b/docs/api/diagnostic_report.rst @@ -15,6 +15,25 @@ Methodology deviations (no traffic-light gates, opt-in placebo battery, estimator-native diagnostic routing, power-aware phrasing threshold) are documented in :doc:`../methodology/REPORTING`. +Data-dependent checks (2x2 parallel trends on simple DiD, +Goodman-Bacon decomposition on staggered estimators, the EfficientDiD +Hausman PT-All vs PT-Post pretest) require the raw panel + column +names. Pass ``data``, ``outcome``, ``treatment``, ``unit``, ``time``, +and/or ``first_treat`` and they feed the runners. Without these +kwargs, those specific checks are skipped with an explicit reason +while the rest of the battery still runs. + +For survey-weighted fits (any result carrying +``survey_metadata``) pass the original ``SurveyDesign`` via +``survey_design=``. It is threaded through to +``bacon_decompose`` for a fit-faithful Goodman-Bacon replay. When +``survey_metadata`` is set but ``survey_design`` is not supplied, +Bacon and the simple 2x2 parallel-trends check are skipped with an +explicit reason so the report never emits an unweighted decomposition +or PT verdict for a design that differs from the estimate. Alternatively +supply ``precomputed={'bacon': }`` or +``precomputed={'parallel_trends': }`` with a survey-aware result. + Example ------- diff --git a/docs/methodology/REPORTING.md b/docs/methodology/REPORTING.md index ced19faf..e36fbd7f 100644 --- a/docs/methodology/REPORTING.md +++ b/docs/methodology/REPORTING.md @@ -80,6 +80,23 @@ not new inference. Wald statistic (or Bonferroni fallback when `vcov` is missing). This mirrors the guidance in `practitioner._parallel_trends_step(staggered=True)`. +- **Note:** Survey-design threading for fit-faithful replay. When the + fitted result carries `survey_metadata`, Goodman-Bacon replay and + the simple 2x2 parallel-trends helper require the original + `SurveyDesign` object to produce a diagnostic that matches the + estimate. `DiagnosticReport(survey_design=...)` and + `BusinessReport(survey_design=...)` accept it and forward to + `bacon_decompose(survey_design=...)`. When `survey_metadata` is set + but `survey_design` is not supplied, both checks skip with an + explicit reason rather than replaying an unweighted decomposition + / PT verdict for a design that differs from the weighted estimate. + Users can alternatively pass `precomputed={'bacon': ...}` / + `precomputed={'parallel_trends': ...}` with a survey-aware result. + Event-study PT on staggered estimators already reads the weighted + pre-period coefficients directly off the fitted result (so does not + need a second replay) and uses the finite-df reference described + below. + - **Note:** Survey finite-df PT policy. When the fitted result carries a finite `survey_metadata.df_survey`, `_pt_event_study` computes `F = W / k` (numerator df = k pre-period coefficients) against an diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 6bf4ceb5..742308f5 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -1006,9 +1006,7 @@ def test_full_report_slope_difference_uses_single_p_label(self): def test_full_report_hausman_uses_single_p_label(self): stub, fake_dr = self._stub_result_with_method("hausman") - section = self._pt_section( - BusinessReport(stub, diagnostics=fake_dr).full_report() - ) + section = self._pt_section(BusinessReport(stub, diagnostics=fake_dr).full_report()) assert "joint p" not in section, ( f"EfficientDiD Hausman is a single-statistic test and must " f"not be labeled ``joint p`` in the markdown. Got: {section!r}" @@ -1017,9 +1015,7 @@ def test_full_report_hausman_uses_single_p_label(self): def test_full_report_synthetic_fit_omits_p_label(self): stub, fake_dr = self._stub_result_with_method("synthetic_fit") - section = self._pt_section( - BusinessReport(stub, diagnostics=fake_dr).full_report() - ) + section = self._pt_section(BusinessReport(stub, diagnostics=fake_dr).full_report()) # No p-value of any kind for design-enforced SDiD PT analogue. assert "joint p" not in section assert "p = " not in section @@ -1367,9 +1363,9 @@ class StaggeredTripleDiffResults: return stub def test_never_treated_mode_surfaces_never_enabled_not_composite_total(self): - sample = BusinessReport( - self._stub("never_treated"), auto_diagnostics=False - ).to_dict()["sample"] + sample = BusinessReport(self._stub("never_treated"), auto_diagnostics=False).to_dict()[ + "sample" + ] # Composite total must not be surfaced as the fixed control # count on the ``nevertreated`` path. assert sample["n_control"] is None, ( @@ -1387,9 +1383,7 @@ def test_never_treated_mode_summary_renders_never_enabled_count(self): """ import re - summary = BusinessReport( - self._stub("never_treated"), auto_diagnostics=False - ).summary() + summary = BusinessReport(self._stub("never_treated"), auto_diagnostics=False).summary() # Old wrong phrasing absent. assert not re.search(r"\b500\s+control", summary), summary # New fixed cohort present. @@ -1402,9 +1396,7 @@ def test_never_treated_mode_summary_renders_never_enabled_count(self): assert "Sample: 800 observations." not in summary def test_never_treated_mode_full_report_renders_never_enabled_count(self): - md = BusinessReport( - self._stub("never_treated"), auto_diagnostics=False - ).full_report() + md = BusinessReport(self._stub("never_treated"), auto_diagnostics=False).full_report() sample_section = md.split("## Sample", 1)[1].split("\n## ", 1)[0] assert "never-enabled" in sample_section.lower() assert "300" in sample_section @@ -1444,9 +1436,7 @@ class DiDResults: return stub def test_summary_does_not_render_undefined_ci_interval(self): - summary = BusinessReport( - self._stub_nan_ci(), auto_diagnostics=False - ).summary() + summary = BusinessReport(self._stub_nan_ci(), auto_diagnostics=False).summary() lower = summary.lower() # Must not render the broken CI interval fragment. assert "undefined to undefined" not in lower, summary @@ -1455,9 +1445,7 @@ def test_summary_does_not_render_undefined_ci_interval(self): assert "inference unavailable" in lower def test_full_report_does_not_render_undefined_ci_interval(self): - md = BusinessReport( - self._stub_nan_ci(), auto_diagnostics=False - ).full_report() + md = BusinessReport(self._stub_nan_ci(), auto_diagnostics=False).full_report() lower = md.lower() assert "undefined to undefined" not in lower assert "95% ci: nan" not in lower @@ -1551,7 +1539,6 @@ def test_stacked_all_eventually_treated_panel_does_not_fabricate_never_treated(s never-treated units, because every unit is eventually treated (the round-21 reviewer example). """ - import pandas as pd from diff_diff import StackedDiD @@ -1567,9 +1554,7 @@ def test_stacked_all_eventually_treated_panel_does_not_fabricate_never_treated(s # Sanity: the fixture has no never-treated units. assert sdf[sdf["first_treat"] == 0].empty - st = StackedDiD( - clean_control="not_yet_treated", kappa_pre=1, kappa_post=1 - ).fit( + st = StackedDiD(clean_control="not_yet_treated", kappa_pre=1, kappa_post=1).fit( sdf, outcome="outcome", unit="unit", time="period", first_treat="first_treat" ) sample = BusinessReport(st, auto_diagnostics=False).to_dict()["sample"] @@ -1623,15 +1608,15 @@ def test_not_yet_treated_names_subexperiment_contract(self): assert a["clean_control"] == "not_yet_treated" def test_strict_names_strict_rule(self): - desc = BusinessReport( - self._stub("strict"), auto_diagnostics=False - ).to_dict()["assumption"]["description"] + desc = BusinessReport(self._stub("strict"), auto_diagnostics=False).to_dict()["assumption"][ + "description" + ] assert "A_s > a + kappa_post + kappa_pre" in desc def test_never_treated_names_fixed_pool(self): - desc = BusinessReport( - self._stub("never_treated"), auto_diagnostics=False - ).to_dict()["assumption"]["description"] + desc = BusinessReport(self._stub("never_treated"), auto_diagnostics=False).to_dict()[ + "assumption" + ]["description"] assert "never treated" in desc.lower() assert "A_s = infinity" in desc @@ -1673,7 +1658,9 @@ def test_full_report_names_sub_experiment_comparison_for_stacked_strict(self): ) md = BusinessReport(st, auto_diagnostics=False).full_report() # Must NOT emit a bare "Control: N" line. - assert "- Control:" not in md or "- Control: " not in md.split("## Sample")[1].split("##")[0], ( + assert ( + "- Control:" not in md or "- Control: " not in md.split("## Sample")[1].split("##")[0] + ), ( "Stacked with dynamic clean-control must not render a fixed " "'- Control: N' line in the Sample section." ) @@ -1817,9 +1804,7 @@ def test_efficient_did_pt_post_strips_strict_clause(self): assert "PT-Post" in block["description"] def test_stacked_did_strips_strict_clause(self): - stub = self._stub( - "StackedDiDResults", clean_control="not_yet_treated" - ) + stub = self._stub("StackedDiDResults", clean_control="not_yet_treated") block = BusinessReport(stub, auto_diagnostics=False).to_dict()["assumption"] self._assert_no_strict_contract(block["description"]) # Stacked sub-experiment identifying content preserved. @@ -1832,9 +1817,7 @@ def test_rendered_full_report_has_no_strict_contract_for_anticipation(self): """ stub = self._stub("CallawaySantAnnaResults") md = BusinessReport(stub, auto_diagnostics=False).full_report() - assumption_section = md.split("## Identifying Assumption", 1)[1].split( - "\n## ", 1 - )[0] + assumption_section = md.split("## Identifying Assumption", 1)[1].split("\n## ", 1)[0] for pat in self._STRICT_PATTERNS: assert pat not in assumption_section, ( f"Rendered assumption section must not carry the strict " @@ -2561,12 +2544,8 @@ class EfficientDiDResults: # Without any completed steps, the Hausman pretest is included. baseline = practitioner_next_steps(stub, verbose=False)["next_steps"] - hausman_in_baseline = any( - "Hausman pretest" in s.get("label", "") for s in baseline - ) - assert hausman_in_baseline, ( - "EfficientDiD workflow must include the Hausman pretest step" - ) + hausman_in_baseline = any("Hausman pretest" in s.get("label", "") for s in baseline) + assert hausman_in_baseline, "EfficientDiD workflow must include the Hausman pretest step" # After marking ``parallel_trends`` complete (which DR does when # ``_check_pt_hausman`` runs), the Hausman step must be filtered @@ -2575,9 +2554,7 @@ class EfficientDiDResults: filtered = practitioner_next_steps( stub, completed_steps=["parallel_trends"], verbose=False )["next_steps"] - assert not any( - "Hausman pretest" in s.get("label", "") for s in filtered - ), ( + assert not any("Hausman pretest" in s.get("label", "") for s in filtered), ( "Hausman step must be tagged as 'parallel_trends' (REGISTRY.md " "§EfficientDiD classifies it as a PT diagnostic) so that " "DR's _collect_next_steps() suppresses it after running the same " @@ -2623,17 +2600,15 @@ def _step_labels_after_completed(stub, completed): return [ s.get("label", "") - for s in practitioner_next_steps( - stub, completed_steps=completed, verbose=False - )["next_steps"] + for s in practitioner_next_steps(stub, completed_steps=completed, verbose=False)[ + "next_steps" + ] ] def test_sa_specification_falsification_persists_after_sensitivity_runs(self): stub = self._build_stub("SunAbrahamResults") labels = self._step_labels_after_completed(stub, completed=["sensitivity"]) - assert any( - "Specification-based falsification" in lab for lab in labels - ), ( + assert any("Specification-based falsification" in lab for lab in labels), ( "SA's 'Specification-based falsification' step must persist " "after DR marks sensitivity complete — HonestDiD does not run " "control_group / anticipation variation." @@ -2700,9 +2675,7 @@ class CallawaySantAnnaResults: def test_schema_exposes_count_unit(self): for panel, expected in [(True, "units"), (False, "observations")]: - sample = BusinessReport( - self._stub(panel), auto_diagnostics=False - ).to_dict()["sample"] + sample = BusinessReport(self._stub(panel), auto_diagnostics=False).to_dict()["sample"] assert sample["count_unit"] == expected def test_panel_true_renders_unit_wording(self): @@ -2800,8 +2773,7 @@ def test_lift_pre_trends_exposes_power_reason(self): "pretrends_power": { "status": "not_applicable", "reason": ( - "StackedDiDResults does not yet have a " - "compute_pretrends_power adapter." + "StackedDiDResults does not yet have a " "compute_pretrends_power adapter." ), }, } @@ -2810,8 +2782,7 @@ def test_lift_pre_trends_exposes_power_reason(self): assert lifted["power_status"] == "not_applicable" # Plain-English reason now exposed on the schema. assert lifted["power_reason"] == ( - "StackedDiDResults does not yet have a " - "compute_pretrends_power adapter." + "StackedDiDResults does not yet have a " "compute_pretrends_power adapter." ) def test_survey_pt_method_stat_label_uses_joint_p(self): @@ -2826,10 +2797,7 @@ def test_survey_pt_method_stat_label_uses_joint_p(self): f"(the joint test remains; only the reference " f"distribution changes)." ) - assert ( - _pt_method_subject(method) - == "Pre-treatment event-study coefficients" - ), ( + assert _pt_method_subject(method) == "Pre-treatment event-study coefficients", ( f"Survey PT variant {method!r} must use the event-study " f"subject phrase, not the generic fall-through." ) @@ -2873,13 +2841,11 @@ class SyntheticDiDResults: labels = [ s.get("label", "") - for s in practitioner_next_steps( - stub, completed_steps=["sensitivity"], verbose=False - )["next_steps"] + for s in practitioner_next_steps(stub, completed_steps=["sensitivity"], verbose=False)[ + "next_steps" + ] ] - assert any( - "Leave-one-out influence (jackknife)" in lab for lab in labels - ), ( + assert any("Leave-one-out influence (jackknife)" in lab for lab in labels), ( "SDiD jackknife recommendation must persist after DR marks " "sensitivity complete — the SDiD native battery does not run " "the jackknife LOO workflow (requires a separate " @@ -2935,13 +2901,11 @@ class TROPResults: labels = [ s.get("label", "") - for s in practitioner_next_steps( - stub, completed_steps=["sensitivity"], verbose=False - )["next_steps"] + for s in practitioner_next_steps(stub, completed_steps=["sensitivity"], verbose=False)[ + "next_steps" + ] ] - assert any( - "In-time or in-space placebo" in lab for lab in labels - ), ( + assert any("In-time or in-space placebo" in lab for lab in labels), ( "TROP's placebo recommendation must persist after DR marks " "sensitivity complete (SDiD/TROP native battery) — factor-" "model diagnostics are not a placebo substitute." @@ -2982,7 +2946,6 @@ def _fake_grid_sens(): @staticmethod def _stub(class_name: str, **extras): - from diff_diff.prep_dgp import generate_staggered_data # For estimator types that have fits, we'd use real fits; but # several of these need specific setup. Stub with minimal @@ -3016,8 +2979,7 @@ def test_dr_precomputed_sensitivity_honored_on_sun_abraham(self): dr = DiagnosticReport(stub, precomputed={"sensitivity": self._fake_grid_sens()}) sens = dr.to_dict()["sensitivity"] assert sens["status"] == "ran", ( - f"precomputed sensitivity on SunAbrahamResults must be honored; " - f"got {sens!r}" + f"precomputed sensitivity on SunAbrahamResults must be honored; " f"got {sens!r}" ) assert sens.get("precomputed") is True assert sens["breakdown_M"] == 1.25 @@ -3044,8 +3006,7 @@ def test_br_honest_did_results_honored_on_imputation(self): br = BusinessReport(stub, honest_did_results=self._fake_grid_sens()) sens = br.to_dict()["sensitivity"] assert sens["status"] == "computed", ( - f"honest_did_results on ImputationDiDResults must be honored " - f"by BR; got {sens!r}" + f"honest_did_results on ImputationDiDResults must be honored " f"by BR; got {sens!r}" ) assert sens["breakdown_M"] == 1.25 @@ -3156,9 +3117,7 @@ def test_dr_rejects_precomputed_pretrends_power_on_sdid(self, sdid_fit): fit, _ = sdid_fit with pytest.raises(ValueError, match="estimator_native_diagnostics"): - DiagnosticReport( - fit, precomputed={"pretrends_power": self._dummy_power_object()} - ) + DiagnosticReport(fit, precomputed={"pretrends_power": self._dummy_power_object()}) def test_dr_rejects_precomputed_pretrends_power_on_trop(self): from diff_diff import DiagnosticReport @@ -3172,9 +3131,7 @@ class TROPResults: stub.alpha = 0.05 stub.n_obs = 100 with pytest.raises(ValueError, match="estimator_native_diagnostics"): - DiagnosticReport( - stub, precomputed={"pretrends_power": self._dummy_power_object()} - ) + DiagnosticReport(stub, precomputed={"pretrends_power": self._dummy_power_object()}) class TestHeterogeneityOmittedFromFullReportWhenNotRan: @@ -3298,9 +3255,7 @@ def _dummy_sens_object(): def _dummy_pt_object(): from types import SimpleNamespace - return SimpleNamespace( - joint_p_value=0.2, n_pre_periods=3, method="event_study" - ) + return SimpleNamespace(joint_p_value=0.2, n_pre_periods=3, method="event_study") def test_dr_rejects_precomputed_sensitivity_on_sdid(self, sdid_fit): from diff_diff import DiagnosticReport @@ -3758,3 +3713,173 @@ class CallawaySantAnnaResults: assert "sensitivity_skipped" in topics, ( "BR must surface varying-base sensitivity skip as a caveat; " f"got topics {topics}" ) + + +class TestBusinessReportSurveyDesignPassthrough: + """Round-40 P1 CI review on PR #318: ``BusinessReport`` must accept + ``survey_design`` and forward it to the auto-constructed + ``DiagnosticReport``, so Bacon replay on survey-backed fits is + fit-faithful and the simple 2x2 PT path skips with an explicit + reason rather than reporting an unweighted verdict for a weighted + estimate.""" + + def _did_with_survey(self): + from types import SimpleNamespace + + class DiDResults: + pass + + obj = DiDResults() + obj.att = 1.0 + obj.se = 0.2 + obj.t_stat = 5.0 + obj.p_value = 0.001 + obj.conf_int = (0.6, 1.4) + obj.alpha = 0.05 + obj.n_obs = 400 + obj.n_treated = 100 + obj.n_control = 300 + obj.survey_metadata = SimpleNamespace( + design_effect=1.25, + effective_n=320.0, + weight_type="pweight", + n_strata=None, + n_psu=None, + df_survey=20.0, + replicate_method=None, + ) + obj.inference_method = "analytical" + return obj + + def _staggered_stub_with_survey(self): + from types import SimpleNamespace + + class CallawaySantAnnaResults: + pass + + obj = CallawaySantAnnaResults() + obj.overall_att = 1.0 + obj.overall_se = 0.2 + obj.overall_p_value = 0.001 + obj.overall_conf_int = (0.6, 1.4) + obj.alpha = 0.05 + obj.n_obs = 600 + obj.n_treated = 200 + obj.n_control_units = 400 + obj.survey_metadata = SimpleNamespace( + design_effect=1.5, + effective_n=400.0, + weight_type="pweight", + n_strata=None, + n_psu=None, + df_survey=30.0, + replicate_method=None, + ) + obj.event_study_effects = None + return obj + + def test_survey_backed_did_br_rolls_up_pt_skip(self): + """BR's auto-constructed DR must skip the 2x2 PT helper on a + survey-backed DiDResults. BR's schema then surfaces the + skipped PT block with the survey-design reason (no unweighted + verdict leaks into the narrative).""" + import pandas as pd + + panel = pd.DataFrame( + { + "outcome": [1.0, 2.0, 1.1, 2.2], + "post": [0, 1, 0, 1], + "treated": [0, 0, 1, 1], + } + ) + obj = self._did_with_survey() + br = BusinessReport( + obj, + outcome_label="Revenue", + outcome_unit="$", + data=panel, + outcome="outcome", + time="post", + treatment="treated", + ) + schema = br.to_dict() + diag = schema.get("diagnostics", {}) + dr_schema = diag.get("schema", {}) if isinstance(diag, dict) else {} + pt_block = dr_schema.get("parallel_trends", {}) if isinstance(dr_schema, dict) else {} + # Round-40 schema: parallel_trends skipped with a survey-design + # reason rather than emitting an unweighted verdict. BR's auto + # path must honor the skip. + assert pt_block.get("status") == "skipped" + reason = (pt_block.get("reason") or "").lower() + assert "survey design" in reason + + def test_survey_backed_staggered_br_forwards_survey_design_to_bacon(self): + """BR must forward ``survey_design`` to the auto-constructed + DR, which in turn threads it to ``bacon_decompose``. Verify via + ``unittest.mock.patch`` that the kwarg reaches the decomposer. + """ + from unittest.mock import MagicMock, patch + + import pandas as pd + + panel = pd.DataFrame( + { + "outcome": [1.0, 2.0, 1.1, 2.2, 1.2, 2.3, 1.3, 2.4], + "unit": [1, 1, 2, 2, 3, 3, 4, 4], + "period": [1, 2, 1, 2, 1, 2, 1, 2], + "first_treat": [0, 0, 0, 0, 2, 2, 2, 2], + } + ) + obj = self._staggered_stub_with_survey() + sentinel_design = object() + fake_decomp = MagicMock() + fake_decomp.total_weight_treated_vs_never = 0.9 + fake_decomp.total_weight_earlier_vs_later = 0.05 + fake_decomp.total_weight_later_vs_earlier = 0.05 + fake_decomp.twfe_estimate = 1.1 + fake_decomp.n_timing_groups = 2 + with patch("diff_diff.bacon.bacon_decompose", return_value=fake_decomp) as m: + br = BusinessReport( + obj, + data=panel, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + survey_design=sentinel_design, + ) + br.to_dict() # trigger DR build + assert m.called, "bacon_decompose was not called" + _, kwargs = m.call_args + assert kwargs.get("survey_design") is sentinel_design + + def test_survey_backed_staggered_br_skips_bacon_without_survey_design(self): + """Without ``survey_design``, BR's DR must skip Bacon with the + survey-design reason (fit-faithful replay requires it).""" + import pandas as pd + + panel = pd.DataFrame( + { + "outcome": [1.0, 2.0, 1.1, 2.2, 1.2, 2.3, 1.3, 2.4], + "unit": [1, 1, 2, 2, 3, 3, 4, 4], + "period": [1, 2, 1, 2, 1, 2, 1, 2], + "first_treat": [0, 0, 0, 0, 2, 2, 2, 2], + } + ) + obj = self._staggered_stub_with_survey() + br = BusinessReport( + obj, + data=panel, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + # survey_design intentionally omitted + ) + schema = br.to_dict() + diag = schema.get("diagnostics", {}) + dr_schema = diag.get("schema", {}) if isinstance(diag, dict) else {} + bacon_block = dr_schema.get("bacon", {}) if isinstance(dr_schema, dict) else {} + assert bacon_block.get("status") == "skipped" + reason = (bacon_block.get("reason") or "").lower() + assert "survey design" in reason diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index 96c1fb56..5b82c5fa 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -472,9 +472,9 @@ def test_precomputed_parallel_trends_preserves_schema_shaped_joint_p(self, cs_fi pt = dr.to_dict()["parallel_trends"] assert pt["status"] == "ran" assert pt["method"] == "hausman" - assert pt["joint_p_value"] == 0.42, ( - f"joint_p_value must survive formatting; got {pt.get('joint_p_value')}" - ) + assert ( + pt["joint_p_value"] == 0.42 + ), f"joint_p_value must survive formatting; got {pt.get('joint_p_value')}" assert pt["test_statistic"] == 5.6 assert pt["df"] == 3 # Verdict must be derived from the surviving p-value, not None. @@ -590,8 +590,7 @@ def test_precomputed_pt_infers_hausman_method_for_native_object(self, cs_fit): pt = dr.to_dict()["parallel_trends"] assert pt["status"] == "ran" assert pt["method"] == "hausman", ( - f"Native Hausman-like object must infer method='hausman'; " - f"got {pt.get('method')!r}" + f"Native Hausman-like object must infer method='hausman'; " f"got {pt.get('method')!r}" ) assert pt["test_statistic"] == 4.5 assert pt["joint_p_value"] == 0.21 @@ -633,9 +632,7 @@ def test_precomputed_parallel_trends_rejects_input_without_p_value(self, cs_fit) neither) while catching obviously-wrong inputs. """ fit, _ = cs_fit - dr = DiagnosticReport( - fit, precomputed={"parallel_trends": {"method": "event_study"}} - ) + dr = DiagnosticReport(fit, precomputed={"parallel_trends": {"method": "event_study"}}) pt = dr.to_dict()["parallel_trends"] assert pt["status"] == "error" assert "joint_p_value" in pt["reason"] or "p_value" in pt["reason"] @@ -690,9 +687,9 @@ def test_precomputed_single_m_sensitivity_exposes_original_estimate_and_se(self, alpha=0.05, ) - block = DiagnosticReport( - fit, precomputed={"sensitivity": single_m} - ).to_dict()["sensitivity"] + block = DiagnosticReport(fit, precomputed={"sensitivity": single_m}).to_dict()[ + "sensitivity" + ] assert block["status"] == "ran" assert block["conclusion"] == "single_M_precomputed" # Parity with the grid branch: these fields must be present and @@ -826,7 +823,8 @@ def test_joint_wald_uses_F_reference_when_survey_df_is_finite(self): """ from types import SimpleNamespace - from scipy.stats import chi2, f as f_dist + from scipy.stats import chi2 + from scipy.stats import f as f_dist # Same fixture as ``test_joint_wald_runs_when_keys_align`` but with # a survey_metadata carrying a finite df_survey. @@ -906,10 +904,7 @@ def test_dr_prose_uses_event_study_subject_for_survey_pt(self): "joint_wald_survey", "joint_wald_event_study_survey", ): - assert ( - _pt_subject_phrase(method) - == "Pre-treatment event-study coefficients" - ), ( + assert _pt_subject_phrase(method) == "Pre-treatment event-study coefficients", ( f"DR subject for {method!r} must match the non-survey " f"event-study phrasing; got " f"{_pt_subject_phrase(method)!r}" @@ -1128,9 +1123,10 @@ def test_undefined_pre_period_inference_yields_inconclusive_not_shrunken_bonferr "parallel trends hold" verdict from a partially-undefined pre-period surface. """ - import numpy as np from types import SimpleNamespace + import numpy as np + class MultiPeriodDiDResults: pass @@ -1181,7 +1177,6 @@ def test_nan_headline_yields_estimation_failure_prose_not_did_not_change(self): ``to_dict()["overall_interpretation"]`` must now emit an explicit estimation-failure sentence instead. """ - import numpy as np class DiDResults: pass @@ -1205,18 +1200,18 @@ class DiDResults: for label, prose in [("summary", summary), ("overall_interpretation", interp)]: lower = prose.lower() # Must NOT render directional / numeric prose on a NaN fit. - assert "did not change" not in lower, ( - f"{label} rendered 'did not change' on a NaN fit; got: {prose!r}" - ) - assert "nan" not in lower, ( - f"{label} rendered 'nan' in the stakeholder-facing prose; got: {prose!r}" - ) + assert ( + "did not change" not in lower + ), f"{label} rendered 'did not change' on a NaN fit; got: {prose!r}" + assert ( + "nan" not in lower + ), f"{label} rendered 'nan' in the stakeholder-facing prose; got: {prose!r}" assert "by nan" not in lower assert "ci: nan" not in lower # Must name the non-finite state explicitly. - assert "non-finite" in lower or "did not produce" in lower, ( - f"{label} must emit an estimation-failure sentence; got: {prose!r}" - ) + assert ( + "non-finite" in lower or "did not produce" in lower + ), f"{label} must emit an estimation-failure sentence; got: {prose!r}" def test_summary_prose_surfaces_inconclusive_pt_explicitly(self): """Round-35 P1 regression: when pre-trends is inconclusive @@ -1246,9 +1241,7 @@ class StackedDiDResults: -1: {"effect": 0.05, "se": 0.3, "p_value": float("nan"), "n_obs": 400}, } - dr_summary = DiagnosticReport( - obj, run_sensitivity=False, run_bacon=False - ).summary() + dr_summary = DiagnosticReport(obj, run_sensitivity=False, run_bacon=False).summary() br_summary = BusinessReport(obj).summary() # Both summaries must explicitly name the inconclusive state. @@ -1388,9 +1381,9 @@ class MultiPeriodDiDResults: obj.n_control = 50 obj.survey_metadata = None - pt = DiagnosticReport( - obj, run_sensitivity=False, run_bacon=False - ).to_dict()["parallel_trends"] + pt = DiagnosticReport(obj, run_sensitivity=False, run_bacon=False).to_dict()[ + "parallel_trends" + ] assert pt["verdict"] == "inconclusive" assert pt["method"] == "inconclusive" assert pt["n_dropped_undefined"] >= 1 @@ -1401,7 +1394,6 @@ def test_pretrends_power_adapter_filters_zero_se_cs(self): ``np.isfinite(se)`` so the power analysis never includes rows whose per-period SE collapsed. """ - from types import SimpleNamespace import numpy as np @@ -1998,6 +1990,211 @@ class CallawaySantAnnaResults: assert het["sign_consistent"] is True +# --------------------------------------------------------------------------- +# Round-40 P1: survey-design threading for fit-faithful replay +# --------------------------------------------------------------------------- +class TestSurveyDesignThreading: + """Round-40 P1 CI review on PR #318: when a fitted result carries + ``survey_metadata``, Goodman-Bacon and the simple 2x2 PT helper + cannot be faithfully replayed without the original ``SurveyDesign``. + + DR must: + * accept a ``survey_design`` kwarg; + * thread it to ``bacon_decompose(survey_design=...)`` when the + user supplies it; + * skip Bacon with an explicit reason when ``survey_metadata`` is + set but ``survey_design`` is not supplied; + * skip the simple 2x2 PT check with an explicit reason on + survey-backed ``DiDResults`` (the helper has no + ``survey_design`` parameter). + """ + + def _did_with_survey(self): + from types import SimpleNamespace + + class DiDResults: + pass + + obj = DiDResults() + obj.att = 1.0 + obj.se = 0.2 + obj.t_stat = 5.0 + obj.p_value = 0.001 + obj.conf_int = (0.6, 1.4) + obj.alpha = 0.05 + obj.n_obs = 400 + obj.n_treated = 100 + obj.n_control = 300 + obj.survey_metadata = SimpleNamespace( + design_effect=1.25, + effective_n=320.0, + weight_type="pweight", + n_strata=None, + n_psu=None, + df_survey=20.0, + replicate_method=None, + ) + obj.inference_method = "analytical" + return obj + + def _staggered_stub_with_survey(self): + """Lightweight CS-like stub carrying survey_metadata for Bacon gating.""" + from types import SimpleNamespace + + class CallawaySantAnnaResults: + pass + + obj = CallawaySantAnnaResults() + obj.overall_att = 1.0 + obj.overall_se = 0.2 + obj.overall_p_value = 0.001 + obj.overall_conf_int = (0.6, 1.4) + obj.alpha = 0.05 + obj.n_obs = 600 + obj.n_treated = 200 + obj.n_control_units = 400 + obj.survey_metadata = SimpleNamespace( + design_effect=1.5, + effective_n=400.0, + weight_type="pweight", + n_strata=None, + n_psu=None, + df_survey=30.0, + replicate_method=None, + ) + obj.event_study_effects = None + return obj + + def test_survey_backed_did_skips_2x2_pt_with_reason(self): + """Survey-backed ``DiDResults`` must skip the 2x2 PT helper + (``utils.check_parallel_trends`` is unweighted) and produce a + skip reason naming the survey-design replay requirement. + """ + obj = self._did_with_survey() + import pandas as pd + + panel = pd.DataFrame( + { + "outcome": [1.0, 2.0, 1.1, 2.2], + "post": [0, 1, 0, 1], + "treated": [0, 0, 1, 1], + } + ) + dr = DiagnosticReport( + obj, + data=panel, + outcome="outcome", + time="post", + treatment="treated", + ) + assert "parallel_trends" not in dr.applicable_checks + reason = dr.skipped_checks["parallel_trends"] + assert "survey design" in reason.lower() + pt = dr.to_dict()["parallel_trends"] + assert pt["status"] == "skipped" + + def test_survey_backed_did_with_precomputed_pt_runs(self): + """When the user supplies ``precomputed={'parallel_trends': ...}`` + on a survey-backed DiDResults, DR must honor the override rather + than skip with the survey-design reason. + """ + obj = self._did_with_survey() + precomputed_pt = { + "p_value": 0.42, + "treated_trend": 0.05, + "control_trend": 0.04, + "trend_difference": 0.01, + "t_statistic": 0.8, + } + dr = DiagnosticReport( + obj, + precomputed={"parallel_trends": precomputed_pt}, + ) + assert "parallel_trends" in dr.applicable_checks + pt = dr.to_dict()["parallel_trends"] + assert pt["status"] == "ran" + + def test_survey_backed_staggered_skips_bacon_without_survey_design(self): + """CS-like survey-backed fit: Bacon replay must skip with a + reason naming the survey-design requirement rather than produce + an unweighted decomposition for a weighted estimate. + """ + obj = self._staggered_stub_with_survey() + import pandas as pd + + panel = pd.DataFrame( + { + "outcome": [1.0, 2.0, 1.1, 2.2, 1.2, 2.3, 1.3, 2.4], + "unit": [1, 1, 2, 2, 3, 3, 4, 4], + "period": [1, 2, 1, 2, 1, 2, 1, 2], + "first_treat": [0, 0, 0, 0, 2, 2, 2, 2], + } + ) + dr = DiagnosticReport( + obj, + data=panel, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + ) + assert "bacon" not in dr.applicable_checks + reason = dr.skipped_checks["bacon"] + assert "survey design" in reason.lower() + assert "survey_design" in reason or "SurveyDesign" in reason + bacon = dr.to_dict()["bacon"] + assert bacon["status"] == "skipped" + + def test_survey_backed_staggered_threads_survey_design_to_bacon(self): + """When ``survey_design`` is supplied, Bacon applicability flips + back to runnable and ``bacon_decompose`` is invoked with the + survey design. Assert via ``unittest.mock.patch`` that the + kwarg is forwarded. + """ + from unittest.mock import MagicMock, patch + + obj = self._staggered_stub_with_survey() + import pandas as pd + + panel = pd.DataFrame( + { + "outcome": [1.0, 2.0, 1.1, 2.2, 1.2, 2.3, 1.3, 2.4], + "unit": [1, 1, 2, 2, 3, 3, 4, 4], + "period": [1, 2, 1, 2, 1, 2, 1, 2], + "first_treat": [0, 0, 0, 0, 2, 2, 2, 2], + } + ) + + sentinel_design = object() + fake_decomp = MagicMock() + fake_decomp.total_weight_treated_vs_never = 0.9 + fake_decomp.total_weight_earlier_vs_later = 0.05 + fake_decomp.total_weight_later_vs_earlier = 0.05 + fake_decomp.twfe_estimate = 1.1 + fake_decomp.n_timing_groups = 2 + + with patch("diff_diff.bacon.bacon_decompose", return_value=fake_decomp) as m: + dr = DiagnosticReport( + obj, + data=panel, + outcome="outcome", + unit="unit", + time="period", + first_treat="first_treat", + survey_design=sentinel_design, + ) + # Applicability gate passes since survey_design is supplied. + assert "bacon" in dr.applicable_checks + bacon = dr.to_dict()["bacon"] + assert bacon["status"] == "ran" + # The survey_design must be threaded through to + # bacon_decompose as a kwarg so the replayed decomposition + # matches the fitted design. + assert m.called, "bacon_decompose was not called" + _, kwargs = m.call_args + assert kwargs.get("survey_design") is sentinel_design + + # --------------------------------------------------------------------------- # Public API exposure # --------------------------------------------------------------------------- From 020d53748f06078a2db21a59e8ad8ed65028b175 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 16:01:25 -0400 Subject: [PATCH 44/48] Address forty-first round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-41 landed a single P3 finding (PR overall assessment ✅): the survey-threading docs and constructor docstrings overstated what ``survey_design`` unlocks. Supplying the design enables fit-faithful Goodman-Bacon replay, but the simple 2x2 parallel-trends helper (``utils.check_parallel_trends``) has no survey-aware variant, so survey-backed ``DiDResults`` PT is skipped unconditionally regardless of ``survey_design`` — the helper cannot consume the design even when it is available. Docs: - ``REPORTING.md`` now separates the Bacon replay contract (where ``survey_design`` is load-bearing) from the 2x2 PT contract (skip-only; use ``precomputed={'parallel_trends': ...}`` to opt in). - ``business_report.rst`` / ``diagnostic_report.rst`` mirror the split and point the reader at the precomputed-PT opt-in. - Constructor docstrings on both classes clarify the scope. Tests: added ``test_survey_backed_did_skips_2x2_pt_even_when_survey_design_supplied`` which passes both ``survey_metadata`` AND ``survey_design`` on a ``DiDResults`` stub and asserts PT still skips with a reason naming the precomputed-PT opt-in (not ``survey_design``). Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 12 +++++++---- diff_diff/diagnostic_report.py | 12 +++++++---- docs/api/business_report.rst | 10 ++++++--- docs/api/diagnostic_report.rst | 17 ++++++++++----- docs/methodology/REPORTING.md | 38 +++++++++++++++++++-------------- tests/test_diagnostic_report.py | 38 +++++++++++++++++++++++++++++++++ 6 files changed, 95 insertions(+), 32 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 0e5b3f4e..24ddc722 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -131,10 +131,14 @@ class BusinessReport: estimator. Forwarded to the auto-constructed ``DiagnosticReport`` for fit-faithful Goodman-Bacon replay. When the fit carries ``survey_metadata`` but ``survey_design`` is not supplied, Bacon - and the simple 2x2 parallel-trends check are skipped with an - explicit reason rather than replaying an unweighted decomposition - for a design that does not match the estimate. See - ``docs/methodology/REPORTING.md``. + is skipped with an explicit reason rather than replaying an + unweighted decomposition for a design that does not match the + estimate. The simple 2x2 parallel-trends helper + (``utils.check_parallel_trends``) has no survey-aware variant; + on a survey-backed ``DiDResults`` it is skipped unconditionally + regardless of ``survey_design``. Supply + ``precomputed={'parallel_trends': ...}`` with a survey-aware + pretest to opt in. See ``docs/methodology/REPORTING.md``. """ def __init__( diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index facff564..aed7444a 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -273,10 +273,14 @@ class DiagnosticReport: estimator. Required for fit-faithful replay of Goodman-Bacon on a survey-backed fit; threaded to ``bacon_decompose(survey_design=...)``. When the fit carries ``survey_metadata`` but ``survey_design`` is - not supplied, Bacon and the simple 2x2 parallel-trends check are - skipped with an explicit reason rather than replaying an - unweighted decomposition for a design that does not match the - estimate. See ``docs/methodology/REPORTING.md``. + not supplied, Bacon is skipped with an explicit reason rather than + replaying an unweighted decomposition for a design that does not + match the estimate. The simple 2x2 parallel-trends helper + (``utils.check_parallel_trends``) has no survey-aware variant; + on a survey-backed ``DiDResults`` it is skipped unconditionally + regardless of ``survey_design``. Supply + ``precomputed={'parallel_trends': ...}`` with a survey-aware + pretest to opt in. See ``docs/methodology/REPORTING.md``. precomputed : dict, optional Map of check name to a pre-computed result object. Accepted keys (this is the full implemented list; unsupported keys raise diff --git a/docs/api/business_report.rst b/docs/api/business_report.rst index 02f35da0..c95f0710 100644 --- a/docs/api/business_report.rst +++ b/docs/api/business_report.rst @@ -28,9 +28,13 @@ For survey-weighted fits (any result carrying ``survey_design=``. It is threaded through to ``bacon_decompose`` for a fit-faithful Goodman-Bacon replay. When ``survey_metadata`` is set but ``survey_design`` is not supplied, -Bacon and the simple 2x2 parallel-trends check are skipped with an -explicit reason so the report never emits an unweighted decomposition -or PT verdict for a design that differs from the estimate. +Bacon is skipped with an explicit reason so the report never emits +an unweighted decomposition for a design that differs from the +estimate. The simple 2x2 parallel-trends helper has no survey-aware +variant and is skipped unconditionally on a survey-backed +``DiDResults`` regardless of ``survey_design``; supply +``precomputed={'parallel_trends': ...}`` with a survey-aware +pretest to opt in. Methodology deviations (no traffic-light gates, pre-trends verdict thresholds, power-aware phrasing, unit-translation policy, schema diff --git a/docs/api/diagnostic_report.rst b/docs/api/diagnostic_report.rst index c3c3626d..0a72e2ca 100644 --- a/docs/api/diagnostic_report.rst +++ b/docs/api/diagnostic_report.rst @@ -28,11 +28,18 @@ For survey-weighted fits (any result carrying ``survey_design=``. It is threaded through to ``bacon_decompose`` for a fit-faithful Goodman-Bacon replay. When ``survey_metadata`` is set but ``survey_design`` is not supplied, -Bacon and the simple 2x2 parallel-trends check are skipped with an -explicit reason so the report never emits an unweighted decomposition -or PT verdict for a design that differs from the estimate. Alternatively -supply ``precomputed={'bacon': }`` or -``precomputed={'parallel_trends': }`` with a survey-aware result. +Bacon is skipped with an explicit reason so the report never emits +an unweighted decomposition for a design that differs from the +estimate; alternatively supply +``precomputed={'bacon': }`` with a +survey-aware result. + +The simple 2x2 parallel-trends helper has no survey-aware variant +and is skipped unconditionally on a survey-backed ``DiDResults`` +regardless of ``survey_design`` — the helper cannot consume the +design even when it is available. Supply +``precomputed={'parallel_trends': }`` with a survey-aware +pretest result to opt in. Example ------- diff --git a/docs/methodology/REPORTING.md b/docs/methodology/REPORTING.md index e36fbd7f..fd10cfda 100644 --- a/docs/methodology/REPORTING.md +++ b/docs/methodology/REPORTING.md @@ -80,22 +80,28 @@ not new inference. Wald statistic (or Bonferroni fallback when `vcov` is missing). This mirrors the guidance in `practitioner._parallel_trends_step(staggered=True)`. -- **Note:** Survey-design threading for fit-faithful replay. When the - fitted result carries `survey_metadata`, Goodman-Bacon replay and - the simple 2x2 parallel-trends helper require the original - `SurveyDesign` object to produce a diagnostic that matches the - estimate. `DiagnosticReport(survey_design=...)` and - `BusinessReport(survey_design=...)` accept it and forward to - `bacon_decompose(survey_design=...)`. When `survey_metadata` is set - but `survey_design` is not supplied, both checks skip with an - explicit reason rather than replaying an unweighted decomposition - / PT verdict for a design that differs from the weighted estimate. - Users can alternatively pass `precomputed={'bacon': ...}` / - `precomputed={'parallel_trends': ...}` with a survey-aware result. - Event-study PT on staggered estimators already reads the weighted - pre-period coefficients directly off the fitted result (so does not - need a second replay) and uses the finite-df reference described - below. +- **Note:** Survey-design threading for fit-faithful Bacon replay. + `DiagnosticReport(survey_design=...)` and + `BusinessReport(survey_design=...)` accept the original + `SurveyDesign` object and forward it to + `bacon_decompose(survey_design=...)` so the Goodman-Bacon + decomposition is computed under the same design as the weighted + estimate. When `survey_metadata` is set but `survey_design` is not + supplied, Bacon skips with an explicit reason rather than replaying + an unweighted decomposition for a design that differs from the + weighted estimate; users can alternatively pass + `precomputed={'bacon': ...}` with a survey-aware result. + + The simple 2x2 parallel-trends helper (`utils.check_parallel_trends`) + has no survey-aware variant. On a survey-backed `DiDResults` the + check is skipped **unconditionally**, regardless of whether + `survey_design` is supplied, because the helper cannot consume the + design even when it is available. Users must pass + `precomputed={'parallel_trends': ...}` with a survey-aware pretest + result to opt in. Event-study PT on staggered estimators is + unaffected — it reads the weighted pre-period coefficients directly + off the fitted result and uses the finite-df reference described + below, so no second replay is needed. - **Note:** Survey finite-df PT policy. When the fitted result carries a finite `survey_metadata.df_survey`, `_pt_event_study` computes diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index 5b82c5fa..ca5f7863 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -2093,6 +2093,44 @@ def test_survey_backed_did_skips_2x2_pt_with_reason(self): pt = dr.to_dict()["parallel_trends"] assert pt["status"] == "skipped" + def test_survey_backed_did_skips_2x2_pt_even_when_survey_design_supplied(self): + """Round-41 P3 regression: supplying ``survey_design`` does NOT + unlock the simple 2x2 PT helper. ``utils.check_parallel_trends`` + has no survey-aware variant, so the helper cannot consume the + design even when it is available; the check is skipped + unconditionally on a survey-backed ``DiDResults`` and the skip + reason must point the user at the precomputed-PT opt-in rather + than imply that ``survey_design`` would have helped. + """ + import pandas as pd + + obj = self._did_with_survey() + panel = pd.DataFrame( + { + "outcome": [1.0, 2.0, 1.1, 2.2], + "post": [0, 1, 0, 1], + "treated": [0, 0, 1, 1], + } + ) + sentinel_design = object() + dr = DiagnosticReport( + obj, + data=panel, + outcome="outcome", + time="post", + treatment="treated", + survey_design=sentinel_design, + ) + # Supplying survey_design does not unlock 2x2 PT. + assert "parallel_trends" not in dr.applicable_checks + reason = dr.skipped_checks["parallel_trends"] + # Reason must point at the precomputed-PT opt-in and must not + # claim ``survey_design`` fixes this path. + assert "precomputed" in reason.lower() + assert "parallel_trends" in reason.lower() + pt = dr.to_dict()["parallel_trends"] + assert pt["status"] == "skipped" + def test_survey_backed_did_with_precomputed_pt_runs(self): """When the user supplies ``precomputed={'parallel_trends': ...}`` on a survey-backed DiDResults, DR must honor the override rather From 5788e26300ef27398649a38c84bb43cf64e2a022 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 16:24:03 -0400 Subject: [PATCH 45/48] Address forty-second round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-42 landed two P1 findings: 1. All-undefined pre-period surface routed to ``skipped`` instead of ``inconclusive`` (``diagnostic_report.py``). When every pre-row is dropped by ``_collect_pre_period_coefs`` for undefined inference (all ``se <= 0`` / non-finite effect/se), the collector returns ``([], n_dropped_undefined > 0)``. Both the applicability gate and ``_pt_event_study`` treated that as "no coefficients available" and skipped, letting BR drop the identifying-assumption warning. Fixed both sites to detect the all-undefined case and route to the explicit ``method="inconclusive"`` runner alongside the partial- undefined case already covered by R33. BR's existing inconclusive phrasing lifts through unchanged. 2. Source-faithful assumption text for ``ImputationDiDResults`` and ``TwoStageDiDResults`` (``business_report.py``). BR's ``_describe_assumption`` was grouping both with CS / SA / Wooldridge under the generic "parallel trends across treatment cohorts and time periods (group-time ATT)" template, but BJS (2024) and Gardner (2022) both identify through an untreated-potential-outcome model: unit+time FE fitted on untreated observations (``Omega_0`` = never-treated + not-yet-treated) deliver the counterfactual, and the identifying restriction is on ``E[Y_it(0)] = alpha_i + beta_t`` — not on cohort-time ATT equality. Split each into its own branch mirroring REGISTRY.md §ImputationDiD (lines 1000-1013) and §TwoStageDiD (lines 1113-1128), including the Gardner-BJS algebraic-equivalence note. Tests: 3 new regressions. - ``test_all_pre_periods_undefined_yields_inconclusive_not_skipped``: all pre-rows with ``se == 0``, asserts DR emits ``method="inconclusive"`` / ``status="ran"`` / ``n_pre_periods=0`` / ``n_dropped_undefined=2``, and BR summary emits "inconclusive". - ``test_imputation_did_assumption_uses_untreated_fe_model`` and ``test_two_stage_did_assumption_uses_untreated_fe_model``: lock the new ``parallel_trends_variant="untreated_outcome_fe_model"`` tag, require the registry-backed source attribution and untreated-subset detail, and reject the pre-R42 generic-PT template. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 72 ++++++++++++++++++++++++- diff_diff/diagnostic_report.py | 45 ++++++++++------ tests/test_business_report.py | 95 +++++++++++++++++++++++++++++++++ tests/test_diagnostic_report.py | 83 ++++++++++++++++++++++++++++ 4 files changed, 277 insertions(+), 18 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 24ddc722..9d57679c 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -1232,11 +1232,79 @@ def _describe_assumption(estimator_name: str, results: Any = None) -> Dict[str, block["control_group"] = clean_control block["clean_control"] = clean_control return block + if estimator_name == "ImputationDiDResults": + # Borusyak, Jaravel & Spiess (2024) — identification is through + # an untreated-potential-outcome model: unit+time FE (optionally + # plus covariates) fitted on untreated observations only + # (``Omega_0``) deliver the counterfactual ``Y_it(0)``, and the + # treatment effect ``tau_it`` is the residual on treated + # observations. Writing this as generic "group-time ATT + # parallel trends" misstates the identifying model — the + # restriction is on the UNTREATED outcome's additive FE + # structure, not on cohort-time ATT equality. REGISTRY.md + # §ImputationDiD lines 1000-1013 and Assumption 1 (parallel + # trends) + Assumption 2 (no anticipation on untreated + # observations). Round-42 P1 CI review on PR #318 flagged this + # source-faithfulness gap. + return { + "parallel_trends_variant": "untreated_outcome_fe_model", + "no_anticipation": True, + "description": ( + "Identification under Imputation DiD (Borusyak, Jaravel " + "& Spiess 2024): the untreated potential outcome " + "``Y_it(0)`` follows an additive unit+time fixed-effects " + "model ``Y_it(0) = alpha_i + beta_t [+ X'_it * delta] + " + "epsilon_it``. Step 1 estimates those FE on untreated " + "observations only (``Omega_0`` = never-treated plus " + "not-yet-treated cells); Step 2 imputes the " + "counterfactual for treated observations from the " + "fitted FE; Step 3 aggregates ``tau_hat_it = Y_it - " + "Y_hat_it(0)`` with researcher-chosen weights. The " + "identifying restriction is therefore parallel trends " + "of the UNTREATED outcome model (Assumption 1) — " + "``E[Y_it(0)] = alpha_i + beta_t``, holding across all " + "observations — rather than equality of cohort-time " + "ATTs. Also assumes no anticipation on untreated " + "observations (Assumption 2) and absorbing treatment." + ), + } + if estimator_name == "TwoStageDiDResults": + # Gardner (2022) — identification is the same as BJS + # ImputationDiD (point estimates are algebraically equivalent + # per REGISTRY.md §TwoStageDiD line 1130): unit+time FE + # estimated on untreated observations only deliver the + # untreated potential-outcome trajectory; Stage 2 regresses + # the resulting residuals on treatment indicators. Writing + # this as generic "group-time ATT parallel trends" loses the + # load-bearing detail that Stage 1 operates only on untreated + # cells. REGISTRY.md §TwoStageDiD lines 1113-1128 and + # Assumption (same as ImputationDiD). Round-42 P1 CI review on + # PR #318 flagged this source-faithfulness gap. + return { + "parallel_trends_variant": "untreated_outcome_fe_model", + "no_anticipation": True, + "description": ( + "Identification under Two-Stage DiD (Gardner 2022): " + "Stage 1 fits unit + time fixed effects on untreated " + "observations only (``Omega_0``), residualizing the " + "outcome as ``y_tilde_it = Y_it - alpha_hat_i - " + "beta_hat_t``; Stage 2 regresses residualized outcomes " + "on the treatment indicator across treated observations " + "to recover the ATT. The point estimates are " + "algebraically equivalent to Borusyak-Jaravel-Spiess " + "imputation (both rely on the same untreated-outcome FE " + "model to construct the counterfactual). The " + "identifying restriction is therefore parallel trends " + "of the UNTREATED outcome: ``E[Y_it(0)] = alpha_i + " + "beta_t`` for all observations (not a group-time ATT " + "equality across cohorts). Also assumes no anticipation " + "(``Y_it = Y_it(0)`` for all untreated observations) " + "and absorbing / irreversible treatment." + ), + } if estimator_name in { "CallawaySantAnnaResults", "SunAbrahamResults", - "ImputationDiDResults", - "TwoStageDiDResults", "WooldridgeDiDResults", }: return { diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index aed7444a..d9255ad6 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -615,8 +615,17 @@ def _instance_skip_reason(self, check: str) -> Optional[str]: "opt in." ) if method == "event_study": - pre_coefs, _ = _collect_pre_period_coefs(r) - if not pre_coefs: + pre_coefs, n_dropped_undefined = _collect_pre_period_coefs(r) + # Round-42 P1 CI review on PR #318: the all-undefined + # pre-period case (every pre-row dropped for ``se <= 0`` + # / non-finite inference) is the twin of the partial- + # undefined case from round-33. It must route to the + # inconclusive runner rather than skip, so the explicit + # ``method="inconclusive"`` / ``n_dropped_undefined`` + # provenance is surfaced through DR's schema and BR's + # summary emits the "inconclusive" identifying- + # assumption warning rather than silently dropping PT. + if not pre_coefs and n_dropped_undefined == 0: return ( "No pre-period event-study coefficients are exposed on " "this fit. For staggered estimators, re-fit with " @@ -1083,20 +1092,19 @@ def _pt_event_study(self) -> Dict[str, Any]: """ r = self._results pre_coefs, n_dropped_undefined = _collect_pre_period_coefs(r) - if not pre_coefs: - return { - "status": "skipped", - "reason": "No pre-period event-study coefficients available.", - } - # Round-33 P0 CI review on PR #318: if any real pre-period was - # rejected for undefined inference (``se <= 0`` or non-finite - # ``effect`` / ``se``), the Bonferroni fallback used to silently - # shrink the test family on the remaining subset and publish a - # finite joint p-value that then lifted into clean BR prose. - # That violates the ``safe_inference`` contract (``se <= 0`` -> - # NaN downstream). Return an explicit inconclusive PT result - # instead — the user cannot conclude "PT holds" from a - # partially-undefined pre-period surface. + # Round-33 P0 / Round-42 P1 CI review on PR #318: undefined- + # inference rows must drive an explicit ``inconclusive`` PT + # result rather than either (a) silently shrinking the + # Bonferroni family on the remaining subset and publishing a + # finite joint p-value (R33, mixed-partial case), or (b) + # routing through the empty-coefs ``skipped`` path when every + # pre-row was rejected (R42, all-undefined case). Both violate + # the ``safe_inference`` contract: ``se <= 0`` / non-finite + # effect or SE yields NaN downstream per ``utils.py`` line + # 175, REGISTRY.md line 197. The inconclusive block preserves + # the undefined-row count on the schema so BR's summary can + # quote it and stakeholders see an explicit "PT could not be + # assessed" warning rather than a silent PT-absent narrative. if n_dropped_undefined > 0: return { "status": "ran", @@ -1119,6 +1127,11 @@ def _pt_event_study(self) -> Dict[str, Any]: "investigate why the per-period SE collapsed." ), } + if not pre_coefs: + return { + "status": "skipped", + "reason": "No pre-period event-study coefficients available.", + } interaction_indices = getattr(r, "interaction_indices", None) vcov = getattr(r, "vcov", None) diff --git a/tests/test_business_report.py b/tests/test_business_report.py index 742308f5..d9c0db95 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -750,6 +750,101 @@ class StaggeredTripleDiffResults: # Must NOT be the generic group-time PT text. assert "group-time ATT" not in desc + def test_imputation_did_assumption_uses_untreated_fe_model(self): + """Round-42 P1 regression: BJS (2024) identifies through the + untreated-outcome FE model (Step 1 estimates FE on ``Omega_0`` + = never-treated + not-yet-treated observations, Assumption 1 + parallel trends applies to ``E[Y_it(0)]``). The old generic + "group-time ATT" wording misstated this: the identifying + restriction is on the UNTREATED outcome's additive FE + structure, not on cohort-time ATT equality. REGISTRY.md + §ImputationDiD lines 1000-1013 and Assumption 1/2. + """ + + class ImputationDiDResults: + pass + + obj = ImputationDiDResults() + obj.overall_att = 1.0 + obj.overall_se = 0.1 + obj.overall_p_value = 0.001 + obj.overall_conf_int = (0.8, 1.2) + obj.alpha = 0.05 + obj.n_obs = 100 + obj.n_treated = 40 + obj.n_control = 60 + obj.survey_metadata = None + obj.event_study_effects = None + obj.inference_method = "analytical" + obj.anticipation = 0 + + br = BusinessReport(obj, auto_diagnostics=False) + assumption = br.to_dict()["assumption"] + assert assumption["parallel_trends_variant"] == "untreated_outcome_fe_model" + desc = assumption["description"] + # Registry-backed: Borusyak-Jaravel-Spiess attribution. + assert "Borusyak" in desc or "BJS" in desc or "2024" in desc + # Load-bearing source detail: untreated-observation FE model. + assert "untreated" in desc.lower() + assert "Omega_0" in desc or "fixed effect" in desc.lower() + # Must NOT render the pre-R42 generic group-time-ATT template + # that grouped BJS in with CS / SA. + assert ( + "parallel trends across treatment cohorts and time periods (group-time ATT)" not in desc + ), ( + "ImputationDiD identifies via untreated-outcome FE modelling " + "(BJS 2024 Assumption 1), not generic group-time ATT PT. The " + f"assumption description must not use the pre-R42 template. Got: {desc!r}" + ) + + def test_two_stage_did_assumption_uses_untreated_fe_model(self): + """Round-42 P1 regression: Gardner (2022) two-stage DiD shares + BJS's untreated-outcome FE identification (REGISTRY.md explicitly + states "Parallel trends (same as ImputationDiD)" and the point + estimates are algebraically equivalent). Stage 1 fits FE on + untreated observations, Stage 2 residualizes treated observations. + The old generic "group-time ATT" wording dropped the untreated- + subset detail. REGISTRY.md §TwoStageDiD lines 1113-1128. + """ + + class TwoStageDiDResults: + pass + + obj = TwoStageDiDResults() + obj.overall_att = 1.0 + obj.overall_se = 0.1 + obj.overall_p_value = 0.001 + obj.overall_conf_int = (0.8, 1.2) + obj.alpha = 0.05 + obj.n_obs = 100 + obj.n_treated = 40 + obj.n_control = 60 + obj.survey_metadata = None + obj.event_study_effects = None + obj.inference_method = "analytical" + obj.anticipation = 0 + + br = BusinessReport(obj, auto_diagnostics=False) + assumption = br.to_dict()["assumption"] + assert assumption["parallel_trends_variant"] == "untreated_outcome_fe_model" + desc = assumption["description"] + # Registry-backed: Gardner 2022 attribution. + assert "Gardner" in desc or "2022" in desc + # Load-bearing: Stage 1 operates on untreated observations. + assert "untreated" in desc.lower() + assert "Stage 1" in desc or "stage 1" in desc.lower() + # Must mention the two-stage procedure. + assert "two-stage" in desc.lower() or "Two-Stage" in desc + # Must NOT render the pre-R42 generic group-time-ATT template + # that grouped Gardner in with CS / SA. + assert ( + "parallel trends across treatment cohorts and time periods (group-time ATT)" not in desc + ), ( + "TwoStageDiD identifies via the same untreated-outcome FE " + "model as ImputationDiD (Gardner 2022); the assumption " + f"description must not use the pre-R42 template. Got: {desc!r}" + ) + class TestEfficientDiDAssumptionPtAllPtPost: """Round-8 regression: EfficientDiD has two distinct PT regimes diff --git a/tests/test_diagnostic_report.py b/tests/test_diagnostic_report.py index ca5f7863..2f3800a6 100644 --- a/tests/test_diagnostic_report.py +++ b/tests/test_diagnostic_report.py @@ -1388,6 +1388,89 @@ class MultiPeriodDiDResults: assert pt["method"] == "inconclusive" assert pt["n_dropped_undefined"] >= 1 + def test_all_pre_periods_undefined_yields_inconclusive_not_skipped(self): + """Round-42 P1 regression: the twin of the partially-undefined + case. When every pre-period row is dropped by the collector + for undefined inference (all ``se <= 0`` or non-finite effect/SE), + ``_collect_pre_period_coefs`` returns ``([], n_dropped_undefined > 0)``. + The prior behavior routed through the empty-coefs ``skipped`` + path ("No pre-period event-study coefficients available"), + which let BR drop the identifying-assumption warning and render + a silent-PT-absent narrative. That violates the inconclusive + contract documented in REPORTING.md: when any pre-row is + dropped for undefined inference, the joint PT test is + inconclusive, not skipped. + """ + from diff_diff import BusinessReport + + class StackedDiDResults: + pass + + obj = StackedDiDResults() + obj.overall_att = 1.0 + obj.overall_se = 0.2 + obj.overall_p_value = 0.001 + obj.overall_conf_int = (0.6, 1.4) + obj.alpha = 0.05 + obj.n_obs = 400 + obj.n_treated_units = 100 + obj.n_control_units = 300 + obj.survey_metadata = None + # All pre-rows have ``se == 0`` — undefined inference per the + # safe-inference contract (``utils.py:175``). The collector's + # ``se > 0`` filter drops all of them, leaving pre_coefs=[] + # with n_dropped_undefined=2 (the R42 all-undefined case). + obj.event_study_effects = { + -2: { + "effect": 0.1, + "se": 0.0, + "p_value": 1.0, + "n_obs": 400, + }, + -1: { + "effect": 0.05, + "se": 0.0, + "p_value": 1.0, + "n_obs": 400, + }, + } + + dr = DiagnosticReport(obj, run_sensitivity=False, run_bacon=False) + # Applicability gate: PT must be marked applicable (runs as + # inconclusive), not skipped with "no coefficients available". + assert "parallel_trends" in dr.applicable_checks, ( + "All-undefined pre-period case must keep PT applicable so " + "the inconclusive runner can emit the explicit " + "n_dropped_undefined provenance. Current skipped reasons: " + f"{dr.skipped_checks}" + ) + pt = dr.to_dict()["parallel_trends"] + assert pt["status"] == "ran", pt + assert pt["method"] == "inconclusive", ( + f"All-undefined pre-period family must route to the " + f"inconclusive runner, not 'skipped'. Got status=" + f"{pt.get('status')!r}, method={pt.get('method')!r}, " + f"reason={pt.get('reason')!r}" + ) + assert pt["verdict"] == "inconclusive" + assert pt["joint_p_value"] is None + # All-undefined: n_dropped_undefined equals attempted pre-period + # count (2 rows here), and the valid subset is empty. + assert pt["n_dropped_undefined"] == 2 + assert pt["n_pre_periods"] == 0 + + # BR must surface this as an inconclusive identifying- + # assumption warning, not silently omit PT. The "inconclusive" + # verdict phrasing is the load-bearing contract for + # stakeholders. + br_summary = BusinessReport(obj).summary().lower() + assert "inconclusive" in br_summary, ( + f"All-undefined PT must surface 'inconclusive' in BR " f"summary. Got: {br_summary!r}" + ) + # And must not claim PT was untested / no-coefs. + assert "no pre-period event-study coefficients" not in br_summary + assert "consistent with parallel trends" not in br_summary + def test_pretrends_power_adapter_filters_zero_se_cs(self): """Round-33 P0 regression: CS / SA ``compute_pretrends_power`` adapters also use the ``se > 0`` filter alongside From e2115bcf7e7acdf42e7a8182e7941f9a676220fe Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 16:52:21 -0400 Subject: [PATCH 46/48] Address forty-third round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-43 assessment was ✅ with two P2 findings; both are boundary / contract mismatches rather than methodology defects. 1. DEFF==1.05 boundary inconsistency. REPORTING.md defines the ``trivial`` band as ``0.95 <= deff < 1.05`` (half-open) and ``slightly_reduces`` as starting at ``1.05``. The ``is_trivial`` flag in both DR's ``_check_design_effect`` and BR's sample-block copy used ``<= 1.05`` (closed), so exactly ``deff == 1.05`` landed in the ``slightly_reduces`` band AND was flagged ``is_trivial=True`` — internally inconsistent, and the flag suppressed the non-trivial prose the documented threshold says should fire. Aligned both ``is_trivial`` bounds with REPORTING.md's half-open interval. 2. ``BusinessReport`` did not accept the ``precomputed=`` dict that its docstring and API docs advertised as the opt-in path for survey-aware 2x2 PT (``precomputed={'parallel_trends': ...}``) and other escape hatches. The auto path only synthesized ``{"sensitivity": honest_did_results}``, so a user following the BR docs hit a ``TypeError`` on ``__init__``. Added ``precomputed=`` kwarg to ``BusinessReport``, eager key validation mirroring DR's set (keys: ``parallel_trends`` / ``sensitivity`` / ``pretrends_power`` / ``bacon``), and forwarded the merged dict to the auto-constructed DR. ``honest_did_results`` remains a shorthand for ``sensitivity``; explicit ``precomputed['sensitivity']`` wins on conflict. Tests: 5 new regressions. - ``test_deff_exactly_1_05_is_slightly_reduces_not_trivial`` + ``test_deff_just_under_1_05_is_trivial`` cover the exact-boundary + adjacent-point schema behavior across DR and BR. - ``TestBusinessReportPrecomputedPassthrough`` covers the happy path (PT precomputed unlocks survey-backed 2x2), eager key validation (unknown key raises ``ValueError`` at BR construction), and the explicit-vs-shorthand precedence rule. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 67 ++++++++++- diff_diff/diagnostic_report.py | 12 +- docs/api/business_report.rst | 9 ++ tests/test_business_report.py | 196 +++++++++++++++++++++++++++++++++ 4 files changed, 280 insertions(+), 4 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 9d57679c..7f8264d2 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -139,6 +139,15 @@ class BusinessReport: regardless of ``survey_design``. Supply ``precomputed={'parallel_trends': ...}`` with a survey-aware pretest to opt in. See ``docs/methodology/REPORTING.md``. + precomputed : dict, optional + Pre-computed diagnostic objects forwarded to the auto- + constructed ``DiagnosticReport`` (same keys as + ``DiagnosticReport(precomputed=...)``): ``"parallel_trends"``, + ``"sensitivity"``, ``"pretrends_power"``, ``"bacon"``. DR + validates keys and rejects estimator-incompatible entries + (e.g., HonestDiD bounds or generic PT on SDiD / TROP). + ``honest_did_results`` remains a shorthand for ``sensitivity``; + an explicit ``precomputed['sensitivity']`` wins on conflict. """ def __init__( @@ -162,6 +171,7 @@ def __init__( time: Optional[str] = None, first_treat: Optional[str] = None, survey_design: Optional[Any] = None, + precomputed: Optional[Dict[str, Any]] = None, ): if type(results).__name__ == "BaconDecompositionResults": raise TypeError( @@ -229,6 +239,42 @@ def __init__( # Without this passthrough, the auto path silently replays an # unweighted decomposition / PT verdict for a weighted fit. self._dr_survey_design = survey_design + # Round-43 P2 CI review on PR #318: BR docs and docstrings + # advertised a ``precomputed={'parallel_trends': ...}`` opt-in + # for survey-aware 2x2 PT and other escape hatches, but BR did + # not actually accept a ``precomputed=`` kwarg — the auto path + # only synthesized ``{"sensitivity": honest_did_results}``, so + # callers following the BR docs hit a ``TypeError`` on + # ``__init__``. Accept the passthrough here and forward every + # key to the auto-constructed DR (which owns validation against + # its implemented-key set and estimator-aware rejection rules). + # ``honest_did_results`` still feeds into ``sensitivity`` as a + # convenience; an explicit ``precomputed['sensitivity']`` wins + # on conflict. + self._dr_precomputed: Dict[str, Any] = dict(precomputed or {}) + # Round-43 P2 CI review on PR #318: mirror DR's eager key + # validation so users get the "unsupported key" error at BR + # construction rather than lazily when the DR is built inside + # ``to_dict()``. Kept in sync with ``DiagnosticReport``'s + # ``_supported_precomputed`` set; the cheapest way to avoid + # drift would be to import the set, but DR currently scopes it + # locally to ``__init__`` so mirror the literal here with a + # pointer comment. + _br_supported_precomputed = { + "parallel_trends", + "sensitivity", + "pretrends_power", + "bacon", + } + _br_unsupported = set(self._dr_precomputed) - _br_supported_precomputed + if _br_unsupported: + raise ValueError( + "precomputed= contains keys that are not implemented: " + f"{sorted(_br_unsupported)}. Supported keys: " + f"{sorted(_br_supported_precomputed)}. ``design_effect``, " + "``heterogeneity``, and ``epv`` are read directly from the " + "fitted result and do not accept precomputed overrides." + ) resolved_alpha = alpha if alpha is not None else getattr(results, "alpha", 0.05) self._context = BusinessContext( @@ -307,9 +353,16 @@ def _resolve_diagnostics(self) -> Optional[DiagnosticReportResults]: raise TypeError("diagnostics= must be a DiagnosticReport or DiagnosticReportResults") if not self._auto_diagnostics: return None - precomputed: Dict[str, Any] = {} + # Round-43 P2 CI review on PR #318: forward the user's + # ``precomputed`` dict through to DR. ``honest_did_results`` + # stays a convenience shortcut for ``sensitivity`` only; an + # explicit ``precomputed['sensitivity']`` from the caller + # wins. DR handles key validation (rejects unsupported keys + # and estimator-incompatible sensitivities / parallel_trends + # entries) so BR just merges and forwards. + precomputed: Dict[str, Any] = dict(self._dr_precomputed) if self._honest_did_results is not None: - precomputed["sensitivity"] = self._honest_did_results + precomputed.setdefault("sensitivity", self._honest_did_results) dr = DiagnosticReport( self._results, alpha=self._context.alpha, @@ -666,7 +719,15 @@ def _extract_survey_block(self) -> Optional[Dict[str, Any]]: "weight_type": getattr(sm, "weight_type", None), "effective_n": _safe_float(getattr(sm, "effective_n", None)), "design_effect": deff, - "is_trivial": deff is not None and 0.95 <= deff <= 1.05, + # Round-43 P2 CI review on PR #318: the ``is_trivial`` + # upper bound matches DR's ``_check_design_effect`` and + # REPORTING.md's ``trivial`` band definition + # ``0.95 <= deff < 1.05`` (half-open). The prior closed + # interval ``<= 1.05`` produced ``is_trivial=True`` at + # exactly ``deff == 1.05`` while the DR schema emitted + # ``band_label="slightly_reduces"`` for the same value, + # suppressing BR's non-trivial prose at that boundary. + "is_trivial": deff is not None and 0.95 <= deff < 1.05, "n_strata": _safe_int(getattr(sm, "n_strata", None)), "n_psu": _safe_int(getattr(sm, "n_psu", None)), "df_survey": _safe_int(getattr(sm, "df_survey", None)), diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index d9255ad6..1b4fbd17 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -1717,7 +1717,17 @@ def _check_design_effect(self) -> Dict[str, Any]: # ``deff < 0.95`` and keep ``is_trivial`` restricted to the # tight "effectively no effect" window so the schema # carries the precision-improving signal explicitly. - is_trivial = deff is not None and 0.95 <= deff <= 1.05 + # + # Round-43 P2 CI review on PR #318: the ``is_trivial`` upper + # bound used ``<= 1.05`` (closed interval) but REPORTING.md + # defines the ``trivial`` band as ``0.95 <= deff < 1.05`` + # (half-open) and ``slightly_reduces`` as ``1.05 <= deff < 2``. + # At exactly ``deff == 1.05`` the schema emitted + # ``band_label="slightly_reduces"`` while also setting + # ``is_trivial=True``, suppressing the non-trivial prose that + # the documented threshold says should fire. Align the + # ``is_trivial`` bound with the band-label bound. + is_trivial = deff is not None and 0.95 <= deff < 1.05 if deff is None or not np.isfinite(deff): band_label: Optional[str] = None elif deff < 0.95: diff --git a/docs/api/business_report.rst b/docs/api/business_report.rst index c95f0710..3017dbf2 100644 --- a/docs/api/business_report.rst +++ b/docs/api/business_report.rst @@ -14,6 +14,15 @@ to surface pre-trends, sensitivity, and other validity checks as part of the narrative. Pass ``auto_diagnostics=False`` to skip this, or ``diagnostics=`` to supply an explicit one. +Pre-computed diagnostics can be forwarded directly to the auto- +constructed ``DiagnosticReport`` via +``precomputed={'parallel_trends': ...}``, +``precomputed={'sensitivity': ...}``, +``precomputed={'pretrends_power': ...}``, or +``precomputed={'bacon': ...}`` — same keys as +``DiagnosticReport(precomputed=...)``. DR validates keys and rejects +estimator-incompatible entries. + Data-dependent checks (2x2 parallel trends on simple DiD, Goodman-Bacon decomposition on staggered estimators, the EfficientDiD Hausman PT-All vs PT-Post pretest) require the raw panel + column diff --git a/tests/test_business_report.py b/tests/test_business_report.py index d9c0db95..f5dbb31d 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -3319,6 +3319,72 @@ def test_materially_reduces_band_under_5(self): def test_large_warning_band_at_or_above_5(self): assert self._stub_with_deff(7.5)["band_label"] == "large_warning" + def test_deff_exactly_1_05_is_slightly_reduces_not_trivial(self): + """Round-43 P2 regression: REPORTING.md defines the ``trivial`` + band as ``0.95 <= deff < 1.05`` (half-open) and + ``slightly_reduces`` as starting at ``1.05``. The prior code used + ``is_trivial = 0.95 <= deff <= 1.05`` (closed), producing a + schema that labeled exactly ``deff == 1.05`` as + ``band_label="slightly_reduces"`` AND ``is_trivial=True`` — + internally inconsistent, and the ``is_trivial=True`` flag + suppressed the non-trivial prose that the documented threshold + says should fire. + """ + # DR schema: band_label="slightly_reduces" and is_trivial=False. + block = self._stub_with_deff(1.05) + assert block["band_label"] == "slightly_reduces", ( + f"DEFF==1.05 must land in the ``slightly_reduces`` band per " + f"REPORTING.md (half-open threshold). Got: {block!r}" + ) + assert block["is_trivial"] is False, ( + f"DEFF==1.05 must NOT be classified as trivial (half-open " + f"threshold). Got is_trivial={block['is_trivial']!r}" + ) + + # BR's sample-block ``is_trivial`` must match. + from types import SimpleNamespace + + from diff_diff import BusinessReport + + class CallawaySantAnnaResults: + pass + + stub = CallawaySantAnnaResults() + stub.overall_att = 1.0 + stub.overall_se = 0.2 + stub.overall_p_value = 0.001 + stub.overall_conf_int = (0.6, 1.4) + stub.alpha = 0.05 + stub.n_obs = 500 + stub.n_treated = 100 + stub.n_control_units = 400 + stub.event_study_effects = None + stub.survey_metadata = SimpleNamespace( + design_effect=1.05, + effective_n=500.0 / 1.05, + weight_type="pweight", + n_strata=None, + n_psu=None, + df_survey=None, + replicate_method=None, + ) + br = BusinessReport(stub, auto_diagnostics=False) + sample = br.to_dict()["sample"] + assert sample["survey"] is not None + assert sample["survey"]["is_trivial"] is False, ( + f"BR sample-block ``is_trivial`` at DEFF==1.05 must match " + f"DR's half-open threshold. Got: {sample['survey']!r}" + ) + + def test_deff_just_under_1_05_is_trivial(self): + """Round-43 P2 regression: the lower-bound adjacent point + ``deff == 1.049`` is still inside the half-open ``trivial`` + band ``[0.95, 1.05)``. + """ + block = self._stub_with_deff(1.049) + assert block["band_label"] == "trivial" + assert block["is_trivial"] is True + class TestSDiDTROPRejectIncompatiblePrecomputedInputs: """Round-21 P1 CI review on PR #318: ``precomputed={"sensitivity": @@ -3978,3 +4044,133 @@ def test_survey_backed_staggered_br_skips_bacon_without_survey_design(self): assert bacon_block.get("status") == "skipped" reason = (bacon_block.get("reason") or "").lower() assert "survey design" in reason + + +class TestBusinessReportPrecomputedPassthrough: + """Round-43 P2 CI review on PR #318: ``BusinessReport`` must accept + the ``precomputed=`` dict that its docs advertise and forward every + key to the auto-constructed ``DiagnosticReport``. DR owns key + validation (rejects unsupported keys and estimator-incompatible + entries).""" + + def _did_with_survey(self): + from types import SimpleNamespace + + class DiDResults: + pass + + obj = DiDResults() + obj.att = 1.0 + obj.se = 0.2 + obj.t_stat = 5.0 + obj.p_value = 0.001 + obj.conf_int = (0.6, 1.4) + obj.alpha = 0.05 + obj.n_obs = 400 + obj.n_treated = 100 + obj.n_control = 300 + obj.survey_metadata = SimpleNamespace( + design_effect=1.25, + effective_n=320.0, + weight_type="pweight", + n_strata=None, + n_psu=None, + df_survey=20.0, + replicate_method=None, + ) + obj.inference_method = "analytical" + return obj + + def test_br_accepts_precomputed_parallel_trends_on_survey_did(self): + """The BR docs advertise + ``precomputed={'parallel_trends': ...}`` as the opt-in for + survey-backed 2x2 PT. That contract must actually be + reachable from BR's constructor, not just DR's.""" + obj = self._did_with_survey() + precomputed_pt = { + "p_value": 0.42, + "treated_trend": 0.05, + "control_trend": 0.04, + "trend_difference": 0.01, + "t_statistic": 0.8, + } + br = BusinessReport( + obj, + outcome_label="Revenue", + outcome_unit="$", + precomputed={"parallel_trends": precomputed_pt}, + ) + schema = br.to_dict() + dr_schema = schema["diagnostics"]["schema"] + pt = dr_schema["parallel_trends"] + assert pt["status"] == "ran", ( + "BR precomputed PT passthrough must unlock the otherwise-" + f"skipped survey-backed 2x2 path. Got PT block: {pt!r}" + ) + assert pt["joint_p_value"] == pytest.approx(0.42) + + def test_br_rejects_unsupported_precomputed_key(self): + """DR validates the precomputed key set; BR must raise the same + error rather than silently dropping unsupported keys.""" + obj = self._did_with_survey() + with pytest.raises(ValueError, match="precomputed="): + BusinessReport(obj, precomputed={"unknown_check": object()}) + + def test_br_explicit_precomputed_sensitivity_wins_over_honest_did_results(self): + """When both ``honest_did_results`` (shorthand) and explicit + ``precomputed['sensitivity']`` are supplied, the explicit + passthrough wins. Documented contract: honest_did_results is a + convenience that only kicks in when no explicit sensitivity is + present.""" + from types import SimpleNamespace + + class MultiPeriodDiDResults: + pass + + obj = MultiPeriodDiDResults() + obj.avg_att = 1.0 + obj.avg_se = 0.1 + obj.avg_p_value = 0.001 + obj.avg_conf_int = (0.8, 1.2) + obj.alpha = 0.05 + obj.n_obs = 100 + obj.n_treated = 40 + obj.n_control = 60 + obj.survey_metadata = None + obj.pre_period_effects = {} + obj.vcov = None + obj.interaction_indices = None + obj.event_study_vcov = None + obj.event_study_vcov_index = None + + explicit_sens = SimpleNamespace( + M_values=[0.5, 1.0], + bounds=[(0.1, 2.0), (-0.2, 2.5)], + robust_cis=[(0.05, 2.1), (-0.3, 2.6)], + breakdown_M=0.87, # sentinel — the explicit one's breakdown + method="relative_magnitude", + original_estimate=1.0, + original_se=0.1, + alpha=0.05, + ) + shorthand_sens = SimpleNamespace( + M_values=[0.5, 1.0], + bounds=[(0.1, 2.0), (-0.2, 2.5)], + robust_cis=[(0.05, 2.1), (-0.3, 2.6)], + breakdown_M=0.33, # different sentinel + method="relative_magnitude", + original_estimate=1.0, + original_se=0.1, + alpha=0.05, + ) + br = BusinessReport( + obj, + honest_did_results=shorthand_sens, + precomputed={"sensitivity": explicit_sens}, + ) + dr_schema = br.to_dict()["diagnostics"]["schema"] + sens = dr_schema["sensitivity"] + assert sens.get("breakdown_M") == pytest.approx(0.87), ( + "Explicit precomputed['sensitivity'] must win over " + "honest_did_results shorthand. Got sens block: " + repr(sens) + ) From 1085e724c0af85a409994a95d053c2107af9d486 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 17:10:19 -0400 Subject: [PATCH 47/48] Address forty-fourth round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-44 landed one P1 methodology finding: ``precomputed['sensitivity']`` (and BR's ``honest_did_results`` shorthand that merges into it) bypassed the varying-base ``CallawaySantAnna`` guard. The applicability gate's precomputed early-return fired before the ``base_period != 'universal'`` check, so a displayed CS fit with ``base_period='varying'`` would get its sensitivity section unlocked and the Rambachan-Roth bounds narrated as ordinary robustness — even though HonestDiD explicitly warns those bounds are not valid for interpretation on the consecutive-comparison pre-period surface ``varying`` base produces (REGISTRY.md §CallawaySantAnna line 410, §HonestDiD line 2458). Narrating the bounds alongside the displayed varying-base fit mixes provenance the bounds do not support, which is the silent-failure pattern the varying-base auto-path skip was designed to prevent. Fixes: - ``diagnostic_report.py`` ``__init__``: raise ``ValueError`` when ``precomputed['sensitivity']`` is supplied on ``CallawaySantAnnaResults`` with ``base_period != 'universal'``, mirroring the existing SDiD/TROP rejection pattern for methodology-incompatible passthroughs. - ``diagnostic_report.py`` ``_instance_skip_reason``: reorder the sensitivity gate so the CS varying-base check fires BEFORE the precomputed early-return (defense-in-depth behind the ``__init__`` raise; also protects against callers that mutate ``_precomputed`` post-construction). - ``business_report.py`` ``__init__``: raise on the same interaction when either ``honest_did_results`` or ``precomputed['sensitivity']`` is supplied (or both — the error names each rejected input). Tests: 5 new regressions in ``TestCSVaryingBaseSensitivityRejectsPrecomputed`` covering both DR and BR, both passthrough paths, the union-error case, and the universal-base positive case (supported and not rejected). Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 40 ++++++++++++++ diff_diff/diagnostic_report.py | 73 ++++++++++++++++++++----- tests/test_business_report.py | 99 ++++++++++++++++++++++++++++++++++ 3 files changed, 200 insertions(+), 12 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 7f8264d2..451e5540 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -216,6 +216,46 @@ def __init__( "automatically under ``estimator_native_diagnostics``." ) + # Round-44 P1 CI review on PR #318: mirror the SDiD/TROP + # rejection pattern for ``CallawaySantAnna`` fits with + # ``base_period != "universal"``. HonestDiD Rambachan-Roth + # bounds are not valid for interpretation on the consecutive- + # comparison pre-period surface produced by ``varying`` base, + # so narrating precomputed sensitivity (whether passed as + # ``honest_did_results`` or ``precomputed['sensitivity']``) + # alongside a displayed varying-base fit mixes provenance the + # bounds don't support. DR enforces the same guard at + # construction; BR duplicates the check so the error fires + # before the auto-DR is built, matching the existing + # SDiD/TROP UX. REGISTRY.md §CallawaySantAnna line 410, + # §HonestDiD line 2458. + _cs_with_varying_base = type(results).__name__ == "CallawaySantAnnaResults" and ( + getattr(results, "base_period", "universal") != "universal" + ) + if _cs_with_varying_base: + _rejected_inputs: List[str] = [] + if honest_did_results is not None: + _rejected_inputs.append("honest_did_results") + if precomputed is not None and "sensitivity" in precomputed: + _rejected_inputs.append("precomputed['sensitivity']") + if _rejected_inputs: + _base_period = getattr(results, "base_period", "universal") + raise ValueError( + f"CallawaySantAnnaResults with " + f"``base_period={_base_period!r}`` cannot be " + "summarized alongside a precomputed HonestDiD " + "sensitivity object. The Rambachan-Roth bounds are " + "not valid for interpretation on the consecutive-" + "comparison pre-period surface this base yields " + "(REGISTRY.md §CallawaySantAnna / §HonestDiD). " + "Rejected inputs: " + ", ".join(_rejected_inputs) + ". " + "Re-fit the main estimator with " + "``CallawaySantAnna(base_period='universal')`` " + "before passing precomputed sensitivity, or drop " + "the sensitivity passthrough to let BR skip the " + "section with a methodology-critical reason." + ) + self._results = results self._honest_did_results = honest_did_results self._auto_diagnostics = auto_diagnostics diff --git a/diff_diff/diagnostic_report.py b/diff_diff/diagnostic_report.py index 1b4fbd17..0fe798a9 100644 --- a/diff_diff/diagnostic_report.py +++ b/diff_diff/diagnostic_report.py @@ -421,6 +421,38 @@ def __init__( "``pre_treatment_fit``; TROP: ``effective_rank``, " "``loocv_score``) — DR surfaces these automatically." ) + + # Round-44 P1 CI review on PR #318: mirror the SDiD/TROP + # __init__ rejection pattern for ``CallawaySantAnna`` with + # ``base_period != "universal"``. HonestDiD bounds are not + # valid for interpretation on consecutive-comparison + # (``base_period='varying'``) pre-period surfaces (REGISTRY.md + # §CallawaySantAnna line 410 plus §HonestDiD line 2458). + # ``precomputed["sensitivity"]`` would otherwise bypass the + # applicability-gate guard (which already existed for the auto + # path) and let BR/DR narrate the Rambachan-Roth bounds as + # ordinary robustness on a displayed fit whose interpretation + # does not match the bounds' provenance. Reject at + # construction so users get the error up-front rather than a + # late skip in the schema. + if _result_name == "CallawaySantAnnaResults" and "sensitivity" in self._precomputed: + _base_period = getattr(self._results, "base_period", "universal") + if _base_period != "universal": + raise ValueError( + "precomputed['sensitivity'] on " + "CallawaySantAnnaResults requires " + "``base_period='universal'`` on the displayed fit — " + "HonestDiD Rambachan-Roth bounds are not valid for " + "interpretation on the consecutive-comparison " + "pre-period surface produced by " + f"``base_period={_base_period!r}``. Narrating the " + "bounds as robustness alongside a varying-base fit " + "mixes provenance the bounds don't support. Re-fit " + "the main estimator with " + "``CallawaySantAnna(base_period='universal')`` " + "before passing precomputed sensitivity." + ) + self._outcome_label = outcome_label self._treatment_label = treatment_label self._cached: Optional[DiagnosticReportResults] = None @@ -697,16 +729,20 @@ def _instance_skip_reason(self, check: str) -> Optional[str]: # Native SDiD/TROP paths substitute for HonestDiD. if name in {"SyntheticDiDResults", "TROPResults"}: return None - # Precomputed sensitivity always unlocks this check. - if "sensitivity" in self._precomputed: - return None - # CallawaySantAnna with ``base_period='varying'`` (the default) - # produces consecutive-comparison pre-period coefficients; - # HonestDiD explicitly warns those bounds are not valid for - # interpreted sensitivity. Skip at the applicability gate so - # BR/DR do not narrate the grid as robustness. Users opting - # in can pass ``precomputed={'sensitivity': ...}`` or re-fit - # with ``base_period='universal'``. + # Round-44 P1 CI review on PR #318: the CS varying-base + # guard MUST fire before the precomputed early-return. + # Previously, ``precomputed["sensitivity"]`` unlocked this + # check unconditionally, letting BR/DR narrate the + # Rambachan-Roth bounds as ordinary robustness even though + # HonestDiD explicitly warns those bounds are not valid + # for interpretation on consecutive-comparison + # (``base_period='varying'``) pre-period surfaces + # (REGISTRY.md §CallawaySantAnna line 410, §HonestDiD line + # 2458). The previous skip message also mis-pointed users + # at ``precomputed`` as the opt-in; that path now routes + # through the same guard, so the correct remediation is to + # re-fit the main estimator with ``base_period='universal'`` + # or to consult HonestDiD outside the report layer. if name == "CallawaySantAnnaResults": base_period = getattr(r, "base_period", "universal") if base_period != "universal": @@ -716,9 +752,22 @@ def _instance_skip_reason(self, check: str) -> Optional[str]: "(Rambachan-Roth bounds are not comparable across the " "consecutive pre-period comparisons produced by " f"``base_period={base_period!r}``). Re-fit with " - "``CallawaySantAnna(base_period='universal')`` or pass " - "``precomputed={'sensitivity': ...}`` to opt in." + "``CallawaySantAnna(base_period='universal')``; " + "``precomputed={'sensitivity': ...}`` is rejected here " + "because the precomputed bounds would be narrated as " + "robustness for a displayed fit whose pre-period " + "surface has a different interpretation than the one " + "the bounds were computed against." ) + # Precomputed sensitivity unlocks this check for every + # other estimator (SDiD/TROP were already rejected at DR + # __init__; CS varying-base is gated above). The CS + # guard above runs on the *displayed fit*, not on the + # provenance of the precomputed bounds; it protects + # against narrating bounds whose interpretation is + # incompatible with the fit being summarized. + if "sensitivity" in self._precomputed: + return None # dCDH uses ``placebo_event_study`` as its pre-period surface, # which HonestDiD consumes via a dedicated branch. Accept the # fit when that attribute is populated. diff --git a/tests/test_business_report.py b/tests/test_business_report.py index f5dbb31d..ab0b68c4 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -3876,6 +3876,105 @@ class CallawaySantAnnaResults: ) +class TestCSVaryingBaseSensitivityRejectsPrecomputed: + """Round-44 P1 CI review on PR #318: ``precomputed['sensitivity']`` + (and BR's ``honest_did_results`` shorthand) previously bypassed the + varying-base CS guard — the applicability gate's precomputed early- + return unlocked the sensitivity section and BR/DR narrated the + Rambachan-Roth bounds as ordinary robustness on a displayed fit + whose consecutive-comparison pre-period surface has a different + interpretation than the bounds' provenance (REGISTRY.md + §CallawaySantAnna line 410, §HonestDiD line 2458). DR and BR must + reject at construction.""" + + def _cs_varying_base_stub(self): + class CallawaySantAnnaResults: + pass + + stub = CallawaySantAnnaResults() + stub.overall_att = 1.0 + stub.overall_se = 0.3 + stub.overall_p_value = 0.01 + stub.overall_conf_int = (0.4, 1.6) + stub.alpha = 0.05 + stub.n_obs = 100 + stub.n_treated = 40 + stub.n_control = 60 + stub.survey_metadata = None + stub.event_study_effects = None + stub.event_study_vcov = None + stub.event_study_vcov_index = None + stub.vcov = None + stub.interaction_indices = None + stub.base_period = "varying" + stub.inference_method = "analytical" + return stub + + def _dummy_sens_object(self): + from types import SimpleNamespace + + return SimpleNamespace( + M_values=[0.5, 1.0], + bounds=[(0.1, 2.0), (-0.2, 2.5)], + robust_cis=[(0.05, 2.1), (-0.3, 2.6)], + breakdown_M=0.75, + method="relative_magnitude", + original_estimate=1.0, + original_se=0.2, + alpha=0.05, + ) + + def test_dr_rejects_precomputed_sensitivity_on_varying_base_cs(self): + from diff_diff import DiagnosticReport + + stub = self._cs_varying_base_stub() + with pytest.raises(ValueError, match="base_period='universal'"): + DiagnosticReport(stub, precomputed={"sensitivity": self._dummy_sens_object()}) + + def test_dr_allows_precomputed_sensitivity_on_universal_base_cs(self): + """Universal-base CS + precomputed sensitivity is the supported + path — must not be rejected.""" + from diff_diff import DiagnosticReport + + stub = self._cs_varying_base_stub() + stub.base_period = "universal" + # Should not raise. + DiagnosticReport(stub, precomputed={"sensitivity": self._dummy_sens_object()}) + + def test_br_rejects_precomputed_sensitivity_on_varying_base_cs(self): + stub = self._cs_varying_base_stub() + with pytest.raises(ValueError, match="base_period='varying'"): + BusinessReport( + stub, + precomputed={"sensitivity": self._dummy_sens_object()}, + ) + + def test_br_rejects_honest_did_results_on_varying_base_cs(self): + """BR's ``honest_did_results`` shorthand must hit the same + rejection — it becomes ``precomputed['sensitivity']`` under the + hood, and the methodology problem is identical.""" + stub = self._cs_varying_base_stub() + with pytest.raises(ValueError, match="honest_did_results"): + BusinessReport( + stub, + honest_did_results=self._dummy_sens_object(), + ) + + def test_br_rejects_both_passthrough_inputs_names_them(self): + """When both passthrough inputs are supplied, the error must + name both so the user knows every input that was rejected.""" + stub = self._cs_varying_base_stub() + with pytest.raises(ValueError) as excinfo: + BusinessReport( + stub, + honest_did_results=self._dummy_sens_object(), + precomputed={"sensitivity": self._dummy_sens_object()}, + ) + msg = str(excinfo.value) + assert "honest_did_results" in msg + assert "precomputed['sensitivity']" in msg + + class TestBusinessReportSurveyDesignPassthrough: """Round-40 P1 CI review on PR #318: ``BusinessReport`` must accept ``survey_design`` and forward it to the auto-constructed From dcfb4fea110702d661b38face15b78e539a74085 Mon Sep 17 00:00:00 2001 From: igerber Date: Sun, 19 Apr 2026 17:24:35 -0400 Subject: [PATCH 48/48] Address forty-fifth round of CI review findings on PR #318 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Round-45 landed one P1 methodology finding: ``BusinessReport`` emitted the Bacon "re-estimate with a heterogeneity-robust estimator (CS / SA / BJS / Gardner)" caveat on every fit whose Bacon block had ``forbidden_weight > 0.10``, including fits that were already produced by one of those robust estimators. Goodman-Bacon is explicitly a decomposition of TWFE weights (``bacon.py`` header, Goodman-Bacon 2021). On a displayed fit that is already heterogeneity-robust (CS / SA / BJS / Gardner / Wooldridge / EfficientDiD / Stacked / dCDH / TripleDifference / StaggeredTripleDiff / SDiD / TROP), a high forbidden-weight share is a statement about what TWFE WOULD have done on this rollout, not a claim that the displayed estimator needs replacement. DR partly preserved this in its prose with an "if not already in use" guard; BR dropped that distinction and rendered the stronger recommendation in stakeholder-facing caveats / full reports. Fix (``business_report.py`` ``_build_caveats``): - Introduce ``_TWFE_STYLE_RESULTS = {DiDResults, MultiPeriodDiDResults, TwoWayFixedEffectsResults}`` — the fits for which the switch-to- robust recommendation is load-bearing. - Keep the original message for TWFE-style fits. - Rephrase for already-robust fits: "TWFE benchmark would be materially biased on this rollout; the displayed estimator is already heterogeneity-robust, so this is a statement about the rollout design (avoid reporting TWFE alongside this fit), not about the current result's validity." Tests: 3 new regressions in ``TestBaconCaveatEstimatorAware``: - CS-like fit with high forbidden-weight does NOT recommend switching. - Spot-check the same rule across SA / Imputation / TwoStage / Stacked / Wooldridge / dCDH / EfficientDiD. - ``MultiPeriodDiDResults`` (TWFE event-study) DOES keep the switch recommendation. Co-Authored-By: Claude Opus 4.7 (1M context) --- diff_diff/business_report.py | 51 +++++++++-- tests/test_business_report.py | 161 ++++++++++++++++++++++++++++++++++ 2 files changed, 203 insertions(+), 9 deletions(-) diff --git a/diff_diff/business_report.py b/diff_diff/business_report.py index 451e5540..2445251f 100644 --- a/diff_diff/business_report.py +++ b/diff_diff/business_report.py @@ -38,7 +38,7 @@ import re from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, FrozenSet, List, Optional, Union import numpy as np @@ -1521,23 +1521,56 @@ def _build_caveats( ) # Bacon forbidden comparisons. + # Round-45 P1 CI review on PR #318: Goodman-Bacon is a + # decomposition of TWFE weights (see ``bacon.py`` header and + # Goodman-Bacon 2021). On fits already produced by a + # heterogeneity-robust estimator (CS / SA / BJS / Gardner / + # Wooldridge / EfficientDiD / Stacked / dCDH / TripleDifference / + # StaggeredTripleDiff / SDiD / TROP), a high forbidden-weight share + # says "TWFE would have been materially biased on this rollout", + # not "the displayed estimator needs to be replaced" — the + # displayed estimator is already robust to the heterogeneity that + # Bacon flags. DR partly preserves this with "if not already in + # use" prose; BR must carry the same distinction through to the + # caveat. The TWFE-style estimators whose results route through + # Bacon and for which the "switch to a robust estimator" + # recommendation is load-bearing are the DiDResults-type fits; all + # other result classes are already robust. + _TWFE_STYLE_RESULTS: FrozenSet[str] = frozenset( + {"DiDResults", "MultiPeriodDiDResults", "TwoWayFixedEffectsResults"} + ) if dr_schema: bacon = dr_schema.get("bacon") or {} if bacon.get("status") == "ran": fw = bacon.get("forbidden_weight") if isinstance(fw, (int, float)) and fw > 0.10: + _estimator_name = type(_results).__name__ + if _estimator_name in _TWFE_STYLE_RESULTS: + bacon_message = ( + f"Goodman-Bacon decomposition places {fw:.0%} " + "of implicit TWFE weight on 'forbidden' " + "later-vs-earlier comparisons. TWFE may be " + "materially biased under heterogeneous effects. " + "Re-estimate with a heterogeneity-robust " + "estimator (CS / SA / BJS / Gardner)." + ) + else: + bacon_message = ( + f"Goodman-Bacon decomposition places {fw:.0%} " + "of TWFE weight on 'forbidden' later-vs-earlier " + "comparisons. A TWFE benchmark on this rollout " + "would be materially biased under heterogeneous " + "effects; the displayed estimator is already " + "heterogeneity-robust, so this is a statement " + "about the rollout design (avoid reporting TWFE " + "alongside this fit), not about the current " + "result's validity." + ) caveats.append( { "severity": "warning", "topic": "bacon_contamination", - "message": ( - f"Goodman-Bacon decomposition places {fw:.0%} " - "of implicit TWFE weight on 'forbidden' " - "later-vs-earlier comparisons. TWFE may be " - "materially biased under heterogeneous effects. " - "Re-estimate with a heterogeneity-robust " - "estimator (CS / SA / BJS / Gardner)." - ), + "message": bacon_message, } ) diff --git a/tests/test_business_report.py b/tests/test_business_report.py index ab0b68c4..cda3e5a7 100644 --- a/tests/test_business_report.py +++ b/tests/test_business_report.py @@ -3975,6 +3975,167 @@ def test_br_rejects_both_passthrough_inputs_names_them(self): assert "precomputed['sensitivity']" in msg +class TestBaconCaveatEstimatorAware: + """Round-45 P1 CI review on PR #318: Goodman-Bacon decomposes TWFE + weights. On fits already produced by a heterogeneity-robust + estimator (CS / SA / BJS / Gardner / Wooldridge / EfficientDiD / + Stacked / dCDH / TripleDifference / StaggeredTripleDiff / SDiD / + TROP), a high forbidden-weight share says "TWFE would have been + materially biased on this rollout", not "the displayed estimator + needs to be replaced" — the displayed fit is already robust. + + BR's caveat must be estimator-aware: keep the "switch to a robust + estimator" recommendation for TWFE-style fits only. + """ + + @staticmethod + def _bacon_schema_with_high_forbidden_weight(): + """Build a fake ``DiagnosticReportResults`` whose schema carries + a Bacon block with ``forbidden_weight > 0.10`` so BR's caveat + builder fires on the Bacon branch.""" + from diff_diff.diagnostic_report import DiagnosticReportResults + + schema = { + "schema_version": "1.0", + "estimator": {"class_name": "Stub", "display_name": "Stub"}, + "headline_metric": {}, + "parallel_trends": {"status": "skipped", "reason": "stub"}, + "pretrends_power": {"status": "skipped", "reason": "stub"}, + "sensitivity": {"status": "skipped", "reason": "stub"}, + "placebo": {"status": "skipped", "reason": "stub"}, + "bacon": { + "status": "ran", + "twfe_estimate": 1.2, + "weight_by_type": { + "treated_vs_never": 0.5, + "earlier_vs_later": 0.1, + "later_vs_earlier": 0.4, + }, + "forbidden_weight": 0.4, # > 0.10 threshold + "verdict": "materially_contaminated", + "n_timing_groups": 3, + }, + "design_effect": {"status": "skipped", "reason": "stub"}, + "heterogeneity": {"status": "skipped", "reason": "stub"}, + "epv": {"status": "skipped", "reason": "stub"}, + "estimator_native_diagnostics": {"status": "not_applicable"}, + "skipped": {}, + "warnings": [], + "overall_interpretation": "", + "next_steps": [], + } + return DiagnosticReportResults( + schema=schema, + interpretation="", + applicable_checks=("bacon",), + skipped_checks={}, + warnings=(), + ) + + @staticmethod + def _make_cs_like_stub(class_name: str): + cls = type(class_name, (), {}) + obj = cls() + obj.overall_att = 1.0 + obj.overall_se = 0.2 + obj.overall_p_value = 0.001 + obj.overall_conf_int = (0.6, 1.4) + obj.alpha = 0.05 + obj.n_obs = 500 + obj.n_treated = 100 + obj.n_control_units = 400 + obj.survey_metadata = None + obj.event_study_effects = None + obj.inference_method = "analytical" + return obj + + @staticmethod + def _make_twfe_style_stub(): + class MultiPeriodDiDResults: + pass + + obj = MultiPeriodDiDResults() + obj.avg_att = 1.0 + obj.avg_se = 0.2 + obj.avg_p_value = 0.001 + obj.avg_conf_int = (0.6, 1.4) + obj.alpha = 0.05 + obj.n_obs = 500 + obj.n_treated = 100 + obj.n_control = 400 + obj.survey_metadata = None + obj.pre_period_effects = None + obj.inference_method = "analytical" + return obj + + def test_cs_like_fit_does_not_recommend_switching_estimators(self): + """On an already-robust CS-style fit with high forbidden + Bacon weight, BR must not recommend switching to a robust + estimator — the displayed fit IS already robust.""" + stub = self._make_cs_like_stub("CallawaySantAnnaResults") + dr = self._bacon_schema_with_high_forbidden_weight() + br = BusinessReport(stub, diagnostics=dr) + caveats = br.caveats() + bacon_caveats = [c for c in caveats if c.get("topic") == "bacon_contamination"] + assert len(bacon_caveats) == 1, ( + f"High forbidden-weight Bacon must surface a caveat. " f"Got caveats: {caveats!r}" + ) + msg = bacon_caveats[0]["message"].lower() + # Must NOT tell the user to switch estimators. + assert "re-estimate with a heterogeneity-robust" not in msg, ( + f"CS is already heterogeneity-robust; must not recommend " + f"switching. Got message: {msg!r}" + ) + # Must frame this as a TWFE benchmark problem / rollout-design + # statement, not a displayed-fit-validity problem. + assert ( + "heterogeneity-robust" in msg + or "already" in msg + or "twfe benchmark" in msg + or "rollout design" in msg + ), f"CS Bacon caveat must reframe as rollout/TWFE issue. Got: {msg!r}" + # And the full-report rendering must reflect the softer wording. + md = br.full_report() + assert "Re-estimate with a heterogeneity-robust estimator" not in md, md + + def test_other_robust_estimators_also_avoid_switch_recommendation(self): + """Spot-check the same rule holds for multiple + heterogeneity-robust estimators on the Bacon path.""" + for class_name in ( + "SunAbrahamResults", + "ImputationDiDResults", + "TwoStageDiDResults", + "StackedDiDResults", + "WooldridgeDiDResults", + "ChaisemartinDHaultfoeuilleResults", + "EfficientDiDResults", + ): + stub = self._make_cs_like_stub(class_name) + dr = self._bacon_schema_with_high_forbidden_weight() + br = BusinessReport(stub, diagnostics=dr) + msgs = [c["message"] for c in br.caveats() if c.get("topic") == "bacon_contamination"] + assert msgs, f"{class_name}: Bacon caveat must fire" + assert ( + "Re-estimate with a heterogeneity-robust estimator" not in msgs[0] + ), f"{class_name} is already robust; must not recommend switching. Got: {msgs[0]!r}" + + def test_twfe_style_fit_keeps_switch_recommendation(self): + """The switch-to-robust recommendation is load-bearing for + genuinely TWFE-style fits and must be preserved there.""" + stub = self._make_twfe_style_stub() + dr = self._bacon_schema_with_high_forbidden_weight() + br = BusinessReport(stub, diagnostics=dr) + caveats = br.caveats() + bacon_caveats = [c for c in caveats if c.get("topic") == "bacon_contamination"] + assert len(bacon_caveats) == 1 + msg = bacon_caveats[0]["message"] + # TWFE-style fit: keep the explicit switch recommendation. + assert "Re-estimate with a heterogeneity-robust estimator" in msg, ( + f"MultiPeriodDiDResults (TWFE event-study) must keep the " + f"switch-to-robust recommendation. Got: {msg!r}" + ) + + class TestBusinessReportSurveyDesignPassthrough: """Round-40 P1 CI review on PR #318: ``BusinessReport`` must accept ``survey_design`` and forward it to the auto-constructed