From e18aa27852ff7422b78838a858bc40c18b22cfac Mon Sep 17 00:00:00 2001 From: igerber Date: Mon, 1 Jun 2026 12:37:39 -0400 Subject: [PATCH 1/2] ci+local: upgrade AI PR reviewer model gpt-5.4 -> gpt-5.5 - .github/workflows/ai_pr_review.yml: CI Codex reviewer model -> gpt-5.5 (effort xhigh / read-only sandbox unchanged) - .claude/scripts/openai_review.py: DEFAULT_MODEL -> gpt-5.5; add gpt-5.5 to _is_reasoning_model (api-backend timeout/token classification only) - .claude/commands/ai-review-local.md: update default-model + reasoning- model references - CHANGELOG: [Unreleased] entry No PRICING entry: CI + local reviewer run gpt-5.5 via the codex backend (subscription/flat-rate), which does not consult the api-backend cost table. Validated before the swap via tools/reviewer-eval/ A/B (gpt-5.5 >= gpt-5.4 on every test-backed recall case incl. a bug buried in a ~3k-line diff, 0 new false positives, faster) and an end-to-end CI canary (action CLI 0.135.0 runs gpt-5.5 and caught a planted P0). gpt-5.4 remains accepted. Co-Authored-By: Claude Opus 4.8 (1M context) --- .claude/commands/ai-review-local.md | 12 ++++++------ .claude/scripts/openai_review.py | 10 +++++----- .github/workflows/ai_pr_review.yml | 2 +- CHANGELOG.md | 3 +++ 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/.claude/commands/ai-review-local.md b/.claude/commands/ai-review-local.md index 7fb4df71..7266988b 100644 --- a/.claude/commands/ai-review-local.md +++ b/.claude/commands/ai-review-local.md @@ -16,7 +16,7 @@ Two backends are supported: | Backend | Latency | Cost | Quality | |---|---|---|---| -| `api` (`gpt-5.4`) | 30-60s | $0.05-0.50/run, metered via `OPENAI_API_KEY` | Single-shot — won't grep, can't load files on its own initiative | +| `api` (`gpt-5.5`) | 30-60s | $0.05-0.50/run, metered via `OPENAI_API_KEY` | Single-shot — won't grep, can't load files on its own initiative | | `codex` (any auth) | 3-15 min | depends on your `codex login` mode (subscription vs API key) — see codex docs | Agentic — matches CI Codex reviewer, can grep / load files / multi-turn | Choose with `--backend {auto,codex,api}` (default `auto`): @@ -66,8 +66,8 @@ Notes: *Api backend only.* - `--force-fresh`: Skip delta-diff mode, run a full fresh review even if previous state exists - `--full-registry`: Include the entire REGISTRY.md instead of selective sections -- `--model `: Override the model (default: `gpt-5.4`). Applies to both backends. -- `--timeout `: HTTP request timeout. If omitted, defaults to 900 for reasoning models (gpt-5.4, *-pro, o1/o3/o4) and 300 otherwise. *Api backend only.* +- `--model `: Override the model (default: `gpt-5.5`). Applies to both backends. +- `--timeout `: HTTP request timeout. If omitted, defaults to 900 for reasoning models (gpt-5.4/gpt-5.5, *-pro, o1/o3/o4) and 300 otherwise. *Api backend only.* - `--dry-run`: Print the compiled prompt without invoking the chosen backend (no API call, no codex subprocess) @@ -107,7 +107,7 @@ Step 5 invokes the chosen backend: Parse `$ARGUMENTS` for the optional flags listed above. All flags are optional — the default behavior (auto-detect backend, standard context for api or -agentic loading for codex, selective registry, gpt-5.4) +agentic loading for codex, selective registry, gpt-5.5) requires no arguments. ### Step 2: Validate Prerequisites @@ -417,8 +417,8 @@ would be silently ignored. Note: `--force-fresh` is a skill-only flag — it controls whether delta diffs are generated in Step 4 and is NOT passed to the script. -**Reasoning model handling:** If the model is `gpt-5.4`, contains `-pro`, or starts with -`o1`/`o3`/`o4` (e.g., `gpt-5.4`, `gpt-5.4-pro`, `o3`, `o4-mini`): +**Reasoning model handling:** If the model is `gpt-5.4`/`gpt-5.5`, contains `-pro`, or starts with +`o1`/`o3`/`o4` (e.g., `gpt-5.5`, `gpt-5.4-pro`, `o3`, `o4-mini`): - The script auto-resolves `--timeout` to 900s for reasoning models when omitted, so no extra flag is required unless overriding - Run the Bash command with `run_in_background: true` (bypasses the 600s Bash tool timeout cap) diff --git a/.claude/scripts/openai_review.py b/.claude/scripts/openai_review.py index 2c2b6736..6cf8a491 100644 --- a/.claude/scripts/openai_review.py +++ b/.claude/scripts/openai_review.py @@ -1140,7 +1140,7 @@ def compile_prompt( # --------------------------------------------------------------------------- ENDPOINT = "https://api.openai.com/v1/responses" -DEFAULT_MODEL = "gpt-5.4" +DEFAULT_MODEL = "gpt-5.5" DEFAULT_TIMEOUT = 300 # seconds REASONING_TIMEOUT = 900 # seconds DEFAULT_MAX_TOKENS = 16384 @@ -1149,13 +1149,13 @@ def compile_prompt( def _is_reasoning_model(model: str) -> bool: """Return True for models that use internal chain-of-thought reasoning.""" - return model.startswith(("o1", "o3", "o4", "gpt-5.4")) or "-pro" in model + return model.startswith(("o1", "o3", "o4", "gpt-5.4", "gpt-5.5")) or "-pro" in model def _resolve_timeout(timeout: "int | None", model: str) -> int: """Auto-resolve omitted --timeout based on model class. - Reasoning models (o1/o3/o4/gpt-5.4/*-pro) get REASONING_TIMEOUT (900s). + Reasoning models (o1/o3/o4/gpt-5.4/gpt-5.5/*-pro) get REASONING_TIMEOUT (900s). Non-reasoning models get DEFAULT_TIMEOUT (300s). Explicit values are passed through unchanged. """ @@ -1321,7 +1321,7 @@ def _build_codex_cmd( ) -> "list[str]": """Construct the argv for `codex exec`. - Pinned to match the CI Codex action's invocation (gpt-5.4 + xhigh effort + + Pinned to match the CI Codex action's invocation (gpt-5.5 + xhigh effort + read-only sandbox) so local reviews give CI-equivalent quality. NOTE: the effort key MUST be `model_reasoning_effort`, not @@ -1705,7 +1705,7 @@ def main() -> None: default=None, help=( f"HTTP request timeout in seconds (api backend only). If omitted, " - f"defaults to {REASONING_TIMEOUT} for reasoning models (gpt-5.4, " + f"defaults to {REASONING_TIMEOUT} for reasoning models (gpt-5.4/gpt-5.5, " f"*-pro, o1/o3/o4) and {DEFAULT_TIMEOUT} otherwise. Ignored under " f"--backend codex (codex manages its own time budget)." ), diff --git a/.github/workflows/ai_pr_review.yml b/.github/workflows/ai_pr_review.yml index f9748b84..0ac8c426 100644 --- a/.github/workflows/ai_pr_review.yml +++ b/.github/workflows/ai_pr_review.yml @@ -516,7 +516,7 @@ jobs: sandbox: read-only safety-strategy: drop-sudo # Recommended by OpenAI for review quality/consistency: - model: gpt-5.4 + model: gpt-5.5 effort: xhigh - name: Post PR comment (new on every event except initial open) diff --git a/CHANGELOG.md b/CHANGELOG.md index f5ca3c8e..e472872f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - **Covariate names that collide with reserved structural terms now raise `ValueError` instead of silently corrupting the coefficient dict (`DifferenceInDifferences`, `MultiPeriodDiD`, `TwoWayFixedEffects`).** These estimators build their `coefficients` dict by zipping a variable-name list -- structural term names PLUS the user covariate column names appended verbatim -- with the fitted coefficient vector. A covariate whose name equaled a reserved structural name (`const`; the treatment/time column names; the `{treatment}:{time}` interaction; MultiPeriodDiD `period_{p}` dummies and `{treatment}:period_{p}` interactions; `TwoWayFixedEffects` `ATT`; fixed-effect / unit / time dummy names; or an internal `_`-prefixed working column such as `_treat_time` / `_did_treatment` / `_treatment_post`) silently **overwrote** that structural coefficient via Python dict last-write-wins -- e.g. a covariate named `const` dropped the intercept -- with no error or warning. A new shared `validate_covariate_names` helper (`diff_diff/utils.py`) is now called in each of the three `fit()` methods before the design matrix is built; it raises `ValueError` on a collision (the comparison is case-sensitive, so e.g. `Const` is still allowed) **and** on duplicate names within `covariates` (which collapse to a single dict entry the same way). Fixed-effect/unit/time dummy reserved names are taken from the same `pd.get_dummies(..., drop_first=True)` call used to build them, so they match exactly (including for pandas `Categorical` columns with a non-default category order). For `TwoWayFixedEffects` the guard fires on **all** variance paths: the default within-transform path returns only `{"ATT": att}` (no covariate is a dict key there), but a covariate named `_treatment_post` would still clobber the internal interaction column, so guarding both paths is uniform and forward-compatible. **Potentially breaking:** a fit that previously *succeeded* with a colliding (or duplicated) covariate name -- silently returning a corrupted coefficient dict -- now raises; rename the covariate column(s). The staggered / influence-function estimators (CallawaySantAnna, SunAbraham, StaggeredTripleDifference, EfficientDiD, TwoStageDiD, ImputationDiD, WooldridgeDiD, dCDH, StackedDiD) key results by `(g, t)` tuples / relative-time indices, never covariate names, and `TripleDifference` / `SyntheticControl` / `SyntheticDiD` do not expose covariates by name, so none are affected. New tests in `tests/test_utils.py`, `tests/test_estimators.py`, and `tests/test_estimators_vcov_type.py`. +### Changed +- **CI + local AI PR-reviewer model upgraded `gpt-5.4` → `gpt-5.5`.** The CI Codex reviewer (`.github/workflows/ai_pr_review.yml`) and the local `/ai-review-local` default (`.claude/scripts/openai_review.py` `DEFAULT_MODEL`) now run `gpt-5.5` @ `xhigh` effort / `read-only` sandbox (all other invocation settings unchanged). Validated empirically before the swap via the `tools/reviewer-eval/` A/B harness: on a real-bug corpus plus a k=6 big-diff de-risk, `gpt-5.5` matched-or-beat `gpt-5.4` on every test-backed recall case (including a bug buried in a ~3k-line methodology diff), added zero false positives, and ran faster; an end-to-end CI canary confirmed the action environment (`openai/codex-action@v1`, codex CLI 0.135.0) runs `gpt-5.5` and catches a planted P0. `gpt-5.5` is also added to the reasoning-model set (`_is_reasoning_model`), so the rarely-used api backend would apply the reasoning-model timeout/token limits if invoked with it; no `PRICING` entry is added because the CI + local reviewer run `gpt-5.5` via the **codex backend** (subscription/flat-rate), which does not use the api-backend cost table. `gpt-5.4` remains accepted. + ## [3.5.0] - 2026-06-01 ### Added From e05a104b67a25918ef501bfa189e91171a3193a3 Mon Sep 17 00:00:00 2001 From: igerber Date: Mon, 1 Jun 2026 12:58:05 -0400 Subject: [PATCH 2/2] reviewer-upgrade: add gpt-5.5 PRICING (address PR #522 review P2/P3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add gpt-5.5 ($5/$30) and gpt-5.5-pro ($30/$180) to PRICING at OpenAI's confirmed standard rates (developers.openai.com/api/docs/pricing). The production reviewer uses the flat-rate codex backend, but `--backend auto` falls back to the metered API path when the codex CLI is unavailable — there estimate_cost("gpt-5.5") previously returned None and silently dropped the cost lines. gpt-5.5-pro gets its own entry so it doesn't prefix-fall-back to the standard rate. Also bumps the api-backend cost range note (gpt-5.5 ~2x gpt-5.4) and the CHANGELOG wording. Co-Authored-By: Claude Opus 4.8 (1M context) --- .claude/commands/ai-review-local.md | 2 +- .claude/scripts/openai_review.py | 2 ++ CHANGELOG.md | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.claude/commands/ai-review-local.md b/.claude/commands/ai-review-local.md index 7266988b..19864883 100644 --- a/.claude/commands/ai-review-local.md +++ b/.claude/commands/ai-review-local.md @@ -16,7 +16,7 @@ Two backends are supported: | Backend | Latency | Cost | Quality | |---|---|---|---| -| `api` (`gpt-5.5`) | 30-60s | $0.05-0.50/run, metered via `OPENAI_API_KEY` | Single-shot — won't grep, can't load files on its own initiative | +| `api` (`gpt-5.5`) | 30-60s | ~$0.10-1.00/run, metered via `OPENAI_API_KEY` | Single-shot — won't grep, can't load files on its own initiative | | `codex` (any auth) | 3-15 min | depends on your `codex login` mode (subscription vs API key) — see codex docs | Agentic — matches CI Codex reviewer, can grep / load files / multi-turn | Choose with `--backend {auto,codex,api}` (default `auto`): diff --git a/.claude/scripts/openai_review.py b/.claude/scripts/openai_review.py index 6cf8a491..312c452b 100644 --- a/.claude/scripts/openai_review.py +++ b/.claude/scripts/openai_review.py @@ -871,6 +871,8 @@ def apply_token_budget( PRICING = { "gpt-5.4": (2.50, 15.00), "gpt-5.4-pro": (30.00, 180.00), + "gpt-5.5": (5.00, 30.00), + "gpt-5.5-pro": (30.00, 180.00), "gpt-4.1": (2.00, 8.00), "gpt-4.1-mini": (0.40, 1.60), "o3": (2.00, 8.00), diff --git a/CHANGELOG.md b/CHANGELOG.md index e472872f..d4fc92d0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - **Covariate names that collide with reserved structural terms now raise `ValueError` instead of silently corrupting the coefficient dict (`DifferenceInDifferences`, `MultiPeriodDiD`, `TwoWayFixedEffects`).** These estimators build their `coefficients` dict by zipping a variable-name list -- structural term names PLUS the user covariate column names appended verbatim -- with the fitted coefficient vector. A covariate whose name equaled a reserved structural name (`const`; the treatment/time column names; the `{treatment}:{time}` interaction; MultiPeriodDiD `period_{p}` dummies and `{treatment}:period_{p}` interactions; `TwoWayFixedEffects` `ATT`; fixed-effect / unit / time dummy names; or an internal `_`-prefixed working column such as `_treat_time` / `_did_treatment` / `_treatment_post`) silently **overwrote** that structural coefficient via Python dict last-write-wins -- e.g. a covariate named `const` dropped the intercept -- with no error or warning. A new shared `validate_covariate_names` helper (`diff_diff/utils.py`) is now called in each of the three `fit()` methods before the design matrix is built; it raises `ValueError` on a collision (the comparison is case-sensitive, so e.g. `Const` is still allowed) **and** on duplicate names within `covariates` (which collapse to a single dict entry the same way). Fixed-effect/unit/time dummy reserved names are taken from the same `pd.get_dummies(..., drop_first=True)` call used to build them, so they match exactly (including for pandas `Categorical` columns with a non-default category order). For `TwoWayFixedEffects` the guard fires on **all** variance paths: the default within-transform path returns only `{"ATT": att}` (no covariate is a dict key there), but a covariate named `_treatment_post` would still clobber the internal interaction column, so guarding both paths is uniform and forward-compatible. **Potentially breaking:** a fit that previously *succeeded* with a colliding (or duplicated) covariate name -- silently returning a corrupted coefficient dict -- now raises; rename the covariate column(s). The staggered / influence-function estimators (CallawaySantAnna, SunAbraham, StaggeredTripleDifference, EfficientDiD, TwoStageDiD, ImputationDiD, WooldridgeDiD, dCDH, StackedDiD) key results by `(g, t)` tuples / relative-time indices, never covariate names, and `TripleDifference` / `SyntheticControl` / `SyntheticDiD` do not expose covariates by name, so none are affected. New tests in `tests/test_utils.py`, `tests/test_estimators.py`, and `tests/test_estimators_vcov_type.py`. ### Changed -- **CI + local AI PR-reviewer model upgraded `gpt-5.4` → `gpt-5.5`.** The CI Codex reviewer (`.github/workflows/ai_pr_review.yml`) and the local `/ai-review-local` default (`.claude/scripts/openai_review.py` `DEFAULT_MODEL`) now run `gpt-5.5` @ `xhigh` effort / `read-only` sandbox (all other invocation settings unchanged). Validated empirically before the swap via the `tools/reviewer-eval/` A/B harness: on a real-bug corpus plus a k=6 big-diff de-risk, `gpt-5.5` matched-or-beat `gpt-5.4` on every test-backed recall case (including a bug buried in a ~3k-line methodology diff), added zero false positives, and ran faster; an end-to-end CI canary confirmed the action environment (`openai/codex-action@v1`, codex CLI 0.135.0) runs `gpt-5.5` and catches a planted P0. `gpt-5.5` is also added to the reasoning-model set (`_is_reasoning_model`), so the rarely-used api backend would apply the reasoning-model timeout/token limits if invoked with it; no `PRICING` entry is added because the CI + local reviewer run `gpt-5.5` via the **codex backend** (subscription/flat-rate), which does not use the api-backend cost table. `gpt-5.4` remains accepted. +- **CI + local AI PR-reviewer model upgraded `gpt-5.4` → `gpt-5.5`.** The CI Codex reviewer (`.github/workflows/ai_pr_review.yml`) and the local `/ai-review-local` default (`.claude/scripts/openai_review.py` `DEFAULT_MODEL`) now run `gpt-5.5` @ `xhigh` effort / `read-only` sandbox (all other invocation settings unchanged). Validated empirically before the swap via the `tools/reviewer-eval/` A/B harness: on a real-bug corpus plus a k=6 big-diff de-risk, `gpt-5.5` matched-or-beat `gpt-5.4` on every test-backed recall case (including a bug buried in a ~3k-line methodology diff), added zero false positives, and ran faster; an end-to-end CI canary confirmed the action environment (`openai/codex-action@v1`, codex CLI 0.135.0) runs `gpt-5.5` and catches a planted P0. `gpt-5.5` is also added to the reasoning-model set (`_is_reasoning_model`, for the api-backend timeout/token limits) and to the `PRICING` table at OpenAI's confirmed standard rates (`gpt-5.5` $5/$30, `gpt-5.5-pro` $30/$180 per 1M input/output tokens) — the production CI + local reviewer run `gpt-5.5` via the flat-rate **codex backend**, but `--backend auto` falls back to the metered API path when the codex CLI is unavailable, so the cost estimate must stay accurate there. `gpt-5.4` remains accepted. ## [3.5.0] - 2026-06-01