From 943557bcc168a94e69b4b88421411a34f2343405 Mon Sep 17 00:00:00 2001 From: Alexander Amiri Date: Thu, 12 Mar 2026 17:19:45 +0100 Subject: [PATCH 1/2] Update docs after plan-review merge and securityhub-summary addition - Remove plan-review.yml and commit-terraform.yml from workflow lists (plan + review now combined in tf-plan.yml / platform-ci.yml) - Add securityhub-summary Lambda to function tables and alert routing - Document ci-infra / ci-infra-plan IAM role split (6 CI roles) - Add javabin-alert-dedup DynamoDB table to monitoring module - Update Lambda count from 6 to 8 across all docs - Add ci-registry role to platform-modules IAM table - Add weekly Security Hub summary to CI workflow schedule --- CLAUDE.md | 15 +++++++++------ docs/ci-workflow.md | 17 +++++++---------- docs/lambda-functions.md | 13 +++++++++++++ docs/platform-modules.md | 7 +++++-- docs/reusable-workflows.md | 15 +++++---------- 5 files changed, 39 insertions(+), 28 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 911c3ae..d299c6b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -116,7 +116,7 @@ terraform/platform/ iam/ GitHub OIDC, CI roles, permission boundary compute/ ECS cluster, ECR base config monitoring/ SNS, EventBridge, Config, GuardDuty, Security Hub - lambdas/ slack-alert, cost-report, daily-cost-check, compliance-reporter, override-cleanup, team-provisioner, apply-gate + lambdas/ slack-alert, cost-report, daily-cost-check, compliance-reporter, override-cleanup, team-provisioner, apply-gate, securityhub-summary identity/ Cognito user pools (internal + external). Identity Center is in terraform/org/ ``` @@ -168,12 +168,13 @@ terraform/state/ tf-apply.yml SHA verify + apply via project role eb-deploy.yml Elastic Beanstalk deploy (transition) ecs-deploy.yml ECS task definition update - plan-review.yml LLM risk + cost analysis approve-override.yml Risk gate override (board members) provision-app.yml Team provisioning triggered from registry dispatch - commit-terraform.yml Commit generated TF files back to app repos ``` +> **Note:** Plan + LLM review are combined inline in `platform-ci.yml` for the platform repo +> and in `tf-plan.yml` for app repos (no separate `plan-review.yml` or `commit-terraform.yml`). + ### Lambda Functions | Function | Purpose | |----------|---------| @@ -183,6 +184,7 @@ terraform/state/ | `compliance-reporter` | Reports untagged resources to Slack (no auto-fix) | | `override-cleanup` | Hourly cleanup of stale SSM override tokens | | `team-provisioner` | Syncs Google Groups, GitHub teams, AWS Budgets from registry team YAML | +| `securityhub-summary` | Weekly Security Hub findings summary (Monday 08:00 UTC) | ### Scripts | Script | What | @@ -202,7 +204,7 @@ terraform/state/ | Type | Pattern | |------|---------| | Resources | `javabin-{purpose}` | -| IAM roles | `javabin-ci-{purpose}` (CI), `javabin-{service}` (runtime) | +| IAM roles | `javabin-ci-{purpose}` (CI — 6 roles including `ci-infra` / `ci-infra-plan` split), `javabin-{service}` (runtime) | | Lambdas | `javabin-{function}` | | S3 buckets | `javabin-{purpose}-{account_id}` | | SSM params | `/javabin/{namespace}/{name}` | @@ -219,6 +221,7 @@ Cost Anomaly ──► javabin-alerts SNS ──► slack-alert Lambda ──► Scheduled: Monday 08:00 UTC ──► cost-report ──► #javabin-cost-alerts + Monday 08:00 UTC ──► securityhub-summary ──► #javabin-infra-alerts Daily 08:00 UTC ──► daily-cost-check ──► #javabin-cost-alerts (only on spikes) EventBridge (Create/Run) ──► compliance-reporter (report to Slack, no auto-fix) @@ -267,10 +270,10 @@ The SA JSON key is at `/javabin/platform/google-admin-sa`, the impersonation tar | 1 | Identity (Google + Identity Center + Cognito) | **Deployed** — GCP SA with domain-wide delegation, Identity Center with ABAC + 3 permission sets in `terraform/org/`. Cognito pool TF exists but not yet applied (needs Google OAuth client). | | 2a | Networking | **Deployed** — VPC, subnets, NAT | | 2b | Ingress | **Deployed** — ALB + ACM cert | -| 2c | IAM / OIDC | **Deployed** — 5 CI roles (infra, per-app, deploy, override-approver, registry) | +| 2c | IAM / OIDC | **Deployed** — 6 CI roles (infra, infra-plan, per-app, deploy, override-approver, registry) | | 2d | Compute | **Deployed** — ECS cluster + ECR repos | | 2e | Monitoring | **Deployed** — GuardDuty, Security Hub, Config, SNS | -| 2f | Lambda Functions | **Deployed** — 6 working (Google/GitHub/Budget/Cognito/Identity Center sync live) | +| 2f | Lambda Functions | **Deployed** — 8 functions (Google/GitHub/Budget/Cognito/Identity Center sync live) | | 2g | Platform CI | **Done** — plan → LLM review → apply pipeline working | | 3a | Reusable Terraform Modules | **Code done** — 12 modules in repo | | 3b | GitHub Actions Workflows | **Code done** — 14 reusable workflows | diff --git a/docs/ci-workflow.md b/docs/ci-workflow.md index 7fde398..25734ee 100644 --- a/docs/ci-workflow.md +++ b/docs/ci-workflow.md @@ -5,27 +5,24 @@ Defined in `.github/workflows/platform-ci.yml`. Runs on pushes to `terraform/pla ## Pipeline ``` -push to main ──► plan ──► review ──► apply -PR ──► plan ──► review ──► PR comments (no apply) +push to main ──► plan (includes LLM review) ──► apply +PR ──► plan (includes LLM review) ──► PR comments (no apply) Monday 06:00 ──► drift detection ──► Slack if drift found +Monday 08:00 ──► securityhub-summary Lambda ──► #javabin-infra-alerts ``` ## Jobs -### plan -- OIDC → `javabin-ci-infra` role +### plan (includes LLM review) +- OIDC → `javabin-ci-infra-plan` role (read-only) - `terraform init`, `validate`, `fmt -check`, `plan` - Uploads `tfplan` + `plan-output.txt` to S3 with SHA256 hash - Posts plan output as PR comment on pull requests -- Outputs: `has_changes`, `plan_key`, `plan_sha256` - -### review -- Downloads plan text from S3 (does NOT re-run terraform plan) -- Runs `scripts/review-plan.py` which calls Bedrock (Claude Haiku) for risk analysis +- Runs `scripts/review-plan.py` inline — calls Bedrock (Claude Haiku) for risk analysis - Structured output via Bedrock Converse tool use: `{risk: LOW|MEDIUM|HIGH, summary: ..., findings: [...]}` - Posts review as PR comment - HIGH risk on main → sends Slack notification -- Output: `risk_level` +- Outputs: `has_changes`, `plan_key`, `plan_sha256`, `risk_level` ### apply - Only on `push` to `main` with changes diff --git a/docs/lambda-functions.md b/docs/lambda-functions.md index 0253129..cd92517 100644 --- a/docs/lambda-functions.md +++ b/docs/lambda-functions.md @@ -17,6 +17,8 @@ Terraform in `terraform/platform/lambdas/`. Features: GitHub OIDC attribution (extracts actor/repo/SHA from session tags), cost estimation via shared pricing module, Security Hub finding formatting. +Uses the `javabin-alert-dedup` DynamoDB table for deduplication of alerts (prevents repeated notifications for the same finding). + ## cost-report **Trigger:** EventBridge schedule — Monday 08:00 UTC @@ -55,6 +57,17 @@ Filters by identity: only reports resources created by known CI/platform identit No Slack integration. Logs cleanup actions to CloudWatch. +## securityhub-summary + +**Trigger:** EventBridge schedule — Monday 08:00 UTC +**Purpose:** Weekly summary of active HIGH/CRITICAL Security Hub findings. Reuses the slack-alert code with the `summary_handler` entry point. + +| SSM Parameter | Channel | +|---------------|---------| +| `/javabin/slack/platform-resource-alerts-webhook` | #javabin-infra-alerts | + +Queries Security Hub for active findings at HIGH and CRITICAL severity, aggregates by resource type and finding title, and posts a formatted summary to Slack. + ## team-provisioner **Trigger:** (Future) Registry repo merge events diff --git a/docs/platform-modules.md b/docs/platform-modules.md index 5321aa5..1cc09cf 100644 --- a/docs/platform-modules.md +++ b/docs/platform-modules.md @@ -47,10 +47,12 @@ GitHub OIDC provider (data source), CI roles, permission boundary, ECS execution | Role | Trust | Purpose | |------|-------|---------| -| `javabin-ci-infra` | platform repo, main branch | Plan + apply platform TF | +| `javabin-ci-infra` | platform repo, main branch | Apply platform TF | +| `javabin-ci-infra-plan` | platform repo, main branch + PRs | Read-only plan + review for platform TF | | `javabin-ci-app-{repo}` | Per-app, pinned to `tf-plan.yml` | App TF plan + apply | | `javabin-ci-deploy-{repo}` | Per-app, pinned to `tf-apply.yml` | ECR push, ECS deploy | | `javabin-ci-override-approver` | Board members, pinned to `approve-override.yml` | SSM override tokens | +| `javabin-ci-registry` | Registry repo | Team provisioning dispatch | | `javabin-ecs-execution` | ECS tasks | Pull images, write logs, read secrets | **Key design:** `job_workflow_ref` condition prevents app repos from writing rogue workflows. @@ -76,10 +78,11 @@ SNS topics, EventBridge rules, Config, GuardDuty, Security Hub. | AWS Config | Configuration recorder + S3 delivery | | GuardDuty | Threat detection | | Security Hub | Findings aggregation | +| `javabin-alert-dedup` DynamoDB | Deduplication table used by slack-alert Lambda | ## lambdas -6 Lambda functions — see [lambda-functions.md](lambda-functions.md) for details. +8 Lambda functions — see [lambda-functions.md](lambda-functions.md) for details. ## identity diff --git a/docs/reusable-workflows.md b/docs/reusable-workflows.md index f17d687..6002ee6 100644 --- a/docs/reusable-workflows.md +++ b/docs/reusable-workflows.md @@ -24,12 +24,12 @@ That's it. The pipeline auto-detects your repo contents and runs the right jobs. ``` detect ──► build-jvm ──► docker-build ──► ecs-deploy ├─► build-ts ──┘ │ - ├─► tf-plan ──► plan-review ──► tf-apply + ├─► tf-plan (includes LLM review) ──► tf-apply └─► eb-deploy (transitional) ``` -- **PRs:** detect → build → tf-plan → plan-review → PR comments -- **Main:** detect → build → docker-build → tf-plan → plan-review → tf-apply → ecs-deploy +- **PRs:** detect → build → tf-plan (plan + review) → PR comments +- **Main:** detect → build → docker-build → tf-plan (plan + review) → tf-apply → ecs-deploy ## Workflow Reference @@ -79,15 +79,10 @@ Docker BuildKit build + ECR push. Uses GitHub Actions cache (zero ECR storage fo Generates `terraform/` from `app.yaml` if present (via `generate-terraform.sh`), runs plan, uploads to S3 with SHA256. Posts plan to PR. -**OIDC role:** `javabin-ci-app-{repo}` -**Outputs:** `has_changes`, `plan_key`, `plan_sha256` - -### plan-review.yml - -Downloads plan text from S3, runs LLM risk analysis via Bedrock Claude Haiku. Posts review to PR (on PRs) or Slack (on direct push). +LLM review is now performed inline within this workflow (no separate `plan-review.yml`). After planning, it downloads the plan text and runs `scripts/review-plan.py` via Bedrock Claude Haiku for risk analysis. **OIDC role:** `javabin-ci-app-{repo}` -**Outputs:** `risk_level` (LOW, MEDIUM, HIGH, FAILED) +**Outputs:** `has_changes`, `plan_key`, `plan_sha256`, `risk_level` ### tf-apply.yml From 9e201fe19d85ce77e9817cdf99d8f846958459e8 Mon Sep 17 00:00:00 2001 From: Alexander Amiri Date: Thu, 12 Mar 2026 17:50:09 +0100 Subject: [PATCH 2/2] Fix: run CI on all PRs so required status check always reports MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove paths filter from pull_request trigger — the workflow now runs on every PR but skips plan/review steps when no infra files changed. This ensures "Terraform Plan" status check always reports, even for doc-only PRs, so the ruleset requirement is satisfied. --- .github/workflows/platform-ci.yml | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/.github/workflows/platform-ci.yml b/.github/workflows/platform-ci.yml index 60da484..fa22973 100644 --- a/.github/workflows/platform-ci.yml +++ b/.github/workflows/platform-ci.yml @@ -9,11 +9,6 @@ on: - 'scripts/**' - '.github/workflows/**' pull_request: - paths: - - 'terraform/platform/**' - - 'terraform/lambda-src/**' - - 'scripts/**' - - '.github/workflows/**' schedule: # Drift detection — Monday 06:00 UTC - cron: '0 6 * * 1' @@ -48,13 +43,28 @@ jobs: risk_level: ${{ steps.review.outputs.risk_level }} steps: - uses: actions/checkout@v6 + with: + fetch-depth: 0 + + - name: Check for infra changes + id: changes + run: | + BASE="${{ github.event.pull_request.base.sha || github.event.before || 'HEAD~1' }}" + if git diff --name-only "$BASE" HEAD | grep -qE '^(terraform/|scripts/|\.github/workflows/)'; then + echo "has_infra_changes=true" >> "$GITHUB_OUTPUT" + else + echo "has_infra_changes=false" >> "$GITHUB_OUTPUT" + echo "No infrastructure changes — skipping plan." + fi - uses: hashicorp/setup-terraform@v4 + if: steps.changes.outputs.has_infra_changes == 'true' with: terraform_version: "1.7" terraform_wrapper: false - name: Configure AWS credentials via OIDC + if: steps.changes.outputs.has_infra_changes == 'true' uses: aws-actions/configure-aws-credentials@v6 with: role-to-assume: arn:aws:iam::${{ env.AWS_ACCOUNT_ID }}:role/javabin-ci-infra-plan @@ -62,19 +72,23 @@ jobs: role-session-name: javabin-platform-plan-${{ github.run_id }} - name: Terraform Init + if: steps.changes.outputs.has_infra_changes == 'true' working-directory: ${{ env.TF_ROOT }} run: terraform init -input=false - name: Terraform Validate + if: steps.changes.outputs.has_infra_changes == 'true' working-directory: ${{ env.TF_ROOT }} run: terraform validate - name: Terraform Format + if: steps.changes.outputs.has_infra_changes == 'true' working-directory: ${{ env.TF_ROOT }} run: terraform fmt -recursive - name: Terraform Plan id: plan + if: steps.changes.outputs.has_infra_changes == 'true' run: scripts/run-plan.sh "${{ env.TF_ROOT }}" -lock-timeout=5m - name: Upload plan and output to S3 @@ -91,7 +105,7 @@ jobs: retention-days: 1 - name: Post plan to PR - if: github.event_name == 'pull_request' + if: github.event_name == 'pull_request' && steps.changes.outputs.has_infra_changes == 'true' env: GH_TOKEN: ${{ github.token }} PR_NUMBER: ${{ github.event.pull_request.number }}