diff --git a/scripts/sweep-classifiers/README.adoc b/scripts/sweep-classifiers/README.adoc new file mode 100644 index 00000000..5457cdad --- /dev/null +++ b/scripts/sweep-classifiers/README.adoc @@ -0,0 +1,103 @@ += Sweep classifiers +:SPDX-License-Identifier: PMPL-1.0-or-later + +Per-template classifiers used to triage the wrapper-sweep work that +follows each of the foundational reusable PRs filed against +`standards`: + +[cols="2,1,3", options="header"] +|=== +| Classifier | Reusable PR | Template +| `classify-mirror.sh` | #187 | `mirror.yml` (7-forge mirror bundle) +| `classify-secret-scanner.sh` | #190 | `secret-scanner.yml` (trufflehog/gitleaks/rust/shell) +| `classify-codeql.sh` | #192 | `codeql.yml` (CodeQL security analysis) +| `classify-hypatia-scan.sh` | #193 | `hypatia-scan.yml` (Hypatia neurosymbolic scan) +|=== + +== Pipeline + +. Paginate `gh api /search/code` for the template across the estate. +. Group results by blob SHA; fetch each unique blob exactly once. +. Classify each blob (job-set match / line-count band / language matrix). +. Emit per-repo TSV: `\t\t\t\t\t
`. + +The expensive step (blob fetch) is cached in `$BLOBS_DIR`, so reruns +are fast. + +== Usage + +[source,bash] +---- +# 1. Paginate the search (one-time per template): +gh api --paginate -X GET '/search/code' \ + -f q='filename:mirror.yml path:.github/workflows org:hyperpolymath' \ + --jq '.items[] | {repo: .repository.name, sha: .sha}' \ + > /tmp/mirror-full.json + +# 2. Run the classifier: +BLOBS_DIR=/tmp/mirror-blobs \ + ./classify-mirror.sh /tmp/mirror-full.json > /tmp/mirror-classification.tsv + +# 3. Summarise: +awk -F'\t' '{print $3}' /tmp/mirror-classification.tsv | sort | uniq -c | sort -rn +---- + +== Nested-path caveat — 3 layers of undercount + +`gh api /search/code` undercounts monorepo-nested workflow files in +three compounding ways: + +. *Layer 1 — `path:` filter is a prefix match.* `path:.github/workflows` + excludes paths like `a2ml/bindings/deno/.github/workflows/mirror.yml` + outright. +. *Layer 2 — even broad queries are org-scope-truncated.* Removing the + `path:` filter and running `filename:codeql.yml org:hyperpolymath` + returns ~190 paths, but per-repo enumeration of just + `developer-ecosystem` finds 170 codeql.yml files. The endpoint + silently caps results once it hits internal limits. Empirically + validated against `scorecard.yml`: broad query saw 152 paths, all + top-level; per-repo enumeration revealed 626 nested copies the + broad query missed entirely. +. *Layer 3 — nested workflows do not execute.* GitHub Actions only runs + workflows from the repo-root `.github/workflows/` directory. Nested + copies are inert vendored templates or stale leftover. Security-driven + campaigns (e.g. propagating a missing guardrail) do NOT close + additional attack surface via nested wrappers. Single-source-of-truth + campaigns still benefit. + +For accurate counts, use `list-workflow-paths.sh` (this directory), +which walks `gh repo list` and queries each repo's Git Tree API +directly — bypassing Layers 1 and 2. + +[source,bash] +---- +./list-workflow-paths.sh codeql.yml \ + > /tmp/drift-survey/codeql-all-tuples.tsv + +# Top-level vs nested split: +awk -F'\t' '{print $4}' /tmp/drift-survey/codeql-all-tuples.tsv \ + | sort | uniq -c +---- + +Output format: `\t\t\t`. + +The legacy `gh api /search/code` query is still useful as a quick first +look, but `list-workflow-paths.sh` is the source of truth for any +sweep-planning decision. + +== Output classes (varies by classifier) + +* `TRIVIAL` / `TRIVIAL_CURRENT` / `TRIVIAL_DEFAULT` — canonical match; + pure mechanical wrapper conversion. +* `MISSING_*` / `SLIM_*` / `OLDER_*` — propagation lag; same wrapper as + TRIVIAL upgrades the workflow body on first run after merge. +* `SINGLE_NON_DEFAULT` / `MULTI_LANGUAGE` — requires one or more input + overrides at the call site. +* `NEEDS_REVIEW` — extra jobs, line count out of any known band, or + custom workflow body. Per-repo diff required before wrapper. + +== Related + +* `scripts/apply-baseline.sh` — Hypatia-finding baseline filter (paired + with `scripts/tests/apply-baseline-test.sh`). +* PRs #187, #190, #192, #193 — the reusables these classifiers triage. diff --git a/scripts/sweep-classifiers/_lib.sh b/scripts/sweep-classifiers/_lib.sh new file mode 100755 index 00000000..865523e2 --- /dev/null +++ b/scripts/sweep-classifiers/_lib.sh @@ -0,0 +1,28 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: PMPL-1.0-or-later +# _lib.sh — Shared helpers for classify-*.sh scripts. +# +# Sourced by every classifier. Not directly executable. +# +# Provides: normalize_input +# +# Emits one TSV row per workflow file: "\t\t". +# Accepts two input shapes auto-detected from the first line: +# +# 1. JSONL from `gh api /search/code` (legacy): +# {"repo":"foo","sha":"abc123"} +# Emitted path is empty (no nested-copy info available). +# +# 2. TSV from `list-workflow-paths.sh` (preferred — handles nested): +# fooabc123top-level|nested +# Emitted path is the original `` value; the scope column +# is dropped. + +normalize_input() { + local file="$1" + if [[ "$(head -c1 "$file")" == "{" ]]; then + jq -r '[.repo, (.path // ""), .sha] | @tsv' "$file" + else + awk -F'\t' 'NF >= 3 { print $1 "\t" $2 "\t" $3 }' "$file" + fi +} diff --git a/scripts/sweep-classifiers/classify-codeql.sh b/scripts/sweep-classifiers/classify-codeql.sh new file mode 100755 index 00000000..bc3fd341 --- /dev/null +++ b/scripts/sweep-classifiers/classify-codeql.sh @@ -0,0 +1,63 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: PMPL-1.0-or-later +# classify-codeql.sh — classify per-repo codeql.yml for #192 sweep. +# +# Canonical: 49 lines, single `analyze` job, matrix language=javascript-typescript +# build-mode=none. +# +# Classes: +# TRIVIAL_DEFAULT — single javascript-typescript language, default wrapper +# SINGLE_NON_DEFAULT — single rust or actions language, override wrapper +# MULTI_LANGUAGE — 2+ languages, multi-call wrapper +# NEEDS_REVIEW — NONE language matrix, or large custom workflow + +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=_lib.sh +. "$SCRIPT_DIR/_lib.sh" + +INPUT="${1:-/tmp/drift-survey/codeql-full.json}" +BLOBS_DIR="${BLOBS_DIR:-/tmp/drift-survey/codeql-blobs}" +mkdir -p "$BLOBS_DIR" + +classify_blob() { + local blob="$1" langs lines jobs + jobs=$(awk '/^jobs:[[:space:]]*$/{in_jobs=1; next} in_jobs && /^[A-Za-z]/{exit} in_jobs && /^ [a-z][a-z0-9_-]*:[[:space:]]*$/{sub(/^ /,""); sub(/:.*/,""); print}' "$blob" 2>/dev/null | sort -u | paste -sd, -) + langs=$(grep -E "^\s*- language:" "$blob" 2>/dev/null | sed 's/.*language: //; s/[[:space:]]*$//' | sort -u | paste -sd, -) + lines=$(wc -l < "$blob") + langs="${langs:-NONE}" + + if [ "$lines" -gt 80 ]; then + echo "NEEDS_REVIEW custom_workflow_${lines}_lines $lines $langs" + return + fi + case "$langs" in + javascript-typescript) echo "TRIVIAL_DEFAULT - $lines $langs" ;; + rust|actions) echo "SINGLE_NON_DEFAULT +language=$langs $lines $langs" ;; + NONE) echo "NEEDS_REVIEW no_language_matrix $lines $langs" ;; + *,*) echo "MULTI_LANGUAGE +per-language-wrapper $lines $langs" ;; + *) echo "NEEDS_REVIEW unknown_lang:$langs $lines $langs" ;; + esac +} + +echo "[fetch] retrieving unique blobs..." >&2 +normalize_input "$INPUT" | awk -F'\t' '{print $3 "\t" $1}' | sort -u | while IFS=$'\t' read -r sha repo; do + [ -z "$sha" ] || [ "$sha" = "null" ] && continue + blob_file="$BLOBS_DIR/$sha.yml" + [ -s "$blob_file" ] && continue + gh api "/repos/hyperpolymath/$repo/git/blobs/$sha" --jq '.content' 2>/dev/null \ + | base64 -d > "$blob_file" || echo "::warn fetch failed for $sha ($repo)" >&2 +done + +declare -A SHA_CLASS +echo "[classify] classifying unique blobs..." >&2 +for blob in "$BLOBS_DIR"/*.yml; do + [ -s "$blob" ] || continue + sha=$(basename "$blob" .yml) + SHA_CLASS[$sha]=$(classify_blob "$blob") +done + +normalize_input "$INPUT" | while IFS=$'\t' read -r repo path sha; do + [ -z "$sha" ] || [ "$sha" = "null" ] && { printf '%s\t%s\t%s\tNEEDS_REVIEW\tnull_sha\t-\t-\n' "$repo" "$path" "$sha"; continue; } + printf '%s\t%s\t%s\t%s\n' "$repo" "$path" "$sha" "${SHA_CLASS[$sha]:-NEEDS_REVIEW fetch_failed - -}" +done diff --git a/scripts/sweep-classifiers/classify-hypatia-scan.sh b/scripts/sweep-classifiers/classify-hypatia-scan.sh new file mode 100755 index 00000000..cb2fe912 --- /dev/null +++ b/scripts/sweep-classifiers/classify-hypatia-scan.sh @@ -0,0 +1,62 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: PMPL-1.0-or-later +# classify-hypatia-scan.sh — classify per-repo hypatia-scan.yml for #193 sweep. +# +# Canonical (standards/.github/workflows/hypatia-scan.yml): 416 lines, +# single `scan` job. Every estate variant carries the same job set; +# drift is pure propagation lag, manifested as line count. +# +# Classes: +# TRIVIAL_CURRENT — 410-416 lines (current canonical-1) +# SLIM_PROPAGATION_LAG — 245-260 lines (older slimmer version) +# OLDER_SLIM — 205-235 lines (much older slim version) +# NEEDS_REVIEW — anything else (multi-job, line count out of band, +# or self-source like hypatia/standards repos) + +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=_lib.sh +. "$SCRIPT_DIR/_lib.sh" + +INPUT="${1:-/tmp/drift-survey/hypatia-scan-full.json}" +BLOBS_DIR="${BLOBS_DIR:-/tmp/drift-survey/hypatia-blobs}" +mkdir -p "$BLOBS_DIR" + +classify_blob() { + local blob="$1" jobs lines + jobs=$(awk '/^jobs:[[:space:]]*$/{in_jobs=1; next} in_jobs && /^[A-Za-z]/{exit} in_jobs && /^ [a-z][a-z0-9_-]*:[[:space:]]*$/{sub(/^ /,""); sub(/:.*/,""); print}' "$blob" 2>/dev/null | sort -u | paste -sd, -) + jobs="${jobs:-NONE}" + lines=$(wc -l < "$blob") + if [ "$jobs" != "scan" ]; then + echo "NEEDS_REVIEW job_set:$jobs $lines $jobs" + return + fi + case "$lines" in + 41[0-9]|42[0-9]) echo "TRIVIAL_CURRENT - $lines scan" ;; + 24[0-9]|25[0-9]) echo "SLIM_PROPAGATION_LAG +upgrade-to-current $lines scan" ;; + 20[5-9]|21[0-9]|22[0-9]|23[0-9]) echo "OLDER_SLIM +upgrade-to-current $lines scan" ;; + *) echo "NEEDS_REVIEW line_count_oob:$lines $lines scan" ;; + esac +} + +echo "[fetch] retrieving unique blobs..." >&2 +normalize_input "$INPUT" | awk -F'\t' '{print $3 "\t" $1}' | sort -u | while IFS=$'\t' read -r sha repo; do + [ -z "$sha" ] || [ "$sha" = "null" ] && continue + blob_file="$BLOBS_DIR/$sha.yml" + [ -s "$blob_file" ] && continue + gh api "/repos/hyperpolymath/$repo/git/blobs/$sha" --jq '.content' 2>/dev/null \ + | base64 -d > "$blob_file" || echo "::warn fetch failed for $sha ($repo)" >&2 +done + +declare -A SHA_CLASS +echo "[classify] classifying unique blobs..." >&2 +for blob in "$BLOBS_DIR"/*.yml; do + [ -s "$blob" ] || continue + sha=$(basename "$blob" .yml) + SHA_CLASS[$sha]=$(classify_blob "$blob") +done + +normalize_input "$INPUT" | while IFS=$'\t' read -r repo path sha; do + [ -z "$sha" ] || [ "$sha" = "null" ] && { printf '%s\t%s\t%s\tNEEDS_REVIEW\tnull_sha\t-\t-\n' "$repo" "$path" "$sha"; continue; } + printf '%s\t%s\t%s\t%s\n' "$repo" "$path" "$sha" "${SHA_CLASS[$sha]:-NEEDS_REVIEW fetch_failed - -}" +done diff --git a/scripts/sweep-classifiers/classify-mirror.sh b/scripts/sweep-classifiers/classify-mirror.sh new file mode 100755 index 00000000..8d579ccf --- /dev/null +++ b/scripts/sweep-classifiers/classify-mirror.sh @@ -0,0 +1,97 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: PMPL-1.0-or-later +# classify-mirror.sh — classify per-repo mirror.yml for the #187 sweep. +# +# Reads /tmp/drift-survey/mirror-full.json (output of `gh api --paginate +# /search/code` filtered to mirror.yml in .github/workflows). For each +# unique blob SHA, fetches the blob once (cached in $BLOBS_DIR) and +# classifies as: +# +# TRIVIAL — exactly the canonical 7 forges, line count in band, +# no extra/missing top-level jobs. +# NEEDS_REVIEW — anything else (extra forges, missing forges, +# structural diff). Reported with reason. +# +# Output: TSV to stdout, one row per repo: +# \t\t\t\t\t + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=_lib.sh +. "$SCRIPT_DIR/_lib.sh" + +INPUT="${1:-/tmp/drift-survey/mirror-full.json}" +BLOBS_DIR="${BLOBS_DIR:-/tmp/drift-survey/blobs}" +mkdir -p "$BLOBS_DIR" + +# Canonical 7 forges (alphabetical for stable compare). +CANON_FORGES="bitbucket,codeberg,disroot,gitea,gitlab,radicle,sourcehut" +LINE_MIN=130 +LINE_MAX=160 + +classify_blob() { + local blob="$1" + local forges line_count extra_jobs + + # Forges: top-level jobs under `jobs:` starting with mirror-. + forges=$(awk ' + /^jobs:[[:space:]]*$/ { in_jobs=1; next } + in_jobs && /^[A-Za-z]/ { exit } + in_jobs && /^ mirror-[a-z]+:[[:space:]]*$/ { + sub(/^ mirror-/, ""); sub(/:[[:space:]]*$/, ""); print + } + ' "$blob" 2>/dev/null | sort -u | paste -sd, -) + forges="${forges:-NONE}" + line_count=$(wc -l < "$blob") + + # Any non-mirror top-level jobs? Scope to the `jobs:` block (between + # `^jobs:$` and the next column-0 key, EOF, or comment-only/blank). + extra_jobs=$(awk ' + /^jobs:[[:space:]]*$/ { in_jobs=1; next } + in_jobs && /^[A-Za-z]/ { exit } # next column-0 key ends the block + in_jobs && /^ [a-z][a-z0-9_-]*:[[:space:]]*$/ { + sub(/^ /, ""); sub(/:[[:space:]]*$/, ""); print + } + ' "$blob" 2>/dev/null | grep -vE '^mirror-' | paste -sd, -) + extra_jobs="${extra_jobs:-}" + + if [ "$forges" != "$CANON_FORGES" ]; then + echo "NEEDS_REVIEW forge_set:$forges $line_count $forges" + return + fi + if [ -n "$extra_jobs" ]; then + echo "NEEDS_REVIEW extra_jobs:$extra_jobs $line_count $forges" + return + fi + if [ "$line_count" -lt "$LINE_MIN" ] || [ "$line_count" -gt "$LINE_MAX" ]; then + echo "NEEDS_REVIEW line_count_oob:$line_count $line_count $forges" + return + fi + echo "TRIVIAL - $line_count $forges" +} + +# 1. Fetch each unique SHA's blob (cached). +echo "[fetch] unique SHAs to retrieve..." >&2 +normalize_input "$INPUT" | awk -F'\t' '{print $3 "\t" $1}' | sort -u | while IFS=$'\t' read -r sha repo; do + [ -z "$sha" ] || [ "$sha" = "null" ] && continue + blob_file="$BLOBS_DIR/$sha.yml" + [ -s "$blob_file" ] && continue + gh api "/repos/hyperpolymath/$repo/git/blobs/$sha" --jq '.content' 2>/dev/null \ + | base64 -d > "$blob_file" || echo "::warn fetch failed for $sha ($repo)" >&2 +done + +# 2. Classify each unique SHA, cache in a map. +declare -A SHA_CLASS +echo "[classify] classifying unique blobs..." >&2 +for blob in "$BLOBS_DIR"/*.yml; do + [ -s "$blob" ] || continue + sha=$(basename "$blob" .yml) + SHA_CLASS[$sha]=$(classify_blob "$blob") +done + +# 3. Emit per-(repo, path) TSV: repo \t path \t sha \t class \t reason \t lines \t forges +normalize_input "$INPUT" | while IFS=$'\t' read -r repo path sha; do + [ -z "$sha" ] || [ "$sha" = "null" ] && { printf '%s\t%s\t%s\tNEEDS_REVIEW\tnull_sha\t-\t-\n' "$repo" "$path" "$sha"; continue; } + printf '%s\t%s\t%s\t%s\n' "$repo" "$path" "$sha" "${SHA_CLASS[$sha]:-NEEDS_REVIEW fetch_failed - -}" +done diff --git a/scripts/sweep-classifiers/classify-secret-scanner.sh b/scripts/sweep-classifiers/classify-secret-scanner.sh new file mode 100755 index 00000000..03326073 --- /dev/null +++ b/scripts/sweep-classifiers/classify-secret-scanner.sh @@ -0,0 +1,78 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: PMPL-1.0-or-later +# classify-secret-scanner.sh — classify per-repo secret-scanner.yml for #190 sweep. +# +# Canonical (standards/.github/workflows/secret-scanner.yml @ 080c394): +# 4 jobs — trufflehog, gitleaks, rust-secrets, shell-secrets +# +# Classifies each estate copy as: +# TRIVIAL — exactly the canonical 4 jobs, no extras. +# MISSING_SHELL_SECRETS — 3 canonical jobs (trufflehog/gitleaks/rust-secrets); +# the post-Cloudflare-leak guardrail is absent. +# MISSING_RUST_SECRETS — 2 jobs (trufflehog/gitleaks); slim variant. +# NEEDS_REVIEW — extra jobs, missing trufflehog/gitleaks core, +# line count out of band, or otherwise unrecognised. + +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# shellcheck source=_lib.sh +. "$SCRIPT_DIR/_lib.sh" + +INPUT="${1:-/tmp/drift-survey/secret-scanner-full.json}" +BLOBS_DIR="${BLOBS_DIR:-/tmp/drift-survey/secret-blobs}" +mkdir -p "$BLOBS_DIR" + +CORE_JOBS="trufflehog,gitleaks" +ALL_CANONICAL="gitleaks,rust-secrets,shell-secrets,trufflehog" +MISSING_SHELL="gitleaks,rust-secrets,trufflehog" +SLIM_2JOB="gitleaks,trufflehog" + +extract_jobs() { + awk ' + /^jobs:[[:space:]]*$/ { in_jobs=1; next } + in_jobs && /^[A-Za-z]/ { exit } + in_jobs && /^ [a-z][a-z0-9_-]*:[[:space:]]*$/ { + sub(/^ /, ""); sub(/:[[:space:]]*$/, ""); print + } + ' "$1" 2>/dev/null | sort -u | paste -sd, - +} + +classify_blob() { + local blob="$1" jobs lines + jobs=$(extract_jobs "$blob") + jobs="${jobs:-NONE}" + lines=$(wc -l < "$blob") + + case "$jobs" in + "$ALL_CANONICAL") echo "TRIVIAL - $lines $jobs" ;; + "$MISSING_SHELL") echo "MISSING_SHELL_SECRETS +shell-secrets $lines $jobs" ;; + "$SLIM_2JOB") echo "MISSING_RUST_SHELL +rust-secrets,+shell-secrets $lines $jobs" ;; + *) echo "NEEDS_REVIEW job_set:$jobs $lines $jobs" ;; + esac +} + +# 1. Fetch unique blobs (cached). +echo "[fetch] retrieving unique blobs..." >&2 +normalize_input "$INPUT" | awk -F'\t' '{print $3 "\t" $1}' | sort -u | while IFS=$'\t' read -r sha repo; do + [ -z "$sha" ] || [ "$sha" = "null" ] && continue + blob_file="$BLOBS_DIR/$sha.yml" + [ -s "$blob_file" ] && continue + gh api "/repos/hyperpolymath/$repo/git/blobs/$sha" --jq '.content' 2>/dev/null \ + | base64 -d > "$blob_file" || echo "::warn fetch failed for $sha ($repo)" >&2 +done + +# 2. Classify unique blobs. +declare -A SHA_CLASS +echo "[classify] classifying unique blobs..." >&2 +for blob in "$BLOBS_DIR"/*.yml; do + [ -s "$blob" ] || continue + sha=$(basename "$blob" .yml) + SHA_CLASS[$sha]=$(classify_blob "$blob") +done + +# 3. Per-(repo, path) TSV. +normalize_input "$INPUT" | while IFS=$'\t' read -r repo path sha; do + [ -z "$sha" ] || [ "$sha" = "null" ] && { printf '%s\t%s\t%s\tNEEDS_REVIEW\tnull_sha\t-\t-\n' "$repo" "$path" "$sha"; continue; } + printf '%s\t%s\t%s\t%s\n' "$repo" "$path" "$sha" "${SHA_CLASS[$sha]:-NEEDS_REVIEW fetch_failed - -}" +done diff --git a/scripts/sweep-classifiers/list-workflow-paths.sh b/scripts/sweep-classifiers/list-workflow-paths.sh new file mode 100755 index 00000000..7e3fc5b7 --- /dev/null +++ b/scripts/sweep-classifiers/list-workflow-paths.sh @@ -0,0 +1,86 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: PMPL-1.0-or-later +# list-workflow-paths.sh — enumerate every (repo, path, blob-sha) tuple +# for a given workflow filename across the hyperpolymath estate, INCLUDING +# nested copies in monorepos. +# +# Why this exists (the 3-layer GitHub Code Search undercount): +# +# Layer 1 — path-prefix filter excludes nested workflows. +# `path:.github/workflows` matches the path PREFIX, so files like +# `a2ml/bindings/deno/.github/workflows/secret-scanner.yml` (inside +# the standards monorepo) are NEVER returned by the path-filtered +# query. Dropping the `path:` filter helps, but introduces Layer 2. +# +# Layer 2 — org-scoped /search/code is severely truncated. +# A broad `filename:codeql.yml org:hyperpolymath` query against the +# code-search endpoint returns ~190 paths total, but per-repo +# enumeration of just developer-ecosystem reveals 170 codeql.yml +# files — i.e. the broad query missed ~518 nested copies +# estate-wide. The endpoint silently drops results once it hits +# internal caps. +# +# Layer 3 — nested workflows don't actually execute. +# GitHub Actions only runs workflows from the repo-root +# `.github/workflows/` directory. Nested copies are inert +# (vendored templates or stale leftover). For SECURITY-driven +# campaigns (e.g. the secret-scanner shell-secrets propagation), +# nested copies do NOT close additional attack surface. For +# SINGLE-SOURCE-OF-TRUTH campaigns they still matter. +# +# This script bypasses Layer 1 and Layer 2 by walking the repo list +# directly and using each repo's git-tree API (which IS exhaustive). +# +# Usage: +# ./list-workflow-paths.sh +# +# Output (TSV to stdout, one row per matching file): +# \t\t\t +# +# Example: +# ./list-workflow-paths.sh codeql.yml \ +# > /tmp/drift-survey/codeql-all-tuples.tsv +# +# Notes: +# - Requires `gh` CLI + jq. +# - Honours $GH_ORG (default: hyperpolymath). +# - Skips archived and disabled repos. +# - Output is stable-sorted by repo then path. +# - Each repo costs one Git Tree API call (cheap; uses core +# bucket, not code_search). 300 repos ≈ 300 API calls; +# under the 5000/hr core limit. + +set -euo pipefail + +WORKFLOW="${1:?usage: $0 }" +GH_ORG="${GH_ORG:-hyperpolymath}" + +# 1. Get every non-archived repo in the org. +gh repo list "$GH_ORG" --limit 1000 --no-archived \ + --json name,defaultBranchRef \ + --jq '.[] | select(.defaultBranchRef != null) | + "\(.name)\t\(.defaultBranchRef.name)"' \ + | sort \ + | while IFS=$'\t' read -r repo branch; do + # 2. Pull the full recursive tree for the default branch. + # 200 (success), 404 (empty repo), 422 (truncated — see below). + gh api "repos/$GH_ORG/$repo/git/trees/$branch?recursive=1" \ + 2>/dev/null \ + | jq -r --arg wf "$WORKFLOW" --arg r "$repo" ' + .tree[]? + | select(.type == "blob") + | select(.path | endswith("/.github/workflows/" + $wf) + or . == ".github/workflows/" + $wf) + | "\($r)\t\(.path)\t\(.sha)\t" + + (if .path == ".github/workflows/" + $wf + then "top-level" + else "nested" + end)' \ + 2>/dev/null || true + done + +# Caveat — if a repo's tree is large enough to hit the 100k-entry +# truncation limit, the recursive=1 call returns `"truncated": true` +# and may silently drop entries beyond the cap. None of the +# hyperpolymath monorepos are believed to hit this, but if a repo's +# blob count grows past 100k, switch to per-subdir tree calls.