hyperpolymath · hyperpolymath · May 26, 2026 · May 26, 2026
diff --git a/scripts/sweep-classifiers/README.adoc b/scripts/sweep-classifiers/README.adoc
@@ -0,0 +1,78 @@
+= Sweep classifiers
+:SPDX-License-Identifier: PMPL-1.0-or-later
+
+Per-template classifiers used to triage the wrapper-sweep work that
+follows each of the foundational reusable PRs filed against
+`standards`:
+
+[cols="2,1,3", options="header"]
+|===
+| Classifier | Reusable PR | Template
+| `classify-mirror.sh`         | #187 | `mirror.yml` (7-forge mirror bundle)
+| `classify-secret-scanner.sh` | #190 | `secret-scanner.yml` (trufflehog/gitleaks/rust/shell)
+| `classify-codeql.sh`         | #192 | `codeql.yml` (CodeQL security analysis)
+| `classify-hypatia-scan.sh`   | #193 | `hypatia-scan.yml` (Hypatia neurosymbolic scan)
+|===
+
+== Pipeline
+
+. Paginate `gh api /search/code` for the template across the estate.
+. Group results by blob SHA; fetch each unique blob exactly once.
+. Classify each blob (job-set match / line-count band / language matrix).
+. Emit per-repo TSV: `<repo>\t<sha>\t<class>\t<reason>\t<lines>\t<details>`.
+
+The expensive step (blob fetch) is cached in `$BLOBS_DIR`, so reruns
+are fast.
+
+== Usage
+
+[source,bash]
+----
+# 1. Paginate the search (one-time per template):
+gh api --paginate -X GET '/search/code' \
+  -f q='filename:mirror.yml path:.github/workflows org:hyperpolymath' \
+  --jq '.items[] | {repo: .repository.name, sha: .sha}' \
+  > /tmp/mirror-full.json
+
+# 2. Run the classifier:
+BLOBS_DIR=/tmp/mirror-blobs \
+  ./classify-mirror.sh /tmp/mirror-full.json > /tmp/mirror-classification.tsv
+
+# 3. Summarise:
+awk -F'\t' '{print $3}' /tmp/mirror-classification.tsv | sort | uniq -c | sort -rn
+----
+
+== Nested-path caveat
+
+`path:.github/workflows` in the GitHub code-search API matches the path
+PREFIX. Monorepo nested workflow files (e.g.,
+`a2ml/bindings/deno/.github/workflows/mirror.yml`) are EXCLUDED by that
+filter. To include them, use:
+
+[source,bash]
+----
+gh api --paginate -X GET '/search/code' \
+  -f q='filename:mirror.yml org:hyperpolymath' \
+  --jq '.items[] | {repo: .repository.name, path: .path, sha: .sha}' \
+  > /tmp/mirror-full-with-nested.json
+----
+
+Then filter on `.path` ending in `/.github/workflows/<template>.yml` to
+keep only workflow files.
+
+== Output classes (varies by classifier)
+
+* `TRIVIAL` / `TRIVIAL_CURRENT` / `TRIVIAL_DEFAULT` — canonical match;
+  pure mechanical wrapper conversion.
+* `MISSING_*` / `SLIM_*` / `OLDER_*` — propagation lag; same wrapper as
+  TRIVIAL upgrades the workflow body on first run after merge.
+* `SINGLE_NON_DEFAULT` / `MULTI_LANGUAGE` — requires one or more input
+  overrides at the call site.
+* `NEEDS_REVIEW` — extra jobs, line count out of any known band, or
+  custom workflow body. Per-repo diff required before wrapper.
+
+== Related
+
+* `scripts/apply-baseline.sh` — Hypatia-finding baseline filter (paired
+  with `scripts/tests/apply-baseline-test.sh`).
+* PRs #187, #190, #192, #193 — the reusables these classifiers triage.
diff --git a/scripts/sweep-classifiers/classify-codeql.sh b/scripts/sweep-classifiers/classify-codeql.sh
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: PMPL-1.0-or-later
+# classify-codeql.sh — classify per-repo codeql.yml for #192 sweep.
+#
+# Canonical: 49 lines, single `analyze` job, matrix language=javascript-typescript
+# build-mode=none.
+#
+# Classes:
+#   TRIVIAL_DEFAULT       — single javascript-typescript language, default wrapper
+#   SINGLE_NON_DEFAULT    — single rust or actions language, override wrapper
+#   MULTI_LANGUAGE        — 2+ languages, multi-call wrapper
+#   NEEDS_REVIEW          — NONE language matrix, or large custom workflow
+
+set -euo pipefail
+REPOS_JSON="${1:-/tmp/drift-survey/codeql-full.json}"
+BLOBS_DIR="${BLOBS_DIR:-/tmp/drift-survey/codeql-blobs}"
+mkdir -p "$BLOBS_DIR"
+
+classify_blob() {
+  local blob="$1" langs lines jobs
+  jobs=$(awk '/^jobs:[[:space:]]*$/{in_jobs=1; next} in_jobs && /^[A-Za-z]/{exit} in_jobs && /^  [a-z][a-z0-9_-]*:[[:space:]]*$/{sub(/^  /,""); sub(/:.*/,""); print}' "$blob" 2>/dev/null | sort -u | paste -sd, -)
+  langs=$(grep -E "^\s*- language:" "$blob" 2>/dev/null | sed 's/.*language: //; s/[[:space:]]*$//' | sort -u | paste -sd, -)
+  lines=$(wc -l < "$blob")
+  langs="${langs:-NONE}"
+
+  if [ "$lines" -gt 80 ]; then
+    echo "NEEDS_REVIEW	custom_workflow_${lines}_lines	$lines	$langs"
+    return
+  fi
+  case "$langs" in
+    javascript-typescript)  echo "TRIVIAL_DEFAULT	-	$lines	$langs" ;;
+    rust|actions)           echo "SINGLE_NON_DEFAULT	+language=$langs	$lines	$langs" ;;
+    NONE)                   echo "NEEDS_REVIEW	no_language_matrix	$lines	$langs" ;;
+    *,*)                    echo "MULTI_LANGUAGE	+per-language-wrapper	$lines	$langs" ;;
+    *)                      echo "NEEDS_REVIEW	unknown_lang:$langs	$lines	$langs" ;;
+  esac
+}
+
+echo "[fetch] retrieving unique blobs..." >&2
+jq -r '.sha' "$REPOS_JSON" | sort -u | grep -v '^null$' | while read -r sha; do
+  blob_file="$BLOBS_DIR/$sha.yml"
+  [ -s "$blob_file" ] && continue
+  repo=$(jq -r --arg s "$sha" 'select(.sha == $s) | .repo' "$REPOS_JSON" | head -1)
+  gh api "/repos/hyperpolymath/$repo/git/blobs/$sha" --jq '.content' 2>/dev/null \
+    | base64 -d > "$blob_file" || echo "::warn fetch failed for $sha ($repo)" >&2
+done
+
+declare -A SHA_CLASS
+echo "[classify] classifying unique blobs..." >&2
+for blob in "$BLOBS_DIR"/*.yml; do
+  [ -s "$blob" ] || continue
+  sha=$(basename "$blob" .yml)
+  SHA_CLASS[$sha]=$(classify_blob "$blob")
+done
+
+jq -r '"\(.repo)\t\(.sha)"' "$REPOS_JSON" | while IFS=$'\t' read -r repo sha; do
+  [ -z "$sha" ] || [ "$sha" = "null" ] && { printf '%s\t%s\tNEEDS_REVIEW\tnull_sha\t-\t-\n' "$repo" "$sha"; continue; }
+  printf '%s\t%s\t%s\n' "$repo" "$sha" "${SHA_CLASS[$sha]:-NEEDS_REVIEW	fetch_failed	-	-}"
+done
diff --git a/scripts/sweep-classifiers/classify-hypatia-scan.sh b/scripts/sweep-classifiers/classify-hypatia-scan.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: PMPL-1.0-or-later
+# classify-hypatia-scan.sh — classify per-repo hypatia-scan.yml for #193 sweep.
+#
+# Canonical (standards/.github/workflows/hypatia-scan.yml): 416 lines,
+# single `scan` job. Every estate variant carries the same job set;
+# drift is pure propagation lag, manifested as line count.
+#
+# Classes:
+#   TRIVIAL_CURRENT          — 410-416 lines (current canonical-1)
+#   SLIM_PROPAGATION_LAG     — 245-260 lines (older slimmer version)
+#   OLDER_SLIM               — 205-235 lines (much older slim version)
+#   NEEDS_REVIEW             — anything else (multi-job, line count out of band,
+#                              or self-source like hypatia/standards repos)
+
+set -euo pipefail
+REPOS_JSON="${1:-/tmp/drift-survey/hypatia-scan-full.json}"
+BLOBS_DIR="${BLOBS_DIR:-/tmp/drift-survey/hypatia-blobs}"
+mkdir -p "$BLOBS_DIR"
+
+classify_blob() {
+  local blob="$1" jobs lines
+  jobs=$(awk '/^jobs:[[:space:]]*$/{in_jobs=1; next} in_jobs && /^[A-Za-z]/{exit} in_jobs && /^  [a-z][a-z0-9_-]*:[[:space:]]*$/{sub(/^  /,""); sub(/:.*/,""); print}' "$blob" 2>/dev/null | sort -u | paste -sd, -)
+  jobs="${jobs:-NONE}"
+  lines=$(wc -l < "$blob")
+  if [ "$jobs" != "scan" ]; then
+    echo "NEEDS_REVIEW	job_set:$jobs	$lines	$jobs"
+    return
+  fi
+  case "$lines" in
+    41[0-9]|42[0-9])  echo "TRIVIAL_CURRENT	-	$lines	scan" ;;
+    24[0-9]|25[0-9])  echo "SLIM_PROPAGATION_LAG	+upgrade-to-current	$lines	scan" ;;
+    20[5-9]|21[0-9]|22[0-9]|23[0-9])  echo "OLDER_SLIM	+upgrade-to-current	$lines	scan" ;;
+    *)  echo "NEEDS_REVIEW	line_count_oob:$lines	$lines	scan" ;;
+  esac
+}
+
+echo "[fetch] retrieving unique blobs..." >&2
+jq -r '.sha' "$REPOS_JSON" | sort -u | grep -v '^null$' | while read -r sha; do
+  blob_file="$BLOBS_DIR/$sha.yml"
+  [ -s "$blob_file" ] && continue
+  repo=$(jq -r --arg s "$sha" 'select(.sha == $s) | .repo' "$REPOS_JSON" | head -1)
+  gh api "/repos/hyperpolymath/$repo/git/blobs/$sha" --jq '.content' 2>/dev/null \
+    | base64 -d > "$blob_file" || echo "::warn fetch failed for $sha ($repo)" >&2
+done
+
+declare -A SHA_CLASS
+echo "[classify] classifying unique blobs..." >&2
+for blob in "$BLOBS_DIR"/*.yml; do
+  [ -s "$blob" ] || continue
+  sha=$(basename "$blob" .yml)
+  SHA_CLASS[$sha]=$(classify_blob "$blob")
+done
+
+jq -r '"\(.repo)\t\(.sha)"' "$REPOS_JSON" | while IFS=$'\t' read -r repo sha; do
+  [ -z "$sha" ] || [ "$sha" = "null" ] && { printf '%s\t%s\tNEEDS_REVIEW\tnull_sha\t-\t-\n' "$repo" "$sha"; continue; }
+  printf '%s\t%s\t%s\n' "$repo" "$sha" "${SHA_CLASS[$sha]:-NEEDS_REVIEW	fetch_failed	-	-}"
+done
diff --git a/scripts/sweep-classifiers/classify-mirror.sh b/scripts/sweep-classifiers/classify-mirror.sh
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: PMPL-1.0-or-later
+# classify-mirror.sh — classify per-repo mirror.yml for the #187 sweep.
+#
+# Reads /tmp/drift-survey/mirror-full.json (output of `gh api --paginate
+# /search/code` filtered to mirror.yml in .github/workflows). For each
+# unique blob SHA, fetches the blob once (cached in $BLOBS_DIR) and
+# classifies as:
+#
+#   TRIVIAL       — exactly the canonical 7 forges, line count in band,
+#                   no extra/missing top-level jobs.
+#   NEEDS_REVIEW  — anything else (extra forges, missing forges,
+#                   structural diff). Reported with reason.
+#
+# Output: TSV to stdout, one row per repo:
+#   <repo>\t<sha>\t<classification>\t<reason>\t<line_count>\t<forges>
+
+set -euo pipefail
+
+REPOS_JSON="${1:-/tmp/drift-survey/mirror-full.json}"
+BLOBS_DIR="${BLOBS_DIR:-/tmp/drift-survey/blobs}"
+mkdir -p "$BLOBS_DIR"
+
+# Canonical 7 forges (alphabetical for stable compare).
+CANON_FORGES="bitbucket,codeberg,disroot,gitea,gitlab,radicle,sourcehut"
+LINE_MIN=130
+LINE_MAX=160
+
+classify_blob() {
+  local blob="$1"
+  local forges line_count extra_jobs
+
+  # Forges: top-level jobs under `jobs:` starting with mirror-.
+  forges=$(awk '
+    /^jobs:[[:space:]]*$/ { in_jobs=1; next }
+    in_jobs && /^[A-Za-z]/ { exit }
+    in_jobs && /^  mirror-[a-z]+:[[:space:]]*$/ {
+      sub(/^  mirror-/, ""); sub(/:[[:space:]]*$/, ""); print
+    }
+  ' "$blob" 2>/dev/null | sort -u | paste -sd, -)
+  forges="${forges:-NONE}"
+  line_count=$(wc -l < "$blob")
+
+  # Any non-mirror top-level jobs? Scope to the `jobs:` block (between
+  # `^jobs:$` and the next column-0 key, EOF, or comment-only/blank).
+  extra_jobs=$(awk '
+    /^jobs:[[:space:]]*$/ { in_jobs=1; next }
+    in_jobs && /^[A-Za-z]/ { exit }   # next column-0 key ends the block
+    in_jobs && /^  [a-z][a-z0-9_-]*:[[:space:]]*$/ {
+      sub(/^  /, ""); sub(/:[[:space:]]*$/, ""); print
+    }
+  ' "$blob" 2>/dev/null | grep -vE '^mirror-' | paste -sd, -)
+  extra_jobs="${extra_jobs:-}"
+
+  if [ "$forges" != "$CANON_FORGES" ]; then
+    echo "NEEDS_REVIEW	forge_set:$forges	$line_count	$forges"
+    return
+  fi
+  if [ -n "$extra_jobs" ]; then
+    echo "NEEDS_REVIEW	extra_jobs:$extra_jobs	$line_count	$forges"
+    return
+  fi
+  if [ "$line_count" -lt "$LINE_MIN" ] || [ "$line_count" -gt "$LINE_MAX" ]; then
+    echo "NEEDS_REVIEW	line_count_oob:$line_count	$line_count	$forges"
+    return
+  fi
+  echo "TRIVIAL	-	$line_count	$forges"
+}
+
+# 1. Fetch each unique SHA's blob (cached).
+echo "[fetch] unique SHAs to retrieve..." >&2
+jq -r '.sha' "$REPOS_JSON" | sort -u | grep -v '^null$' | while read -r sha; do
+  blob_file="$BLOBS_DIR/$sha.yml"
+  [ -s "$blob_file" ] && continue
+  repo=$(jq -r --arg s "$sha" 'select(.sha == $s) | .repo' "$REPOS_JSON" | head -1)
+  gh api "/repos/hyperpolymath/$repo/git/blobs/$sha" --jq '.content' 2>/dev/null \
+    | base64 -d > "$blob_file" || echo "::warn fetch failed for $sha ($repo)" >&2
+done
+
+# 2. Classify each unique SHA, cache in a map.
+declare -A SHA_CLASS
+echo "[classify] classifying unique blobs..." >&2
+for blob in "$BLOBS_DIR"/*.yml; do
+  [ -s "$blob" ] || continue
+  sha=$(basename "$blob" .yml)
+  SHA_CLASS[$sha]=$(classify_blob "$blob")
+done
+
+# 3. Emit per-repo TSV.
+jq -r '"\(.repo)\t\(.sha)"' "$REPOS_JSON" | while IFS=$'\t' read -r repo sha; do
+  [ -z "$sha" ] || [ "$sha" = "null" ] && { printf '%s\t%s\tNEEDS_REVIEW\tnull_sha\t-\t-\n' "$repo" "$sha"; continue; }
+  printf '%s\t%s\t%s\n' "$repo" "$sha" "${SHA_CLASS[$sha]:-NEEDS_REVIEW	fetch_failed	-	-}"
+done
diff --git a/scripts/sweep-classifiers/classify-secret-scanner.sh b/scripts/sweep-classifiers/classify-secret-scanner.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+# SPDX-License-Identifier: PMPL-1.0-or-later
+# classify-secret-scanner.sh — classify per-repo secret-scanner.yml for #190 sweep.
+#
+# Canonical (standards/.github/workflows/secret-scanner.yml @ 080c394):
+#   4 jobs — trufflehog, gitleaks, rust-secrets, shell-secrets
+#
+# Classifies each estate copy as:
+#   TRIVIAL                  — exactly the canonical 4 jobs, no extras.
+#   MISSING_SHELL_SECRETS    — 3 canonical jobs (trufflehog/gitleaks/rust-secrets);
+#                              the post-Cloudflare-leak guardrail is absent.
+#   MISSING_RUST_SECRETS     — 2 jobs (trufflehog/gitleaks); slim variant.
+#   NEEDS_REVIEW             — extra jobs, missing trufflehog/gitleaks core,
+#                              line count out of band, or otherwise unrecognised.
+
+set -euo pipefail
+
+REPOS_JSON="${1:-/tmp/drift-survey/secret-scanner-full.json}"
+BLOBS_DIR="${BLOBS_DIR:-/tmp/drift-survey/secret-blobs}"
+mkdir -p "$BLOBS_DIR"
+
+CORE_JOBS="trufflehog,gitleaks"
+ALL_CANONICAL="gitleaks,rust-secrets,shell-secrets,trufflehog"
+MISSING_SHELL="gitleaks,rust-secrets,trufflehog"
+SLIM_2JOB="gitleaks,trufflehog"
+
+extract_jobs() {
+  awk '
+    /^jobs:[[:space:]]*$/ { in_jobs=1; next }
+    in_jobs && /^[A-Za-z]/ { exit }
+    in_jobs && /^  [a-z][a-z0-9_-]*:[[:space:]]*$/ {
+      sub(/^  /, ""); sub(/:[[:space:]]*$/, ""); print
+    }
+  ' "$1" 2>/dev/null | sort -u | paste -sd, -
+}
+
+classify_blob() {
+  local blob="$1" jobs lines
+  jobs=$(extract_jobs "$blob")
+  jobs="${jobs:-NONE}"
+  lines=$(wc -l < "$blob")
+
+  case "$jobs" in
+    "$ALL_CANONICAL")    echo "TRIVIAL	-	$lines	$jobs" ;;
+    "$MISSING_SHELL")    echo "MISSING_SHELL_SECRETS	+shell-secrets	$lines	$jobs" ;;
+    "$SLIM_2JOB")        echo "MISSING_RUST_SHELL	+rust-secrets,+shell-secrets	$lines	$jobs" ;;
+    *)                   echo "NEEDS_REVIEW	job_set:$jobs	$lines	$jobs" ;;
+  esac
+}
+
+# 1. Fetch unique blobs (cached).
+echo "[fetch] retrieving unique blobs..." >&2
+jq -r '.sha' "$REPOS_JSON" | sort -u | grep -v '^null$' | while read -r sha; do
+  blob_file="$BLOBS_DIR/$sha.yml"
+  [ -s "$blob_file" ] && continue
+  repo=$(jq -r --arg s "$sha" 'select(.sha == $s) | .repo' "$REPOS_JSON" | head -1)
+  gh api "/repos/hyperpolymath/$repo/git/blobs/$sha" --jq '.content' 2>/dev/null \
+    | base64 -d > "$blob_file" || echo "::warn fetch failed for $sha ($repo)" >&2
+done
+
+# 2. Classify unique blobs.
+declare -A SHA_CLASS
+echo "[classify] classifying unique blobs..." >&2
+for blob in "$BLOBS_DIR"/*.yml; do
+  [ -s "$blob" ] || continue
+  sha=$(basename "$blob" .yml)
+  SHA_CLASS[$sha]=$(classify_blob "$blob")
+done
+
+# 3. Per-repo TSV.
+jq -r '"\(.repo)\t\(.sha)"' "$REPOS_JSON" | while IFS=$'\t' read -r repo sha; do
+  [ -z "$sha" ] || [ "$sha" = "null" ] && { printf '%s\t%s\tNEEDS_REVIEW\tnull_sha\t-\t-\n' "$repo" "$sha"; continue; }
+  printf '%s\t%s\t%s\n' "$repo" "$sha" "${SHA_CLASS[$sha]:-NEEDS_REVIEW	fetch_failed	-	-}"
+done