diff --git a/AGENTS.md b/AGENTS.md index 8fab5fee7..212e964ed 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -61,6 +61,80 @@ If you genuinely need a doc-kind not yet listed: 2. Add a routing entry to `.cleo/canon.yml`. 3. Re-run `pnpm --filter @cleocode/cleo run build` and the gate stays green. +## Skill Maintenance Discipline (Saga T9799 · Epic T9960) + +Canonical `ct-*` skills under `packages/skills/skills/` describe how CLEO +works to every spawned agent. When core systems change but the skill text +does not, agents act on stale instructions. The T9540 release-system +rewrite is the canonical example — `ct-release-orchestrator` still +described the deleted `cleo release ship` monolith for weeks. + +**Rule**: when you edit a path declared in the coverage map, you MUST +update the corresponding skill in the same PR — or acknowledge the +deferral explicitly. + +### Coverage map (internal-only — never ships) + +`packages/skills/internal/skill-coverage.yml` maps each canonical skill +to the code paths it documents. The file is listed in +`packages/skills/.npmignore` so it never lands in the published +`@cleocode/skills` bundle. Sibling tooling under `packages/skills/internal/` +(drift-check.mjs, the git-hook runners) is excluded the same way. + +### Shipped per-skill metadata (in SKILL.md frontmatter) + +Every canonical SKILL.md MUST carry a `metadata:` block. These fields +DO ship — they are documentation, not enforcement, and they let +consumers and the curator daemon reason about freshness: + +```yaml +metadata: + version: 2.0.0 # bump on every material change + lastReviewed: 2026-05-21 # ISO date — set by the human/agent who reviewed + stability: stable # experimental | stable | deprecated +``` + +### Enforcement (T9960 — in progress) + +- **Pre-commit hook**: regenerates `packages/skills/skills.json` from + SKILL.md frontmatter. Drift between frontmatter and `skills.json` fails + the hook. +- **CI gate `Skill Drift Check`**: scans the PR diff against the coverage + map. If a covered path is touched but the matching SKILL.md is not, the + PR fails with `E_SKILL_DRIFT_UNACKNOWLEDGED`. +- **Trailer override**: a commit trailer + `Skill-Drift-Acknowledged: ` bypasses the gate AND auto-files a + sentient follow-up task for retroactive skill update. +- **Tier-0 skills get NO override** — `ct-cleo`, `ct-orchestrator`, + `ct-task-executor`, `ct-dev-workflow`, `ct-documentor`, and + `CLEO-INJECTION.md` must be kept current in the same PR. The trailer + is rejected for these. + +### Tier-0 core skills (strict — no drift tolerated) + +These define the agent protocol surface. Edit the matching code path, +edit the skill in the same PR. Period. + +- `ct-cleo` — CLI protocol + session lifecycle +- `ct-orchestrator` — spawn/delegation contract +- `ct-task-executor` — worker contract +- `ct-dev-workflow` — commit / branch / release flow +- `ct-documentor` — docs SSoT routing +- `CLEO-INJECTION.md` (template, not a skill folder) — protocol injected + into every spawn prompt + +### Tier-1 LOOM-stage skills (trailer override permitted) + +One per LOOM stage in `packages/core/src/validation/protocols/`. Same +rule applies; trailer override is allowed for non-blocking deferrals. + +### Internal-only validator + +`ct-skill-validator` ships with `disable-model-invocation: true` and is +listed in `packages/skills/.npmignore` so it never reaches consumers. +It is the developer-side toolchain that drives the drift check, depth +audit, and quality evals. + ## Worktree Location (ADR-055 · Saga T9800 · Decision D009) ALL git worktrees provisioned for agent tasks MUST live under the canonical diff --git a/packages/cleo/src/dispatch/domains/__tests__/docs-slug-type-project.test.ts b/packages/cleo/src/dispatch/domains/__tests__/docs-slug-type-project.test.ts index d7f3523bb..baebbae3c 100644 --- a/packages/cleo/src/dispatch/domains/__tests__/docs-slug-type-project.test.ts +++ b/packages/cleo/src/dispatch/domains/__tests__/docs-slug-type-project.test.ts @@ -290,4 +290,112 @@ describe('docs dispatch — slug/type/project (T9636/T9637/T9638)', () => { const fetched = await handler.query('fetch', { attachmentRef: prefix }); expect(fetched.success).toBe(true); }); + + // ──────────────────────────────────────────────────────────────────────── + // T9965 — docs.fetch round-trip: slug + type + kind non-null regression + // + // Guards against regression where docs.fetch returned all-null payload + // ({slug:null, type:null, kind:null}) when resolving by slug or uuid. + // ──────────────────────────────────────────────────────────────────────── + + it('T9965 RT-1: add → list → fetch by slug: slug/type/kind all non-null in metadata', async () => { + const handler = new DocsHandler(); + + // AC1/AC2: docs add with slug + type + const add = await handler.mutate('add', { + ownerId: 'T9965', + file: fixtureA, + slug: 'sg-arch-solid-session-1-handoff', + type: 'handoff', + }); + expect(add.success, `docs.add failed: ${JSON.stringify(add.error)}`).toBe(true); + const addData = add.data as { attachmentId: string; slug: string; type: string }; + expect(addData.slug).toBe('sg-arch-solid-session-1-handoff'); + expect(addData.type).toBe('handoff'); + + // Verify list shows slug + type + const list = await handler.query('list', { task: 'T9965' }); + expect(list.success).toBe(true); + const listData = list.data as { + attachments: Array<{ id: string; slug?: string; type?: string; kind: string }>; + }; + expect(listData.attachments).toHaveLength(1); + const listedRow = listData.attachments[0]; + expect(listedRow?.slug).toBe('sg-arch-solid-session-1-handoff'); + expect(listedRow?.type).toBe('handoff'); + expect(listedRow?.kind).toBe('local-file'); + + // AC1: fetch by slug returns populated slug/type/kind + const fetchBySlug = await handler.query('fetch', { + attachmentRef: 'sg-arch-solid-session-1-handoff', + }); + expect( + fetchBySlug.success, + `docs.fetch by slug failed: ${JSON.stringify(fetchBySlug.error)}`, + ).toBe(true); + const slugData = fetchBySlug.data as { + metadata: { id: string; slug?: string; type?: string; kind: string }; + sizeBytes: number; + inlined: boolean; + }; + expect(slugData.metadata.id).toBe(addData.attachmentId); + // Regression assertions: these were null before T9965 fix + expect(slugData.metadata.slug, 'metadata.slug must not be null after fetch by slug').toBe( + 'sg-arch-solid-session-1-handoff', + ); + expect(slugData.metadata.type, 'metadata.type must not be null after fetch by slug').toBe( + 'handoff', + ); + expect(slugData.metadata.kind, 'metadata.kind must not be null after fetch by slug').toBe( + 'local-file', + ); + expect(slugData.sizeBytes).toBeGreaterThan(0); + }); + + it('T9965 RT-2: add → fetch by uuid: slug/type/kind all non-null in metadata', async () => { + const handler = new DocsHandler(); + + // AC2: docs add with slug + type, then fetch by UUID + const add = await handler.mutate('add', { + ownerId: 'T9965-uuid', + file: fixtureB, + slug: 'handoff-for-uuid-test', + type: 'handoff', + }); + expect(add.success, `docs.add failed: ${JSON.stringify(add.error)}`).toBe(true); + const addData = add.data as { + attachmentId: string; + sha256: string; + slug: string; + type: string; + }; + expect(addData.slug).toBe('handoff-for-uuid-test'); + expect(addData.type).toBe('handoff'); + + // AC2: fetch by UUID returns populated slug/type/kind + const fetchByUuid = await handler.query('fetch', { + attachmentRef: addData.attachmentId, + }); + expect( + fetchByUuid.success, + `docs.fetch by uuid failed: ${JSON.stringify(fetchByUuid.error)}`, + ).toBe(true); + const uuidData = fetchByUuid.data as { + metadata: { id: string; slug?: string; type?: string; kind: string }; + sizeBytes: number; + inlined: boolean; + }; + expect(uuidData.metadata.id).toBe(addData.attachmentId); + // Regression assertions: these were null before T9965 fix + expect(uuidData.metadata.slug, 'metadata.slug must not be null after fetch by uuid').toBe( + 'handoff-for-uuid-test', + ); + expect(uuidData.metadata.type, 'metadata.type must not be null after fetch by uuid').toBe( + 'handoff', + ); + expect(uuidData.metadata.kind, 'metadata.kind must not be null after fetch by uuid').toBe( + 'local-file', + ); + expect(uuidData.sizeBytes).toBeGreaterThan(0); + }); }); diff --git a/packages/cleo/vitest.config.ts b/packages/cleo/vitest.config.ts index 5dc9f6864..6e51ae213 100644 --- a/packages/cleo/vitest.config.ts +++ b/packages/cleo/vitest.config.ts @@ -262,6 +262,22 @@ export default defineConfig({ ).pathname, '@cleocode/core': new URL('../../packages/core/src/index.ts', import.meta.url).pathname, '@cleocode/lafs': new URL('../../packages/lafs/src/index.ts', import.meta.url).pathname, + // T9965: @a2a-js/sdk is a dep of @cleocode/lafs; in worktrees it resolves + // through lafs/node_modules rather than root node_modules. + '@a2a-js/sdk': new URL( + '../../packages/lafs/node_modules/@a2a-js/sdk/dist/index.js', + import.meta.url, + ).pathname, + // T9965: js-yaml + @iarna/toml are deps of @cleocode/caamp; in worktrees + // they resolve through caamp/node_modules rather than root node_modules. + '@iarna/toml': new URL( + '../../packages/caamp/node_modules/@iarna/toml/toml.js', + import.meta.url, + ).pathname, + 'js-yaml': new URL( + '../../packages/caamp/node_modules/js-yaml/index.js', + import.meta.url, + ).pathname, // T1113: nexus code sub-path exports — legacy dist-path imports used in nexus.ts '@cleocode/nexus/dist/src/code/unfold.js': new URL( '../../packages/nexus/src/code/unfold.ts', diff --git a/packages/skills/skills/ct-skill-validator/SKILL.md b/packages/skills/skills/ct-skill-validator/SKILL.md index f765f6bb6..b5dea4e8c 100644 --- a/packages/skills/skills/ct-skill-validator/SKILL.md +++ b/packages/skills/skills/ct-skill-validator/SKILL.md @@ -34,14 +34,18 @@ python ${CLAUDE_SKILL_DIR}/scripts/validate.py --json # Deep body quality audit (optional, run alongside validate.py): python ${CLAUDE_SKILL_DIR}/scripts/audit_body.py -# Manifest alignment check: -python ${CLAUDE_SKILL_DIR}/scripts/check_manifest.py +# Manifest alignment check: bundled into validate.py Tier 4. Use: +# python validate.py --manifest --dispatch-config # Progressive-disclosure depth check (T9684 — CI gate): python ${CLAUDE_SKILL_DIR}/scripts/check_depth.py # Repo-wide depth sweep: python ${CLAUDE_SKILL_DIR}/scripts/check_depth.py --all + +# Allowlist audit (CI / cron — exit 1 on findings): +python ${CLAUDE_SKILL_DIR}/scripts/check_depth.py --audit-allowlist +python ${CLAUDE_SKILL_DIR}/scripts/check_depth.py --audit-allowlist --json ``` **Depth rule (T9684):** A skill PASSES when ANY of: @@ -54,6 +58,13 @@ Pre-existing stubs are allowlisted with follow-up task IDs in `scripts/check_depth.py::ALLOWLIST`. Gold-standard skills: `ct-orchestrator` (9 refs) and `ct-skill-creator` (7 refs). +**Allowlist hygiene:** every entry carries `last_reviewed: YYYY-MM-DD HH:MM:SS`. +`check_depth.py` runs a silent background audit on every invocation and emits +WARNs to stderr for malformed or stale (> 30 days) entries. Use +`--audit-allowlist` for an explicit pass that exits 1 on any finding — +suitable for a CI cron job. The threshold is tunable via +`ALLOWLIST_STALE_DAYS` at the top of `check_depth.py`. + The depth check runs on every PR touching `packages/skills/skills/**` via `.github/workflows/skills-depth-check.yml`. @@ -119,40 +130,68 @@ Repeat until verdict is `PASS` or `PASS_WITH_WARNINGS`. WARN is acceptable; ERRO ## Phase 3: Quality A/B Eval Tests whether the skill actually improves agent output quality vs. no skill context. -Uses the eval infrastructure from ct-skill-creator. +Phase 3 is **delegated** — `ct-skill-validator` does static analysis; runtime +quality evals live in a dedicated skill (`skill-evaluator` preferred, +`ct-skill-creator` as legacy fallback). + +> **Scope boundary:** `ct-skill-validator` is *static* — it checks structure, +> frontmatter, body, manifest, depth, ecosystem fit. For deep runtime A/B +> benchmarking, regression detection, and auto-improvement, the dispatcher +> below routes to `skill-evaluator`, which owns that workflow end-to-end. + +The two eval files in `evals/` serve different purposes: +- `evals/trigger_queries.json` — trigger queries (does the description activate correctly?) +- `evals/quality_evals.json` — output-quality scenarios (does the validator produce the right report?) + +### Dispatch (no hardcoded cross-skill paths) + +`scripts/run_quality_eval.py` uses `_skill_finder.py` to dynamically locate +the eval skill at runtime. It searches: + +1. `$SKILL_FINDER_PATH` (colon-separated override) +2. Direct sibling of this skill +3. `/../../skills//` (CLEO / awesome-skills layouts) +4. Walk-up ancestors + their project-shaped children (cross-project) +5. `~/.claude/skills//` + +Show what would be used (without running anything): +```bash +python ${CLAUDE_SKILL_DIR}/scripts/run_quality_eval.py --list +``` **Trigger accuracy** — does the skill description trigger correctly? ```bash -python ${CLAUDE_SKILL_DIR}/../ct-skill-creator/scripts/run_eval.py \ - --eval-set ${CLAUDE_SKILL_DIR}/evals/eval_set.json \ - --skill-path ${CLAUDE_SKILL_DIR} +python ${CLAUDE_SKILL_DIR}/scripts/run_quality_eval.py \ + --trigger --evals ${CLAUDE_SKILL_DIR}/evals/trigger_queries.json ``` -**Optimize description** (if trigger accuracy < 80%): +**Quality eval** (with/without skill A/B + grading + blind comparison): ```bash -python ${CLAUDE_SKILL_DIR}/../ct-skill-creator/scripts/run_loop.py \ - --eval-set ${CLAUDE_SKILL_DIR}/evals/eval_set.json \ - --skill-path ${CLAUDE_SKILL_DIR} \ - --model claude-sonnet-4-6 \ - --max-iterations 5 +python ${CLAUDE_SKILL_DIR}/scripts/run_quality_eval.py \ + --runs 3 --executor api \ + --evals ${CLAUDE_SKILL_DIR}/evals/quality_evals.json ``` -`run_loop.py` opens a live HTML accuracy report in the browser automatically. - -**Quality eval** (with/without skill A/B): -1. Spawn two agents in the SAME turn: one WITH skill context loaded, one WITHOUT (baseline) -2. Give both the same task prompt from [evals/evals.json](evals/evals.json) -3. Grade each with the grader agent → `grading.json`: - `${CLAUDE_SKILL_DIR}/../ct-skill-creator/agents/grader.md` -4. Blind A/B comparison with the comparator agent → `comparison.json`: - `${CLAUDE_SKILL_DIR}/../ct-skill-creator/agents/comparator.md` -5. Post-hoc analysis with the analyzer agent → `analysis.json`: - `${CLAUDE_SKILL_DIR}/../ct-skill-creator/agents/analyzer.md` -6. Serve the full eval review: - `python ${CLAUDE_SKILL_DIR}/../ct-skill-creator/eval-viewer/generate_review.py ` - (Opens browser at localhost:3117) - -See [references/validation-rules.md](references/validation-rules.md) and -`${CLAUDE_SKILL_DIR}/../ct-skill-creator/references/schemas.md` for JSON output schemas. + +When `skill-evaluator` is the resolved target, the wrapper drives its full +loop: generate → run → grade → aggregate → analyze → detect-regression → +propose. See `skill-evaluator/SKILL.md` for the workflow it actually +executes. + +When `ct-skill-creator` is the resolved fallback, the wrapper invokes its +`run_eval.py` with the same arguments translated to its CLI shape. + +### Manual A/B (if you want to drive runs yourself) + +If you need direct control of how runs are spawned (e.g. inside a real +Claude Code session with subagent isolation), invoke the resolved eval +skill's scripts directly — locate them with: + +```bash +EVAL_SKILL=$(python ${CLAUDE_SKILL_DIR}/scripts/_skill_finder.py skill-evaluator) +``` + +then drive that skill's documented workflow without any further hardcoded +paths in this file. --- diff --git a/packages/skills/skills/ct-skill-validator/evals/evals.json b/packages/skills/skills/ct-skill-validator/evals/quality_evals.json similarity index 97% rename from packages/skills/skills/ct-skill-validator/evals/evals.json rename to packages/skills/skills/ct-skill-validator/evals/quality_evals.json index 6837a978d..5b329e8e1 100644 --- a/packages/skills/skills/ct-skill-validator/evals/evals.json +++ b/packages/skills/skills/ct-skill-validator/evals/quality_evals.json @@ -42,7 +42,7 @@ "prompt": "Run the manifest alignment check for ct-skill-validator against the CLEO manifest", "expected_output": "Manifest alignment results showing whether ct-skill-validator is registered correctly in manifest.json and dispatch-config.json", "expectations": [ - "Claude passes --manifest to validate.py or check_manifest.py", + "Claude passes --manifest to validate.py (Tier 4 check)", "The manifest.json path is correctly resolved", "The output shows Tier 4 CLEO Integration results", "Claude reports whether the skill is found in manifest.json" diff --git a/packages/skills/skills/ct-skill-validator/evals/eval_set.json b/packages/skills/skills/ct-skill-validator/evals/trigger_queries.json similarity index 100% rename from packages/skills/skills/ct-skill-validator/evals/eval_set.json rename to packages/skills/skills/ct-skill-validator/evals/trigger_queries.json diff --git a/packages/skills/skills/ct-skill-validator/references/validation-rules.md b/packages/skills/skills/ct-skill-validator/references/validation-rules.md index 0aad4a53d..964d6d8e3 100644 --- a/packages/skills/skills/ct-skill-validator/references/validation-rules.md +++ b/packages/skills/skills/ct-skill-validator/references/validation-rules.md @@ -1,10 +1,10 @@ -# CLEO Skill Validator v2 — Validation Rules +# CLEO Skill Validator — Validation Rules Complete rule reference for the 5-tier validation system. ## Overview -The CLEO Skill Validator v2 enforces compliance across five tiers of increasing depth: +The CLEO Skill Validator enforces compliance across five tiers of increasing depth: 1. **Structure** — Does the skill have the required files and valid frontmatter? 2. **Frontmatter Quality** — Are all frontmatter fields correct, well-formed, and non-contradictory? @@ -16,42 +16,79 @@ Tiers 1-3 run on every validation. Tiers 4-5 are opt-in via CLI flags. ## Allowed vs Forbidden Fields -### V2_STANDARD (allowed in SKILL.md frontmatter) +### Allowed in SKILL.md frontmatter + +Two groups: **agentskills.io spec fields** (the open standard) and +**Claude Code harness extensions** (honored by the runtime but not part of +the open spec). + +#### From the agentskills.io spec | Field | Type | Required | Description | |-------|------|----------|-------------| -| `name` | string | Yes | Skill identifier, hyphen-case, max 64 chars | +| `name` | string | Yes | Skill identifier, hyphen-case, max 64 chars, must match parent directory name | | `description` | string | Yes | What the skill does and when to use it, max 1024 chars | +| `license` | string | No | License name or reference to a bundled LICENSE file | +| `compatibility` | string | No | Environment requirements (max 500 chars). Only include when the skill needs specific runtime, packages, or network access | +| `metadata` | dict | No | Map from string keys to string values for additional metadata not defined by the spec | +| `allowed-tools` | string or list | No | Tools pre-approved without per-use prompts (experimental in spec) | + +##### Recommended `metadata` sub-keys (string values per spec) + +The agentskills.io spec defines `metadata` as "a map from string keys to string +values". Use it for authorship and version info that the spec doesn't define +fields for: + +| Sub-key | Convention | Example | +|---------|-----------|---------| +| `author` | Author name or org | `author: example-org` | +| `version` | Skill version (always quoted as string) | `version: "1.0.0"` | +| `last_updated` | ISO timestamp `YYYY-MM-DD HH:MM:SS` (always quoted) | `last_updated: "2026-05-21 14:00:18"` | +| `related` | Related skills | `related: skill-creator, skill-evaluator` | +| `spec` | Spec the skill claims to follow | `spec: https://agentskills.io/specification.md` | + +The validator emits a WARN when `metadata` is present without any of +`author`, `version`, or `last_updated`. Numeric values like `version: 1.0` are +flagged — the spec requires string values, so quote them: `version: "1.0"`. + +#### Claude Code harness extensions + +These are honored by the Claude Code runtime but are NOT part of the +agentskills.io open spec. Skills targeting other agent runtimes should +either omit them or document the dependency in `compatibility`. + +| Field | Type | Required | Description | +|-------|------|----------|-------------| | `argument-hint` | string | No | Shown in autocomplete, max 100 chars | | `disable-model-invocation` | boolean | No | Prevent model from auto-invoking | | `user-invocable` | boolean | No | Whether skill appears as slash command | -| `allowed-tools` | string or list | No | Tools pre-approved without per-use prompts | | `model` | string | No | Override model for this skill | | `context` | string | No | Must be "fork" if present | | `agent` | string | No | Subagent type (Explore, Plan, etc.) | | `hooks` | dict | No | Skill-scoped lifecycle hooks | -| `license` | string | No | License identifier | -### CLEO_ONLY (forbidden in SKILL.md, belongs in manifest.json) +### CLEO-only fields (forbidden in SKILL.md; belong in `manifest-entry.json`) + +These fields hold CLEO-specific structured data that the Claude runtime +doesn't read. They live in `manifest-entry.json` so they don't bloat +the SKILL.md frontmatter or violate the agentskills.io spec. | Field | Destination | |-------|-------------| -| `version` | manifest.json | -| `tier` | manifest.json | -| `core` | manifest.json | -| `category` | manifest.json | -| `protocol` | manifest.json | -| `dependencies` | manifest.json | -| `sharedResources` | manifest.json | -| `compatibility` | manifest.json | -| `token_budget` | manifest.json | -| `capabilities` | manifest.json | -| `constraints` | manifest.json | -| `metadata` | manifest.json | -| `tags` | manifest.json | -| `triggers` | manifest.json | -| `mvi_scope` | manifest.json | -| `requires_tiers` | manifest.json | +| `version` | manifest-entry.json (or `metadata.version` in SKILL.md) | +| `tier` | manifest-entry.json | +| `core` | manifest-entry.json | +| `category` | manifest-entry.json | +| `protocol` | manifest-entry.json | +| `dependencies` | manifest-entry.json | +| `sharedResources` | manifest-entry.json | +| `token_budget` | manifest-entry.json | +| `capabilities` | manifest-entry.json | +| `constraints` | manifest-entry.json | +| `tags` | manifest-entry.json | +| `triggers` | manifest-entry.json | +| `mvi_scope` | manifest-entry.json | +| `requires_tiers` | manifest-entry.json | ## Tier 1 — Structure Rules @@ -94,14 +131,21 @@ Tiers 1-3 run on every validation. Tiers 4-5 are opt-in via CLI flags. | T2-024 | `model` is a string if present | ERROR | Use model ID string | | T2-025 | `agent` is a string if present | ERROR | Use agent type string | | T2-026 | `hooks` is a dict if present | ERROR | Use key: value structure | +| T2-027 | `compatibility` is a string if present | ERROR | Use a plain string value | +| T2-028 | `compatibility` is 500 characters or fewer (agentskills.io spec) | ERROR | Shorten or move detail to `references/` | +| T2-029 | `metadata` is a dict if present (agentskills.io spec) | ERROR | Use key: value structure | +| T2-030 | `metadata` keys are all strings (agentskills.io spec) | ERROR | Quote non-string keys | +| T2-031 | `metadata` values are all strings (agentskills.io spec) | WARN | Quote numeric versions: `version: "1.0"` | +| T2-032 | `metadata` includes at least one of: author, version, last_updated | WARN | Add recommended traceability keys | +| T2-033 | `metadata.last_updated` (and `metadata.last_reviewed` if present) match `YYYY-MM-DD HH:MM:SS` | WARN | Use precise timestamp format, e.g. `"2026-05-21 14:00:18"` | ## Tier 3 — Body Quality Rules | Rule ID | Check | Severity | Fix | |---------|-------|----------|-----| | T3-001 | Body is present (non-empty content after frontmatter) | WARN | Add content below the closing `---` | -| T3-002 | Body is under 600 lines | ERROR | Split into sub-documents or trim | -| T3-003 | Body is under 400 lines | WARN | Consider trimming for token efficiency | +| T3-002 | Body is under 600 lines (hard cap) | ERROR | Split into sub-documents or trim | +| T3-003 | Body is under 500 lines (agentskills.io spec recommendation) | WARN | Move detail to `references/` for progressive disclosure | | T3-004 | No placeholder text (`[Required:`, `TODO`, `REPLACE`, `[Add content`, `FIXME`, `TBD`) | WARN (per match) | Replace placeholders with real content | | T3-005 | Bodies over 200 lines have `## ` section headers | WARN | Add section structure for readability | | T3-006 | File references (`references/`, `scripts/`) point to existing files | WARN | Create the referenced file or fix the path | @@ -156,9 +200,16 @@ Tiers 1-3 run on every validation. Tiers 4-5 are opt-in via CLI flags. | T2-024 | 2 | `model` is string | ERROR | | T2-025 | 2 | `agent` is string | ERROR | | T2-026 | 2 | `hooks` is dict | ERROR | +| T2-027 | 2 | `compatibility` is string | ERROR | +| T2-028 | 2 | `compatibility` max 500 chars | ERROR | +| T2-029 | 2 | `metadata` is dict | ERROR | +| T2-030 | 2 | `metadata` keys are strings | ERROR | +| T2-031 | 2 | `metadata` values are strings | WARN | +| T2-032 | 2 | `metadata` has author/version/last_updated | WARN | +| T2-033 | 2 | `metadata` timestamp keys match `YYYY-MM-DD HH:MM:SS` | WARN | | T3-001 | 3 | Body present | WARN | | T3-002 | 3 | Body under 600 lines | ERROR | -| T3-003 | 3 | Body under 400 lines | WARN | +| T3-003 | 3 | Body under 500 lines (spec) | WARN | | T3-004 | 3 | No placeholder text | WARN | | T3-005 | 3 | Section headers in long bodies | WARN | | T3-006 | 3 | File references exist | WARN | diff --git a/packages/skills/skills/ct-skill-validator/scripts/__init__.py b/packages/skills/skills/ct-skill-validator/scripts/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/packages/skills/skills/ct-skill-validator/scripts/__pycache__/audit_body.cpython-314.pyc b/packages/skills/skills/ct-skill-validator/scripts/__pycache__/audit_body.cpython-314.pyc deleted file mode 100644 index 78af56b54..000000000 Binary files a/packages/skills/skills/ct-skill-validator/scripts/__pycache__/audit_body.cpython-314.pyc and /dev/null differ diff --git a/packages/skills/skills/ct-skill-validator/scripts/__pycache__/check_ecosystem.cpython-314.pyc b/packages/skills/skills/ct-skill-validator/scripts/__pycache__/check_ecosystem.cpython-314.pyc deleted file mode 100644 index 8215d81ba..000000000 Binary files a/packages/skills/skills/ct-skill-validator/scripts/__pycache__/check_ecosystem.cpython-314.pyc and /dev/null differ diff --git a/packages/skills/skills/ct-skill-validator/scripts/__pycache__/generate_validation_report.cpython-314.pyc b/packages/skills/skills/ct-skill-validator/scripts/__pycache__/generate_validation_report.cpython-314.pyc deleted file mode 100644 index 5de0e3932..000000000 Binary files a/packages/skills/skills/ct-skill-validator/scripts/__pycache__/generate_validation_report.cpython-314.pyc and /dev/null differ diff --git a/packages/skills/skills/ct-skill-validator/scripts/__pycache__/validate.cpython-314.pyc b/packages/skills/skills/ct-skill-validator/scripts/__pycache__/validate.cpython-314.pyc deleted file mode 100644 index dafd49459..000000000 Binary files a/packages/skills/skills/ct-skill-validator/scripts/__pycache__/validate.cpython-314.pyc and /dev/null differ diff --git a/packages/skills/skills/ct-skill-validator/scripts/_skill_finder.py b/packages/skills/skills/ct-skill-validator/scripts/_skill_finder.py new file mode 100644 index 000000000..5373d91b0 --- /dev/null +++ b/packages/skills/skills/ct-skill-validator/scripts/_skill_finder.py @@ -0,0 +1,211 @@ +#!/usr/bin/env python3 +"""Dynamic skill resolution — finds a sibling skill directory by name without +any hardcoded paths. + +Search order (first match wins): + + 1. $SKILL_FINDER_PATH environment variable — colon-separated override paths. + Each entry is treated as a directory that may contain `/SKILL.md` + OR may itself be the skill directory if its basename matches. + + 2. `~/.claude/skill-finder-paths.txt` — user config; newline-separated root + directories to search. Useful when the install lives in `~/.claude/skills/` + but the project skills live under `/mnt/projects//skills/`. + Lines starting with `#` are ignored. + + 3. Sibling of the calling skill — `/..//SKILL.md`. + Most common in awesome-skills layouts where every skill is a peer. + + 4. Two-up + `skills//` — `/../../skills//SKILL.md`. + Matches cleocode `packages/skills/skills/...` and `repo/skills/...`. + + 5. Walk up from the calling skill looking for a `skills//SKILL.md` + on the ancestor chain AND its project-shaped children (depth-limited). + + 6. `~/.claude/skills//SKILL.md` — installed Claude Code skill. + +The caller is determined from `Path(__file__).resolve()` — so a skill +running its own script can find a peer without knowing absolute paths. + +Use: + from _skill_finder import find_skill + evaluator = find_skill("skill-evaluator") + if evaluator is None: + sys.exit("skill-evaluator not found") + +CLI: + python _skill_finder.py # prints path, exits 1 if not found + python _skill_finder.py --json +""" + +from __future__ import annotations + +import argparse +import json +import os +import sys +from pathlib import Path +from typing import Iterator + + +MAX_WALK_DEPTH = 6 + + +def _candidates(name: str, caller_skill_dir: Path) -> Iterator[Path]: + """Yield candidate paths in priority order. Each may or may not exist.""" + + # 1. Explicit override via env var (colon-separated paths) + env = os.environ.get("SKILL_FINDER_PATH", "") + for entry in (e for e in env.split(":") if e): + p = Path(entry).expanduser() + # If the entry IS the skill directory, accept it; else treat as parent + if p.is_dir() and p.name == name and (p / "SKILL.md").exists(): + yield p + continue + yield p / name + # Also probe common "skills/" subdirs under each user-configured root + yield p / "skills" / name + yield p / "packages" / "skills" / "skills" / name + + # 2. User config file: ~/.claude/skill-finder-paths.txt + cfg = Path.home() / ".claude" / "skill-finder-paths.txt" + if cfg.exists(): + try: + for line in cfg.read_text(encoding="utf-8").splitlines(): + line = line.strip() + if not line or line.startswith("#"): + continue + root = Path(line).expanduser() + yield root / name + yield root / "skills" / name + yield root / "packages" / "skills" / "skills" / name + # Also peek into project-shaped children of the root + if root.is_dir(): + try: + for child in root.iterdir(): + if not child.is_dir() or child.name.startswith("."): + continue + if (child / "skills").is_dir(): + yield child / "skills" / name + if (child / "packages" / "skills" / "skills").is_dir(): + yield child / "packages" / "skills" / "skills" / name + except PermissionError: + pass + except (OSError, UnicodeDecodeError): + pass + + # 3. Direct sibling of the caller + yield caller_skill_dir.parent / name + + # 4. Two-up + skills// + yield caller_skill_dir.parent.parent / "skills" / name + + # 4. Walk up looking for skills// on the ancestor chain. + # At each ancestor `cur`, probe: + # - cur/skills/ (standard layout) + # - cur/packages/skills/skills/ (CLEO layout) + # AND iterate the *children* of cur (which are potential project + # roots) and probe: + # - /skills/ + # - /packages/skills/skills/ + # This finds skills under sibling project roots — e.g. when the + # caller is in /mnt/projects/cleocode/.../ct-skill-validator/ and + # the target is in /mnt/projects/proxmox/skills//. The walk + # eventually reaches the common ancestor /mnt/projects/ where both + # cleocode and proxmox are children. Bounded by MAX_SIBLINGS per + # level to keep this fast on populated roots. + cur = caller_skill_dir.parent + for _ in range(MAX_WALK_DEPTH): + yield cur / "skills" / name + yield cur / "packages" / "skills" / "skills" / name + # Iterate children, pre-filtering to project-shaped dirs only. + # A project-shaped dir is one that has a `skills/` or + # `packages/skills/skills/` subdir on disk. The pre-filter is one + # extra stat per child but eliminates the vast majority of irrelevant + # candidates before they enter the search, keeping a fully-populated + # ancestor (e.g. 200+ peer projects) sub-second. + if cur.is_dir(): + try: + for child in cur.iterdir(): + if not child.is_dir() or child.name.startswith("."): + continue + if (child / "skills").is_dir(): + yield child / "skills" / name + if (child / "packages" / "skills" / "skills").is_dir(): + yield child / "packages" / "skills" / "skills" / name + except PermissionError: + pass + parent = cur.parent + if parent == cur: + break + cur = parent + + # 5. Installed Claude Code skill + yield Path.home() / ".claude" / "skills" / name + + +def caller_skill_dir() -> Path: + """Resolve the directory of the skill that invoked this helper. + + Assumes this file lives at `/scripts/_skill_finder.py`. If moved, + the caller can pass `caller_skill_dir` to `find_skill()` explicitly. + """ + return Path(__file__).resolve().parent.parent + + +def find_skill(name: str, *, caller: Path | None = None) -> Path | None: + """Return the resolved path of the named skill, or None if not found. + + A skill is considered found if `/SKILL.md` exists. + """ + caller_dir = (caller or caller_skill_dir()).resolve() + seen: set[Path] = set() + for cand in _candidates(name, caller_dir): + try: + resolved = cand.resolve() + except (OSError, RuntimeError): + continue + if resolved in seen: + continue + seen.add(resolved) + if (resolved / "SKILL.md").exists(): + return resolved + return None + + +def find_first(names: list[str], *, caller: Path | None = None) -> tuple[str, Path] | None: + """Return the (name, path) of the first found skill from a preference list. + + Lets callers say "prefer skill-evaluator, fall back to ct-skill-creator" + without hardcoding either path. + """ + for n in names: + p = find_skill(n, caller=caller) + if p is not None: + return n, p + return None + + +def main() -> int: + ap = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + ap.add_argument("name", help="skill name to resolve") + ap.add_argument("--caller", help="override the calling-skill directory") + ap.add_argument("--json", action="store_true", help="emit JSON instead of bare path") + args = ap.parse_args() + + caller = Path(args.caller).expanduser().resolve() if args.caller else None + path = find_skill(args.name, caller=caller) + + if args.json: + print(json.dumps({"name": args.name, "found": path is not None, + "path": str(path) if path else None}, indent=2)) + else: + if path is None: + print(f"error: skill '{args.name}' not found on search path", file=sys.stderr) + return 1 + print(str(path)) + return 0 if path is not None else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/packages/skills/skills/ct-skill-validator/scripts/audit_body.py b/packages/skills/skills/ct-skill-validator/scripts/audit_body.py index 6b8115938..3bfb43a6b 100644 --- a/packages/skills/skills/ct-skill-validator/scripts/audit_body.py +++ b/packages/skills/skills/ct-skill-validator/scripts/audit_body.py @@ -60,9 +60,13 @@ def ok(section, msg): total_lines = len(body_lines) # ── Section analysis ──────────────────────────────────────────────── - h1_headers = re.findall(r"^# .+", body, re.MULTILINE) - h2_headers = re.findall(r"^## .+", body, re.MULTILINE) - h3_headers = re.findall(r"^### .+", body, re.MULTILINE) + # Strip fenced code blocks first — '#' inside bash blocks are comments, + # not markdown headings, and matching them produces false positives. + body_no_fences = re.sub(r"```[\s\S]*?```", "", body) + body_no_fences_lines = body_no_fences.split("\n") + h1_headers = re.findall(r"^# .+", body_no_fences, re.MULTILINE) + h2_headers = re.findall(r"^## .+", body_no_fences, re.MULTILINE) + h3_headers = re.findall(r"^### .+", body_no_fences, re.MULTILINE) total_sections = len(h2_headers) + len(h3_headers) if len(h1_headers) > 1: @@ -70,7 +74,7 @@ def ok(section, msg): first_h2_line = None first_h3_line = None - for i, line in enumerate(body_lines): + for i, line in enumerate(body_no_fences_lines): if first_h2_line is None and line.startswith("## "): first_h2_line = i if first_h3_line is None and line.startswith("### "): @@ -146,7 +150,8 @@ def ok(section, msg): ok("placeholder-scan", "No placeholder text found") # ── Duplicate headings ────────────────────────────────────────────── - all_headings = re.findall(r"^(#{1,6} .+)", body, re.MULTILINE) + # Reuse code-fence-stripped body for the same reason as section analysis. + all_headings = re.findall(r"^(#{1,6} .+)", body_no_fences, re.MULTILINE) seen: dict[str, bool] = {} dup_found = False for heading in all_headings: diff --git a/packages/skills/skills/ct-skill-validator/scripts/check_depth.py b/packages/skills/skills/ct-skill-validator/scripts/check_depth.py index 18fcb7a4c..e7fb5976d 100644 --- a/packages/skills/skills/ct-skill-validator/scripts/check_depth.py +++ b/packages/skills/skills/ct-skill-validator/scripts/check_depth.py @@ -33,6 +33,7 @@ import re import json import argparse +import datetime from pathlib import Path @@ -42,14 +43,28 @@ MIN_REF_FILES = 3 GOLD_STANDARDS = ("ct-orchestrator", "ct-skill-creator") +# Cadence for the allowlist audit — entries older than this are flagged stale. +ALLOWLIST_STALE_DAYS = 30 +LAST_REVIEWED_RE = re.compile(r"last_reviewed:\s*(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})") + # Allowlist — pre-existing stub skills exempted at T9567 (E-SKILLS-DEPTH-BACKFILL). -# Each entry MUST have a follow-up task ID. Remove the entry once that task lands -# a depth backfill. New entries require owner approval — do not add silently. +# Each entry MUST have a follow-up task ID. Remove the entry once that task +# lands a depth backfill. New entries require owner approval — do not add +# silently. +# +# AUDIT CADENCE: review every release cycle (or every 30 days, whichever +# comes first). Each entry below carries `last_reviewed` — if that timestamp +# is older than the cadence, run `python check_depth.py ` for each +# allowlisted skill, decide keep / remove, and bump `last_reviewed` to a +# fresh `date '+%Y-%m-%d %H:%M:%S'` value. See ALLOWLIST_STALE_DAYS above +# for the actual threshold the audit enforces. +# +# Format: skill-name -> "task-id: rationale | last_reviewed: YYYY-MM-DD HH:MM:SS" ALLOWLIST: dict[str, str] = { - "ct-codebase-mapper": "T9567-followup: pre-existing; depth-backfill deferred", - "ct-master-tac": "T9567-followup: pre-existing; depth-backfill deferred", - "ct-memory": "T9567-followup: pre-existing; depth-backfill deferred", - "ct-stickynote": "T9567-followup: ephemeral note skill; minimal-by-design", + "ct-codebase-mapper": "T9567-followup: pre-existing; depth-backfill deferred | last_reviewed: 2026-05-21 14:00:18", + "ct-master-tac": "T9567-followup: pre-existing; depth-backfill deferred | last_reviewed: 2026-05-21 14:00:18", + "ct-memory": "T9567-followup: pre-existing; depth-backfill deferred | last_reviewed: 2026-05-21 14:00:18", + "ct-stickynote": "T9567-followup: ephemeral note skill; minimal-by-design | last_reviewed: 2026-05-21 14:00:18", } @@ -218,6 +233,60 @@ def _print_report(report: dict) -> None: print(f" * {r}") +def audit_allowlist( + *, + now: datetime.datetime | None = None, + stale_days: int = ALLOWLIST_STALE_DAYS, +) -> list[dict]: + """Audit the ALLOWLIST for malformed or stale `last_reviewed` stamps. + + Returns a list of finding dicts: each has `skill`, `severity` (WARN), + `message`, and (when parseable) `age_days`. An empty list means every + allowlist entry has a well-formed, fresh stamp. + + `last_reviewed:` must match `YYYY-MM-DD HH:MM:SS`. Stamps older than + `stale_days` are flagged for re-audit. + """ + now = now or datetime.datetime.now() + findings: list[dict] = [] + for skill, rationale in ALLOWLIST.items(): + m = LAST_REVIEWED_RE.search(rationale) + if not m: + findings.append({ + "skill": skill, "severity": "WARN", + "message": "missing or malformed 'last_reviewed: YYYY-MM-DD HH:MM:SS' stamp", + }) + continue + stamp = m.group(1) + try: + ts = datetime.datetime.strptime(stamp, "%Y-%m-%d %H:%M:%S") + except ValueError as e: + findings.append({ + "skill": skill, "severity": "WARN", + "message": f"invalid timestamp '{stamp}': {e}", + }) + continue + age_days = (now - ts).days + if age_days > stale_days: + findings.append({ + "skill": skill, "severity": "WARN", "age_days": age_days, + "message": ( + f"stale: last_reviewed was {age_days}d ago " + f"(cadence: {stale_days}d). Audit and bump the stamp." + ), + }) + return findings + + +def _print_allowlist_audit(findings: list[dict], *, stream=sys.stderr) -> None: + if not findings: + return + print("=== allowlist audit ===", file=stream) + for f in findings: + print(f" ⚠️ {f['skill']}: {f['message']}", file=stream) + print(file=stream) + + def walk_all_skills(root: Path) -> list[Path]: """Find all skill directories under packages/skills/skills/. Skips manifest.json, _shared/, and any dir without SKILL.md.""" @@ -246,8 +315,42 @@ def main() -> int: help="Walk every skill under packages/skills/skills/", ) parser.add_argument("--json", action="store_true", help="Output JSON instead of text") + parser.add_argument( + "--audit-allowlist", action="store_true", + help=( + "Audit ALLOWLIST entries for malformed or stale `last_reviewed` " + "stamps and exit. Exits 1 if any finding is reported." + ), + ) args = parser.parse_args() + # Standalone audit mode — for CI / cron use. + if args.audit_allowlist: + findings = audit_allowlist() + if args.json: + print(json.dumps({ + "stale_days_cadence": ALLOWLIST_STALE_DAYS, + "findings": findings, + "passed": len(findings) == 0, + }, indent=2)) + else: + if findings: + _print_allowlist_audit(findings, stream=sys.stdout) + print(f"=== SUMMARY ===\nFindings: {len(findings)}\nResult: FAIL", + file=sys.stdout) + else: + print("=== allowlist audit ===", file=sys.stdout) + print(f" ✅ all {len(ALLOWLIST)} entries have fresh stamps " + f"(cadence: {ALLOWLIST_STALE_DAYS}d)", file=sys.stdout) + print(f"\n=== SUMMARY ===\nFindings: 0\nResult: PASS", + file=sys.stdout) + return 1 if findings else 0 + + # Background audit — runs on every invocation, silent when clean, emits + # to stderr so --json output on stdout stays parseable. + if not args.json: + _print_allowlist_audit(audit_allowlist()) + arg_path = Path(args.skill_dir).resolve() manifest = Path(args.manifest).resolve() if args.manifest else None diff --git a/packages/skills/skills/ct-skill-validator/scripts/check_manifest.py b/packages/skills/skills/ct-skill-validator/scripts/check_manifest.py deleted file mode 100644 index 023d2ec12..000000000 --- a/packages/skills/skills/ct-skill-validator/scripts/check_manifest.py +++ /dev/null @@ -1,172 +0,0 @@ -#!/usr/bin/env python3 -""" -CLEO manifest alignment checker. -Usage: check_manifest.py [--dispatch-config dispatch-config.json] -""" -import sys -import json -import re -import yaml -import argparse -from pathlib import Path - -MANIFEST_REQUIRED_FIELDS = [ - "name", "version", "description", "path", "status", - "tier", "token_budget", "capabilities", "constraints", -] - - -def check_manifest(skill_path, manifest_path, dispatch_config_path=None): - """Check manifest alignment for a skill.""" - skill_dir = Path(skill_path).resolve() - skill_name = skill_dir.name - manifest_file = Path(manifest_path).resolve() - errors = 0 - warnings = 0 - - def error(msg): - nonlocal errors - errors += 1 - print(f" \u274c ERROR: {msg}") - - def warn(msg): - nonlocal warnings - warnings += 1 - print(f" \u26a0\ufe0f WARN: {msg}") - - def ok(msg): - print(f" \u2705 {msg}") - - print(f"\n=== CLEO Manifest Check: {skill_name} ===\n") - - # ── Read SKILL.md frontmatter ─────────────────────────────────────── - print("--- SKILL.md ---") - skill_md = skill_dir / "SKILL.md" - if not skill_md.exists(): - error("SKILL.md does not exist") - _print_summary(errors, warnings) - return errors - - raw_content = skill_md.read_text(encoding="utf-8") - fm_match = re.match(r"^---\n(.*?)\n---", raw_content, re.DOTALL) - if not fm_match: - error("Could not extract frontmatter from SKILL.md") - _print_summary(errors, warnings) - return errors - - try: - frontmatter = yaml.safe_load(fm_match.group(1)) - except yaml.YAMLError as e: - error(f"Frontmatter YAML parse error: {e}") - _print_summary(errors, warnings) - return errors - - if not isinstance(frontmatter, dict): - error("Frontmatter is not a dict") - _print_summary(errors, warnings) - return errors - - fm_name = frontmatter.get("name", skill_name) - ok(f"SKILL.md frontmatter read (name: '{fm_name}')") - - # ── Read manifest.json ────────────────────────────────────────────── - print("\n--- Manifest ---") - if not manifest_file.exists(): - error(f"Manifest file not found: {manifest_path}") - _print_summary(errors, warnings) - return errors - - try: - manifest_data = json.loads(manifest_file.read_text(encoding="utf-8")) - except json.JSONDecodeError as e: - error(f"Manifest is not valid JSON: {e}") - _print_summary(errors, warnings) - return errors - - ok("Manifest parsed successfully") - - skills_list = manifest_data.get("skills", []) - matching = [s for s in skills_list if s.get("name") == fm_name] - - if not matching: - error(f"Skill '{fm_name}' not found in manifest.json skills[] array") - _print_summary(errors, warnings) - return errors - - ok(f"Skill '{fm_name}' found in manifest.json") - entry = matching[0] - - # Check required fields - print("\n--- Required Fields ---") - missing_fields = [] - for field in MANIFEST_REQUIRED_FIELDS: - if field not in entry: - warn(f"Missing required field: '{field}'") - missing_fields.append(field) - else: - ok(f"'{field}' present") - - # ── Dispatch config check ─────────────────────────────────────────── - if dispatch_config_path: - print("\n--- Dispatch Config ---") - dc_file = Path(dispatch_config_path).resolve() - if not dc_file.exists(): - error(f"Dispatch config not found: {dispatch_config_path}") - else: - try: - dc_data = json.loads(dc_file.read_text(encoding="utf-8")) - except json.JSONDecodeError as e: - error(f"Dispatch config is not valid JSON: {e}") - dc_data = None - - if dc_data is not None: - overrides = dc_data.get("skill_overrides", {}) - if fm_name not in overrides: - warn(f"Skill '{fm_name}' not found in dispatch-config.json skill_overrides") - else: - ok(f"Skill '{fm_name}' found in dispatch-config.json") - - _print_summary(errors, warnings) - return errors - - -def _print_summary(errors, warnings): - """Print the check summary.""" - print(f"\n=== SUMMARY ===") - print(f"Errors: {errors}") - print(f"Warnings: {warnings}") - - if errors > 0: - print(f"Result: FAIL") - elif warnings > 0: - print(f"Result: PASS (with warnings)") - else: - print(f"Result: PASS") - - -def main(): - parser = argparse.ArgumentParser( - description="CLEO manifest alignment checker" - ) - parser.add_argument("skill_dir", help="Path to the skill directory") - parser.add_argument("manifest", help="Path to manifest.json") - parser.add_argument("--dispatch-config", help="Path to dispatch-config.json") - - args = parser.parse_args() - - skill_path = Path(args.skill_dir).resolve() - if not skill_path.is_dir(): - print(f"Error: '{args.skill_dir}' is not a directory", file=sys.stderr) - sys.exit(1) - - error_count = check_manifest( - skill_path, - args.manifest, - dispatch_config_path=args.dispatch_config, - ) - - sys.exit(1 if error_count > 0 else 0) - - -if __name__ == "__main__": - main() diff --git a/packages/skills/skills/ct-skill-validator/scripts/generate_validation_report.py b/packages/skills/skills/ct-skill-validator/scripts/generate_validation_report.py index c29b623d2..fa6e30961 100644 --- a/packages/skills/skills/ct-skill-validator/scripts/generate_validation_report.py +++ b/packages/skills/skills/ct-skill-validator/scripts/generate_validation_report.py @@ -254,7 +254,7 @@ def generate_html( eco_html = _ecosystem_section(ecosystem) if ecosystem else '
CLEO Ecosystem Compliance — Not yet run
Run: python check_ecosystem.py <skill-dir> | ecosystem-checker agent | save to ecosystem-check.json
' - grading_html = _grading_section(grading) if grading else '
Quality Eval — Grading not yet run
Run A/B eval using ct-skill-creator agents/grader.md then pass --grading grading.json
' + grading_html = _grading_section(grading) if grading else '
Quality Eval — Grading not yet run
Run: python scripts/run_quality_eval.py <skill-dir> (dispatches dynamically to skill-evaluator), then pass --grading grading.json
' comparison_html = _comparison_section(comparison) if comparison else "" diff --git a/packages/skills/skills/ct-skill-validator/scripts/run_quality_eval.py b/packages/skills/skills/ct-skill-validator/scripts/run_quality_eval.py new file mode 100644 index 000000000..7343363a3 --- /dev/null +++ b/packages/skills/skills/ct-skill-validator/scripts/run_quality_eval.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python3 +"""Phase 3 dispatcher — delegates runtime quality eval to a dedicated skill. + +Prefers `skill-evaluator` (the dedicated quality-eval skill) and falls back +to `ct-skill-creator` (legacy eval infrastructure) when skill-evaluator +isn't found. Uses `_skill_finder.py` to resolve the target dynamically — +no hardcoded cross-skill paths. + +Usage: + run_quality_eval.py # full quality eval + run_quality_eval.py --trigger # trigger-accuracy only + run_quality_eval.py --runs 3 --executor api + run_quality_eval.py --list # show what's reachable + +Exit codes: + 0 — eval ran (or was prepared, in --executor print mode) + 1 — target eval skill not found on the search path + 2 — eval script inside the target skill exited non-zero +""" + +from __future__ import annotations + +import argparse +import subprocess +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parent)) +from _skill_finder import find_first # noqa: E402 + + +# Preference order — first found wins. Lets the user opt-in to a different +# eval skill via $SKILL_FINDER_PATH without code changes. +PREFERENCE = ["skill-evaluator", "ct-skill-creator"] + + +def main() -> int: + ap = argparse.ArgumentParser(description=__doc__.splitlines()[0]) + ap.add_argument("skill_dir", nargs="?", help="path to the skill being evaluated") + ap.add_argument("--trigger", action="store_true", + help="run trigger-accuracy eval only (description_eval.py)") + ap.add_argument("--runs", type=int, default=3, help="repeated runs per case") + ap.add_argument("--executor", default=None, + help="executor for the eval (passes through to the target script)") + ap.add_argument("--list", action="store_true", + help="show what eval skill would be used and exit") + ap.add_argument("--evals", default=None, help="explicit evals.json path") + args, extra = ap.parse_known_args() + + resolved = find_first(PREFERENCE) + if resolved is None: + print( + f"error: none of {PREFERENCE} were found on the search path. " + f"Set SKILL_FINDER_PATH or install one of them.", + file=sys.stderr, + ) + return 1 + eval_skill_name, eval_skill_path = resolved + + if args.list: + print(f"will use: {eval_skill_name} at {eval_skill_path}") + return 0 + + if not args.skill_dir: + ap.error("skill_dir is required unless --list is given") + + target_skill = Path(args.skill_dir).expanduser().resolve() + if not (target_skill / "SKILL.md").exists(): + print(f"error: '{args.skill_dir}' is not a skill directory (no SKILL.md)", + file=sys.stderr) + return 1 + + # Pick the right script per target eval skill + if eval_skill_name == "skill-evaluator": + if args.trigger: + script = eval_skill_path / "scripts" / "description_eval.py" + cmd = ["python3", str(script), "--skill", str(target_skill), "--runs", str(args.runs)] + else: + script = eval_skill_path / "scripts" / "run_eval.py" + cmd = ["python3", str(script), "--skill", str(target_skill), "--runs", str(args.runs)] + if args.evals: + cmd += ["--evals", args.evals] + if args.executor: + cmd += ["--executor", args.executor] + else: + # ct-skill-creator legacy paths + if args.trigger: + script = eval_skill_path / "scripts" / "run_eval.py" + cmd = ["python3", str(script), "--skill-path", str(target_skill)] + if args.evals: + cmd += ["--eval-set", args.evals] + else: + script = eval_skill_path / "scripts" / "run_eval.py" + cmd = ["python3", str(script), "--skill-path", str(target_skill)] + if args.evals: + cmd += ["--eval-set", args.evals] + + if not script.exists(): + print(f"error: expected script not found: {script}", file=sys.stderr) + return 1 + + cmd += extra # pass any additional flags straight through + print(f"[run_quality_eval] dispatching to {eval_skill_name}: {' '.join(cmd)}", + file=sys.stderr) + rc = subprocess.run(cmd).returncode + return 0 if rc == 0 else 2 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/packages/skills/skills/ct-skill-validator/scripts/validate.py b/packages/skills/skills/ct-skill-validator/scripts/validate.py index 461ec0a2b..c009d9b18 100644 --- a/packages/skills/skills/ct-skill-validator/scripts/validate.py +++ b/packages/skills/skills/ct-skill-validator/scripts/validate.py @@ -17,16 +17,44 @@ import argparse from pathlib import Path -V2_STANDARD = { - "name", "description", "argument-hint", "disable-model-invocation", - "user-invocable", "allowed-tools", "model", "context", "agent", "hooks", - "license", +# Frontmatter fields allowed directly in SKILL.md. +# +# Sources (in order of authority): +# 1. agentskills.io spec — name, description, license, compatibility, +# metadata, allowed-tools +# https://agentskills.io/specification.md +# 2. Claude Code harness extensions — argument-hint, disable-model-invocation, +# user-invocable, model, context, agent, hooks +# (honored by the runtime but not part of the open spec) +# +# Anything in CLEO_ONLY is reserved for manifest-entry.json — the validator +# rejects those at SKILL.md top level. +# +# Per-spec author conventions for `metadata` (sub-keys, all strings): +# author, version, last_updated, related, spec +RECOMMENDED_METADATA_KEYS = {"author", "version", "last_updated"} + +# Timestamp keys inside `metadata` whose value should match the precision +# convention. The agentskills.io spec doesn't pin a format, but we enforce +# YYYY-MM-DD HH:MM:SS for both `last_updated` (metadata convention) and +# `last_reviewed` (audit/allowlist convention) so audit trails stay precise. +TIMESTAMP_METADATA_KEYS = ("last_updated", "last_reviewed") +TIMESTAMP_FORMAT_RE = re.compile(r"^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$") + +SPEC_FRONTMATTER = { + "name", "description", "license", "compatibility", "metadata", "allowed-tools", } +HARNESS_EXTENSIONS = { + "argument-hint", "disable-model-invocation", "user-invocable", + "model", "context", "agent", "hooks", +} +ALLOWED_FRONTMATTER = SPEC_FRONTMATTER | HARNESS_EXTENSIONS + CLEO_ONLY = { "version", "tier", "core", "category", "protocol", - "dependencies", "sharedResources", "compatibility", + "dependencies", "sharedResources", "token_budget", "capabilities", "constraints", - "metadata", "tags", "triggers", "mvi_scope", "requires_tiers", + "tags", "triggers", "mvi_scope", "requires_tiers", } MANIFEST_REQUIRED_FIELDS = [ @@ -213,6 +241,53 @@ def ok(tier, msg): if hooks_val is not None and not isinstance(hooks_val, dict): error(tier, "'hooks' must be a dict") + # compatibility checks (agentskills.io spec: max 500 chars) + compat_val = frontmatter.get("compatibility") + if compat_val is not None: + if not isinstance(compat_val, str): + error(tier, "'compatibility' must be a string") + elif len(compat_val) > 500: + error(tier, f"'compatibility' exceeds 500 characters (got: {len(compat_val)})") + else: + ok(tier, "'compatibility' is valid") + + # metadata checks (agentskills.io spec: map from string keys to string values) + metadata_val = frontmatter.get("metadata") + if metadata_val is not None: + if not isinstance(metadata_val, dict): + error(tier, "'metadata' must be a dict (map from string keys to string values)") + else: + non_string_keys = [k for k in metadata_val if not isinstance(k, str)] + if non_string_keys: + error(tier, f"'metadata' keys must all be strings (got non-string: {non_string_keys[:3]})") + non_string_vals = [k for k, v in metadata_val.items() if not isinstance(v, str)] + if non_string_vals: + warn(tier, ( + f"'metadata' values should be strings per agentskills.io spec; " + f"non-string keys: {non_string_vals[:3]} (quote numeric versions: \"1.0\" not 1.0)" + )) + present_recommended = RECOMMENDED_METADATA_KEYS & set(metadata_val.keys()) + if not present_recommended: + warn(tier, ( + "'metadata' present but contains none of the recommended keys " + f"({', '.join(sorted(RECOMMENDED_METADATA_KEYS))}); consider adding for traceability" + )) + else: + ok(tier, f"'metadata' has recommended key(s): {', '.join(sorted(present_recommended))}") + # Timestamp format check on convention keys (precision: YYYY-MM-DD HH:MM:SS). + # The spec is silent on format; this is our audit-precision convention. + for ts_key in TIMESTAMP_METADATA_KEYS: + ts_val = metadata_val.get(ts_key) + if ts_val is None or not isinstance(ts_val, str): + continue + if not TIMESTAMP_FORMAT_RE.match(ts_val): + warn(tier, ( + f"'metadata.{ts_key}' should match 'YYYY-MM-DD HH:MM:SS' " + f"(got: {ts_val[:40]!r}); use a value like \"2026-05-21 14:00:18\"" + )) + else: + ok(tier, f"'metadata.{ts_key}' has valid timestamp format") + # ── Tier 3 — Body Quality ─────────────────────────────────────────── tier = 3 @@ -228,10 +303,13 @@ def ok(tier, msg): body_lines = body.split("\n") line_count = len(body_lines) + # Thresholds aligned with agentskills.io spec recommendation + # ("Keep your main SKILL.md under 500 lines"). 600 is the hard cap; + # 500 is the soft cap from the spec. if line_count >= 600: - error(tier, f"Body is too long: {line_count} lines (max 600)") - elif line_count >= 400: - warn(tier, f"Body is getting long: {line_count} lines (warn threshold: 400)") + error(tier, f"Body is too long: {line_count} lines (hard cap 600)") + elif line_count >= 500: + warn(tier, f"Body exceeds spec recommendation: {line_count} lines (keep under 500)") else: ok(tier, f"Body length OK ({line_count} lines)")